|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 150.0, |
|
"eval_steps": 500, |
|
"global_step": 304500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.13165710866451263, |
|
"learning_rate": 3.119868637110016e-05, |
|
"loss": 0.8403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.676755428314209, |
|
"learning_rate": 3.114737274220033e-05, |
|
"loss": 0.8005, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5474051237106323, |
|
"learning_rate": 3.109605911330049e-05, |
|
"loss": 0.7733, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.311629056930542, |
|
"learning_rate": 3.104474548440066e-05, |
|
"loss": 0.7443, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.3807041645050049, |
|
"learning_rate": 3.099343185550082e-05, |
|
"loss": 0.7288, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.38295918703079224, |
|
"learning_rate": 3.0942118226600984e-05, |
|
"loss": 0.7143, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 3.78216814994812, |
|
"learning_rate": 3.089080459770115e-05, |
|
"loss": 0.7101, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.34924811124801636, |
|
"learning_rate": 3.0839490968801314e-05, |
|
"loss": 0.6937, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.0352131128311157, |
|
"learning_rate": 3.078817733990148e-05, |
|
"loss": 0.6884, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.3734943866729736, |
|
"learning_rate": 3.0736863711001644e-05, |
|
"loss": 0.682, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.4328332841396332, |
|
"learning_rate": 3.0685550082101805e-05, |
|
"loss": 0.6766, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.6840975284576416, |
|
"learning_rate": 3.0634236453201974e-05, |
|
"loss": 0.6673, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.43340444564819336, |
|
"learning_rate": 3.0582922824302135e-05, |
|
"loss": 0.665, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.46208104491233826, |
|
"learning_rate": 3.05316091954023e-05, |
|
"loss": 0.657, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.4085042178630829, |
|
"learning_rate": 3.0480295566502465e-05, |
|
"loss": 0.6554, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.9567029476165771, |
|
"learning_rate": 3.042898193760263e-05, |
|
"loss": 0.6493, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.449690043926239, |
|
"learning_rate": 3.0377668308702792e-05, |
|
"loss": 0.6504, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.5416390299797058, |
|
"learning_rate": 3.0326354679802957e-05, |
|
"loss": 0.641, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.48183944821357727, |
|
"learning_rate": 3.0275041050903122e-05, |
|
"loss": 0.6322, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.4853132963180542, |
|
"learning_rate": 3.0223727422003287e-05, |
|
"loss": 0.6303, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.6569094657897949, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 0.6226, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 0.4652053415775299, |
|
"learning_rate": 3.0121100164203614e-05, |
|
"loss": 0.6211, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 0.49975645542144775, |
|
"learning_rate": 3.006978653530378e-05, |
|
"loss": 0.6187, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.39061611890792847, |
|
"learning_rate": 3.0018472906403944e-05, |
|
"loss": 0.6123, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 0.5346322655677795, |
|
"learning_rate": 2.996715927750411e-05, |
|
"loss": 0.6088, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.4497845768928528, |
|
"learning_rate": 2.991584564860427e-05, |
|
"loss": 0.6068, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 0.4072265923023224, |
|
"learning_rate": 2.9864532019704435e-05, |
|
"loss": 0.6011, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 0.47786733508110046, |
|
"learning_rate": 2.9813218390804597e-05, |
|
"loss": 0.6031, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 0.47765907645225525, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 0.6015, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 0.45993849635124207, |
|
"learning_rate": 2.9710591133004923e-05, |
|
"loss": 0.5956, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.46012237668037415, |
|
"learning_rate": 2.965927750410509e-05, |
|
"loss": 0.5958, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.4168663024902344, |
|
"learning_rate": 2.9607963875205253e-05, |
|
"loss": 0.5918, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"grad_norm": 0.452117383480072, |
|
"learning_rate": 2.955665024630542e-05, |
|
"loss": 0.5871, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.43112337589263916, |
|
"learning_rate": 2.9505336617405583e-05, |
|
"loss": 0.5868, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.44672533869743347, |
|
"learning_rate": 2.9454022988505745e-05, |
|
"loss": 0.5882, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.7204737067222595, |
|
"learning_rate": 2.940270935960591e-05, |
|
"loss": 0.5877, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.11, |
|
"grad_norm": 0.7274026274681091, |
|
"learning_rate": 2.9351395730706075e-05, |
|
"loss": 0.5867, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.398488849401474, |
|
"learning_rate": 2.930008210180624e-05, |
|
"loss": 0.5789, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"grad_norm": 0.4212868809700012, |
|
"learning_rate": 2.9248768472906405e-05, |
|
"loss": 0.5795, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"grad_norm": 0.42726728320121765, |
|
"learning_rate": 2.9197454844006567e-05, |
|
"loss": 0.5817, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"grad_norm": 0.4954194724559784, |
|
"learning_rate": 2.914614121510673e-05, |
|
"loss": 0.5778, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 10.34, |
|
"grad_norm": 0.46405500173568726, |
|
"learning_rate": 2.9094827586206897e-05, |
|
"loss": 0.5772, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"grad_norm": 3.064875602722168, |
|
"learning_rate": 2.904351395730706e-05, |
|
"loss": 0.5739, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"grad_norm": 0.41212061047554016, |
|
"learning_rate": 2.8992200328407223e-05, |
|
"loss": 0.5755, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 0.5741709470748901, |
|
"learning_rate": 2.8940886699507388e-05, |
|
"loss": 0.5726, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 11.33, |
|
"grad_norm": 0.4510386288166046, |
|
"learning_rate": 2.8889573070607553e-05, |
|
"loss": 0.5737, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 11.58, |
|
"grad_norm": 0.44956105947494507, |
|
"learning_rate": 2.8838259441707718e-05, |
|
"loss": 0.5733, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 11.82, |
|
"grad_norm": 0.4094774127006531, |
|
"learning_rate": 2.8786945812807883e-05, |
|
"loss": 0.566, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.07, |
|
"grad_norm": 0.3915603756904602, |
|
"learning_rate": 2.8735632183908045e-05, |
|
"loss": 0.5621, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 0.42128342390060425, |
|
"learning_rate": 2.868431855500821e-05, |
|
"loss": 0.5647, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"grad_norm": 0.4155956506729126, |
|
"learning_rate": 2.8633004926108375e-05, |
|
"loss": 0.5654, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 12.81, |
|
"grad_norm": 0.43520358204841614, |
|
"learning_rate": 2.858169129720854e-05, |
|
"loss": 0.5672, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.05, |
|
"grad_norm": 0.42418190836906433, |
|
"learning_rate": 2.8530377668308705e-05, |
|
"loss": 0.5639, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 13.3, |
|
"grad_norm": 0.4998001456260681, |
|
"learning_rate": 2.8479064039408867e-05, |
|
"loss": 0.5619, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.3863075375556946, |
|
"learning_rate": 2.842775041050903e-05, |
|
"loss": 0.5588, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 13.79, |
|
"grad_norm": 0.42726796865463257, |
|
"learning_rate": 2.8376436781609196e-05, |
|
"loss": 0.5619, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 0.5370152592658997, |
|
"learning_rate": 2.832512315270936e-05, |
|
"loss": 0.5599, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 14.29, |
|
"grad_norm": 0.5220550894737244, |
|
"learning_rate": 2.8273809523809523e-05, |
|
"loss": 0.5614, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 14.53, |
|
"grad_norm": 0.4528503119945526, |
|
"learning_rate": 2.8222495894909688e-05, |
|
"loss": 0.5544, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 14.78, |
|
"grad_norm": 0.34627479314804077, |
|
"learning_rate": 2.8171182266009853e-05, |
|
"loss": 0.5557, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 15.02, |
|
"grad_norm": 0.4139779210090637, |
|
"learning_rate": 2.8119868637110018e-05, |
|
"loss": 0.5583, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.529438316822052, |
|
"learning_rate": 2.8068555008210183e-05, |
|
"loss": 0.5511, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 15.52, |
|
"grad_norm": 0.40643787384033203, |
|
"learning_rate": 2.8017241379310345e-05, |
|
"loss": 0.5533, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"grad_norm": 0.4177665114402771, |
|
"learning_rate": 2.796592775041051e-05, |
|
"loss": 0.5554, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 16.01, |
|
"grad_norm": 0.3691587448120117, |
|
"learning_rate": 2.7914614121510675e-05, |
|
"loss": 0.551, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 16.26, |
|
"grad_norm": 0.4278986155986786, |
|
"learning_rate": 2.786330049261084e-05, |
|
"loss": 0.5495, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.36237508058547974, |
|
"learning_rate": 2.7811986863711e-05, |
|
"loss": 0.5516, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"grad_norm": 0.4242589771747589, |
|
"learning_rate": 2.7760673234811166e-05, |
|
"loss": 0.5499, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.35249194502830505, |
|
"learning_rate": 2.770935960591133e-05, |
|
"loss": 0.5511, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 17.24, |
|
"grad_norm": 0.42292988300323486, |
|
"learning_rate": 2.7658045977011496e-05, |
|
"loss": 0.5465, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 17.49, |
|
"grad_norm": 0.49178698658943176, |
|
"learning_rate": 2.760673234811166e-05, |
|
"loss": 0.5463, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 17.73, |
|
"grad_norm": 0.6154900789260864, |
|
"learning_rate": 2.7555418719211823e-05, |
|
"loss": 0.5459, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 17.98, |
|
"grad_norm": 0.43449530005455017, |
|
"learning_rate": 2.7504105090311988e-05, |
|
"loss": 0.5478, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"grad_norm": 0.3909509479999542, |
|
"learning_rate": 2.7452791461412153e-05, |
|
"loss": 0.545, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 18.47, |
|
"grad_norm": 0.3956120014190674, |
|
"learning_rate": 2.7401477832512318e-05, |
|
"loss": 0.5442, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"grad_norm": 0.369386225938797, |
|
"learning_rate": 2.7350164203612483e-05, |
|
"loss": 0.5496, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 18.97, |
|
"grad_norm": 0.40281441807746887, |
|
"learning_rate": 2.7298850574712645e-05, |
|
"loss": 0.5415, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 19.21, |
|
"grad_norm": 0.4241119623184204, |
|
"learning_rate": 2.724753694581281e-05, |
|
"loss": 0.5453, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 19.46, |
|
"grad_norm": 0.35802990198135376, |
|
"learning_rate": 2.7196223316912975e-05, |
|
"loss": 0.5465, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 19.7, |
|
"grad_norm": 0.3796702027320862, |
|
"learning_rate": 2.714490968801314e-05, |
|
"loss": 0.5407, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"grad_norm": 5.289798736572266, |
|
"learning_rate": 2.70935960591133e-05, |
|
"loss": 0.54, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 20.2, |
|
"grad_norm": 0.43847599625587463, |
|
"learning_rate": 2.7042282430213466e-05, |
|
"loss": 0.5437, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 20.44, |
|
"grad_norm": 0.3674856126308441, |
|
"learning_rate": 2.699096880131363e-05, |
|
"loss": 0.5394, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 20.69, |
|
"grad_norm": 0.47424060106277466, |
|
"learning_rate": 2.6939655172413793e-05, |
|
"loss": 0.5412, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 20.94, |
|
"grad_norm": 0.40056848526000977, |
|
"learning_rate": 2.6888341543513958e-05, |
|
"loss": 0.5404, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 21.18, |
|
"grad_norm": 0.4091791808605194, |
|
"learning_rate": 2.683702791461412e-05, |
|
"loss": 0.5402, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 21.43, |
|
"grad_norm": 0.4481904208660126, |
|
"learning_rate": 2.6785714285714284e-05, |
|
"loss": 0.5358, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 21.67, |
|
"grad_norm": 0.44363269209861755, |
|
"learning_rate": 2.673440065681445e-05, |
|
"loss": 0.5378, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 21.92, |
|
"grad_norm": 0.42436665296554565, |
|
"learning_rate": 2.6683087027914614e-05, |
|
"loss": 0.541, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 22.17, |
|
"grad_norm": 0.4019184708595276, |
|
"learning_rate": 2.6631773399014776e-05, |
|
"loss": 0.5396, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 22.41, |
|
"grad_norm": 0.4707052409648895, |
|
"learning_rate": 2.658045977011494e-05, |
|
"loss": 0.5369, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 22.66, |
|
"grad_norm": 0.4337681233882904, |
|
"learning_rate": 2.6529146141215106e-05, |
|
"loss": 0.5383, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 22.91, |
|
"grad_norm": 0.5081506967544556, |
|
"learning_rate": 2.647783251231527e-05, |
|
"loss": 0.5356, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 23.15, |
|
"grad_norm": 0.4122790992259979, |
|
"learning_rate": 2.6426518883415436e-05, |
|
"loss": 0.5386, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 23.4, |
|
"grad_norm": 0.3679068386554718, |
|
"learning_rate": 2.6375205254515598e-05, |
|
"loss": 0.5337, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 23.65, |
|
"grad_norm": 0.4097256660461426, |
|
"learning_rate": 2.6323891625615763e-05, |
|
"loss": 0.5324, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 23.89, |
|
"grad_norm": 0.47837790846824646, |
|
"learning_rate": 2.6272577996715928e-05, |
|
"loss": 0.5374, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 24.14, |
|
"grad_norm": 0.33660951256752014, |
|
"learning_rate": 2.6221264367816093e-05, |
|
"loss": 0.5367, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 24.38, |
|
"grad_norm": 0.40010085701942444, |
|
"learning_rate": 2.6169950738916258e-05, |
|
"loss": 0.5308, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 24.63, |
|
"grad_norm": 0.4388476014137268, |
|
"learning_rate": 2.611863711001642e-05, |
|
"loss": 0.5323, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 24.88, |
|
"grad_norm": 0.6001474857330322, |
|
"learning_rate": 2.6067323481116584e-05, |
|
"loss": 0.5373, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 25.12, |
|
"grad_norm": 0.41148585081100464, |
|
"learning_rate": 2.601600985221675e-05, |
|
"loss": 0.5296, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 25.37, |
|
"grad_norm": 0.36909425258636475, |
|
"learning_rate": 2.5964696223316914e-05, |
|
"loss": 0.5353, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 25.62, |
|
"grad_norm": 0.4281887710094452, |
|
"learning_rate": 2.5913382594417076e-05, |
|
"loss": 0.528, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 25.86, |
|
"grad_norm": 0.37007570266723633, |
|
"learning_rate": 2.586206896551724e-05, |
|
"loss": 0.5332, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 26.11, |
|
"grad_norm": 0.44517025351524353, |
|
"learning_rate": 2.5810755336617406e-05, |
|
"loss": 0.5291, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 26.35, |
|
"grad_norm": 0.3227912187576294, |
|
"learning_rate": 2.575944170771757e-05, |
|
"loss": 0.5298, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 26.6, |
|
"grad_norm": 0.5226913690567017, |
|
"learning_rate": 2.5708128078817736e-05, |
|
"loss": 0.5269, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 26.85, |
|
"grad_norm": 0.4505539536476135, |
|
"learning_rate": 2.5656814449917897e-05, |
|
"loss": 0.5321, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 27.09, |
|
"grad_norm": 0.33736366033554077, |
|
"learning_rate": 2.5605500821018062e-05, |
|
"loss": 0.5299, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 27.34, |
|
"grad_norm": 0.5179808735847473, |
|
"learning_rate": 2.5554187192118227e-05, |
|
"loss": 0.5301, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 27.59, |
|
"grad_norm": 0.46012309193611145, |
|
"learning_rate": 2.5502873563218392e-05, |
|
"loss": 0.5259, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 27.83, |
|
"grad_norm": 0.3472147583961487, |
|
"learning_rate": 2.5451559934318557e-05, |
|
"loss": 0.5279, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 28.08, |
|
"grad_norm": 0.4225232005119324, |
|
"learning_rate": 2.540024630541872e-05, |
|
"loss": 0.5268, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 28.33, |
|
"grad_norm": 0.3676544725894928, |
|
"learning_rate": 2.5348932676518884e-05, |
|
"loss": 0.5285, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 28.57, |
|
"grad_norm": 0.4334140717983246, |
|
"learning_rate": 2.529761904761905e-05, |
|
"loss": 0.5306, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 28.82, |
|
"grad_norm": 0.3348256051540375, |
|
"learning_rate": 2.5246305418719214e-05, |
|
"loss": 0.5256, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 29.06, |
|
"grad_norm": 0.3783932030200958, |
|
"learning_rate": 2.5194991789819376e-05, |
|
"loss": 0.528, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 29.31, |
|
"grad_norm": 0.3242786228656769, |
|
"learning_rate": 2.514367816091954e-05, |
|
"loss": 0.5261, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 29.56, |
|
"grad_norm": 0.36754658818244934, |
|
"learning_rate": 2.5092364532019706e-05, |
|
"loss": 0.5242, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 29.8, |
|
"grad_norm": 0.3502654433250427, |
|
"learning_rate": 2.504105090311987e-05, |
|
"loss": 0.5284, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 30.05, |
|
"grad_norm": 0.3878660798072815, |
|
"learning_rate": 2.4989737274220036e-05, |
|
"loss": 0.5235, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 30.3, |
|
"grad_norm": 0.37782377004623413, |
|
"learning_rate": 2.4938423645320197e-05, |
|
"loss": 0.5232, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 30.54, |
|
"grad_norm": 0.49442175030708313, |
|
"learning_rate": 2.4887110016420362e-05, |
|
"loss": 0.5259, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 30.79, |
|
"grad_norm": 0.9456015229225159, |
|
"learning_rate": 2.4835796387520527e-05, |
|
"loss": 0.5261, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 31.03, |
|
"grad_norm": 0.35271939635276794, |
|
"learning_rate": 2.4784482758620692e-05, |
|
"loss": 0.5227, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 31.28, |
|
"grad_norm": 0.3738420903682709, |
|
"learning_rate": 2.4733169129720854e-05, |
|
"loss": 0.5234, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 31.53, |
|
"grad_norm": 0.4006379544734955, |
|
"learning_rate": 2.468185550082102e-05, |
|
"loss": 0.5226, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 31.77, |
|
"grad_norm": 0.3207830786705017, |
|
"learning_rate": 2.4630541871921184e-05, |
|
"loss": 0.5237, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 32.02, |
|
"grad_norm": 0.4237392246723175, |
|
"learning_rate": 2.457922824302135e-05, |
|
"loss": 0.5209, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 32.27, |
|
"grad_norm": 0.328134685754776, |
|
"learning_rate": 2.4527914614121514e-05, |
|
"loss": 0.5278, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 32.51, |
|
"grad_norm": 0.4005485773086548, |
|
"learning_rate": 2.4476600985221675e-05, |
|
"loss": 0.5221, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 32.76, |
|
"grad_norm": 0.4421192705631256, |
|
"learning_rate": 2.442528735632184e-05, |
|
"loss": 0.5199, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.38501447439193726, |
|
"learning_rate": 2.4373973727422005e-05, |
|
"loss": 0.5208, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 33.25, |
|
"grad_norm": 0.38499945402145386, |
|
"learning_rate": 2.432266009852217e-05, |
|
"loss": 0.521, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 33.5, |
|
"grad_norm": 0.31156325340270996, |
|
"learning_rate": 2.4271346469622335e-05, |
|
"loss": 0.5162, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 33.74, |
|
"grad_norm": 0.4041023552417755, |
|
"learning_rate": 2.4220032840722497e-05, |
|
"loss": 0.5199, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 33.99, |
|
"grad_norm": 0.4398662745952606, |
|
"learning_rate": 2.4168719211822662e-05, |
|
"loss": 0.5203, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 34.24, |
|
"grad_norm": 0.4758293330669403, |
|
"learning_rate": 2.4117405582922824e-05, |
|
"loss": 0.5236, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 34.48, |
|
"grad_norm": 0.4214634597301483, |
|
"learning_rate": 2.406609195402299e-05, |
|
"loss": 0.5187, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 34.73, |
|
"grad_norm": 0.41065362095832825, |
|
"learning_rate": 2.401477832512315e-05, |
|
"loss": 0.5171, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 34.98, |
|
"grad_norm": 0.3957192599773407, |
|
"learning_rate": 2.3963464696223315e-05, |
|
"loss": 0.5203, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 35.22, |
|
"grad_norm": 0.3957277536392212, |
|
"learning_rate": 2.391215106732348e-05, |
|
"loss": 0.5205, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 35.47, |
|
"grad_norm": 0.362846702337265, |
|
"learning_rate": 2.3860837438423645e-05, |
|
"loss": 0.5206, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 35.71, |
|
"grad_norm": 0.3435048758983612, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 0.5179, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 35.96, |
|
"grad_norm": 0.4027571976184845, |
|
"learning_rate": 2.3758210180623972e-05, |
|
"loss": 0.5179, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 36.21, |
|
"grad_norm": 0.40012747049331665, |
|
"learning_rate": 2.3706896551724137e-05, |
|
"loss": 0.5198, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 36.45, |
|
"grad_norm": 0.36184847354888916, |
|
"learning_rate": 2.3655582922824302e-05, |
|
"loss": 0.5171, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 36.7, |
|
"grad_norm": 0.4480341672897339, |
|
"learning_rate": 2.3604269293924467e-05, |
|
"loss": 0.5172, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 36.95, |
|
"grad_norm": 0.5025932788848877, |
|
"learning_rate": 2.355295566502463e-05, |
|
"loss": 0.5176, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 37.19, |
|
"grad_norm": 0.4158807694911957, |
|
"learning_rate": 2.3501642036124793e-05, |
|
"loss": 0.5144, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 37.44, |
|
"grad_norm": 0.3783656060695648, |
|
"learning_rate": 2.345032840722496e-05, |
|
"loss": 0.5146, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 37.68, |
|
"grad_norm": 0.37473171949386597, |
|
"learning_rate": 2.3399014778325123e-05, |
|
"loss": 0.5189, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 37.93, |
|
"grad_norm": 0.40515512228012085, |
|
"learning_rate": 2.334770114942529e-05, |
|
"loss": 0.5155, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 38.18, |
|
"grad_norm": 0.39813145995140076, |
|
"learning_rate": 2.329638752052545e-05, |
|
"loss": 0.5165, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 38.42, |
|
"grad_norm": 0.4116913676261902, |
|
"learning_rate": 2.3245073891625615e-05, |
|
"loss": 0.5144, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 38.67, |
|
"grad_norm": 0.37143009901046753, |
|
"learning_rate": 2.319376026272578e-05, |
|
"loss": 0.5167, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 38.92, |
|
"grad_norm": 0.38151097297668457, |
|
"learning_rate": 2.3142446633825945e-05, |
|
"loss": 0.5137, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 39.16, |
|
"grad_norm": 0.4646279215812683, |
|
"learning_rate": 2.309113300492611e-05, |
|
"loss": 0.5147, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 39.41, |
|
"grad_norm": 0.3832203447818756, |
|
"learning_rate": 2.303981937602627e-05, |
|
"loss": 0.5151, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 39.66, |
|
"grad_norm": 0.4361591637134552, |
|
"learning_rate": 2.2988505747126437e-05, |
|
"loss": 0.5141, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 39.9, |
|
"grad_norm": 0.3762683868408203, |
|
"learning_rate": 2.29371921182266e-05, |
|
"loss": 0.5142, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 40.15, |
|
"grad_norm": 0.3509220480918884, |
|
"learning_rate": 2.2885878489326767e-05, |
|
"loss": 0.5143, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 40.39, |
|
"grad_norm": 0.3234823942184448, |
|
"learning_rate": 2.2834564860426928e-05, |
|
"loss": 0.5111, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 40.64, |
|
"grad_norm": 0.3884125053882599, |
|
"learning_rate": 2.2783251231527093e-05, |
|
"loss": 0.5093, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 40.89, |
|
"grad_norm": 0.45307618379592896, |
|
"learning_rate": 2.2731937602627258e-05, |
|
"loss": 0.5157, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 41.13, |
|
"grad_norm": 0.4034999907016754, |
|
"learning_rate": 2.2680623973727423e-05, |
|
"loss": 0.5164, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 41.38, |
|
"grad_norm": 0.35115841031074524, |
|
"learning_rate": 2.2629310344827588e-05, |
|
"loss": 0.5151, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 41.63, |
|
"grad_norm": 0.3815375864505768, |
|
"learning_rate": 2.257799671592775e-05, |
|
"loss": 0.512, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 41.87, |
|
"grad_norm": 0.4422209560871124, |
|
"learning_rate": 2.2526683087027915e-05, |
|
"loss": 0.5088, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 42.12, |
|
"grad_norm": 0.35160163044929504, |
|
"learning_rate": 2.247536945812808e-05, |
|
"loss": 0.5123, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 42.36, |
|
"grad_norm": 0.4204586446285248, |
|
"learning_rate": 2.2424055829228245e-05, |
|
"loss": 0.5104, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 42.61, |
|
"grad_norm": 0.2954113185405731, |
|
"learning_rate": 2.237274220032841e-05, |
|
"loss": 0.5116, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 42.86, |
|
"grad_norm": 0.40945374965667725, |
|
"learning_rate": 2.232142857142857e-05, |
|
"loss": 0.5152, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 43.1, |
|
"grad_norm": 0.3709004819393158, |
|
"learning_rate": 2.2270114942528736e-05, |
|
"loss": 0.5157, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 43.35, |
|
"grad_norm": 0.4811795651912689, |
|
"learning_rate": 2.22188013136289e-05, |
|
"loss": 0.5118, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 43.6, |
|
"grad_norm": 0.36660343408584595, |
|
"learning_rate": 2.2167487684729066e-05, |
|
"loss": 0.5099, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 43.84, |
|
"grad_norm": 0.4243915379047394, |
|
"learning_rate": 2.2116174055829228e-05, |
|
"loss": 0.512, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 44.09, |
|
"grad_norm": 0.34263724088668823, |
|
"learning_rate": 2.2064860426929393e-05, |
|
"loss": 0.5116, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 44.33, |
|
"grad_norm": 0.374990314245224, |
|
"learning_rate": 2.2013546798029558e-05, |
|
"loss": 0.5092, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 44.58, |
|
"grad_norm": 0.33906516432762146, |
|
"learning_rate": 2.1962233169129723e-05, |
|
"loss": 0.5092, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 44.83, |
|
"grad_norm": 0.3953360915184021, |
|
"learning_rate": 2.1910919540229888e-05, |
|
"loss": 0.509, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 45.07, |
|
"grad_norm": 0.34916388988494873, |
|
"learning_rate": 2.185960591133005e-05, |
|
"loss": 0.5095, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 45.32, |
|
"grad_norm": 0.3636263310909271, |
|
"learning_rate": 2.1808292282430215e-05, |
|
"loss": 0.5091, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 45.57, |
|
"grad_norm": 0.38837483525276184, |
|
"learning_rate": 2.175697865353038e-05, |
|
"loss": 0.5077, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 45.81, |
|
"grad_norm": 0.3651277422904968, |
|
"learning_rate": 2.1705665024630545e-05, |
|
"loss": 0.5108, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 46.06, |
|
"grad_norm": 0.5110191702842712, |
|
"learning_rate": 2.1654351395730706e-05, |
|
"loss": 0.5093, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 46.31, |
|
"grad_norm": 0.36329400539398193, |
|
"learning_rate": 2.160303776683087e-05, |
|
"loss": 0.5093, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 46.55, |
|
"grad_norm": 0.42045408487319946, |
|
"learning_rate": 2.1551724137931036e-05, |
|
"loss": 0.5073, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 46.8, |
|
"grad_norm": 0.5139952301979065, |
|
"learning_rate": 2.15004105090312e-05, |
|
"loss": 0.5053, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 47.04, |
|
"grad_norm": 0.4146077632904053, |
|
"learning_rate": 2.1449096880131366e-05, |
|
"loss": 0.5073, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 47.29, |
|
"grad_norm": 0.5339871048927307, |
|
"learning_rate": 2.1397783251231528e-05, |
|
"loss": 0.5083, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 47.54, |
|
"grad_norm": 0.4209000766277313, |
|
"learning_rate": 2.1346469622331693e-05, |
|
"loss": 0.5098, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 47.78, |
|
"grad_norm": 0.3803653419017792, |
|
"learning_rate": 2.1295155993431854e-05, |
|
"loss": 0.5083, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 48.03, |
|
"grad_norm": 0.35340991616249084, |
|
"learning_rate": 2.124384236453202e-05, |
|
"loss": 0.5059, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 48.28, |
|
"grad_norm": 0.3598570227622986, |
|
"learning_rate": 2.1192528735632184e-05, |
|
"loss": 0.5026, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 48.52, |
|
"grad_norm": 0.3424580991268158, |
|
"learning_rate": 2.1141215106732346e-05, |
|
"loss": 0.5092, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 48.77, |
|
"grad_norm": 0.37450504302978516, |
|
"learning_rate": 2.108990147783251e-05, |
|
"loss": 0.5052, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 49.01, |
|
"grad_norm": 0.38776275515556335, |
|
"learning_rate": 2.1038587848932676e-05, |
|
"loss": 0.5059, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 49.26, |
|
"grad_norm": 0.4117530882358551, |
|
"learning_rate": 2.098727422003284e-05, |
|
"loss": 0.5059, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 49.51, |
|
"grad_norm": 0.43969395756721497, |
|
"learning_rate": 2.0935960591133003e-05, |
|
"loss": 0.5061, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 49.75, |
|
"grad_norm": 0.39242610335350037, |
|
"learning_rate": 2.0884646962233168e-05, |
|
"loss": 0.5056, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.6274676322937012, |
|
"learning_rate": 2.0833333333333333e-05, |
|
"loss": 0.5063, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 50.25, |
|
"grad_norm": 0.5223700404167175, |
|
"learning_rate": 2.0782019704433498e-05, |
|
"loss": 0.5029, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 50.49, |
|
"grad_norm": 0.36774346232414246, |
|
"learning_rate": 2.0730706075533663e-05, |
|
"loss": 0.5048, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 50.74, |
|
"grad_norm": 0.3601688742637634, |
|
"learning_rate": 2.0679392446633824e-05, |
|
"loss": 0.5054, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 50.99, |
|
"grad_norm": 0.35829395055770874, |
|
"learning_rate": 2.062807881773399e-05, |
|
"loss": 0.5077, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 51.23, |
|
"grad_norm": 0.394145667552948, |
|
"learning_rate": 2.0576765188834154e-05, |
|
"loss": 0.5071, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 51.48, |
|
"grad_norm": 0.39394277334213257, |
|
"learning_rate": 2.052545155993432e-05, |
|
"loss": 0.503, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 51.72, |
|
"grad_norm": 0.42796215415000916, |
|
"learning_rate": 2.047413793103448e-05, |
|
"loss": 0.5015, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 51.97, |
|
"grad_norm": 0.342929869890213, |
|
"learning_rate": 2.0422824302134646e-05, |
|
"loss": 0.5041, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 52.22, |
|
"grad_norm": 0.34425219893455505, |
|
"learning_rate": 2.037151067323481e-05, |
|
"loss": 0.5048, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 52.46, |
|
"grad_norm": 0.40054482221603394, |
|
"learning_rate": 2.0320197044334976e-05, |
|
"loss": 0.505, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 52.71, |
|
"grad_norm": 0.41854235529899597, |
|
"learning_rate": 2.026888341543514e-05, |
|
"loss": 0.5028, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 52.96, |
|
"grad_norm": 0.39581912755966187, |
|
"learning_rate": 2.0217569786535302e-05, |
|
"loss": 0.5019, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 53.2, |
|
"grad_norm": 0.3793180286884308, |
|
"learning_rate": 2.0166256157635467e-05, |
|
"loss": 0.5025, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 53.45, |
|
"grad_norm": 0.42270803451538086, |
|
"learning_rate": 2.0114942528735632e-05, |
|
"loss": 0.505, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 53.69, |
|
"grad_norm": 0.4206148684024811, |
|
"learning_rate": 2.0063628899835797e-05, |
|
"loss": 0.5032, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 53.94, |
|
"grad_norm": 0.535724401473999, |
|
"learning_rate": 2.0012315270935962e-05, |
|
"loss": 0.5041, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 54.19, |
|
"grad_norm": 0.4594982862472534, |
|
"learning_rate": 1.9961001642036124e-05, |
|
"loss": 0.5011, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 54.43, |
|
"grad_norm": 0.38085901737213135, |
|
"learning_rate": 1.990968801313629e-05, |
|
"loss": 0.5043, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 54.68, |
|
"grad_norm": 0.4274803102016449, |
|
"learning_rate": 1.9858374384236454e-05, |
|
"loss": 0.5065, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 54.93, |
|
"grad_norm": 0.4163910150527954, |
|
"learning_rate": 1.980706075533662e-05, |
|
"loss": 0.5, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 55.17, |
|
"grad_norm": 0.42667436599731445, |
|
"learning_rate": 1.975574712643678e-05, |
|
"loss": 0.5018, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 55.42, |
|
"grad_norm": 0.3834710717201233, |
|
"learning_rate": 1.9704433497536946e-05, |
|
"loss": 0.4994, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 55.67, |
|
"grad_norm": 0.39337608218193054, |
|
"learning_rate": 1.965311986863711e-05, |
|
"loss": 0.5032, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 55.91, |
|
"grad_norm": 0.48434558510780334, |
|
"learning_rate": 1.9601806239737276e-05, |
|
"loss": 0.5046, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 56.16, |
|
"grad_norm": 0.4098414480686188, |
|
"learning_rate": 1.955049261083744e-05, |
|
"loss": 0.5032, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 56.4, |
|
"grad_norm": 0.4482068419456482, |
|
"learning_rate": 1.9499178981937602e-05, |
|
"loss": 0.501, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 56.65, |
|
"grad_norm": 0.4341977834701538, |
|
"learning_rate": 1.9447865353037767e-05, |
|
"loss": 0.5021, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 56.9, |
|
"grad_norm": 0.40422672033309937, |
|
"learning_rate": 1.9396551724137932e-05, |
|
"loss": 0.5003, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 57.14, |
|
"grad_norm": 0.35205212235450745, |
|
"learning_rate": 1.9345238095238097e-05, |
|
"loss": 0.4969, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 57.39, |
|
"grad_norm": 0.4301392734050751, |
|
"learning_rate": 1.9293924466338262e-05, |
|
"loss": 0.4977, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 57.64, |
|
"grad_norm": 0.3865519165992737, |
|
"learning_rate": 1.9242610837438424e-05, |
|
"loss": 0.5031, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 57.88, |
|
"grad_norm": 0.4432295262813568, |
|
"learning_rate": 1.919129720853859e-05, |
|
"loss": 0.5036, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 58.13, |
|
"grad_norm": 0.44078487157821655, |
|
"learning_rate": 1.9139983579638754e-05, |
|
"loss": 0.4985, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 58.37, |
|
"grad_norm": 0.5300925374031067, |
|
"learning_rate": 1.908866995073892e-05, |
|
"loss": 0.5014, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 58.62, |
|
"grad_norm": 0.39328643679618835, |
|
"learning_rate": 1.903735632183908e-05, |
|
"loss": 0.4988, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 58.87, |
|
"grad_norm": 0.3447715938091278, |
|
"learning_rate": 1.8986042692939245e-05, |
|
"loss": 0.5013, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 59.11, |
|
"grad_norm": 0.4861631989479065, |
|
"learning_rate": 1.893472906403941e-05, |
|
"loss": 0.4991, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 59.36, |
|
"grad_norm": 0.3493654131889343, |
|
"learning_rate": 1.8883415435139575e-05, |
|
"loss": 0.499, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 59.61, |
|
"grad_norm": 0.4206608235836029, |
|
"learning_rate": 1.883210180623974e-05, |
|
"loss": 0.499, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 59.85, |
|
"grad_norm": 0.3103729486465454, |
|
"learning_rate": 1.8780788177339902e-05, |
|
"loss": 0.4998, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 60.1, |
|
"grad_norm": 0.3733006417751312, |
|
"learning_rate": 1.8729474548440067e-05, |
|
"loss": 0.4981, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 60.34, |
|
"grad_norm": 0.459492564201355, |
|
"learning_rate": 1.8678160919540232e-05, |
|
"loss": 0.499, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 60.59, |
|
"grad_norm": 0.35242483019828796, |
|
"learning_rate": 1.8626847290640397e-05, |
|
"loss": 0.4971, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 60.84, |
|
"grad_norm": 0.3865768611431122, |
|
"learning_rate": 1.857553366174056e-05, |
|
"loss": 0.4961, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 61.08, |
|
"grad_norm": 0.3595926761627197, |
|
"learning_rate": 1.8524220032840724e-05, |
|
"loss": 0.4987, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 61.33, |
|
"grad_norm": 0.36663565039634705, |
|
"learning_rate": 1.847290640394089e-05, |
|
"loss": 0.5008, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 61.58, |
|
"grad_norm": 0.33786624670028687, |
|
"learning_rate": 1.842159277504105e-05, |
|
"loss": 0.4975, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 61.82, |
|
"grad_norm": 0.38265419006347656, |
|
"learning_rate": 1.8370279146141215e-05, |
|
"loss": 0.4958, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 62.07, |
|
"grad_norm": 0.353635311126709, |
|
"learning_rate": 1.8318965517241377e-05, |
|
"loss": 0.4981, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 62.32, |
|
"grad_norm": 0.35221561789512634, |
|
"learning_rate": 1.8267651888341542e-05, |
|
"loss": 0.4964, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 62.56, |
|
"grad_norm": 0.47486743330955505, |
|
"learning_rate": 1.8216338259441707e-05, |
|
"loss": 0.5008, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 62.81, |
|
"grad_norm": 0.40547123551368713, |
|
"learning_rate": 1.8165024630541872e-05, |
|
"loss": 0.4943, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 63.05, |
|
"grad_norm": 0.3869677484035492, |
|
"learning_rate": 1.8113711001642037e-05, |
|
"loss": 0.4975, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 63.3, |
|
"grad_norm": 0.487684428691864, |
|
"learning_rate": 1.80623973727422e-05, |
|
"loss": 0.4964, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 63.55, |
|
"grad_norm": 0.4795319139957428, |
|
"learning_rate": 1.8011083743842363e-05, |
|
"loss": 0.4972, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 63.79, |
|
"grad_norm": 0.40214163064956665, |
|
"learning_rate": 1.795977011494253e-05, |
|
"loss": 0.4977, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 64.04, |
|
"grad_norm": 0.41338789463043213, |
|
"learning_rate": 1.7908456486042693e-05, |
|
"loss": 0.4969, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 64.29, |
|
"grad_norm": 0.46818971633911133, |
|
"learning_rate": 1.7857142857142855e-05, |
|
"loss": 0.4939, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 64.53, |
|
"grad_norm": 0.5682018399238586, |
|
"learning_rate": 1.780582922824302e-05, |
|
"loss": 0.4955, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 64.78, |
|
"grad_norm": 0.3598514497280121, |
|
"learning_rate": 1.7754515599343185e-05, |
|
"loss": 0.4978, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 65.02, |
|
"grad_norm": 0.45837077498435974, |
|
"learning_rate": 1.770320197044335e-05, |
|
"loss": 0.4962, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 65.27, |
|
"grad_norm": 0.40763700008392334, |
|
"learning_rate": 1.7651888341543515e-05, |
|
"loss": 0.4972, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 65.52, |
|
"grad_norm": 0.41413307189941406, |
|
"learning_rate": 1.7600574712643677e-05, |
|
"loss": 0.4948, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 65.76, |
|
"grad_norm": 1.8902608156204224, |
|
"learning_rate": 1.7549261083743842e-05, |
|
"loss": 0.4952, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 66.01, |
|
"grad_norm": 0.4681415855884552, |
|
"learning_rate": 1.7497947454844007e-05, |
|
"loss": 0.4949, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 66.26, |
|
"grad_norm": 0.41859620809555054, |
|
"learning_rate": 1.744663382594417e-05, |
|
"loss": 0.4927, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 66.5, |
|
"grad_norm": 0.380066454410553, |
|
"learning_rate": 1.7395320197044333e-05, |
|
"loss": 0.4984, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 66.75, |
|
"grad_norm": 0.4095420837402344, |
|
"learning_rate": 1.7344006568144498e-05, |
|
"loss": 0.4942, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 0.3269331753253937, |
|
"learning_rate": 1.7292692939244663e-05, |
|
"loss": 0.4964, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 67.24, |
|
"grad_norm": 0.4390511214733124, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.4931, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 67.49, |
|
"grad_norm": 0.37324196100234985, |
|
"learning_rate": 1.7190065681444993e-05, |
|
"loss": 0.4943, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 67.73, |
|
"grad_norm": 0.37749549746513367, |
|
"learning_rate": 1.7138752052545155e-05, |
|
"loss": 0.4967, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 67.98, |
|
"grad_norm": 0.42331600189208984, |
|
"learning_rate": 1.708743842364532e-05, |
|
"loss": 0.4908, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 68.23, |
|
"grad_norm": 0.4857657551765442, |
|
"learning_rate": 1.7036124794745485e-05, |
|
"loss": 0.4933, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 68.47, |
|
"grad_norm": 0.40747708082199097, |
|
"learning_rate": 1.698481116584565e-05, |
|
"loss": 0.4921, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 68.72, |
|
"grad_norm": 0.5494533181190491, |
|
"learning_rate": 1.6933497536945815e-05, |
|
"loss": 0.4929, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 68.97, |
|
"grad_norm": 0.44840362668037415, |
|
"learning_rate": 1.6882183908045977e-05, |
|
"loss": 0.4963, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 69.21, |
|
"grad_norm": 0.42726320028305054, |
|
"learning_rate": 1.683087027914614e-05, |
|
"loss": 0.4945, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 69.46, |
|
"grad_norm": 0.3714928925037384, |
|
"learning_rate": 1.6779556650246307e-05, |
|
"loss": 0.4946, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 69.7, |
|
"grad_norm": 0.44776561856269836, |
|
"learning_rate": 1.672824302134647e-05, |
|
"loss": 0.4932, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 69.95, |
|
"grad_norm": 0.36242425441741943, |
|
"learning_rate": 1.6676929392446633e-05, |
|
"loss": 0.4927, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 70.2, |
|
"grad_norm": 0.40916842222213745, |
|
"learning_rate": 1.6625615763546798e-05, |
|
"loss": 0.4931, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 70.44, |
|
"grad_norm": 0.3582199513912201, |
|
"learning_rate": 1.6574302134646963e-05, |
|
"loss": 0.4909, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 70.69, |
|
"grad_norm": 0.39829275012016296, |
|
"learning_rate": 1.6522988505747128e-05, |
|
"loss": 0.49, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 70.94, |
|
"grad_norm": 0.362525075674057, |
|
"learning_rate": 1.6471674876847293e-05, |
|
"loss": 0.4933, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 71.18, |
|
"grad_norm": 0.425618976354599, |
|
"learning_rate": 1.6420361247947455e-05, |
|
"loss": 0.4968, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 71.43, |
|
"grad_norm": 0.4285431504249573, |
|
"learning_rate": 1.636904761904762e-05, |
|
"loss": 0.4922, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 71.67, |
|
"grad_norm": 0.36322301626205444, |
|
"learning_rate": 1.6317733990147785e-05, |
|
"loss": 0.491, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 71.92, |
|
"grad_norm": 0.41439762711524963, |
|
"learning_rate": 1.626642036124795e-05, |
|
"loss": 0.4906, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 72.17, |
|
"grad_norm": 0.3911706805229187, |
|
"learning_rate": 1.621510673234811e-05, |
|
"loss": 0.4888, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 72.41, |
|
"grad_norm": 0.42668530344963074, |
|
"learning_rate": 1.6163793103448276e-05, |
|
"loss": 0.4925, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 72.66, |
|
"grad_norm": 0.4418216943740845, |
|
"learning_rate": 1.611247947454844e-05, |
|
"loss": 0.4921, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 72.91, |
|
"grad_norm": 0.5424076318740845, |
|
"learning_rate": 1.6061165845648606e-05, |
|
"loss": 0.4888, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 73.15, |
|
"grad_norm": 0.38305142521858215, |
|
"learning_rate": 1.600985221674877e-05, |
|
"loss": 0.491, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 73.4, |
|
"grad_norm": 0.4137153923511505, |
|
"learning_rate": 1.5958538587848933e-05, |
|
"loss": 0.4882, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 73.65, |
|
"grad_norm": 0.46136030554771423, |
|
"learning_rate": 1.5907224958949098e-05, |
|
"loss": 0.4907, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 73.89, |
|
"grad_norm": 0.39286741614341736, |
|
"learning_rate": 1.5855911330049263e-05, |
|
"loss": 0.4884, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 74.14, |
|
"grad_norm": 0.4563937783241272, |
|
"learning_rate": 1.5804597701149428e-05, |
|
"loss": 0.494, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 74.38, |
|
"grad_norm": 0.4879554510116577, |
|
"learning_rate": 1.5753284072249593e-05, |
|
"loss": 0.4908, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 74.63, |
|
"grad_norm": 0.48795485496520996, |
|
"learning_rate": 1.5701970443349755e-05, |
|
"loss": 0.4949, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 74.88, |
|
"grad_norm": 0.3959016799926758, |
|
"learning_rate": 1.565065681444992e-05, |
|
"loss": 0.4868, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 75.12, |
|
"grad_norm": 0.46233025193214417, |
|
"learning_rate": 1.559934318555008e-05, |
|
"loss": 0.4851, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 75.37, |
|
"grad_norm": 0.33422186970710754, |
|
"learning_rate": 1.5548029556650246e-05, |
|
"loss": 0.488, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 75.62, |
|
"grad_norm": 0.4503116309642792, |
|
"learning_rate": 1.549671592775041e-05, |
|
"loss": 0.4885, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 75.86, |
|
"grad_norm": 0.4860435426235199, |
|
"learning_rate": 1.5445402298850576e-05, |
|
"loss": 0.4891, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 76.11, |
|
"grad_norm": 0.40661612153053284, |
|
"learning_rate": 1.539408866995074e-05, |
|
"loss": 0.4884, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 76.35, |
|
"grad_norm": 0.4261013865470886, |
|
"learning_rate": 1.5342775041050903e-05, |
|
"loss": 0.487, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 76.6, |
|
"grad_norm": 0.4988757073879242, |
|
"learning_rate": 1.5291461412151068e-05, |
|
"loss": 0.4909, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 76.85, |
|
"grad_norm": 0.4792279005050659, |
|
"learning_rate": 1.5240147783251233e-05, |
|
"loss": 0.491, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 77.09, |
|
"grad_norm": 0.43279606103897095, |
|
"learning_rate": 1.5188834154351396e-05, |
|
"loss": 0.492, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 77.34, |
|
"grad_norm": 0.44450482726097107, |
|
"learning_rate": 1.5137520525451561e-05, |
|
"loss": 0.4855, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 77.59, |
|
"grad_norm": 0.42101508378982544, |
|
"learning_rate": 1.5086206896551724e-05, |
|
"loss": 0.4883, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 77.83, |
|
"grad_norm": 0.48337703943252563, |
|
"learning_rate": 1.503489326765189e-05, |
|
"loss": 0.4869, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 78.08, |
|
"grad_norm": 0.4778783321380615, |
|
"learning_rate": 1.4983579638752054e-05, |
|
"loss": 0.4869, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 78.33, |
|
"grad_norm": 0.35785460472106934, |
|
"learning_rate": 1.4932266009852218e-05, |
|
"loss": 0.4926, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 78.57, |
|
"grad_norm": 0.39978745579719543, |
|
"learning_rate": 1.4880952380952381e-05, |
|
"loss": 0.4849, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 78.82, |
|
"grad_norm": 0.36985936760902405, |
|
"learning_rate": 1.4829638752052544e-05, |
|
"loss": 0.4849, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 79.06, |
|
"grad_norm": 0.5067600607872009, |
|
"learning_rate": 1.477832512315271e-05, |
|
"loss": 0.4875, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 79.31, |
|
"grad_norm": 0.5925462245941162, |
|
"learning_rate": 1.4727011494252873e-05, |
|
"loss": 0.4895, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 79.56, |
|
"grad_norm": 0.43907538056373596, |
|
"learning_rate": 1.4675697865353038e-05, |
|
"loss": 0.4851, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 79.8, |
|
"grad_norm": 0.39582559466362, |
|
"learning_rate": 1.4624384236453203e-05, |
|
"loss": 0.485, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 80.05, |
|
"grad_norm": 0.3945811688899994, |
|
"learning_rate": 1.4573070607553366e-05, |
|
"loss": 0.49, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 80.3, |
|
"grad_norm": 0.5007498264312744, |
|
"learning_rate": 1.452175697865353e-05, |
|
"loss": 0.4871, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 80.54, |
|
"grad_norm": 0.3767457902431488, |
|
"learning_rate": 1.4470443349753694e-05, |
|
"loss": 0.4873, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 80.79, |
|
"grad_norm": 0.584635317325592, |
|
"learning_rate": 1.4419129720853859e-05, |
|
"loss": 0.4829, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 81.03, |
|
"grad_norm": 0.41645538806915283, |
|
"learning_rate": 1.4367816091954022e-05, |
|
"loss": 0.4893, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 81.28, |
|
"grad_norm": 0.4460589289665222, |
|
"learning_rate": 1.4316502463054187e-05, |
|
"loss": 0.4867, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 81.53, |
|
"grad_norm": 0.4328470528125763, |
|
"learning_rate": 1.4265188834154352e-05, |
|
"loss": 0.4844, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 81.77, |
|
"grad_norm": 0.41708043217658997, |
|
"learning_rate": 1.4213875205254516e-05, |
|
"loss": 0.4869, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 82.02, |
|
"grad_norm": 0.4869779348373413, |
|
"learning_rate": 1.416256157635468e-05, |
|
"loss": 0.4847, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 82.27, |
|
"grad_norm": 0.3944786489009857, |
|
"learning_rate": 1.4111247947454844e-05, |
|
"loss": 0.4856, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 82.51, |
|
"grad_norm": 0.36754748225212097, |
|
"learning_rate": 1.4059934318555009e-05, |
|
"loss": 0.4811, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 82.76, |
|
"grad_norm": 0.4370395839214325, |
|
"learning_rate": 1.4008620689655172e-05, |
|
"loss": 0.4841, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 0.48854583501815796, |
|
"learning_rate": 1.3957307060755337e-05, |
|
"loss": 0.4866, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 83.25, |
|
"grad_norm": 0.4292881488800049, |
|
"learning_rate": 1.39059934318555e-05, |
|
"loss": 0.4831, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 83.5, |
|
"grad_norm": 0.5421557426452637, |
|
"learning_rate": 1.3854679802955666e-05, |
|
"loss": 0.4876, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 83.74, |
|
"grad_norm": 0.4469299614429474, |
|
"learning_rate": 1.380336617405583e-05, |
|
"loss": 0.4857, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 83.99, |
|
"grad_norm": 0.4676869213581085, |
|
"learning_rate": 1.3752052545155994e-05, |
|
"loss": 0.4849, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 84.24, |
|
"grad_norm": 0.4767548739910126, |
|
"learning_rate": 1.3700738916256159e-05, |
|
"loss": 0.4858, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 84.48, |
|
"grad_norm": 0.36193403601646423, |
|
"learning_rate": 1.3649425287356322e-05, |
|
"loss": 0.4849, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 84.73, |
|
"grad_norm": 0.4789485037326813, |
|
"learning_rate": 1.3598111658456487e-05, |
|
"loss": 0.4835, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 84.98, |
|
"grad_norm": 0.4937196373939514, |
|
"learning_rate": 1.354679802955665e-05, |
|
"loss": 0.4851, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 85.22, |
|
"grad_norm": 0.5894652009010315, |
|
"learning_rate": 1.3495484400656816e-05, |
|
"loss": 0.4827, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 85.47, |
|
"grad_norm": 0.37527599930763245, |
|
"learning_rate": 1.3444170771756979e-05, |
|
"loss": 0.4797, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 85.71, |
|
"grad_norm": 0.4133756160736084, |
|
"learning_rate": 1.3392857142857142e-05, |
|
"loss": 0.4845, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 85.96, |
|
"grad_norm": 0.4965701103210449, |
|
"learning_rate": 1.3341543513957307e-05, |
|
"loss": 0.4849, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 86.21, |
|
"grad_norm": 0.37571871280670166, |
|
"learning_rate": 1.329022988505747e-05, |
|
"loss": 0.4846, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 86.45, |
|
"grad_norm": 0.4651111364364624, |
|
"learning_rate": 1.3238916256157635e-05, |
|
"loss": 0.4837, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 86.7, |
|
"grad_norm": 0.43609529733657837, |
|
"learning_rate": 1.3187602627257799e-05, |
|
"loss": 0.4829, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 86.95, |
|
"grad_norm": 0.4729571044445038, |
|
"learning_rate": 1.3136288998357964e-05, |
|
"loss": 0.4846, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 87.19, |
|
"grad_norm": 0.3406832814216614, |
|
"learning_rate": 1.3084975369458129e-05, |
|
"loss": 0.4798, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 87.44, |
|
"grad_norm": 0.40902179479599, |
|
"learning_rate": 1.3033661740558292e-05, |
|
"loss": 0.4846, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 87.68, |
|
"grad_norm": 0.44439247250556946, |
|
"learning_rate": 1.2982348111658457e-05, |
|
"loss": 0.4858, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 87.93, |
|
"grad_norm": 0.36519864201545715, |
|
"learning_rate": 1.293103448275862e-05, |
|
"loss": 0.4826, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 88.18, |
|
"grad_norm": 0.49141621589660645, |
|
"learning_rate": 1.2879720853858785e-05, |
|
"loss": 0.4843, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 88.42, |
|
"grad_norm": 0.45474326610565186, |
|
"learning_rate": 1.2828407224958949e-05, |
|
"loss": 0.4837, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 88.67, |
|
"grad_norm": 0.522074282169342, |
|
"learning_rate": 1.2777093596059114e-05, |
|
"loss": 0.4814, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 88.92, |
|
"grad_norm": 0.48209860920906067, |
|
"learning_rate": 1.2725779967159279e-05, |
|
"loss": 0.4837, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 89.16, |
|
"grad_norm": 0.5469162464141846, |
|
"learning_rate": 1.2674466338259442e-05, |
|
"loss": 0.4809, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 89.41, |
|
"grad_norm": 0.5114103555679321, |
|
"learning_rate": 1.2623152709359607e-05, |
|
"loss": 0.4848, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 89.66, |
|
"grad_norm": 0.3811694085597992, |
|
"learning_rate": 1.257183908045977e-05, |
|
"loss": 0.4774, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 89.9, |
|
"grad_norm": 0.4292524456977844, |
|
"learning_rate": 1.2520525451559935e-05, |
|
"loss": 0.4812, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 90.15, |
|
"grad_norm": 1.8638391494750977, |
|
"learning_rate": 1.2469211822660099e-05, |
|
"loss": 0.4837, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 90.39, |
|
"grad_norm": 0.5892685055732727, |
|
"learning_rate": 1.2417898193760264e-05, |
|
"loss": 0.4792, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 90.64, |
|
"grad_norm": 0.41100233793258667, |
|
"learning_rate": 1.2366584564860427e-05, |
|
"loss": 0.4832, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 90.89, |
|
"grad_norm": 0.44678598642349243, |
|
"learning_rate": 1.2315270935960592e-05, |
|
"loss": 0.484, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 91.13, |
|
"grad_norm": 0.46170106530189514, |
|
"learning_rate": 1.2263957307060757e-05, |
|
"loss": 0.4826, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 91.38, |
|
"grad_norm": 0.46171411871910095, |
|
"learning_rate": 1.221264367816092e-05, |
|
"loss": 0.4819, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 91.63, |
|
"grad_norm": 0.45027804374694824, |
|
"learning_rate": 1.2161330049261085e-05, |
|
"loss": 0.4841, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 91.87, |
|
"grad_norm": 0.407806396484375, |
|
"learning_rate": 1.2110016420361248e-05, |
|
"loss": 0.4772, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 92.12, |
|
"grad_norm": 0.4288312792778015, |
|
"learning_rate": 1.2058702791461412e-05, |
|
"loss": 0.4809, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 92.36, |
|
"grad_norm": 0.41084253787994385, |
|
"learning_rate": 1.2007389162561575e-05, |
|
"loss": 0.4806, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 92.61, |
|
"grad_norm": 0.4116561710834503, |
|
"learning_rate": 1.195607553366174e-05, |
|
"loss": 0.4829, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 92.86, |
|
"grad_norm": 0.4914272725582123, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 0.4812, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 93.1, |
|
"grad_norm": 0.4653904139995575, |
|
"learning_rate": 1.1853448275862068e-05, |
|
"loss": 0.4787, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 93.35, |
|
"grad_norm": 0.4210495948791504, |
|
"learning_rate": 1.1802134646962233e-05, |
|
"loss": 0.4773, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 93.6, |
|
"grad_norm": 0.40525516867637634, |
|
"learning_rate": 1.1750821018062397e-05, |
|
"loss": 0.4807, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 93.84, |
|
"grad_norm": 0.3981895446777344, |
|
"learning_rate": 1.1699507389162562e-05, |
|
"loss": 0.481, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 94.09, |
|
"grad_norm": 0.846139669418335, |
|
"learning_rate": 1.1648193760262725e-05, |
|
"loss": 0.4792, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 94.33, |
|
"grad_norm": 0.42289435863494873, |
|
"learning_rate": 1.159688013136289e-05, |
|
"loss": 0.4788, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 94.58, |
|
"grad_norm": 0.3914598226547241, |
|
"learning_rate": 1.1545566502463055e-05, |
|
"loss": 0.4837, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 94.83, |
|
"grad_norm": 0.5285059809684753, |
|
"learning_rate": 1.1494252873563218e-05, |
|
"loss": 0.4799, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 95.07, |
|
"grad_norm": 0.5075964331626892, |
|
"learning_rate": 1.1442939244663383e-05, |
|
"loss": 0.4812, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 95.32, |
|
"grad_norm": 0.43209370970726013, |
|
"learning_rate": 1.1391625615763547e-05, |
|
"loss": 0.4796, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 95.57, |
|
"grad_norm": 0.41466042399406433, |
|
"learning_rate": 1.1340311986863712e-05, |
|
"loss": 0.4775, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 95.81, |
|
"grad_norm": 0.3891516625881195, |
|
"learning_rate": 1.1288998357963875e-05, |
|
"loss": 0.4798, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 96.06, |
|
"grad_norm": 0.3929733633995056, |
|
"learning_rate": 1.123768472906404e-05, |
|
"loss": 0.4833, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 96.31, |
|
"grad_norm": 0.6019779443740845, |
|
"learning_rate": 1.1186371100164205e-05, |
|
"loss": 0.4788, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 96.55, |
|
"grad_norm": 0.46904659271240234, |
|
"learning_rate": 1.1135057471264368e-05, |
|
"loss": 0.4775, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 96.8, |
|
"grad_norm": 0.4163939356803894, |
|
"learning_rate": 1.1083743842364533e-05, |
|
"loss": 0.4812, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 97.04, |
|
"grad_norm": 0.4934261441230774, |
|
"learning_rate": 1.1032430213464697e-05, |
|
"loss": 0.4808, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 97.29, |
|
"grad_norm": 0.33812370896339417, |
|
"learning_rate": 1.0981116584564862e-05, |
|
"loss": 0.4773, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 97.54, |
|
"grad_norm": 0.5475727319717407, |
|
"learning_rate": 1.0929802955665025e-05, |
|
"loss": 0.4784, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 97.78, |
|
"grad_norm": 0.4021857976913452, |
|
"learning_rate": 1.087848932676519e-05, |
|
"loss": 0.4784, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 98.03, |
|
"grad_norm": 0.5285155773162842, |
|
"learning_rate": 1.0827175697865353e-05, |
|
"loss": 0.4814, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 98.28, |
|
"grad_norm": 0.42629796266555786, |
|
"learning_rate": 1.0775862068965518e-05, |
|
"loss": 0.4825, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 98.52, |
|
"grad_norm": 0.38368546962738037, |
|
"learning_rate": 1.0724548440065683e-05, |
|
"loss": 0.4763, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 98.77, |
|
"grad_norm": 0.46770936250686646, |
|
"learning_rate": 1.0673234811165846e-05, |
|
"loss": 0.4752, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 99.01, |
|
"grad_norm": 0.4895331561565399, |
|
"learning_rate": 1.062192118226601e-05, |
|
"loss": 0.4804, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 99.26, |
|
"grad_norm": 0.48920783400535583, |
|
"learning_rate": 1.0570607553366173e-05, |
|
"loss": 0.4795, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 99.51, |
|
"grad_norm": 0.4836702346801758, |
|
"learning_rate": 1.0519293924466338e-05, |
|
"loss": 0.4767, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 99.75, |
|
"grad_norm": 0.3899345397949219, |
|
"learning_rate": 1.0467980295566501e-05, |
|
"loss": 0.4784, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 1.3630714416503906, |
|
"learning_rate": 1.0416666666666666e-05, |
|
"loss": 0.4802, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 100.25, |
|
"grad_norm": 0.48866012692451477, |
|
"learning_rate": 1.0365353037766831e-05, |
|
"loss": 0.4804, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 100.49, |
|
"grad_norm": 0.4920850694179535, |
|
"learning_rate": 1.0314039408866995e-05, |
|
"loss": 0.4754, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 100.74, |
|
"grad_norm": 0.4752250611782074, |
|
"learning_rate": 1.026272577996716e-05, |
|
"loss": 0.4807, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 100.99, |
|
"grad_norm": 0.4868377447128296, |
|
"learning_rate": 1.0211412151067323e-05, |
|
"loss": 0.476, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 101.23, |
|
"grad_norm": 0.4661599397659302, |
|
"learning_rate": 1.0160098522167488e-05, |
|
"loss": 0.4773, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 101.48, |
|
"grad_norm": 0.5039206147193909, |
|
"learning_rate": 1.0108784893267651e-05, |
|
"loss": 0.4805, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 101.72, |
|
"grad_norm": 0.4316484332084656, |
|
"learning_rate": 1.0057471264367816e-05, |
|
"loss": 0.4737, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 101.97, |
|
"grad_norm": 0.36680227518081665, |
|
"learning_rate": 1.0006157635467981e-05, |
|
"loss": 0.4753, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 102.22, |
|
"grad_norm": 0.4917042553424835, |
|
"learning_rate": 9.954844006568145e-06, |
|
"loss": 0.4771, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 102.46, |
|
"grad_norm": 0.3863958716392517, |
|
"learning_rate": 9.90353037766831e-06, |
|
"loss": 0.4782, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 102.71, |
|
"grad_norm": 0.495717316865921, |
|
"learning_rate": 9.852216748768473e-06, |
|
"loss": 0.4763, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 102.96, |
|
"grad_norm": 0.5366889238357544, |
|
"learning_rate": 9.800903119868638e-06, |
|
"loss": 0.4765, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 103.2, |
|
"grad_norm": 0.39207398891448975, |
|
"learning_rate": 9.749589490968801e-06, |
|
"loss": 0.4752, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 103.45, |
|
"grad_norm": 0.4651632308959961, |
|
"learning_rate": 9.698275862068966e-06, |
|
"loss": 0.4769, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 103.69, |
|
"grad_norm": 0.5077354907989502, |
|
"learning_rate": 9.646962233169131e-06, |
|
"loss": 0.4732, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 103.94, |
|
"grad_norm": 0.39063769578933716, |
|
"learning_rate": 9.595648604269294e-06, |
|
"loss": 0.4773, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 104.19, |
|
"grad_norm": 0.48797911405563354, |
|
"learning_rate": 9.54433497536946e-06, |
|
"loss": 0.4769, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 104.43, |
|
"grad_norm": 0.49572136998176575, |
|
"learning_rate": 9.493021346469623e-06, |
|
"loss": 0.4742, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 104.68, |
|
"grad_norm": 0.48770076036453247, |
|
"learning_rate": 9.441707717569788e-06, |
|
"loss": 0.4777, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 104.93, |
|
"grad_norm": 0.45337972044944763, |
|
"learning_rate": 9.390394088669951e-06, |
|
"loss": 0.4731, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 105.17, |
|
"grad_norm": 0.4577206075191498, |
|
"learning_rate": 9.339080459770116e-06, |
|
"loss": 0.478, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 105.42, |
|
"grad_norm": 0.4246939718723297, |
|
"learning_rate": 9.28776683087028e-06, |
|
"loss": 0.4743, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 105.67, |
|
"grad_norm": 0.510725736618042, |
|
"learning_rate": 9.236453201970444e-06, |
|
"loss": 0.4772, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 105.91, |
|
"grad_norm": 0.4228347837924957, |
|
"learning_rate": 9.185139573070608e-06, |
|
"loss": 0.4745, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 106.16, |
|
"grad_norm": 0.5115532875061035, |
|
"learning_rate": 9.133825944170771e-06, |
|
"loss": 0.4753, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 106.4, |
|
"grad_norm": 0.4812858998775482, |
|
"learning_rate": 9.082512315270936e-06, |
|
"loss": 0.4753, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 106.65, |
|
"grad_norm": 0.5218610167503357, |
|
"learning_rate": 9.0311986863711e-06, |
|
"loss": 0.4786, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 106.9, |
|
"grad_norm": 0.5687581896781921, |
|
"learning_rate": 8.979885057471264e-06, |
|
"loss": 0.4733, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 107.14, |
|
"grad_norm": 0.4318714141845703, |
|
"learning_rate": 8.928571428571428e-06, |
|
"loss": 0.4756, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 107.39, |
|
"grad_norm": 0.5956067442893982, |
|
"learning_rate": 8.877257799671593e-06, |
|
"loss": 0.4755, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 107.64, |
|
"grad_norm": 0.5590375065803528, |
|
"learning_rate": 8.825944170771758e-06, |
|
"loss": 0.4759, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 107.88, |
|
"grad_norm": 0.5045220851898193, |
|
"learning_rate": 8.774630541871921e-06, |
|
"loss": 0.4734, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 108.13, |
|
"grad_norm": 0.466327428817749, |
|
"learning_rate": 8.723316912972086e-06, |
|
"loss": 0.4753, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 108.37, |
|
"grad_norm": 0.4527340829372406, |
|
"learning_rate": 8.672003284072249e-06, |
|
"loss": 0.4728, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 108.62, |
|
"grad_norm": 0.398181676864624, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.4742, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 108.87, |
|
"grad_norm": 0.5564948320388794, |
|
"learning_rate": 8.569376026272577e-06, |
|
"loss": 0.4742, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 109.11, |
|
"grad_norm": 0.4469916522502899, |
|
"learning_rate": 8.518062397372742e-06, |
|
"loss": 0.4721, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 109.36, |
|
"grad_norm": 0.45744919776916504, |
|
"learning_rate": 8.466748768472907e-06, |
|
"loss": 0.4731, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 109.61, |
|
"grad_norm": 0.5253536105155945, |
|
"learning_rate": 8.41543513957307e-06, |
|
"loss": 0.4755, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 109.85, |
|
"grad_norm": 0.5014792680740356, |
|
"learning_rate": 8.364121510673236e-06, |
|
"loss": 0.4745, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 110.1, |
|
"grad_norm": 0.44078388810157776, |
|
"learning_rate": 8.312807881773399e-06, |
|
"loss": 0.4735, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 110.34, |
|
"grad_norm": 0.5724578499794006, |
|
"learning_rate": 8.261494252873564e-06, |
|
"loss": 0.4742, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 110.59, |
|
"grad_norm": 0.5114606022834778, |
|
"learning_rate": 8.210180623973727e-06, |
|
"loss": 0.4736, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 110.84, |
|
"grad_norm": 0.5526043176651001, |
|
"learning_rate": 8.158866995073892e-06, |
|
"loss": 0.4753, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 111.08, |
|
"grad_norm": 0.36584803462028503, |
|
"learning_rate": 8.107553366174056e-06, |
|
"loss": 0.4758, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 111.33, |
|
"grad_norm": 0.5053611397743225, |
|
"learning_rate": 8.05623973727422e-06, |
|
"loss": 0.4699, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 111.58, |
|
"grad_norm": 0.47798970341682434, |
|
"learning_rate": 8.004926108374386e-06, |
|
"loss": 0.4694, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 111.82, |
|
"grad_norm": 0.42536744475364685, |
|
"learning_rate": 7.953612479474549e-06, |
|
"loss": 0.4728, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 112.07, |
|
"grad_norm": 0.451180100440979, |
|
"learning_rate": 7.902298850574714e-06, |
|
"loss": 0.4788, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 112.32, |
|
"grad_norm": 0.4156660735607147, |
|
"learning_rate": 7.850985221674877e-06, |
|
"loss": 0.4726, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 112.56, |
|
"grad_norm": 0.5824641585350037, |
|
"learning_rate": 7.79967159277504e-06, |
|
"loss": 0.4713, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 112.81, |
|
"grad_norm": 0.36896491050720215, |
|
"learning_rate": 7.748357963875206e-06, |
|
"loss": 0.4742, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 113.05, |
|
"grad_norm": 0.46299970149993896, |
|
"learning_rate": 7.69704433497537e-06, |
|
"loss": 0.4724, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 113.3, |
|
"grad_norm": 0.43692949414253235, |
|
"learning_rate": 7.645730706075534e-06, |
|
"loss": 0.4757, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 113.55, |
|
"grad_norm": 0.47571897506713867, |
|
"learning_rate": 7.594417077175698e-06, |
|
"loss": 0.469, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 113.79, |
|
"grad_norm": 0.5554032921791077, |
|
"learning_rate": 7.543103448275862e-06, |
|
"loss": 0.4706, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 114.04, |
|
"grad_norm": 0.5689848065376282, |
|
"learning_rate": 7.491789819376027e-06, |
|
"loss": 0.4727, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 114.29, |
|
"grad_norm": 0.43025562167167664, |
|
"learning_rate": 7.4404761904761905e-06, |
|
"loss": 0.4741, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 114.53, |
|
"grad_norm": 0.4013258218765259, |
|
"learning_rate": 7.389162561576355e-06, |
|
"loss": 0.4756, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 114.78, |
|
"grad_norm": 0.4342605769634247, |
|
"learning_rate": 7.337848932676519e-06, |
|
"loss": 0.4682, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 115.02, |
|
"grad_norm": 0.4698561131954193, |
|
"learning_rate": 7.286535303776683e-06, |
|
"loss": 0.4734, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 115.27, |
|
"grad_norm": 0.6582350134849548, |
|
"learning_rate": 7.235221674876847e-06, |
|
"loss": 0.4752, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 115.52, |
|
"grad_norm": 0.4564856290817261, |
|
"learning_rate": 7.183908045977011e-06, |
|
"loss": 0.4716, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 115.76, |
|
"grad_norm": 0.5398574471473694, |
|
"learning_rate": 7.132594417077176e-06, |
|
"loss": 0.4695, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 116.01, |
|
"grad_norm": 0.4755443036556244, |
|
"learning_rate": 7.08128078817734e-06, |
|
"loss": 0.47, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 116.26, |
|
"grad_norm": 0.49625080823898315, |
|
"learning_rate": 7.0299671592775045e-06, |
|
"loss": 0.4724, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 116.5, |
|
"grad_norm": 0.44892576336860657, |
|
"learning_rate": 6.978653530377669e-06, |
|
"loss": 0.4716, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 116.75, |
|
"grad_norm": 0.38705721497535706, |
|
"learning_rate": 6.927339901477833e-06, |
|
"loss": 0.4727, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 117.0, |
|
"grad_norm": 0.4698588252067566, |
|
"learning_rate": 6.876026272577997e-06, |
|
"loss": 0.4709, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 117.24, |
|
"grad_norm": 0.5400373339653015, |
|
"learning_rate": 6.824712643678161e-06, |
|
"loss": 0.4708, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 117.49, |
|
"grad_norm": 0.4378606379032135, |
|
"learning_rate": 6.773399014778325e-06, |
|
"loss": 0.4755, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 117.73, |
|
"grad_norm": 0.531797468662262, |
|
"learning_rate": 6.7220853858784894e-06, |
|
"loss": 0.4668, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 117.98, |
|
"grad_norm": 0.5268296003341675, |
|
"learning_rate": 6.670771756978654e-06, |
|
"loss": 0.4714, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 118.23, |
|
"grad_norm": 0.5087544918060303, |
|
"learning_rate": 6.619458128078818e-06, |
|
"loss": 0.4736, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 118.47, |
|
"grad_norm": 0.5163730382919312, |
|
"learning_rate": 6.568144499178982e-06, |
|
"loss": 0.4687, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 118.72, |
|
"grad_norm": 0.5498948097229004, |
|
"learning_rate": 6.516830870279146e-06, |
|
"loss": 0.4703, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 118.97, |
|
"grad_norm": 0.4399949610233307, |
|
"learning_rate": 6.46551724137931e-06, |
|
"loss": 0.4682, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 119.21, |
|
"grad_norm": 0.5559102892875671, |
|
"learning_rate": 6.414203612479474e-06, |
|
"loss": 0.4705, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 119.46, |
|
"grad_norm": 0.5814414620399475, |
|
"learning_rate": 6.362889983579639e-06, |
|
"loss": 0.4719, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 119.7, |
|
"grad_norm": 0.477924108505249, |
|
"learning_rate": 6.3115763546798035e-06, |
|
"loss": 0.4681, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 119.95, |
|
"grad_norm": 0.6359853744506836, |
|
"learning_rate": 6.260262725779968e-06, |
|
"loss": 0.4724, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 120.2, |
|
"grad_norm": 0.43428707122802734, |
|
"learning_rate": 6.208949096880132e-06, |
|
"loss": 0.4705, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 120.44, |
|
"grad_norm": 0.5314174294471741, |
|
"learning_rate": 6.157635467980296e-06, |
|
"loss": 0.4671, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 120.69, |
|
"grad_norm": 0.5369435548782349, |
|
"learning_rate": 6.10632183908046e-06, |
|
"loss": 0.4719, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 120.94, |
|
"grad_norm": 0.4553278684616089, |
|
"learning_rate": 6.055008210180624e-06, |
|
"loss": 0.469, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 121.18, |
|
"grad_norm": 0.4905393123626709, |
|
"learning_rate": 6.0036945812807875e-06, |
|
"loss": 0.4698, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 121.43, |
|
"grad_norm": 0.5438905954360962, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 0.4719, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 121.67, |
|
"grad_norm": 0.5870608687400818, |
|
"learning_rate": 5.901067323481117e-06, |
|
"loss": 0.4693, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 121.92, |
|
"grad_norm": 0.5743767619132996, |
|
"learning_rate": 5.849753694581281e-06, |
|
"loss": 0.4676, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 122.17, |
|
"grad_norm": 0.5572515726089478, |
|
"learning_rate": 5.798440065681445e-06, |
|
"loss": 0.4691, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 122.41, |
|
"grad_norm": 0.5879077315330505, |
|
"learning_rate": 5.747126436781609e-06, |
|
"loss": 0.4664, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 122.66, |
|
"grad_norm": 0.48736339807510376, |
|
"learning_rate": 5.695812807881773e-06, |
|
"loss": 0.469, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 122.91, |
|
"grad_norm": 0.39413416385650635, |
|
"learning_rate": 5.6444991789819375e-06, |
|
"loss": 0.4716, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 123.15, |
|
"grad_norm": 0.5965219140052795, |
|
"learning_rate": 5.5931855500821024e-06, |
|
"loss": 0.469, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 123.4, |
|
"grad_norm": 0.48718106746673584, |
|
"learning_rate": 5.541871921182267e-06, |
|
"loss": 0.4684, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 123.65, |
|
"grad_norm": 0.5316205620765686, |
|
"learning_rate": 5.490558292282431e-06, |
|
"loss": 0.4691, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 123.89, |
|
"grad_norm": 0.5446661114692688, |
|
"learning_rate": 5.439244663382595e-06, |
|
"loss": 0.4678, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 124.14, |
|
"grad_norm": 0.41561558842658997, |
|
"learning_rate": 5.387931034482759e-06, |
|
"loss": 0.4676, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 124.38, |
|
"grad_norm": 0.4405909776687622, |
|
"learning_rate": 5.336617405582923e-06, |
|
"loss": 0.4741, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 124.63, |
|
"grad_norm": 0.5378337502479553, |
|
"learning_rate": 5.2853037766830865e-06, |
|
"loss": 0.4685, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 124.88, |
|
"grad_norm": 0.5120199918746948, |
|
"learning_rate": 5.233990147783251e-06, |
|
"loss": 0.465, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 125.12, |
|
"grad_norm": 0.5217579007148743, |
|
"learning_rate": 5.182676518883416e-06, |
|
"loss": 0.4602, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 125.37, |
|
"grad_norm": 0.4467191696166992, |
|
"learning_rate": 5.13136288998358e-06, |
|
"loss": 0.4676, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 125.62, |
|
"grad_norm": 0.5314244031906128, |
|
"learning_rate": 5.080049261083744e-06, |
|
"loss": 0.4729, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 125.86, |
|
"grad_norm": 0.5660964250564575, |
|
"learning_rate": 5.028735632183908e-06, |
|
"loss": 0.4701, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 126.11, |
|
"grad_norm": 0.6407245993614197, |
|
"learning_rate": 4.977422003284072e-06, |
|
"loss": 0.4654, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 126.35, |
|
"grad_norm": 0.6110518574714661, |
|
"learning_rate": 4.926108374384236e-06, |
|
"loss": 0.4674, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 126.6, |
|
"grad_norm": 0.5407283306121826, |
|
"learning_rate": 4.8747947454844006e-06, |
|
"loss": 0.4711, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 126.85, |
|
"grad_norm": 0.5181688070297241, |
|
"learning_rate": 4.8234811165845656e-06, |
|
"loss": 0.4705, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 127.09, |
|
"grad_norm": 0.6265803575515747, |
|
"learning_rate": 4.77216748768473e-06, |
|
"loss": 0.4688, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 127.34, |
|
"grad_norm": 0.38801223039627075, |
|
"learning_rate": 4.720853858784894e-06, |
|
"loss": 0.4673, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 127.59, |
|
"grad_norm": 0.5875949263572693, |
|
"learning_rate": 4.669540229885058e-06, |
|
"loss": 0.4643, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 127.83, |
|
"grad_norm": 0.5323173403739929, |
|
"learning_rate": 4.618226600985222e-06, |
|
"loss": 0.4697, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 128.08, |
|
"grad_norm": 0.5532727241516113, |
|
"learning_rate": 4.5669129720853855e-06, |
|
"loss": 0.4682, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 128.33, |
|
"grad_norm": 0.5300949811935425, |
|
"learning_rate": 4.51559934318555e-06, |
|
"loss": 0.4687, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 128.57, |
|
"grad_norm": 0.7153774499893188, |
|
"learning_rate": 4.464285714285714e-06, |
|
"loss": 0.4678, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 128.82, |
|
"grad_norm": 0.5258143544197083, |
|
"learning_rate": 4.412972085385879e-06, |
|
"loss": 0.4706, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 129.06, |
|
"grad_norm": 0.5947761535644531, |
|
"learning_rate": 4.361658456486043e-06, |
|
"loss": 0.4661, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 129.31, |
|
"grad_norm": 0.5369092226028442, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 0.4687, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 129.56, |
|
"grad_norm": 0.5336856245994568, |
|
"learning_rate": 4.259031198686371e-06, |
|
"loss": 0.4626, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 129.8, |
|
"grad_norm": 0.5533296465873718, |
|
"learning_rate": 4.207717569786535e-06, |
|
"loss": 0.4691, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 130.05, |
|
"grad_norm": 0.579079270362854, |
|
"learning_rate": 4.1564039408866995e-06, |
|
"loss": 0.4665, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 130.3, |
|
"grad_norm": 0.6150951385498047, |
|
"learning_rate": 4.105090311986864e-06, |
|
"loss": 0.4667, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 130.54, |
|
"grad_norm": 0.5602375864982605, |
|
"learning_rate": 4.053776683087028e-06, |
|
"loss": 0.4706, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 130.79, |
|
"grad_norm": 0.58797287940979, |
|
"learning_rate": 4.002463054187193e-06, |
|
"loss": 0.4664, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 131.03, |
|
"grad_norm": 0.39354264736175537, |
|
"learning_rate": 3.951149425287357e-06, |
|
"loss": 0.465, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 131.28, |
|
"grad_norm": 0.5420950055122375, |
|
"learning_rate": 3.89983579638752e-06, |
|
"loss": 0.466, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 131.53, |
|
"grad_norm": 0.5339276790618896, |
|
"learning_rate": 3.848522167487685e-06, |
|
"loss": 0.4626, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 131.77, |
|
"grad_norm": 0.6729488372802734, |
|
"learning_rate": 3.797208538587849e-06, |
|
"loss": 0.4693, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 132.02, |
|
"grad_norm": 0.5628036260604858, |
|
"learning_rate": 3.7458949096880136e-06, |
|
"loss": 0.466, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 132.27, |
|
"grad_norm": 0.5665927529335022, |
|
"learning_rate": 3.6945812807881773e-06, |
|
"loss": 0.4625, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 132.51, |
|
"grad_norm": 0.5044068098068237, |
|
"learning_rate": 3.6432676518883415e-06, |
|
"loss": 0.4644, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 132.76, |
|
"grad_norm": 0.504945695400238, |
|
"learning_rate": 3.5919540229885056e-06, |
|
"loss": 0.4695, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 133.0, |
|
"grad_norm": 0.4701473116874695, |
|
"learning_rate": 3.54064039408867e-06, |
|
"loss": 0.4652, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 133.25, |
|
"grad_norm": 0.5132540464401245, |
|
"learning_rate": 3.4893267651888343e-06, |
|
"loss": 0.4647, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 133.5, |
|
"grad_norm": 0.652473509311676, |
|
"learning_rate": 3.4380131362889985e-06, |
|
"loss": 0.4674, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 133.74, |
|
"grad_norm": 0.5050608515739441, |
|
"learning_rate": 3.3866995073891626e-06, |
|
"loss": 0.4661, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 133.99, |
|
"grad_norm": 0.6116757988929749, |
|
"learning_rate": 3.335385878489327e-06, |
|
"loss": 0.4684, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 134.24, |
|
"grad_norm": 0.4631555676460266, |
|
"learning_rate": 3.284072249589491e-06, |
|
"loss": 0.4695, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 134.48, |
|
"grad_norm": 0.5643542408943176, |
|
"learning_rate": 3.232758620689655e-06, |
|
"loss": 0.4658, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 134.73, |
|
"grad_norm": 0.6550360918045044, |
|
"learning_rate": 3.1814449917898197e-06, |
|
"loss": 0.4696, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 134.98, |
|
"grad_norm": 0.5142180919647217, |
|
"learning_rate": 3.130131362889984e-06, |
|
"loss": 0.4624, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 135.22, |
|
"grad_norm": 0.5686226487159729, |
|
"learning_rate": 3.078817733990148e-06, |
|
"loss": 0.4638, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 135.47, |
|
"grad_norm": 0.551745593547821, |
|
"learning_rate": 3.027504105090312e-06, |
|
"loss": 0.4655, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 135.71, |
|
"grad_norm": 0.5265413522720337, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 0.4649, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 135.96, |
|
"grad_norm": 0.504638671875, |
|
"learning_rate": 2.9248768472906404e-06, |
|
"loss": 0.4658, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 136.21, |
|
"grad_norm": 0.5403110980987549, |
|
"learning_rate": 2.8735632183908046e-06, |
|
"loss": 0.4659, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 136.45, |
|
"grad_norm": 0.7049803137779236, |
|
"learning_rate": 2.8222495894909687e-06, |
|
"loss": 0.4657, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 136.7, |
|
"grad_norm": 0.46327295899391174, |
|
"learning_rate": 2.7709359605911333e-06, |
|
"loss": 0.4676, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 136.95, |
|
"grad_norm": 0.7414257526397705, |
|
"learning_rate": 2.7196223316912975e-06, |
|
"loss": 0.4673, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 137.19, |
|
"grad_norm": 0.528343915939331, |
|
"learning_rate": 2.6683087027914616e-06, |
|
"loss": 0.4659, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 137.44, |
|
"grad_norm": 0.6523202657699585, |
|
"learning_rate": 2.6169950738916253e-06, |
|
"loss": 0.4656, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 137.68, |
|
"grad_norm": 0.496900737285614, |
|
"learning_rate": 2.56568144499179e-06, |
|
"loss": 0.4659, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 137.93, |
|
"grad_norm": 0.6677756309509277, |
|
"learning_rate": 2.514367816091954e-06, |
|
"loss": 0.4644, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 138.18, |
|
"grad_norm": 0.5693820118904114, |
|
"learning_rate": 2.463054187192118e-06, |
|
"loss": 0.4648, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 138.42, |
|
"grad_norm": 0.7365754842758179, |
|
"learning_rate": 2.4117405582922828e-06, |
|
"loss": 0.4653, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 138.67, |
|
"grad_norm": 0.7357062697410583, |
|
"learning_rate": 2.360426929392447e-06, |
|
"loss": 0.4666, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 138.92, |
|
"grad_norm": 0.6288078427314758, |
|
"learning_rate": 2.309113300492611e-06, |
|
"loss": 0.4628, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 139.16, |
|
"grad_norm": 0.5149612426757812, |
|
"learning_rate": 2.257799671592775e-06, |
|
"loss": 0.4629, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 139.41, |
|
"grad_norm": 0.572669267654419, |
|
"learning_rate": 2.2064860426929394e-06, |
|
"loss": 0.4633, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 139.66, |
|
"grad_norm": 0.5936819314956665, |
|
"learning_rate": 2.1551724137931035e-06, |
|
"loss": 0.4657, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 139.9, |
|
"grad_norm": 0.5847501754760742, |
|
"learning_rate": 2.1038587848932677e-06, |
|
"loss": 0.4664, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 140.15, |
|
"grad_norm": 0.514391303062439, |
|
"learning_rate": 2.052545155993432e-06, |
|
"loss": 0.4645, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 140.39, |
|
"grad_norm": 0.6398583650588989, |
|
"learning_rate": 2.0012315270935964e-06, |
|
"loss": 0.4658, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 140.64, |
|
"grad_norm": 0.8860574960708618, |
|
"learning_rate": 1.94991789819376e-06, |
|
"loss": 0.4646, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 140.89, |
|
"grad_norm": 0.5785859823226929, |
|
"learning_rate": 1.8986042692939245e-06, |
|
"loss": 0.4656, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 141.13, |
|
"grad_norm": 0.5147131681442261, |
|
"learning_rate": 1.8472906403940887e-06, |
|
"loss": 0.4636, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 141.38, |
|
"grad_norm": 0.49100789427757263, |
|
"learning_rate": 1.7959770114942528e-06, |
|
"loss": 0.4641, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 141.63, |
|
"grad_norm": 0.5071477293968201, |
|
"learning_rate": 1.7446633825944172e-06, |
|
"loss": 0.4652, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 141.87, |
|
"grad_norm": 0.5541560649871826, |
|
"learning_rate": 1.6933497536945813e-06, |
|
"loss": 0.4661, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 142.12, |
|
"grad_norm": 0.5373649597167969, |
|
"learning_rate": 1.6420361247947455e-06, |
|
"loss": 0.466, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 142.36, |
|
"grad_norm": 0.6542106866836548, |
|
"learning_rate": 1.5907224958949098e-06, |
|
"loss": 0.4596, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 142.61, |
|
"grad_norm": 0.41288328170776367, |
|
"learning_rate": 1.539408866995074e-06, |
|
"loss": 0.4633, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 142.86, |
|
"grad_norm": 0.6059596538543701, |
|
"learning_rate": 1.4880952380952381e-06, |
|
"loss": 0.4648, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 143.1, |
|
"grad_norm": 0.5577875375747681, |
|
"learning_rate": 1.4367816091954023e-06, |
|
"loss": 0.4688, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 143.35, |
|
"grad_norm": 0.5939833521842957, |
|
"learning_rate": 1.3854679802955667e-06, |
|
"loss": 0.4653, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 143.6, |
|
"grad_norm": 0.5009748935699463, |
|
"learning_rate": 1.3341543513957308e-06, |
|
"loss": 0.4599, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 143.84, |
|
"grad_norm": 0.6264510154724121, |
|
"learning_rate": 1.282840722495895e-06, |
|
"loss": 0.4619, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 144.09, |
|
"grad_norm": 0.4556388258934021, |
|
"learning_rate": 1.231527093596059e-06, |
|
"loss": 0.4654, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 144.33, |
|
"grad_norm": 0.5879510045051575, |
|
"learning_rate": 1.1802134646962235e-06, |
|
"loss": 0.4672, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 144.58, |
|
"grad_norm": 0.515290379524231, |
|
"learning_rate": 1.1288998357963874e-06, |
|
"loss": 0.4622, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 144.83, |
|
"grad_norm": 0.6605761647224426, |
|
"learning_rate": 1.0775862068965518e-06, |
|
"loss": 0.4621, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 145.07, |
|
"grad_norm": 0.5039830803871155, |
|
"learning_rate": 1.026272577996716e-06, |
|
"loss": 0.4619, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 145.32, |
|
"grad_norm": 0.5800752639770508, |
|
"learning_rate": 9.7495894909688e-07, |
|
"loss": 0.4631, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 145.57, |
|
"grad_norm": 0.7062521576881409, |
|
"learning_rate": 9.236453201970443e-07, |
|
"loss": 0.464, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 145.81, |
|
"grad_norm": 0.5099909901618958, |
|
"learning_rate": 8.723316912972086e-07, |
|
"loss": 0.4618, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 146.06, |
|
"grad_norm": 0.5134597420692444, |
|
"learning_rate": 8.210180623973727e-07, |
|
"loss": 0.4643, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 146.31, |
|
"grad_norm": 0.497597336769104, |
|
"learning_rate": 7.69704433497537e-07, |
|
"loss": 0.4643, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 146.55, |
|
"grad_norm": 0.613549530506134, |
|
"learning_rate": 7.183908045977011e-07, |
|
"loss": 0.4598, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 146.8, |
|
"grad_norm": 0.5238372683525085, |
|
"learning_rate": 6.670771756978654e-07, |
|
"loss": 0.4653, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 147.04, |
|
"grad_norm": 0.673534095287323, |
|
"learning_rate": 6.157635467980296e-07, |
|
"loss": 0.4617, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 147.29, |
|
"grad_norm": 0.6518653631210327, |
|
"learning_rate": 5.644499178981937e-07, |
|
"loss": 0.4625, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 147.54, |
|
"grad_norm": 0.564731776714325, |
|
"learning_rate": 5.13136288998358e-07, |
|
"loss": 0.4607, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 147.78, |
|
"grad_norm": 0.5317474603652954, |
|
"learning_rate": 4.6182266009852216e-07, |
|
"loss": 0.4635, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 148.03, |
|
"grad_norm": 0.5215335488319397, |
|
"learning_rate": 4.1050903119868637e-07, |
|
"loss": 0.4642, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 148.28, |
|
"grad_norm": 0.5172483325004578, |
|
"learning_rate": 3.5919540229885057e-07, |
|
"loss": 0.4593, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 148.52, |
|
"grad_norm": 0.49218830466270447, |
|
"learning_rate": 3.078817733990148e-07, |
|
"loss": 0.466, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 148.77, |
|
"grad_norm": 0.502910852432251, |
|
"learning_rate": 2.56568144499179e-07, |
|
"loss": 0.4622, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 149.01, |
|
"grad_norm": 0.4450142979621887, |
|
"learning_rate": 2.0525451559934318e-07, |
|
"loss": 0.4653, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 149.26, |
|
"grad_norm": 0.6324329376220703, |
|
"learning_rate": 1.539408866995074e-07, |
|
"loss": 0.4606, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 149.51, |
|
"grad_norm": 0.5597870945930481, |
|
"learning_rate": 1.0262725779967159e-07, |
|
"loss": 0.4619, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 149.75, |
|
"grad_norm": 0.6023632884025574, |
|
"learning_rate": 5.1313628899835796e-08, |
|
"loss": 0.4671, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"grad_norm": 1.9860941171646118, |
|
"learning_rate": 0.0, |
|
"loss": 0.4623, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"step": 304500, |
|
"total_flos": 2.4610424381138534e+20, |
|
"train_loss": 0.5040761357951047, |
|
"train_runtime": 31987.2375, |
|
"train_samples_per_second": 76.123, |
|
"train_steps_per_second": 9.519 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 304500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 1000000000, |
|
"total_flos": 2.4610424381138534e+20, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|