|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.5, |
|
"eval_steps": 1000, |
|
"global_step": 30000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.840716361999512, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 3.0263, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.3429949283599854, |
|
"learning_rate": 3.960000000000001e-05, |
|
"loss": 1.6744, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.441232681274414, |
|
"learning_rate": 5.96e-05, |
|
"loss": 1.5933, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.98654842376709, |
|
"learning_rate": 7.960000000000001e-05, |
|
"loss": 1.5397, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9203338623046875, |
|
"learning_rate": 9.960000000000001e-05, |
|
"loss": 1.5443, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.706545352935791, |
|
"learning_rate": 9.966779661016949e-05, |
|
"loss": 1.5593, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7645795345306396, |
|
"learning_rate": 9.932881355932204e-05, |
|
"loss": 1.569, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.51504647731781, |
|
"learning_rate": 9.898983050847458e-05, |
|
"loss": 1.5094, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.9701757431030273, |
|
"learning_rate": 9.865084745762713e-05, |
|
"loss": 1.5007, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.04455304145813, |
|
"learning_rate": 9.831186440677966e-05, |
|
"loss": 1.5154, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.4852064847946167, |
|
"eval_runtime": 93.0778, |
|
"eval_samples_per_second": 10.744, |
|
"eval_steps_per_second": 2.686, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.8170721530914307, |
|
"learning_rate": 9.797288135593221e-05, |
|
"loss": 1.4842, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3413264751434326, |
|
"learning_rate": 9.763389830508474e-05, |
|
"loss": 1.4884, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.753246784210205, |
|
"learning_rate": 9.72949152542373e-05, |
|
"loss": 1.464, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.4083104133605957, |
|
"learning_rate": 9.695593220338983e-05, |
|
"loss": 1.4739, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.3532192707061768, |
|
"learning_rate": 9.661694915254238e-05, |
|
"loss": 1.5031, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.6274845600128174, |
|
"learning_rate": 9.627796610169491e-05, |
|
"loss": 1.4936, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.692795991897583, |
|
"learning_rate": 9.593898305084746e-05, |
|
"loss": 1.4842, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.3271782398223877, |
|
"learning_rate": 9.56e-05, |
|
"loss": 1.527, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.4074125289916992, |
|
"learning_rate": 9.526101694915255e-05, |
|
"loss": 1.4918, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.9484646320343018, |
|
"learning_rate": 9.492203389830509e-05, |
|
"loss": 1.459, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.489809274673462, |
|
"eval_runtime": 93.0028, |
|
"eval_samples_per_second": 10.752, |
|
"eval_steps_per_second": 2.688, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.8115673065185547, |
|
"learning_rate": 9.458305084745763e-05, |
|
"loss": 1.4285, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.803442358970642, |
|
"learning_rate": 9.424406779661016e-05, |
|
"loss": 1.5765, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2882965803146362, |
|
"learning_rate": 9.39084745762712e-05, |
|
"loss": 1.4512, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.9917691946029663, |
|
"learning_rate": 9.356949152542374e-05, |
|
"loss": 1.5322, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.6726502180099487, |
|
"learning_rate": 9.323050847457627e-05, |
|
"loss": 1.5219, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.595461845397949, |
|
"learning_rate": 9.289152542372882e-05, |
|
"loss": 1.444, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.5381546020507812, |
|
"learning_rate": 9.255254237288135e-05, |
|
"loss": 1.4474, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.1601195335388184, |
|
"learning_rate": 9.22135593220339e-05, |
|
"loss": 1.4111, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.147916078567505, |
|
"learning_rate": 9.187457627118645e-05, |
|
"loss": 1.4551, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.6436530351638794, |
|
"learning_rate": 9.153559322033899e-05, |
|
"loss": 1.4915, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.458221197128296, |
|
"eval_runtime": 93.0923, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 2.686, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.2148969173431396, |
|
"learning_rate": 9.119661016949152e-05, |
|
"loss": 1.4816, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.3700318336486816, |
|
"learning_rate": 9.085762711864407e-05, |
|
"loss": 1.4501, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.5613566637039185, |
|
"learning_rate": 9.05186440677966e-05, |
|
"loss": 1.4646, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.175220251083374, |
|
"learning_rate": 9.017966101694916e-05, |
|
"loss": 1.5036, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 12.032076835632324, |
|
"learning_rate": 8.98406779661017e-05, |
|
"loss": 1.4563, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2785927057266235, |
|
"learning_rate": 8.950169491525424e-05, |
|
"loss": 1.5178, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.5785298347473145, |
|
"learning_rate": 8.916271186440679e-05, |
|
"loss": 1.4257, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.1845338344573975, |
|
"learning_rate": 8.882372881355932e-05, |
|
"loss": 1.4742, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.2537214756011963, |
|
"learning_rate": 8.848474576271187e-05, |
|
"loss": 1.4538, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.092677354812622, |
|
"learning_rate": 8.814576271186441e-05, |
|
"loss": 1.4436, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.4766532182693481, |
|
"eval_runtime": 93.0059, |
|
"eval_samples_per_second": 10.752, |
|
"eval_steps_per_second": 2.688, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.3880198001861572, |
|
"learning_rate": 8.780677966101696e-05, |
|
"loss": 1.4706, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.7497975826263428, |
|
"learning_rate": 8.746779661016949e-05, |
|
"loss": 1.4361, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.1048712730407715, |
|
"learning_rate": 8.713220338983051e-05, |
|
"loss": 1.4484, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.206047534942627, |
|
"learning_rate": 8.679322033898306e-05, |
|
"loss": 1.4685, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.012110471725464, |
|
"learning_rate": 8.64542372881356e-05, |
|
"loss": 1.429, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.0992116928100586, |
|
"learning_rate": 8.611525423728813e-05, |
|
"loss": 1.5028, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.423650741577148, |
|
"learning_rate": 8.577627118644068e-05, |
|
"loss": 1.4644, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.7697961330413818, |
|
"learning_rate": 8.543728813559323e-05, |
|
"loss": 1.4537, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.925752878189087, |
|
"learning_rate": 8.509830508474576e-05, |
|
"loss": 1.4298, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.335164189338684, |
|
"learning_rate": 8.475932203389832e-05, |
|
"loss": 1.4534, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.4249181747436523, |
|
"eval_runtime": 93.056, |
|
"eval_samples_per_second": 10.746, |
|
"eval_steps_per_second": 2.687, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.5609335899353027, |
|
"learning_rate": 8.442033898305085e-05, |
|
"loss": 1.4444, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4093282222747803, |
|
"learning_rate": 8.40813559322034e-05, |
|
"loss": 1.4737, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.9211537837982178, |
|
"learning_rate": 8.374237288135593e-05, |
|
"loss": 1.4108, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1686183214187622, |
|
"learning_rate": 8.340338983050848e-05, |
|
"loss": 1.4603, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.1233601570129395, |
|
"learning_rate": 8.306440677966102e-05, |
|
"loss": 1.414, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.9452950954437256, |
|
"learning_rate": 8.272881355932204e-05, |
|
"loss": 1.3891, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.8304834365844727, |
|
"learning_rate": 8.238983050847457e-05, |
|
"loss": 1.4634, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.8340227603912354, |
|
"learning_rate": 8.205084745762712e-05, |
|
"loss": 1.4711, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.5472098588943481, |
|
"learning_rate": 8.171186440677967e-05, |
|
"loss": 1.4079, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.8752516508102417, |
|
"learning_rate": 8.137288135593221e-05, |
|
"loss": 1.4255, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.433774471282959, |
|
"eval_runtime": 93.0758, |
|
"eval_samples_per_second": 10.744, |
|
"eval_steps_per_second": 2.686, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.256100177764893, |
|
"learning_rate": 8.103389830508476e-05, |
|
"loss": 1.4102, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.966732144355774, |
|
"learning_rate": 8.069491525423729e-05, |
|
"loss": 1.4337, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.9053303599357605, |
|
"learning_rate": 8.035593220338984e-05, |
|
"loss": 1.4517, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.6552069187164307, |
|
"learning_rate": 8.001694915254237e-05, |
|
"loss": 1.459, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.2912251949310303, |
|
"learning_rate": 7.967796610169492e-05, |
|
"loss": 1.3857, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.4170544147491455, |
|
"learning_rate": 7.933898305084746e-05, |
|
"loss": 1.4032, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.7848442792892456, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.4374, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.551065444946289, |
|
"learning_rate": 7.866101694915254e-05, |
|
"loss": 1.4481, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.3571339845657349, |
|
"learning_rate": 7.832203389830509e-05, |
|
"loss": 1.4213, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5277926921844482, |
|
"learning_rate": 7.798305084745763e-05, |
|
"loss": 1.429, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.394229769706726, |
|
"eval_runtime": 93.0895, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 2.686, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.962740421295166, |
|
"learning_rate": 7.764406779661018e-05, |
|
"loss": 1.4396, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.3276052474975586, |
|
"learning_rate": 7.730508474576271e-05, |
|
"loss": 1.4628, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.9193934202194214, |
|
"learning_rate": 7.696610169491526e-05, |
|
"loss": 1.4234, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.859925627708435, |
|
"learning_rate": 7.662711864406779e-05, |
|
"loss": 1.4427, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.7623603343963623, |
|
"learning_rate": 7.628813559322034e-05, |
|
"loss": 1.4046, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.3409364223480225, |
|
"learning_rate": 7.594915254237289e-05, |
|
"loss": 1.4351, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.8644472360610962, |
|
"learning_rate": 7.561016949152543e-05, |
|
"loss": 1.4281, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.663987398147583, |
|
"learning_rate": 7.527118644067796e-05, |
|
"loss": 1.3743, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3779089450836182, |
|
"learning_rate": 7.493220338983051e-05, |
|
"loss": 1.4531, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.2310881614685059, |
|
"learning_rate": 7.459322033898304e-05, |
|
"loss": 1.427, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 1.4101628065109253, |
|
"eval_runtime": 92.9756, |
|
"eval_samples_per_second": 10.756, |
|
"eval_steps_per_second": 2.689, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.426481246948242, |
|
"learning_rate": 7.425762711864407e-05, |
|
"loss": 1.4215, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.706364154815674, |
|
"learning_rate": 7.391864406779662e-05, |
|
"loss": 1.3344, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.511159658432007, |
|
"learning_rate": 7.357966101694915e-05, |
|
"loss": 1.4132, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.31131911277771, |
|
"learning_rate": 7.32406779661017e-05, |
|
"loss": 1.4357, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.3403029441833496, |
|
"learning_rate": 7.290169491525423e-05, |
|
"loss": 1.455, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.386819362640381, |
|
"learning_rate": 7.256271186440679e-05, |
|
"loss": 1.374, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 12.603606224060059, |
|
"learning_rate": 7.222372881355932e-05, |
|
"loss": 1.461, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.5920355319976807, |
|
"learning_rate": 7.188474576271187e-05, |
|
"loss": 1.423, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.7238869667053223, |
|
"learning_rate": 7.15457627118644e-05, |
|
"loss": 1.3845, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.266007900238037, |
|
"learning_rate": 7.120677966101695e-05, |
|
"loss": 1.4293, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.4198664426803589, |
|
"eval_runtime": 93.0273, |
|
"eval_samples_per_second": 10.75, |
|
"eval_steps_per_second": 2.687, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.2583582401275635, |
|
"learning_rate": 7.08677966101695e-05, |
|
"loss": 1.4032, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.374390125274658, |
|
"learning_rate": 7.052881355932204e-05, |
|
"loss": 1.3469, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.431976318359375, |
|
"learning_rate": 7.018983050847457e-05, |
|
"loss": 1.3861, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.953474760055542, |
|
"learning_rate": 6.985084745762712e-05, |
|
"loss": 1.3406, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.3457430601119995, |
|
"learning_rate": 6.951186440677967e-05, |
|
"loss": 1.4132, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.713928699493408, |
|
"learning_rate": 6.91728813559322e-05, |
|
"loss": 1.4555, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.0444705486297607, |
|
"learning_rate": 6.883389830508476e-05, |
|
"loss": 1.3372, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.5544724464416504, |
|
"learning_rate": 6.849491525423729e-05, |
|
"loss": 1.4265, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.4634218215942383, |
|
"learning_rate": 6.815593220338984e-05, |
|
"loss": 1.4021, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.940263509750366, |
|
"learning_rate": 6.781694915254237e-05, |
|
"loss": 1.3975, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.3409844636917114, |
|
"eval_runtime": 92.9614, |
|
"eval_samples_per_second": 10.757, |
|
"eval_steps_per_second": 2.689, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.2668895721435547, |
|
"learning_rate": 6.748135593220339e-05, |
|
"loss": 1.3879, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.5967912673950195, |
|
"learning_rate": 6.714237288135593e-05, |
|
"loss": 1.4197, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.100217819213867, |
|
"learning_rate": 6.680338983050848e-05, |
|
"loss": 1.4081, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.7082469463348389, |
|
"learning_rate": 6.646440677966101e-05, |
|
"loss": 1.4103, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.1128108501434326, |
|
"learning_rate": 6.612542372881356e-05, |
|
"loss": 1.44, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.1471731662750244, |
|
"learning_rate": 6.57864406779661e-05, |
|
"loss": 1.412, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.2873263359069824, |
|
"learning_rate": 6.544745762711865e-05, |
|
"loss": 1.4407, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.7035319805145264, |
|
"learning_rate": 6.51084745762712e-05, |
|
"loss": 1.3792, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.7462477684020996, |
|
"learning_rate": 6.476949152542373e-05, |
|
"loss": 1.3676, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.895967721939087, |
|
"learning_rate": 6.443050847457628e-05, |
|
"loss": 1.4092, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 1.4003005027770996, |
|
"eval_runtime": 93.0106, |
|
"eval_samples_per_second": 10.751, |
|
"eval_steps_per_second": 2.688, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.039684295654297, |
|
"learning_rate": 6.409152542372881e-05, |
|
"loss": 1.4015, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.625591516494751, |
|
"learning_rate": 6.375254237288136e-05, |
|
"loss": 1.4362, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.4086389541625977, |
|
"learning_rate": 6.34135593220339e-05, |
|
"loss": 1.4192, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.9734954833984375, |
|
"learning_rate": 6.307457627118645e-05, |
|
"loss": 1.4413, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.1704461574554443, |
|
"learning_rate": 6.273559322033898e-05, |
|
"loss": 1.3856, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.659949541091919, |
|
"learning_rate": 6.239661016949153e-05, |
|
"loss": 1.4044, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.8390787839889526, |
|
"learning_rate": 6.205762711864406e-05, |
|
"loss": 1.3704, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.7644896507263184, |
|
"learning_rate": 6.171864406779662e-05, |
|
"loss": 1.4389, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.0395686626434326, |
|
"learning_rate": 6.137966101694915e-05, |
|
"loss": 1.397, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.8224962949752808, |
|
"learning_rate": 6.10406779661017e-05, |
|
"loss": 1.351, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4065361022949219, |
|
"eval_runtime": 93.0021, |
|
"eval_samples_per_second": 10.752, |
|
"eval_steps_per_second": 2.688, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.049715995788574, |
|
"learning_rate": 6.070169491525424e-05, |
|
"loss": 1.3315, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.025336503982544, |
|
"learning_rate": 6.0362711864406786e-05, |
|
"loss": 1.3722, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.9472439289093018, |
|
"learning_rate": 6.002372881355932e-05, |
|
"loss": 1.3185, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.2052855491638184, |
|
"learning_rate": 5.968474576271187e-05, |
|
"loss": 1.3029, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.9783244132995605, |
|
"learning_rate": 5.9345762711864405e-05, |
|
"loss": 1.3617, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 6.090597152709961, |
|
"learning_rate": 5.900677966101695e-05, |
|
"loss": 1.4188, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.486492156982422, |
|
"learning_rate": 5.866779661016949e-05, |
|
"loss": 1.3771, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.2206759452819824, |
|
"learning_rate": 5.832881355932204e-05, |
|
"loss": 1.3583, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.092623710632324, |
|
"learning_rate": 5.7989830508474577e-05, |
|
"loss": 1.344, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.9661636352539062, |
|
"learning_rate": 5.765084745762712e-05, |
|
"loss": 1.3364, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 1.3436501026153564, |
|
"eval_runtime": 92.9061, |
|
"eval_samples_per_second": 10.764, |
|
"eval_steps_per_second": 2.691, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.299858331680298, |
|
"learning_rate": 5.7311864406779656e-05, |
|
"loss": 1.327, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.999542474746704, |
|
"learning_rate": 5.697288135593221e-05, |
|
"loss": 1.3852, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.422431468963623, |
|
"learning_rate": 5.6633898305084755e-05, |
|
"loss": 1.3812, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.9813380241394043, |
|
"learning_rate": 5.629491525423729e-05, |
|
"loss": 1.3517, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.8180012702941895, |
|
"learning_rate": 5.595593220338984e-05, |
|
"loss": 1.3715, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.0727899074554443, |
|
"learning_rate": 5.5616949152542374e-05, |
|
"loss": 1.3278, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4579344987869263, |
|
"learning_rate": 5.527796610169492e-05, |
|
"loss": 1.3649, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.9728525876998901, |
|
"learning_rate": 5.493898305084746e-05, |
|
"loss": 1.2894, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.2770729064941406, |
|
"learning_rate": 5.4600000000000006e-05, |
|
"loss": 1.3748, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.356542587280273, |
|
"learning_rate": 5.4261016949152545e-05, |
|
"loss": 1.3153, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_loss": 1.3724653720855713, |
|
"eval_runtime": 92.8763, |
|
"eval_samples_per_second": 10.767, |
|
"eval_steps_per_second": 2.692, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.5841625928878784, |
|
"learning_rate": 5.392203389830509e-05, |
|
"loss": 1.3295, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 8.736273765563965, |
|
"learning_rate": 5.35864406779661e-05, |
|
"loss": 1.3146, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.8798937797546387, |
|
"learning_rate": 5.324745762711865e-05, |
|
"loss": 1.2944, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.1168253421783447, |
|
"learning_rate": 5.2908474576271194e-05, |
|
"loss": 1.2995, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.7689605951309204, |
|
"learning_rate": 5.256949152542373e-05, |
|
"loss": 1.3166, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.00744891166687, |
|
"learning_rate": 5.223050847457628e-05, |
|
"loss": 1.3385, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.9501924514770508, |
|
"learning_rate": 5.189152542372881e-05, |
|
"loss": 1.3564, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.614849328994751, |
|
"learning_rate": 5.1552542372881366e-05, |
|
"loss": 1.3326, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.0598857402801514, |
|
"learning_rate": 5.12135593220339e-05, |
|
"loss": 1.2322, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.12611985206604, |
|
"learning_rate": 5.0874576271186445e-05, |
|
"loss": 1.354, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 1.368465542793274, |
|
"eval_runtime": 93.0272, |
|
"eval_samples_per_second": 10.75, |
|
"eval_steps_per_second": 2.687, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.287361979484558, |
|
"learning_rate": 5.0535593220338984e-05, |
|
"loss": 1.3178, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.340104579925537, |
|
"learning_rate": 5.019661016949153e-05, |
|
"loss": 1.3893, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.4543209075927734, |
|
"learning_rate": 4.985762711864407e-05, |
|
"loss": 1.3597, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.6554243564605713, |
|
"learning_rate": 4.951864406779661e-05, |
|
"loss": 1.2903, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.685389518737793, |
|
"learning_rate": 4.917966101694915e-05, |
|
"loss": 1.2902, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.9464205503463745, |
|
"learning_rate": 4.88406779661017e-05, |
|
"loss": 1.3333, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.7877588272094727, |
|
"learning_rate": 4.850169491525424e-05, |
|
"loss": 1.3016, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.88291335105896, |
|
"learning_rate": 4.816271186440678e-05, |
|
"loss": 1.382, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 4.62669038772583, |
|
"learning_rate": 4.782372881355933e-05, |
|
"loss": 1.3343, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.525221824645996, |
|
"learning_rate": 4.748474576271187e-05, |
|
"loss": 1.2822, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"eval_loss": 1.3313556909561157, |
|
"eval_runtime": 93.0605, |
|
"eval_samples_per_second": 10.746, |
|
"eval_steps_per_second": 2.686, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.8074324131011963, |
|
"learning_rate": 4.714576271186441e-05, |
|
"loss": 1.3487, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.902251958847046, |
|
"learning_rate": 4.680677966101695e-05, |
|
"loss": 1.2796, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 3.72412109375, |
|
"learning_rate": 4.646779661016949e-05, |
|
"loss": 1.3161, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.7499101161956787, |
|
"learning_rate": 4.612881355932204e-05, |
|
"loss": 1.3477, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.252373695373535, |
|
"learning_rate": 4.578983050847458e-05, |
|
"loss": 1.3288, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 3.353426933288574, |
|
"learning_rate": 4.5450847457627125e-05, |
|
"loss": 1.3472, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.308408260345459, |
|
"learning_rate": 4.5111864406779664e-05, |
|
"loss": 1.3213, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 4.369659423828125, |
|
"learning_rate": 4.4772881355932204e-05, |
|
"loss": 1.2767, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.9831510782241821, |
|
"learning_rate": 4.443389830508475e-05, |
|
"loss": 1.2989, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.2581738233566284, |
|
"learning_rate": 4.409491525423729e-05, |
|
"loss": 1.3224, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_loss": 1.3554507493972778, |
|
"eval_runtime": 92.9689, |
|
"eval_samples_per_second": 10.756, |
|
"eval_steps_per_second": 2.689, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.903542518615723, |
|
"learning_rate": 4.375593220338983e-05, |
|
"loss": 1.3135, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.0982234477996826, |
|
"learning_rate": 4.3416949152542375e-05, |
|
"loss": 1.3036, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.5168031454086304, |
|
"learning_rate": 4.3077966101694915e-05, |
|
"loss": 1.3223, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.655869722366333, |
|
"learning_rate": 4.273898305084746e-05, |
|
"loss": 1.3036, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 3.1604421138763428, |
|
"learning_rate": 4.24e-05, |
|
"loss": 1.3172, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.9133104085922241, |
|
"learning_rate": 4.206101694915254e-05, |
|
"loss": 1.3596, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 5.270731449127197, |
|
"learning_rate": 4.1722033898305086e-05, |
|
"loss": 1.3021, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.63645339012146, |
|
"learning_rate": 4.1383050847457626e-05, |
|
"loss": 1.3197, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.240687370300293, |
|
"learning_rate": 4.104406779661017e-05, |
|
"loss": 1.347, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.892014265060425, |
|
"learning_rate": 4.070508474576271e-05, |
|
"loss": 1.2844, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 1.3619292974472046, |
|
"eval_runtime": 92.9703, |
|
"eval_samples_per_second": 10.756, |
|
"eval_steps_per_second": 2.689, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.62366783618927, |
|
"learning_rate": 4.036949152542373e-05, |
|
"loss": 1.3391, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 3.9142508506774902, |
|
"learning_rate": 4.0030508474576275e-05, |
|
"loss": 1.2751, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 3.78511381149292, |
|
"learning_rate": 3.9691525423728814e-05, |
|
"loss": 1.3169, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.912883758544922, |
|
"learning_rate": 3.9352542372881354e-05, |
|
"loss": 1.3224, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.096040725708008, |
|
"learning_rate": 3.90135593220339e-05, |
|
"loss": 1.2708, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 5.406456470489502, |
|
"learning_rate": 3.867457627118644e-05, |
|
"loss": 1.3068, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.6677026748657227, |
|
"learning_rate": 3.8335593220338986e-05, |
|
"loss": 1.3202, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.0527544021606445, |
|
"learning_rate": 3.7996610169491525e-05, |
|
"loss": 1.2886, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 3.8199574947357178, |
|
"learning_rate": 3.7657627118644065e-05, |
|
"loss": 1.3367, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.5120060443878174, |
|
"learning_rate": 3.731864406779661e-05, |
|
"loss": 1.2945, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_loss": 1.3452004194259644, |
|
"eval_runtime": 93.0461, |
|
"eval_samples_per_second": 10.747, |
|
"eval_steps_per_second": 2.687, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.0111308097839355, |
|
"learning_rate": 3.697966101694915e-05, |
|
"loss": 1.3713, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.966914176940918, |
|
"learning_rate": 3.66406779661017e-05, |
|
"loss": 1.3124, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.211293697357178, |
|
"learning_rate": 3.630508474576271e-05, |
|
"loss": 1.3617, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.8540898561477661, |
|
"learning_rate": 3.596610169491525e-05, |
|
"loss": 1.3393, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 4.376151084899902, |
|
"learning_rate": 3.56271186440678e-05, |
|
"loss": 1.304, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.297484874725342, |
|
"learning_rate": 3.528813559322034e-05, |
|
"loss": 1.276, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 3.258236885070801, |
|
"learning_rate": 3.4949152542372885e-05, |
|
"loss": 1.3191, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.905130863189697, |
|
"learning_rate": 3.4610169491525425e-05, |
|
"loss": 1.3228, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.611107110977173, |
|
"learning_rate": 3.4271186440677964e-05, |
|
"loss": 1.2464, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 4.044171333312988, |
|
"learning_rate": 3.393220338983051e-05, |
|
"loss": 1.2852, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_loss": 1.322507619857788, |
|
"eval_runtime": 93.0207, |
|
"eval_samples_per_second": 10.75, |
|
"eval_steps_per_second": 2.688, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.803262710571289, |
|
"learning_rate": 3.359322033898305e-05, |
|
"loss": 1.32, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.6002579927444458, |
|
"learning_rate": 3.325423728813559e-05, |
|
"loss": 1.2831, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 5.442947864532471, |
|
"learning_rate": 3.2915254237288136e-05, |
|
"loss": 1.2849, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.7041471004486084, |
|
"learning_rate": 3.257627118644068e-05, |
|
"loss": 1.2846, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.188296794891357, |
|
"learning_rate": 3.223728813559322e-05, |
|
"loss": 1.3026, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 3.6854124069213867, |
|
"learning_rate": 3.189830508474577e-05, |
|
"loss": 1.2429, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 3.1174068450927734, |
|
"learning_rate": 3.155932203389831e-05, |
|
"loss": 1.2728, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.2928777933120728, |
|
"learning_rate": 3.1220338983050854e-05, |
|
"loss": 1.291, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.058570146560669, |
|
"learning_rate": 3.088135593220339e-05, |
|
"loss": 1.2786, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.311408042907715, |
|
"learning_rate": 3.054237288135593e-05, |
|
"loss": 1.2607, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 1.2871655225753784, |
|
"eval_runtime": 93.0004, |
|
"eval_samples_per_second": 10.753, |
|
"eval_steps_per_second": 2.688, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.0149199962615967, |
|
"learning_rate": 3.0203389830508476e-05, |
|
"loss": 1.2897, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 3.8012924194335938, |
|
"learning_rate": 2.986440677966102e-05, |
|
"loss": 1.3003, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.5058399438858032, |
|
"learning_rate": 2.952542372881356e-05, |
|
"loss": 1.245, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.085143089294434, |
|
"learning_rate": 2.9186440677966104e-05, |
|
"loss": 1.2702, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.6351230144500732, |
|
"learning_rate": 2.8847457627118647e-05, |
|
"loss": 1.28, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.3916802406311035, |
|
"learning_rate": 2.8508474576271187e-05, |
|
"loss": 1.2621, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.4449045658111572, |
|
"learning_rate": 2.816949152542373e-05, |
|
"loss": 1.2968, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.8779613971710205, |
|
"learning_rate": 2.7830508474576273e-05, |
|
"loss": 1.3546, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.7994945049285889, |
|
"learning_rate": 2.7491525423728816e-05, |
|
"loss": 1.2766, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.81644344329834, |
|
"learning_rate": 2.7152542372881355e-05, |
|
"loss": 1.2648, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"eval_loss": 1.322618007659912, |
|
"eval_runtime": 92.9974, |
|
"eval_samples_per_second": 10.753, |
|
"eval_steps_per_second": 2.688, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.8521928787231445, |
|
"learning_rate": 2.6813559322033898e-05, |
|
"loss": 1.2689, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.50951886177063, |
|
"learning_rate": 2.647457627118644e-05, |
|
"loss": 1.3133, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.187098979949951, |
|
"learning_rate": 2.6135593220338984e-05, |
|
"loss": 1.2842, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.276773691177368, |
|
"learning_rate": 2.5796610169491527e-05, |
|
"loss": 1.252, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.8711966276168823, |
|
"learning_rate": 2.5457627118644066e-05, |
|
"loss": 1.2994, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.4090774059295654, |
|
"learning_rate": 2.511864406779661e-05, |
|
"loss": 1.2686, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 4.18037748336792, |
|
"learning_rate": 2.4779661016949156e-05, |
|
"loss": 1.318, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.396822452545166, |
|
"learning_rate": 2.4440677966101695e-05, |
|
"loss": 1.27, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 3.4309394359588623, |
|
"learning_rate": 2.4101694915254238e-05, |
|
"loss": 1.2888, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.140385627746582, |
|
"learning_rate": 2.376271186440678e-05, |
|
"loss": 1.2936, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 1.3380318880081177, |
|
"eval_runtime": 93.0298, |
|
"eval_samples_per_second": 10.749, |
|
"eval_steps_per_second": 2.687, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 1.4485564231872559, |
|
"learning_rate": 2.3423728813559324e-05, |
|
"loss": 1.2744, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.8173444271087646, |
|
"learning_rate": 2.3084745762711867e-05, |
|
"loss": 1.2774, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.9735991954803467, |
|
"learning_rate": 2.2745762711864406e-05, |
|
"loss": 1.3241, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.011720657348633, |
|
"learning_rate": 2.240677966101695e-05, |
|
"loss": 1.2912, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.2044718265533447, |
|
"learning_rate": 2.2067796610169492e-05, |
|
"loss": 1.2904, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 3.7060163021087646, |
|
"learning_rate": 2.1728813559322035e-05, |
|
"loss": 1.2289, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.124302864074707, |
|
"learning_rate": 2.1389830508474575e-05, |
|
"loss": 1.2544, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.379523992538452, |
|
"learning_rate": 2.105084745762712e-05, |
|
"loss": 1.2604, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 7.474296569824219, |
|
"learning_rate": 2.0711864406779664e-05, |
|
"loss": 1.274, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.122653007507324, |
|
"learning_rate": 2.0372881355932207e-05, |
|
"loss": 1.2985, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.274635910987854, |
|
"eval_runtime": 93.0906, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 2.686, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.563617467880249, |
|
"learning_rate": 2.0033898305084746e-05, |
|
"loss": 1.1996, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 3.481621265411377, |
|
"learning_rate": 1.969491525423729e-05, |
|
"loss": 1.2156, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.6262869834899902, |
|
"learning_rate": 1.9355932203389832e-05, |
|
"loss": 1.221, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.437523603439331, |
|
"learning_rate": 1.9016949152542375e-05, |
|
"loss": 1.1814, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 4.948596477508545, |
|
"learning_rate": 1.8677966101694915e-05, |
|
"loss": 1.1972, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 3.0667967796325684, |
|
"learning_rate": 1.8338983050847458e-05, |
|
"loss": 1.2229, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.1678037643432617, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.182, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.7943167686462402, |
|
"learning_rate": 1.7661016949152543e-05, |
|
"loss": 1.1889, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.5044032335281372, |
|
"learning_rate": 1.7322033898305083e-05, |
|
"loss": 1.1796, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.931037425994873, |
|
"learning_rate": 1.6983050847457626e-05, |
|
"loss": 1.2159, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 1.33604097366333, |
|
"eval_runtime": 93.1418, |
|
"eval_samples_per_second": 10.736, |
|
"eval_steps_per_second": 2.684, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 5.764918804168701, |
|
"learning_rate": 1.6644067796610172e-05, |
|
"loss": 1.1995, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.17740535736084, |
|
"learning_rate": 1.6305084745762715e-05, |
|
"loss": 1.1873, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 3.145355224609375, |
|
"learning_rate": 1.5966101694915255e-05, |
|
"loss": 1.2101, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.353292942047119, |
|
"learning_rate": 1.563050847457627e-05, |
|
"loss": 1.2076, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.399566411972046, |
|
"learning_rate": 1.5291525423728814e-05, |
|
"loss": 1.2099, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 3.8052713871002197, |
|
"learning_rate": 1.4952542372881357e-05, |
|
"loss": 1.1932, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 4.528957843780518, |
|
"learning_rate": 1.4613559322033898e-05, |
|
"loss": 1.1838, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 3.597487449645996, |
|
"learning_rate": 1.4274576271186441e-05, |
|
"loss": 1.187, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 3.2705302238464355, |
|
"learning_rate": 1.3935593220338982e-05, |
|
"loss": 1.1829, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 3.251126289367676, |
|
"learning_rate": 1.3596610169491525e-05, |
|
"loss": 1.1888, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_loss": 1.3534026145935059, |
|
"eval_runtime": 93.1511, |
|
"eval_samples_per_second": 10.735, |
|
"eval_steps_per_second": 2.684, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 3.9193084239959717, |
|
"learning_rate": 1.325762711864407e-05, |
|
"loss": 1.2176, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.7747070789337158, |
|
"learning_rate": 1.2918644067796613e-05, |
|
"loss": 1.1845, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.7094638347625732, |
|
"learning_rate": 1.2579661016949154e-05, |
|
"loss": 1.2025, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 4.238964557647705, |
|
"learning_rate": 1.2240677966101695e-05, |
|
"loss": 1.1665, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.669313669204712, |
|
"learning_rate": 1.1901694915254238e-05, |
|
"loss": 1.1792, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 8.226383209228516, |
|
"learning_rate": 1.1562711864406781e-05, |
|
"loss": 1.1591, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 3.0040552616119385, |
|
"learning_rate": 1.1223728813559322e-05, |
|
"loss": 1.2112, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.9113960266113281, |
|
"learning_rate": 1.0884745762711865e-05, |
|
"loss": 1.164, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.2811169624328613, |
|
"learning_rate": 1.0545762711864406e-05, |
|
"loss": 1.1807, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 3.2554876804351807, |
|
"learning_rate": 1.020677966101695e-05, |
|
"loss": 1.2362, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 1.315274715423584, |
|
"eval_runtime": 93.1384, |
|
"eval_samples_per_second": 10.737, |
|
"eval_steps_per_second": 2.684, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 3.656125545501709, |
|
"learning_rate": 9.867796610169492e-06, |
|
"loss": 1.1712, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 4.444190502166748, |
|
"learning_rate": 9.528813559322035e-06, |
|
"loss": 1.2532, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 5.304549217224121, |
|
"learning_rate": 9.189830508474576e-06, |
|
"loss": 1.1848, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 4.003425121307373, |
|
"learning_rate": 8.85084745762712e-06, |
|
"loss": 1.1601, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 4.701766490936279, |
|
"learning_rate": 8.511864406779662e-06, |
|
"loss": 1.1682, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 4.16826057434082, |
|
"learning_rate": 8.172881355932203e-06, |
|
"loss": 1.1806, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 3.569227695465088, |
|
"learning_rate": 7.833898305084746e-06, |
|
"loss": 1.1295, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 6.8756890296936035, |
|
"learning_rate": 7.494915254237289e-06, |
|
"loss": 1.1863, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 4.841982364654541, |
|
"learning_rate": 7.155932203389831e-06, |
|
"loss": 1.1651, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 6.766154766082764, |
|
"learning_rate": 6.816949152542373e-06, |
|
"loss": 1.1831, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"eval_loss": 1.3225314617156982, |
|
"eval_runtime": 93.1704, |
|
"eval_samples_per_second": 10.733, |
|
"eval_steps_per_second": 2.683, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 4.122812747955322, |
|
"learning_rate": 6.477966101694915e-06, |
|
"loss": 1.1934, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.2169628143310547, |
|
"learning_rate": 6.1389830508474575e-06, |
|
"loss": 1.1044, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 3.0423035621643066, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.1876, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 3.517133951187134, |
|
"learning_rate": 5.4610169491525424e-06, |
|
"loss": 1.2411, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 3.3230624198913574, |
|
"learning_rate": 5.1220338983050845e-06, |
|
"loss": 1.189, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 5.101288795471191, |
|
"learning_rate": 4.7830508474576274e-06, |
|
"loss": 1.1435, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 3.061377763748169, |
|
"learning_rate": 4.4440677966101695e-06, |
|
"loss": 1.1902, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.9341752529144287, |
|
"learning_rate": 4.1050847457627124e-06, |
|
"loss": 1.22, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 3.691490888595581, |
|
"learning_rate": 3.766101694915254e-06, |
|
"loss": 1.2105, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.67085599899292, |
|
"learning_rate": 3.427118644067797e-06, |
|
"loss": 1.1909, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"eval_loss": 1.294958472251892, |
|
"eval_runtime": 93.09, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 2.686, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 4.373409748077393, |
|
"learning_rate": 3.088135593220339e-06, |
|
"loss": 1.2423, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 4.1964006423950195, |
|
"learning_rate": 2.7491525423728816e-06, |
|
"loss": 1.1482, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 4.233890056610107, |
|
"learning_rate": 2.4135593220338984e-06, |
|
"loss": 1.1824, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 3.9279072284698486, |
|
"learning_rate": 2.0745762711864404e-06, |
|
"loss": 1.1466, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 3.39454984664917, |
|
"learning_rate": 1.7355932203389831e-06, |
|
"loss": 1.1302, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.3635748624801636, |
|
"learning_rate": 1.3966101694915254e-06, |
|
"loss": 1.214, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 3.5587875843048096, |
|
"learning_rate": 1.057627118644068e-06, |
|
"loss": 1.1981, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 7.449352741241455, |
|
"learning_rate": 7.186440677966102e-07, |
|
"loss": 1.1196, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.7902631759643555, |
|
"learning_rate": 3.796610169491526e-07, |
|
"loss": 1.1452, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.423699855804443, |
|
"learning_rate": 4.067796610169492e-08, |
|
"loss": 1.197, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 1.2965623140335083, |
|
"eval_runtime": 93.1115, |
|
"eval_samples_per_second": 10.74, |
|
"eval_steps_per_second": 2.685, |
|
"step": 30000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 30000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"total_flos": 4.8306377981952e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|