|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 48.184357541899445, |
|
"eval_steps": 500, |
|
"global_step": 34500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.0421319007873535, |
|
"learning_rate": 1.9720670391061455e-05, |
|
"loss": 0.408, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 4.579340934753418, |
|
"learning_rate": 1.9441340782122907e-05, |
|
"loss": 0.303, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 5.2814459800720215, |
|
"learning_rate": 1.9162011173184357e-05, |
|
"loss": 0.2487, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 4.263625144958496, |
|
"learning_rate": 1.888268156424581e-05, |
|
"loss": 0.1642, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 16.92409896850586, |
|
"learning_rate": 1.8603351955307266e-05, |
|
"loss": 0.1116, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.10890375822782516, |
|
"learning_rate": 1.8324022346368716e-05, |
|
"loss": 0.0992, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 4.843805313110352, |
|
"learning_rate": 1.804469273743017e-05, |
|
"loss": 0.0723, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.6422103643417358, |
|
"learning_rate": 1.776536312849162e-05, |
|
"loss": 0.0543, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 0.04145967587828636, |
|
"learning_rate": 1.7486033519553075e-05, |
|
"loss": 0.0479, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 0.18585649132728577, |
|
"learning_rate": 1.7206703910614527e-05, |
|
"loss": 0.0472, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 1.1734449863433838, |
|
"learning_rate": 1.6927374301675977e-05, |
|
"loss": 0.037, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 5.980381965637207, |
|
"learning_rate": 1.664804469273743e-05, |
|
"loss": 0.0352, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 0.02630673162639141, |
|
"learning_rate": 1.6368715083798886e-05, |
|
"loss": 0.031, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 0.34635427594184875, |
|
"learning_rate": 1.6089385474860336e-05, |
|
"loss": 0.0249, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 10.47, |
|
"grad_norm": 0.038617122918367386, |
|
"learning_rate": 1.581005586592179e-05, |
|
"loss": 0.0236, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 11.17, |
|
"grad_norm": 0.011197003535926342, |
|
"learning_rate": 1.553072625698324e-05, |
|
"loss": 0.024, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 11.87, |
|
"grad_norm": 0.04000287503004074, |
|
"learning_rate": 1.5251396648044694e-05, |
|
"loss": 0.0257, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 12.57, |
|
"grad_norm": 0.01364846620708704, |
|
"learning_rate": 1.4972067039106146e-05, |
|
"loss": 0.0185, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 13.27, |
|
"grad_norm": 0.030796006321907043, |
|
"learning_rate": 1.4692737430167599e-05, |
|
"loss": 0.0193, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 13.97, |
|
"grad_norm": 0.008376175537705421, |
|
"learning_rate": 1.4413407821229052e-05, |
|
"loss": 0.0177, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 14.66, |
|
"grad_norm": 0.004392937291413546, |
|
"learning_rate": 1.4134078212290503e-05, |
|
"loss": 0.0127, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 15.36, |
|
"grad_norm": 0.0029910472221672535, |
|
"learning_rate": 1.3854748603351957e-05, |
|
"loss": 0.014, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 16.06, |
|
"grad_norm": 0.0029884399846196175, |
|
"learning_rate": 1.357541899441341e-05, |
|
"loss": 0.0156, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 16.76, |
|
"grad_norm": 7.065054416656494, |
|
"learning_rate": 1.3296089385474861e-05, |
|
"loss": 0.0132, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 17.46, |
|
"grad_norm": 0.002008031355217099, |
|
"learning_rate": 1.3016759776536314e-05, |
|
"loss": 0.0105, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 18.16, |
|
"grad_norm": 0.004853041376918554, |
|
"learning_rate": 1.2737430167597766e-05, |
|
"loss": 0.0091, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 18.85, |
|
"grad_norm": 0.009278771467506886, |
|
"learning_rate": 1.2458100558659219e-05, |
|
"loss": 0.0144, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 19.55, |
|
"grad_norm": 0.002231143182143569, |
|
"learning_rate": 1.2178770949720671e-05, |
|
"loss": 0.0099, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 20.25, |
|
"grad_norm": 0.003844016930088401, |
|
"learning_rate": 1.1899441340782123e-05, |
|
"loss": 0.0132, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 20.95, |
|
"grad_norm": 0.0021888066548854113, |
|
"learning_rate": 1.1620111731843577e-05, |
|
"loss": 0.0126, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 21.65, |
|
"grad_norm": 16.925048828125, |
|
"learning_rate": 1.134078212290503e-05, |
|
"loss": 0.0077, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 22.35, |
|
"grad_norm": 0.0013325487961992621, |
|
"learning_rate": 1.1061452513966481e-05, |
|
"loss": 0.0108, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 23.04, |
|
"grad_norm": 0.0013474252773448825, |
|
"learning_rate": 1.0782122905027934e-05, |
|
"loss": 0.0078, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 23.74, |
|
"grad_norm": 0.0014491812326014042, |
|
"learning_rate": 1.0502793296089386e-05, |
|
"loss": 0.0048, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 24.44, |
|
"grad_norm": 0.0027435130905359983, |
|
"learning_rate": 1.0223463687150838e-05, |
|
"loss": 0.0072, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 25.14, |
|
"grad_norm": 0.06291136890649796, |
|
"learning_rate": 9.944134078212291e-06, |
|
"loss": 0.008, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 25.84, |
|
"grad_norm": 0.021899724379181862, |
|
"learning_rate": 9.664804469273744e-06, |
|
"loss": 0.0087, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 26.54, |
|
"grad_norm": 15.27237319946289, |
|
"learning_rate": 9.385474860335197e-06, |
|
"loss": 0.0052, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 27.23, |
|
"grad_norm": 0.009569833986461163, |
|
"learning_rate": 9.106145251396648e-06, |
|
"loss": 0.0073, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 27.93, |
|
"grad_norm": 0.0003999493783339858, |
|
"learning_rate": 8.826815642458101e-06, |
|
"loss": 0.0048, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 28.63, |
|
"grad_norm": 0.0033733926247805357, |
|
"learning_rate": 8.547486033519554e-06, |
|
"loss": 0.0056, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 29.33, |
|
"grad_norm": 0.04343196749687195, |
|
"learning_rate": 8.268156424581007e-06, |
|
"loss": 0.0056, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 30.03, |
|
"grad_norm": 0.0003582398348953575, |
|
"learning_rate": 7.988826815642458e-06, |
|
"loss": 0.0031, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 30.73, |
|
"grad_norm": 0.00021692673908546567, |
|
"learning_rate": 7.709497206703911e-06, |
|
"loss": 0.0035, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 31.42, |
|
"grad_norm": 0.00029874147730879486, |
|
"learning_rate": 7.430167597765364e-06, |
|
"loss": 0.0026, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 32.12, |
|
"grad_norm": 0.0002098192780977115, |
|
"learning_rate": 7.150837988826816e-06, |
|
"loss": 0.0048, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 32.82, |
|
"grad_norm": 0.010424572043120861, |
|
"learning_rate": 6.871508379888268e-06, |
|
"loss": 0.0054, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 33.52, |
|
"grad_norm": 0.1450434923171997, |
|
"learning_rate": 6.592178770949721e-06, |
|
"loss": 0.0031, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 34.22, |
|
"grad_norm": 0.0003349222242832184, |
|
"learning_rate": 6.312849162011173e-06, |
|
"loss": 0.0029, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 34.92, |
|
"grad_norm": 0.0007802957552485168, |
|
"learning_rate": 6.033519553072626e-06, |
|
"loss": 0.0023, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 35.61, |
|
"grad_norm": 0.0005215730634517968, |
|
"learning_rate": 5.754189944134078e-06, |
|
"loss": 0.0021, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 36.31, |
|
"grad_norm": 0.00011289090616628528, |
|
"learning_rate": 5.474860335195531e-06, |
|
"loss": 0.0026, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 37.01, |
|
"grad_norm": 0.013641629368066788, |
|
"learning_rate": 5.195530726256983e-06, |
|
"loss": 0.0032, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 37.71, |
|
"grad_norm": 0.0002486356534063816, |
|
"learning_rate": 4.916201117318436e-06, |
|
"loss": 0.002, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 38.41, |
|
"grad_norm": 9.347203013021499e-05, |
|
"learning_rate": 4.636871508379888e-06, |
|
"loss": 0.0012, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 39.11, |
|
"grad_norm": 0.00012943429464939982, |
|
"learning_rate": 4.357541899441341e-06, |
|
"loss": 0.0026, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 39.8, |
|
"grad_norm": 0.001636040979065001, |
|
"learning_rate": 4.078212290502794e-06, |
|
"loss": 0.0021, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 40.5, |
|
"grad_norm": 0.009456774219870567, |
|
"learning_rate": 3.798882681564246e-06, |
|
"loss": 0.0016, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 41.2, |
|
"grad_norm": 0.000818349071778357, |
|
"learning_rate": 3.5195530726256988e-06, |
|
"loss": 0.0011, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 41.9, |
|
"grad_norm": 6.146980012999848e-05, |
|
"learning_rate": 3.240223463687151e-06, |
|
"loss": 0.0016, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 42.6, |
|
"grad_norm": 4.060289211338386e-05, |
|
"learning_rate": 2.9608938547486037e-06, |
|
"loss": 0.0007, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 43.3, |
|
"grad_norm": 0.00016211264301091433, |
|
"learning_rate": 2.6815642458100562e-06, |
|
"loss": 0.0032, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 43.99, |
|
"grad_norm": 0.0002249513054266572, |
|
"learning_rate": 2.4022346368715087e-06, |
|
"loss": 0.0011, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 44.69, |
|
"grad_norm": 0.00012292143946979195, |
|
"learning_rate": 2.1229050279329612e-06, |
|
"loss": 0.0006, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 45.39, |
|
"grad_norm": 5.348429112927988e-05, |
|
"learning_rate": 1.8435754189944135e-06, |
|
"loss": 0.0007, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 46.09, |
|
"grad_norm": 5.2925995987607166e-05, |
|
"learning_rate": 1.564245810055866e-06, |
|
"loss": 0.0021, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 46.79, |
|
"grad_norm": 6.103613850427791e-05, |
|
"learning_rate": 1.2849162011173185e-06, |
|
"loss": 0.0009, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 47.49, |
|
"grad_norm": 0.0004408392414916307, |
|
"learning_rate": 1.005586592178771e-06, |
|
"loss": 0.0013, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 48.18, |
|
"grad_norm": 0.0011864238185808063, |
|
"learning_rate": 7.262569832402236e-07, |
|
"loss": 0.0013, |
|
"step": 34500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 35800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 2.970794017734924e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|