|
{ |
|
"best_metric": 0.8066528066528067, |
|
"best_model_checkpoint": "test-hasy-5/checkpoint-18935", |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 54100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.6960930824279785, |
|
"learning_rate": 1.9815157116451017e-05, |
|
"loss": 3.9645, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3970893970893971, |
|
"eval_loss": 3.429507255554199, |
|
"eval_runtime": 2.0488, |
|
"eval_samples_per_second": 234.77, |
|
"eval_steps_per_second": 29.773, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.364806652069092, |
|
"learning_rate": 1.9630314232902035e-05, |
|
"loss": 3.4258, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4781704781704782, |
|
"eval_loss": 2.879011392593384, |
|
"eval_runtime": 1.9708, |
|
"eval_samples_per_second": 244.059, |
|
"eval_steps_per_second": 30.951, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 3.35432767868042, |
|
"learning_rate": 1.944547134935305e-05, |
|
"loss": 3.04, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5467775467775468, |
|
"eval_loss": 2.4893012046813965, |
|
"eval_runtime": 2.0175, |
|
"eval_samples_per_second": 238.411, |
|
"eval_steps_per_second": 30.235, |
|
"step": 1623 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 4.459615230560303, |
|
"learning_rate": 1.9260628465804068e-05, |
|
"loss": 2.793, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5738045738045738, |
|
"eval_loss": 2.2005958557128906, |
|
"eval_runtime": 1.962, |
|
"eval_samples_per_second": 245.161, |
|
"eval_steps_per_second": 31.091, |
|
"step": 2164 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 4.481846332550049, |
|
"learning_rate": 1.9075785582255083e-05, |
|
"loss": 2.5551, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6340956340956341, |
|
"eval_loss": 1.9055824279785156, |
|
"eval_runtime": 2.0221, |
|
"eval_samples_per_second": 237.875, |
|
"eval_steps_per_second": 30.167, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 4.1784281730651855, |
|
"learning_rate": 1.88909426987061e-05, |
|
"loss": 2.3662, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6632016632016632, |
|
"eval_loss": 1.7023240327835083, |
|
"eval_runtime": 1.9666, |
|
"eval_samples_per_second": 244.588, |
|
"eval_steps_per_second": 31.018, |
|
"step": 3246 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 6.06294059753418, |
|
"learning_rate": 1.8706099815157116e-05, |
|
"loss": 2.1965, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.6798336798336798, |
|
"eval_loss": 1.5739575624465942, |
|
"eval_runtime": 1.9941, |
|
"eval_samples_per_second": 241.216, |
|
"eval_steps_per_second": 30.591, |
|
"step": 3787 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 4.912960052490234, |
|
"learning_rate": 1.8521256931608135e-05, |
|
"loss": 2.1397, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6943866943866944, |
|
"eval_loss": 1.4560521841049194, |
|
"eval_runtime": 1.9835, |
|
"eval_samples_per_second": 242.507, |
|
"eval_steps_per_second": 30.754, |
|
"step": 4328 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 5.236889362335205, |
|
"learning_rate": 1.833641404805915e-05, |
|
"loss": 1.9955, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7234927234927235, |
|
"eval_loss": 1.3202540874481201, |
|
"eval_runtime": 2.0536, |
|
"eval_samples_per_second": 234.218, |
|
"eval_steps_per_second": 29.703, |
|
"step": 4869 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 5.675503253936768, |
|
"learning_rate": 1.8151571164510168e-05, |
|
"loss": 1.9282, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.738045738045738, |
|
"eval_loss": 1.2246184349060059, |
|
"eval_runtime": 2.0017, |
|
"eval_samples_per_second": 240.293, |
|
"eval_steps_per_second": 30.474, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"grad_norm": 4.67825174331665, |
|
"learning_rate": 1.7966728280961186e-05, |
|
"loss": 1.8368, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.738045738045738, |
|
"eval_loss": 1.1823257207870483, |
|
"eval_runtime": 1.9774, |
|
"eval_samples_per_second": 243.246, |
|
"eval_steps_per_second": 30.848, |
|
"step": 5951 |
|
}, |
|
{ |
|
"epoch": 11.09, |
|
"grad_norm": 4.809859275817871, |
|
"learning_rate": 1.77818853974122e-05, |
|
"loss": 1.812, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7214137214137214, |
|
"eval_loss": 1.1297953128814697, |
|
"eval_runtime": 2.0307, |
|
"eval_samples_per_second": 236.864, |
|
"eval_steps_per_second": 30.039, |
|
"step": 6492 |
|
}, |
|
{ |
|
"epoch": 12.01, |
|
"grad_norm": 5.255190849304199, |
|
"learning_rate": 1.759704251386322e-05, |
|
"loss": 1.7353, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"grad_norm": 8.597217559814453, |
|
"learning_rate": 1.7412199630314234e-05, |
|
"loss": 1.7195, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7484407484407485, |
|
"eval_loss": 1.0423070192337036, |
|
"eval_runtime": 2.0193, |
|
"eval_samples_per_second": 238.201, |
|
"eval_steps_per_second": 30.208, |
|
"step": 7033 |
|
}, |
|
{ |
|
"epoch": 13.86, |
|
"grad_norm": 6.453842639923096, |
|
"learning_rate": 1.7227356746765253e-05, |
|
"loss": 1.6314, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7422037422037422, |
|
"eval_loss": 1.0077309608459473, |
|
"eval_runtime": 2.0783, |
|
"eval_samples_per_second": 231.439, |
|
"eval_steps_per_second": 29.351, |
|
"step": 7574 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"grad_norm": 8.70645523071289, |
|
"learning_rate": 1.7042513863216268e-05, |
|
"loss": 1.5979, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7463617463617463, |
|
"eval_loss": 1.00509512424469, |
|
"eval_runtime": 1.9889, |
|
"eval_samples_per_second": 241.847, |
|
"eval_steps_per_second": 30.671, |
|
"step": 8115 |
|
}, |
|
{ |
|
"epoch": 15.71, |
|
"grad_norm": 7.348147392272949, |
|
"learning_rate": 1.6857670979667286e-05, |
|
"loss": 1.5656, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7692307692307693, |
|
"eval_loss": 0.9325113296508789, |
|
"eval_runtime": 1.9923, |
|
"eval_samples_per_second": 241.43, |
|
"eval_steps_per_second": 30.618, |
|
"step": 8656 |
|
}, |
|
{ |
|
"epoch": 16.64, |
|
"grad_norm": 6.420931816101074, |
|
"learning_rate": 1.66728280961183e-05, |
|
"loss": 1.5414, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7733887733887734, |
|
"eval_loss": 0.8889437913894653, |
|
"eval_runtime": 1.9727, |
|
"eval_samples_per_second": 243.822, |
|
"eval_steps_per_second": 30.921, |
|
"step": 9197 |
|
}, |
|
{ |
|
"epoch": 17.56, |
|
"grad_norm": 8.127350807189941, |
|
"learning_rate": 1.6487985212569316e-05, |
|
"loss": 1.5342, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7484407484407485, |
|
"eval_loss": 0.9072721600532532, |
|
"eval_runtime": 1.9933, |
|
"eval_samples_per_second": 241.303, |
|
"eval_steps_per_second": 30.602, |
|
"step": 9738 |
|
}, |
|
{ |
|
"epoch": 18.48, |
|
"grad_norm": 6.122061252593994, |
|
"learning_rate": 1.6303142329020334e-05, |
|
"loss": 1.4898, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7713097713097713, |
|
"eval_loss": 0.8425627946853638, |
|
"eval_runtime": 1.9868, |
|
"eval_samples_per_second": 242.099, |
|
"eval_steps_per_second": 30.703, |
|
"step": 10279 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"grad_norm": 6.640945911407471, |
|
"learning_rate": 1.611829944547135e-05, |
|
"loss": 1.4731, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7442827442827443, |
|
"eval_loss": 0.862506091594696, |
|
"eval_runtime": 1.9786, |
|
"eval_samples_per_second": 243.096, |
|
"eval_steps_per_second": 30.829, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 20.33, |
|
"grad_norm": 6.019400119781494, |
|
"learning_rate": 1.5933456561922367e-05, |
|
"loss": 1.451, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.762993762993763, |
|
"eval_loss": 0.8015209436416626, |
|
"eval_runtime": 1.9644, |
|
"eval_samples_per_second": 244.864, |
|
"eval_steps_per_second": 31.053, |
|
"step": 11361 |
|
}, |
|
{ |
|
"epoch": 21.26, |
|
"grad_norm": 5.140503406524658, |
|
"learning_rate": 1.5748613678373382e-05, |
|
"loss": 1.4578, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7588357588357588, |
|
"eval_loss": 0.8520306944847107, |
|
"eval_runtime": 2.0001, |
|
"eval_samples_per_second": 240.484, |
|
"eval_steps_per_second": 30.498, |
|
"step": 11902 |
|
}, |
|
{ |
|
"epoch": 22.18, |
|
"grad_norm": 15.190984725952148, |
|
"learning_rate": 1.55637707948244e-05, |
|
"loss": 1.4126, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7713097713097713, |
|
"eval_loss": 0.7928301692008972, |
|
"eval_runtime": 1.9822, |
|
"eval_samples_per_second": 242.66, |
|
"eval_steps_per_second": 30.774, |
|
"step": 12443 |
|
}, |
|
{ |
|
"epoch": 23.11, |
|
"grad_norm": 11.220525741577148, |
|
"learning_rate": 1.5378927911275416e-05, |
|
"loss": 1.3626, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7837837837837838, |
|
"eval_loss": 0.754388689994812, |
|
"eval_runtime": 1.975, |
|
"eval_samples_per_second": 243.545, |
|
"eval_steps_per_second": 30.886, |
|
"step": 12984 |
|
}, |
|
{ |
|
"epoch": 24.03, |
|
"grad_norm": 3.5185582637786865, |
|
"learning_rate": 1.5194085027726432e-05, |
|
"loss": 1.3905, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 24.95, |
|
"grad_norm": 8.19352912902832, |
|
"learning_rate": 1.5009242144177449e-05, |
|
"loss": 1.3694, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.7775467775467776, |
|
"eval_loss": 0.7698755860328674, |
|
"eval_runtime": 2.0179, |
|
"eval_samples_per_second": 238.368, |
|
"eval_steps_per_second": 30.23, |
|
"step": 13525 |
|
}, |
|
{ |
|
"epoch": 25.88, |
|
"grad_norm": 6.003907680511475, |
|
"learning_rate": 1.4824399260628467e-05, |
|
"loss": 1.3612, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7775467775467776, |
|
"eval_loss": 0.7602183818817139, |
|
"eval_runtime": 1.9833, |
|
"eval_samples_per_second": 242.521, |
|
"eval_steps_per_second": 30.756, |
|
"step": 14066 |
|
}, |
|
{ |
|
"epoch": 26.8, |
|
"grad_norm": 6.613931655883789, |
|
"learning_rate": 1.4639556377079484e-05, |
|
"loss": 1.2963, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.7713097713097713, |
|
"eval_loss": 0.7532169818878174, |
|
"eval_runtime": 2.0706, |
|
"eval_samples_per_second": 232.305, |
|
"eval_steps_per_second": 29.461, |
|
"step": 14607 |
|
}, |
|
{ |
|
"epoch": 27.73, |
|
"grad_norm": 7.66683292388916, |
|
"learning_rate": 1.44547134935305e-05, |
|
"loss": 1.3009, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.7012535929679871, |
|
"eval_runtime": 1.9606, |
|
"eval_samples_per_second": 245.335, |
|
"eval_steps_per_second": 31.113, |
|
"step": 15148 |
|
}, |
|
{ |
|
"epoch": 28.65, |
|
"grad_norm": 7.342077255249023, |
|
"learning_rate": 1.4269870609981517e-05, |
|
"loss": 1.2598, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7796257796257796, |
|
"eval_loss": 0.7084705233573914, |
|
"eval_runtime": 1.9824, |
|
"eval_samples_per_second": 242.632, |
|
"eval_steps_per_second": 30.77, |
|
"step": 15689 |
|
}, |
|
{ |
|
"epoch": 29.57, |
|
"grad_norm": 5.679790019989014, |
|
"learning_rate": 1.4085027726432534e-05, |
|
"loss": 1.2565, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7775467775467776, |
|
"eval_loss": 0.7023281455039978, |
|
"eval_runtime": 1.9659, |
|
"eval_samples_per_second": 244.668, |
|
"eval_steps_per_second": 31.029, |
|
"step": 16230 |
|
}, |
|
{ |
|
"epoch": 30.5, |
|
"grad_norm": 5.493412971496582, |
|
"learning_rate": 1.390018484288355e-05, |
|
"loss": 1.2735, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7775467775467776, |
|
"eval_loss": 0.7047860026359558, |
|
"eval_runtime": 1.9718, |
|
"eval_samples_per_second": 243.937, |
|
"eval_steps_per_second": 30.936, |
|
"step": 16771 |
|
}, |
|
{ |
|
"epoch": 31.42, |
|
"grad_norm": 6.2688093185424805, |
|
"learning_rate": 1.3715341959334567e-05, |
|
"loss": 1.2743, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6794067621231079, |
|
"eval_runtime": 1.9764, |
|
"eval_samples_per_second": 243.372, |
|
"eval_steps_per_second": 30.864, |
|
"step": 17312 |
|
}, |
|
{ |
|
"epoch": 32.35, |
|
"grad_norm": 10.169917106628418, |
|
"learning_rate": 1.3530499075785584e-05, |
|
"loss": 1.2441, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.7858627858627859, |
|
"eval_loss": 0.693196713924408, |
|
"eval_runtime": 1.972, |
|
"eval_samples_per_second": 243.92, |
|
"eval_steps_per_second": 30.934, |
|
"step": 17853 |
|
}, |
|
{ |
|
"epoch": 33.27, |
|
"grad_norm": 8.05045223236084, |
|
"learning_rate": 1.33456561922366e-05, |
|
"loss": 1.2282, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7038751840591431, |
|
"eval_runtime": 1.992, |
|
"eval_samples_per_second": 241.466, |
|
"eval_steps_per_second": 30.623, |
|
"step": 18394 |
|
}, |
|
{ |
|
"epoch": 34.2, |
|
"grad_norm": 5.410665035247803, |
|
"learning_rate": 1.3160813308687617e-05, |
|
"loss": 1.2204, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.8066528066528067, |
|
"eval_loss": 0.6860660910606384, |
|
"eval_runtime": 1.9773, |
|
"eval_samples_per_second": 243.258, |
|
"eval_steps_per_second": 30.85, |
|
"step": 18935 |
|
}, |
|
{ |
|
"epoch": 35.12, |
|
"grad_norm": 11.123208045959473, |
|
"learning_rate": 1.2975970425138634e-05, |
|
"loss": 1.1808, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.6589930057525635, |
|
"eval_runtime": 1.9769, |
|
"eval_samples_per_second": 243.305, |
|
"eval_steps_per_second": 30.856, |
|
"step": 19476 |
|
}, |
|
{ |
|
"epoch": 36.04, |
|
"grad_norm": 6.165465354919434, |
|
"learning_rate": 1.279112754158965e-05, |
|
"loss": 1.1933, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 36.97, |
|
"grad_norm": 6.407803535461426, |
|
"learning_rate": 1.2606284658040667e-05, |
|
"loss": 1.1928, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.7817047817047817, |
|
"eval_loss": 0.678415834903717, |
|
"eval_runtime": 1.9844, |
|
"eval_samples_per_second": 242.388, |
|
"eval_steps_per_second": 30.739, |
|
"step": 20017 |
|
}, |
|
{ |
|
"epoch": 37.89, |
|
"grad_norm": 4.849668979644775, |
|
"learning_rate": 1.2421441774491683e-05, |
|
"loss": 1.1914, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.6559053659439087, |
|
"eval_runtime": 1.9912, |
|
"eval_samples_per_second": 241.56, |
|
"eval_steps_per_second": 30.634, |
|
"step": 20558 |
|
}, |
|
{ |
|
"epoch": 38.82, |
|
"grad_norm": 9.1309232711792, |
|
"learning_rate": 1.2236598890942698e-05, |
|
"loss": 1.1856, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.6769025325775146, |
|
"eval_runtime": 2.0066, |
|
"eval_samples_per_second": 239.713, |
|
"eval_steps_per_second": 30.4, |
|
"step": 21099 |
|
}, |
|
{ |
|
"epoch": 39.74, |
|
"grad_norm": 5.001546382904053, |
|
"learning_rate": 1.2051756007393715e-05, |
|
"loss": 1.1585, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8004158004158004, |
|
"eval_loss": 0.64976966381073, |
|
"eval_runtime": 1.9804, |
|
"eval_samples_per_second": 242.874, |
|
"eval_steps_per_second": 30.801, |
|
"step": 21640 |
|
}, |
|
{ |
|
"epoch": 40.67, |
|
"grad_norm": 14.044866561889648, |
|
"learning_rate": 1.1866913123844732e-05, |
|
"loss": 1.1713, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6447434425354004, |
|
"eval_runtime": 1.9973, |
|
"eval_samples_per_second": 240.829, |
|
"eval_steps_per_second": 30.542, |
|
"step": 22181 |
|
}, |
|
{ |
|
"epoch": 41.59, |
|
"grad_norm": 10.289350509643555, |
|
"learning_rate": 1.1682070240295748e-05, |
|
"loss": 1.1183, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.7713097713097713, |
|
"eval_loss": 0.6748064756393433, |
|
"eval_runtime": 1.9672, |
|
"eval_samples_per_second": 244.509, |
|
"eval_steps_per_second": 31.008, |
|
"step": 22722 |
|
}, |
|
{ |
|
"epoch": 42.51, |
|
"grad_norm": 12.7116117477417, |
|
"learning_rate": 1.1497227356746765e-05, |
|
"loss": 1.1564, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6545261740684509, |
|
"eval_runtime": 1.9659, |
|
"eval_samples_per_second": 244.669, |
|
"eval_steps_per_second": 31.029, |
|
"step": 23263 |
|
}, |
|
{ |
|
"epoch": 43.44, |
|
"grad_norm": 3.0720624923706055, |
|
"learning_rate": 1.1312384473197783e-05, |
|
"loss": 1.1215, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.7879417879417879, |
|
"eval_loss": 0.6690270900726318, |
|
"eval_runtime": 1.9635, |
|
"eval_samples_per_second": 244.971, |
|
"eval_steps_per_second": 31.067, |
|
"step": 23804 |
|
}, |
|
{ |
|
"epoch": 44.36, |
|
"grad_norm": 7.927094459533691, |
|
"learning_rate": 1.11275415896488e-05, |
|
"loss": 1.1008, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.7879417879417879, |
|
"eval_loss": 0.659792423248291, |
|
"eval_runtime": 1.9747, |
|
"eval_samples_per_second": 243.578, |
|
"eval_steps_per_second": 30.89, |
|
"step": 24345 |
|
}, |
|
{ |
|
"epoch": 45.29, |
|
"grad_norm": 8.912357330322266, |
|
"learning_rate": 1.0942698706099817e-05, |
|
"loss": 1.1344, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.8024948024948025, |
|
"eval_loss": 0.6550182104110718, |
|
"eval_runtime": 2.0112, |
|
"eval_samples_per_second": 239.156, |
|
"eval_steps_per_second": 30.33, |
|
"step": 24886 |
|
}, |
|
{ |
|
"epoch": 46.21, |
|
"grad_norm": 9.598004341125488, |
|
"learning_rate": 1.0757855822550833e-05, |
|
"loss": 1.126, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.7858627858627859, |
|
"eval_loss": 0.6521425247192383, |
|
"eval_runtime": 1.9713, |
|
"eval_samples_per_second": 244.004, |
|
"eval_steps_per_second": 30.944, |
|
"step": 25427 |
|
}, |
|
{ |
|
"epoch": 47.13, |
|
"grad_norm": 4.670881271362305, |
|
"learning_rate": 1.057301293900185e-05, |
|
"loss": 1.125, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7817047817047817, |
|
"eval_loss": 0.6812848448753357, |
|
"eval_runtime": 2.016, |
|
"eval_samples_per_second": 238.588, |
|
"eval_steps_per_second": 30.258, |
|
"step": 25968 |
|
}, |
|
{ |
|
"epoch": 48.06, |
|
"grad_norm": 8.11451244354248, |
|
"learning_rate": 1.0388170055452866e-05, |
|
"loss": 1.0682, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 48.98, |
|
"grad_norm": 8.960821151733398, |
|
"learning_rate": 1.0203327171903883e-05, |
|
"loss": 1.0855, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.7858627858627859, |
|
"eval_loss": 0.6419298052787781, |
|
"eval_runtime": 1.974, |
|
"eval_samples_per_second": 243.673, |
|
"eval_steps_per_second": 30.902, |
|
"step": 26509 |
|
}, |
|
{ |
|
"epoch": 49.91, |
|
"grad_norm": 3.053118944168091, |
|
"learning_rate": 1.00184842883549e-05, |
|
"loss": 1.0452, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.8004158004158004, |
|
"eval_loss": 0.6550863981246948, |
|
"eval_runtime": 2.0504, |
|
"eval_samples_per_second": 234.587, |
|
"eval_steps_per_second": 29.75, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 50.83, |
|
"grad_norm": 5.4594340324401855, |
|
"learning_rate": 9.833641404805916e-06, |
|
"loss": 1.0626, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6675499081611633, |
|
"eval_runtime": 1.9991, |
|
"eval_samples_per_second": 240.605, |
|
"eval_steps_per_second": 30.513, |
|
"step": 27591 |
|
}, |
|
{ |
|
"epoch": 51.76, |
|
"grad_norm": 8.158236503601074, |
|
"learning_rate": 9.648798521256933e-06, |
|
"loss": 1.0155, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6945971846580505, |
|
"eval_runtime": 1.9873, |
|
"eval_samples_per_second": 242.042, |
|
"eval_steps_per_second": 30.696, |
|
"step": 28132 |
|
}, |
|
{ |
|
"epoch": 52.68, |
|
"grad_norm": 5.626604080200195, |
|
"learning_rate": 9.46395563770795e-06, |
|
"loss": 1.0319, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.7796257796257796, |
|
"eval_loss": 0.6942130923271179, |
|
"eval_runtime": 1.966, |
|
"eval_samples_per_second": 244.665, |
|
"eval_steps_per_second": 31.028, |
|
"step": 28673 |
|
}, |
|
{ |
|
"epoch": 53.6, |
|
"grad_norm": 6.82182502746582, |
|
"learning_rate": 9.279112754158966e-06, |
|
"loss": 1.0488, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.7983367983367984, |
|
"eval_loss": 0.6496003866195679, |
|
"eval_runtime": 2.0069, |
|
"eval_samples_per_second": 239.673, |
|
"eval_steps_per_second": 30.395, |
|
"step": 29214 |
|
}, |
|
{ |
|
"epoch": 54.53, |
|
"grad_norm": 7.865675926208496, |
|
"learning_rate": 9.094269870609981e-06, |
|
"loss": 1.0558, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.8045738045738046, |
|
"eval_loss": 0.6465332508087158, |
|
"eval_runtime": 1.9938, |
|
"eval_samples_per_second": 241.25, |
|
"eval_steps_per_second": 30.595, |
|
"step": 29755 |
|
}, |
|
{ |
|
"epoch": 55.45, |
|
"grad_norm": 7.172035217285156, |
|
"learning_rate": 8.909426987060998e-06, |
|
"loss": 0.9913, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6654109954833984, |
|
"eval_runtime": 1.9693, |
|
"eval_samples_per_second": 244.248, |
|
"eval_steps_per_second": 30.975, |
|
"step": 30296 |
|
}, |
|
{ |
|
"epoch": 56.38, |
|
"grad_norm": 6.30518102645874, |
|
"learning_rate": 8.724584103512016e-06, |
|
"loss": 1.0555, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.656141996383667, |
|
"eval_runtime": 1.9741, |
|
"eval_samples_per_second": 243.66, |
|
"eval_steps_per_second": 30.901, |
|
"step": 30837 |
|
}, |
|
{ |
|
"epoch": 57.3, |
|
"grad_norm": 3.0917370319366455, |
|
"learning_rate": 8.539741219963033e-06, |
|
"loss": 0.9803, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.673220157623291, |
|
"eval_runtime": 1.993, |
|
"eval_samples_per_second": 241.346, |
|
"eval_steps_per_second": 30.607, |
|
"step": 31378 |
|
}, |
|
{ |
|
"epoch": 58.23, |
|
"grad_norm": 8.285308837890625, |
|
"learning_rate": 8.35489833641405e-06, |
|
"loss": 1.0393, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.7817047817047817, |
|
"eval_loss": 0.6892696619033813, |
|
"eval_runtime": 1.9798, |
|
"eval_samples_per_second": 242.949, |
|
"eval_steps_per_second": 30.811, |
|
"step": 31919 |
|
}, |
|
{ |
|
"epoch": 59.15, |
|
"grad_norm": 3.1396327018737793, |
|
"learning_rate": 8.170055452865066e-06, |
|
"loss": 0.9677, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8045738045738046, |
|
"eval_loss": 0.6823599934577942, |
|
"eval_runtime": 2.0127, |
|
"eval_samples_per_second": 238.985, |
|
"eval_steps_per_second": 30.308, |
|
"step": 32460 |
|
}, |
|
{ |
|
"epoch": 60.07, |
|
"grad_norm": 12.875879287719727, |
|
"learning_rate": 7.985212569316083e-06, |
|
"loss": 1.0366, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 8.770364761352539, |
|
"learning_rate": 7.8003696857671e-06, |
|
"loss": 1.0082, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.6618274450302124, |
|
"eval_runtime": 2.0008, |
|
"eval_samples_per_second": 240.403, |
|
"eval_steps_per_second": 30.488, |
|
"step": 33001 |
|
}, |
|
{ |
|
"epoch": 61.92, |
|
"grad_norm": 6.0600972175598145, |
|
"learning_rate": 7.615526802218115e-06, |
|
"loss": 1.0096, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.7837837837837838, |
|
"eval_loss": 0.6691136360168457, |
|
"eval_runtime": 1.9817, |
|
"eval_samples_per_second": 242.719, |
|
"eval_steps_per_second": 30.781, |
|
"step": 33542 |
|
}, |
|
{ |
|
"epoch": 62.85, |
|
"grad_norm": 10.777630805969238, |
|
"learning_rate": 7.430683918669132e-06, |
|
"loss": 0.9685, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.8024948024948025, |
|
"eval_loss": 0.6792653203010559, |
|
"eval_runtime": 2.0645, |
|
"eval_samples_per_second": 232.985, |
|
"eval_steps_per_second": 29.547, |
|
"step": 34083 |
|
}, |
|
{ |
|
"epoch": 63.77, |
|
"grad_norm": 3.9615447521209717, |
|
"learning_rate": 7.245841035120148e-06, |
|
"loss": 0.9847, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.7837837837837838, |
|
"eval_loss": 0.6894533634185791, |
|
"eval_runtime": 2.0054, |
|
"eval_samples_per_second": 239.847, |
|
"eval_steps_per_second": 30.417, |
|
"step": 34624 |
|
}, |
|
{ |
|
"epoch": 64.7, |
|
"grad_norm": 9.38687801361084, |
|
"learning_rate": 7.060998151571166e-06, |
|
"loss": 0.9639, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.7733887733887734, |
|
"eval_loss": 0.7297117114067078, |
|
"eval_runtime": 2.0147, |
|
"eval_samples_per_second": 238.744, |
|
"eval_steps_per_second": 30.277, |
|
"step": 35165 |
|
}, |
|
{ |
|
"epoch": 65.62, |
|
"grad_norm": 12.292973518371582, |
|
"learning_rate": 6.876155268022182e-06, |
|
"loss": 0.9776, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.6561179757118225, |
|
"eval_runtime": 1.9845, |
|
"eval_samples_per_second": 242.381, |
|
"eval_steps_per_second": 30.739, |
|
"step": 35706 |
|
}, |
|
{ |
|
"epoch": 66.54, |
|
"grad_norm": 14.023015022277832, |
|
"learning_rate": 6.691312384473199e-06, |
|
"loss": 1.0074, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.7775467775467776, |
|
"eval_loss": 0.6998913884162903, |
|
"eval_runtime": 1.9686, |
|
"eval_samples_per_second": 244.338, |
|
"eval_steps_per_second": 30.987, |
|
"step": 36247 |
|
}, |
|
{ |
|
"epoch": 67.47, |
|
"grad_norm": 13.870222091674805, |
|
"learning_rate": 6.506469500924215e-06, |
|
"loss": 0.9466, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.6880961656570435, |
|
"eval_runtime": 1.9686, |
|
"eval_samples_per_second": 244.34, |
|
"eval_steps_per_second": 30.987, |
|
"step": 36788 |
|
}, |
|
{ |
|
"epoch": 68.39, |
|
"grad_norm": 6.1949639320373535, |
|
"learning_rate": 6.321626617375231e-06, |
|
"loss": 0.9425, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.6805587410926819, |
|
"eval_runtime": 1.9709, |
|
"eval_samples_per_second": 244.052, |
|
"eval_steps_per_second": 30.95, |
|
"step": 37329 |
|
}, |
|
{ |
|
"epoch": 69.32, |
|
"grad_norm": 7.145143508911133, |
|
"learning_rate": 6.136783733826248e-06, |
|
"loss": 0.9594, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7202461361885071, |
|
"eval_runtime": 2.0125, |
|
"eval_samples_per_second": 239.001, |
|
"eval_steps_per_second": 30.31, |
|
"step": 37870 |
|
}, |
|
{ |
|
"epoch": 70.24, |
|
"grad_norm": 9.215810775756836, |
|
"learning_rate": 5.951940850277265e-06, |
|
"loss": 0.9311, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.7754677754677755, |
|
"eval_loss": 0.7161967754364014, |
|
"eval_runtime": 1.977, |
|
"eval_samples_per_second": 243.297, |
|
"eval_steps_per_second": 30.855, |
|
"step": 38411 |
|
}, |
|
{ |
|
"epoch": 71.16, |
|
"grad_norm": 6.461187362670898, |
|
"learning_rate": 5.767097966728281e-06, |
|
"loss": 0.9429, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.7284368276596069, |
|
"eval_runtime": 2.0663, |
|
"eval_samples_per_second": 232.783, |
|
"eval_steps_per_second": 29.521, |
|
"step": 38952 |
|
}, |
|
{ |
|
"epoch": 72.09, |
|
"grad_norm": 11.850204467773438, |
|
"learning_rate": 5.582255083179298e-06, |
|
"loss": 0.9666, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.6871474981307983, |
|
"eval_runtime": 1.97, |
|
"eval_samples_per_second": 244.162, |
|
"eval_steps_per_second": 30.964, |
|
"step": 39493 |
|
}, |
|
{ |
|
"epoch": 73.01, |
|
"grad_norm": 8.0579252243042, |
|
"learning_rate": 5.3974121996303146e-06, |
|
"loss": 0.932, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 73.94, |
|
"grad_norm": 1.1972132921218872, |
|
"learning_rate": 5.212569316081332e-06, |
|
"loss": 0.945, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.6778899431228638, |
|
"eval_runtime": 2.0376, |
|
"eval_samples_per_second": 236.067, |
|
"eval_steps_per_second": 29.938, |
|
"step": 40034 |
|
}, |
|
{ |
|
"epoch": 74.86, |
|
"grad_norm": 5.484439849853516, |
|
"learning_rate": 5.027726432532349e-06, |
|
"loss": 0.9387, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.735752522945404, |
|
"eval_runtime": 1.9762, |
|
"eval_samples_per_second": 243.395, |
|
"eval_steps_per_second": 30.867, |
|
"step": 40575 |
|
}, |
|
{ |
|
"epoch": 75.79, |
|
"grad_norm": 2.0908420085906982, |
|
"learning_rate": 4.8428835489833645e-06, |
|
"loss": 0.9132, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7043733596801758, |
|
"eval_runtime": 1.9764, |
|
"eval_samples_per_second": 243.367, |
|
"eval_steps_per_second": 30.864, |
|
"step": 41116 |
|
}, |
|
{ |
|
"epoch": 76.71, |
|
"grad_norm": 10.380330085754395, |
|
"learning_rate": 4.658040665434381e-06, |
|
"loss": 0.9181, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.7962577962577962, |
|
"eval_loss": 0.7041053771972656, |
|
"eval_runtime": 2.0006, |
|
"eval_samples_per_second": 240.43, |
|
"eval_steps_per_second": 30.491, |
|
"step": 41657 |
|
}, |
|
{ |
|
"epoch": 77.63, |
|
"grad_norm": 9.135781288146973, |
|
"learning_rate": 4.473197781885398e-06, |
|
"loss": 0.9218, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.6986111998558044, |
|
"eval_runtime": 1.9688, |
|
"eval_samples_per_second": 244.309, |
|
"eval_steps_per_second": 30.983, |
|
"step": 42198 |
|
}, |
|
{ |
|
"epoch": 78.56, |
|
"grad_norm": 17.338001251220703, |
|
"learning_rate": 4.288354898336414e-06, |
|
"loss": 0.8621, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.8004158004158004, |
|
"eval_loss": 0.6909247040748596, |
|
"eval_runtime": 1.997, |
|
"eval_samples_per_second": 240.86, |
|
"eval_steps_per_second": 30.546, |
|
"step": 42739 |
|
}, |
|
{ |
|
"epoch": 79.48, |
|
"grad_norm": 6.793923854827881, |
|
"learning_rate": 4.103512014787431e-06, |
|
"loss": 0.9236, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.7983367983367984, |
|
"eval_loss": 0.7135599851608276, |
|
"eval_runtime": 1.9949, |
|
"eval_samples_per_second": 241.121, |
|
"eval_steps_per_second": 30.579, |
|
"step": 43280 |
|
}, |
|
{ |
|
"epoch": 80.41, |
|
"grad_norm": 3.9345781803131104, |
|
"learning_rate": 3.918669131238448e-06, |
|
"loss": 0.8667, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.8024948024948025, |
|
"eval_loss": 0.7008742094039917, |
|
"eval_runtime": 1.992, |
|
"eval_samples_per_second": 241.461, |
|
"eval_steps_per_second": 30.622, |
|
"step": 43821 |
|
}, |
|
{ |
|
"epoch": 81.33, |
|
"grad_norm": 16.883420944213867, |
|
"learning_rate": 3.7338262476894642e-06, |
|
"loss": 0.8856, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.7127683162689209, |
|
"eval_runtime": 2.0451, |
|
"eval_samples_per_second": 235.201, |
|
"eval_steps_per_second": 29.828, |
|
"step": 44362 |
|
}, |
|
{ |
|
"epoch": 82.26, |
|
"grad_norm": 7.969069480895996, |
|
"learning_rate": 3.548983364140481e-06, |
|
"loss": 0.917, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.7983367983367984, |
|
"eval_loss": 0.7134777307510376, |
|
"eval_runtime": 2.0529, |
|
"eval_samples_per_second": 234.298, |
|
"eval_steps_per_second": 29.714, |
|
"step": 44903 |
|
}, |
|
{ |
|
"epoch": 83.18, |
|
"grad_norm": 1.545163631439209, |
|
"learning_rate": 3.3641404805914975e-06, |
|
"loss": 0.8835, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7295302748680115, |
|
"eval_runtime": 1.9747, |
|
"eval_samples_per_second": 243.583, |
|
"eval_steps_per_second": 30.891, |
|
"step": 45444 |
|
}, |
|
{ |
|
"epoch": 84.1, |
|
"grad_norm": 5.072544097900391, |
|
"learning_rate": 3.1792975970425146e-06, |
|
"loss": 0.8879, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7449509501457214, |
|
"eval_runtime": 2.0759, |
|
"eval_samples_per_second": 231.71, |
|
"eval_steps_per_second": 29.385, |
|
"step": 45985 |
|
}, |
|
{ |
|
"epoch": 85.03, |
|
"grad_norm": 5.645694732666016, |
|
"learning_rate": 2.9944547134935308e-06, |
|
"loss": 0.9114, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 85.95, |
|
"grad_norm": 5.065194129943848, |
|
"learning_rate": 2.8096118299445474e-06, |
|
"loss": 0.8764, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7362204194068909, |
|
"eval_runtime": 1.9869, |
|
"eval_samples_per_second": 242.081, |
|
"eval_steps_per_second": 30.7, |
|
"step": 46526 |
|
}, |
|
{ |
|
"epoch": 86.88, |
|
"grad_norm": 5.654088020324707, |
|
"learning_rate": 2.624768946395564e-06, |
|
"loss": 0.8674, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7232093811035156, |
|
"eval_runtime": 2.0232, |
|
"eval_samples_per_second": 237.746, |
|
"eval_steps_per_second": 30.151, |
|
"step": 47067 |
|
}, |
|
{ |
|
"epoch": 87.8, |
|
"grad_norm": 12.72859001159668, |
|
"learning_rate": 2.4399260628465807e-06, |
|
"loss": 0.8583, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7407870888710022, |
|
"eval_runtime": 1.9934, |
|
"eval_samples_per_second": 241.296, |
|
"eval_steps_per_second": 30.601, |
|
"step": 47608 |
|
}, |
|
{ |
|
"epoch": 88.72, |
|
"grad_norm": 6.526777744293213, |
|
"learning_rate": 2.2550831792975973e-06, |
|
"loss": 0.881, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_accuracy": 0.8004158004158004, |
|
"eval_loss": 0.7377821803092957, |
|
"eval_runtime": 1.9802, |
|
"eval_samples_per_second": 242.901, |
|
"eval_steps_per_second": 30.804, |
|
"step": 48149 |
|
}, |
|
{ |
|
"epoch": 89.65, |
|
"grad_norm": 8.497318267822266, |
|
"learning_rate": 2.070240295748614e-06, |
|
"loss": 0.8668, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7473007440567017, |
|
"eval_runtime": 2.0137, |
|
"eval_samples_per_second": 238.867, |
|
"eval_steps_per_second": 30.293, |
|
"step": 48690 |
|
}, |
|
{ |
|
"epoch": 90.57, |
|
"grad_norm": 6.455136775970459, |
|
"learning_rate": 1.8853974121996305e-06, |
|
"loss": 0.8779, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_accuracy": 0.7983367983367984, |
|
"eval_loss": 0.7438368201255798, |
|
"eval_runtime": 1.9731, |
|
"eval_samples_per_second": 243.774, |
|
"eval_steps_per_second": 30.915, |
|
"step": 49231 |
|
}, |
|
{ |
|
"epoch": 91.5, |
|
"grad_norm": 5.713993072509766, |
|
"learning_rate": 1.700554528650647e-06, |
|
"loss": 0.8717, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.8004158004158004, |
|
"eval_loss": 0.7389739751815796, |
|
"eval_runtime": 1.9686, |
|
"eval_samples_per_second": 244.34, |
|
"eval_steps_per_second": 30.987, |
|
"step": 49772 |
|
}, |
|
{ |
|
"epoch": 92.42, |
|
"grad_norm": 5.342690467834473, |
|
"learning_rate": 1.5157116451016638e-06, |
|
"loss": 0.8781, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_accuracy": 0.7983367983367984, |
|
"eval_loss": 0.7473535537719727, |
|
"eval_runtime": 1.98, |
|
"eval_samples_per_second": 242.926, |
|
"eval_steps_per_second": 30.808, |
|
"step": 50313 |
|
}, |
|
{ |
|
"epoch": 93.35, |
|
"grad_norm": 9.870634078979492, |
|
"learning_rate": 1.3308687615526802e-06, |
|
"loss": 0.8845, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7445840835571289, |
|
"eval_runtime": 1.9776, |
|
"eval_samples_per_second": 243.222, |
|
"eval_steps_per_second": 30.845, |
|
"step": 50854 |
|
}, |
|
{ |
|
"epoch": 94.27, |
|
"grad_norm": 8.909347534179688, |
|
"learning_rate": 1.1460258780036969e-06, |
|
"loss": 0.8623, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.7315581440925598, |
|
"eval_runtime": 1.9728, |
|
"eval_samples_per_second": 243.814, |
|
"eval_steps_per_second": 30.92, |
|
"step": 51395 |
|
}, |
|
{ |
|
"epoch": 95.19, |
|
"grad_norm": 10.748625755310059, |
|
"learning_rate": 9.611829944547135e-07, |
|
"loss": 0.8341, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.7879417879417879, |
|
"eval_loss": 0.7457364201545715, |
|
"eval_runtime": 2.0017, |
|
"eval_samples_per_second": 240.29, |
|
"eval_steps_per_second": 30.473, |
|
"step": 51936 |
|
}, |
|
{ |
|
"epoch": 96.12, |
|
"grad_norm": 3.179774761199951, |
|
"learning_rate": 7.763401109057302e-07, |
|
"loss": 0.8766, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.743617832660675, |
|
"eval_runtime": 1.999, |
|
"eval_samples_per_second": 240.625, |
|
"eval_steps_per_second": 30.516, |
|
"step": 52477 |
|
}, |
|
{ |
|
"epoch": 97.04, |
|
"grad_norm": 12.243720054626465, |
|
"learning_rate": 5.914972273567468e-07, |
|
"loss": 0.8101, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 97.97, |
|
"grad_norm": 18.670886993408203, |
|
"learning_rate": 4.066543438077634e-07, |
|
"loss": 0.8681, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.7900207900207901, |
|
"eval_loss": 0.7483807802200317, |
|
"eval_runtime": 2.0039, |
|
"eval_samples_per_second": 240.035, |
|
"eval_steps_per_second": 30.441, |
|
"step": 53018 |
|
}, |
|
{ |
|
"epoch": 98.89, |
|
"grad_norm": 8.483085632324219, |
|
"learning_rate": 2.2181146025878005e-07, |
|
"loss": 0.8635, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_accuracy": 0.7941787941787942, |
|
"eval_loss": 0.7391884922981262, |
|
"eval_runtime": 1.9875, |
|
"eval_samples_per_second": 242.013, |
|
"eval_steps_per_second": 30.692, |
|
"step": 53559 |
|
}, |
|
{ |
|
"epoch": 99.82, |
|
"grad_norm": 10.068202018737793, |
|
"learning_rate": 3.696857670979668e-08, |
|
"loss": 0.8091, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.7920997920997921, |
|
"eval_loss": 0.7390549182891846, |
|
"eval_runtime": 2.0448, |
|
"eval_samples_per_second": 235.228, |
|
"eval_steps_per_second": 29.831, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"step": 54100, |
|
"total_flos": 3.355193271048192e+19, |
|
"train_loss": 1.2517024893769495, |
|
"train_runtime": 5380.7558, |
|
"train_samples_per_second": 80.379, |
|
"train_steps_per_second": 10.054 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 54100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 3.355193271048192e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|