|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.722424669879707, |
|
"eval_steps": 10000, |
|
"global_step": 380000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 9.83937751004016e-06, |
|
"loss": 1.4799, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.0304539203643799, |
|
"eval_runtime": 69.6877, |
|
"eval_samples_per_second": 1434.973, |
|
"eval_steps_per_second": 14.952, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 9.638654618473896e-06, |
|
"loss": 0.9736, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.9039866328239441, |
|
"eval_runtime": 65.2654, |
|
"eval_samples_per_second": 1532.205, |
|
"eval_steps_per_second": 15.966, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 9.437951807228917e-06, |
|
"loss": 0.8922, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.8560834527015686, |
|
"eval_runtime": 64.4642, |
|
"eval_samples_per_second": 1551.248, |
|
"eval_steps_per_second": 16.164, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 9.237228915662652e-06, |
|
"loss": 0.8531, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 0.8226799368858337, |
|
"eval_runtime": 64.3868, |
|
"eval_samples_per_second": 1553.114, |
|
"eval_steps_per_second": 16.183, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 9.036526104417672e-06, |
|
"loss": 0.829, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 0.8042359948158264, |
|
"eval_runtime": 64.3396, |
|
"eval_samples_per_second": 1554.254, |
|
"eval_steps_per_second": 16.195, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 8.835823293172691e-06, |
|
"loss": 0.811, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.7950388789176941, |
|
"eval_runtime": 64.5291, |
|
"eval_samples_per_second": 1549.687, |
|
"eval_steps_per_second": 16.148, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 8.635120481927711e-06, |
|
"loss": 0.7976, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.7777369022369385, |
|
"eval_runtime": 64.862, |
|
"eval_samples_per_second": 1541.734, |
|
"eval_steps_per_second": 16.065, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 8.434417670682732e-06, |
|
"loss": 0.7872, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 0.7702258825302124, |
|
"eval_runtime": 64.4619, |
|
"eval_samples_per_second": 1551.303, |
|
"eval_steps_per_second": 16.165, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 8.233714859437752e-06, |
|
"loss": 0.778, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.760698139667511, |
|
"eval_runtime": 64.4008, |
|
"eval_samples_per_second": 1552.775, |
|
"eval_steps_per_second": 16.18, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 8.033012048192772e-06, |
|
"loss": 0.7695, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 0.7517786622047424, |
|
"eval_runtime": 64.56, |
|
"eval_samples_per_second": 1548.946, |
|
"eval_steps_per_second": 16.14, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 7.832309236947791e-06, |
|
"loss": 0.7638, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.7468145489692688, |
|
"eval_runtime": 64.9838, |
|
"eval_samples_per_second": 1538.846, |
|
"eval_steps_per_second": 16.035, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 7.631606425702813e-06, |
|
"loss": 0.7573, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 0.739628255367279, |
|
"eval_runtime": 64.2595, |
|
"eval_samples_per_second": 1556.191, |
|
"eval_steps_per_second": 16.216, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 7.4309036144578315e-06, |
|
"loss": 0.7512, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 0.7346537709236145, |
|
"eval_runtime": 64.911, |
|
"eval_samples_per_second": 1540.57, |
|
"eval_steps_per_second": 16.053, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 7.2301807228915665e-06, |
|
"loss": 0.7468, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"eval_loss": 0.7300976514816284, |
|
"eval_runtime": 64.8445, |
|
"eval_samples_per_second": 1542.15, |
|
"eval_steps_per_second": 16.069, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 7.029477911646587e-06, |
|
"loss": 0.7418, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.7252103686332703, |
|
"eval_runtime": 64.5503, |
|
"eval_samples_per_second": 1549.179, |
|
"eval_steps_per_second": 16.142, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"learning_rate": 6.828775100401607e-06, |
|
"loss": 0.7379, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"eval_loss": 0.7203709483146667, |
|
"eval_runtime": 64.4763, |
|
"eval_samples_per_second": 1550.958, |
|
"eval_steps_per_second": 16.161, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 6.628072289156627e-06, |
|
"loss": 0.7333, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_loss": 0.7167079448699951, |
|
"eval_runtime": 64.7852, |
|
"eval_samples_per_second": 1543.562, |
|
"eval_steps_per_second": 16.084, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 6.427369477911647e-06, |
|
"loss": 0.7298, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.7129059433937073, |
|
"eval_runtime": 64.609, |
|
"eval_samples_per_second": 1547.772, |
|
"eval_steps_per_second": 16.128, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 6.2266666666666675e-06, |
|
"loss": 0.7258, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 0.7089965343475342, |
|
"eval_runtime": 64.6825, |
|
"eval_samples_per_second": 1546.013, |
|
"eval_steps_per_second": 16.109, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 6.025963855421687e-06, |
|
"loss": 0.7229, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 0.7047411799430847, |
|
"eval_runtime": 64.3501, |
|
"eval_samples_per_second": 1553.999, |
|
"eval_steps_per_second": 16.193, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 5.825261044176708e-06, |
|
"loss": 0.7194, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 0.7026636004447937, |
|
"eval_runtime": 64.4785, |
|
"eval_samples_per_second": 1550.905, |
|
"eval_steps_per_second": 16.16, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 5.624558232931727e-06, |
|
"loss": 0.7171, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 0.702177107334137, |
|
"eval_runtime": 64.5209, |
|
"eval_samples_per_second": 1549.886, |
|
"eval_steps_per_second": 16.15, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 5.423855421686748e-06, |
|
"loss": 0.7138, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.6978575587272644, |
|
"eval_runtime": 64.573, |
|
"eval_samples_per_second": 1548.634, |
|
"eval_steps_per_second": 16.137, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 5.2231526104417676e-06, |
|
"loss": 0.7113, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 0.6939824819564819, |
|
"eval_runtime": 64.7414, |
|
"eval_samples_per_second": 1544.606, |
|
"eval_steps_per_second": 16.095, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"learning_rate": 5.022449799196788e-06, |
|
"loss": 0.7087, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 0.6922757029533386, |
|
"eval_runtime": 64.5094, |
|
"eval_samples_per_second": 1550.162, |
|
"eval_steps_per_second": 16.153, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 4.821746987951808e-06, |
|
"loss": 0.7071, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_loss": 0.6915081739425659, |
|
"eval_runtime": 65.0605, |
|
"eval_samples_per_second": 1537.031, |
|
"eval_steps_per_second": 16.016, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 4.6210441767068274e-06, |
|
"loss": 0.7047, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"eval_loss": 0.6870027184486389, |
|
"eval_runtime": 64.7199, |
|
"eval_samples_per_second": 1545.119, |
|
"eval_steps_per_second": 16.1, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 4.420341365461848e-06, |
|
"loss": 0.7027, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"eval_loss": 0.6844470500946045, |
|
"eval_runtime": 64.906, |
|
"eval_samples_per_second": 1540.69, |
|
"eval_steps_per_second": 16.054, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 4.219638554216868e-06, |
|
"loss": 0.6999, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 0.6833365559577942, |
|
"eval_runtime": 64.5739, |
|
"eval_samples_per_second": 1548.614, |
|
"eval_steps_per_second": 16.137, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 4.018935742971888e-06, |
|
"loss": 0.6992, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 0.6834575533866882, |
|
"eval_runtime": 64.6706, |
|
"eval_samples_per_second": 1546.298, |
|
"eval_steps_per_second": 16.112, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"learning_rate": 3.818253012048193e-06, |
|
"loss": 0.6975, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"eval_loss": 0.6793897151947021, |
|
"eval_runtime": 64.8495, |
|
"eval_samples_per_second": 1542.031, |
|
"eval_steps_per_second": 16.068, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"learning_rate": 3.617530120481928e-06, |
|
"loss": 0.6947, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"eval_loss": 0.6794592142105103, |
|
"eval_runtime": 64.6554, |
|
"eval_samples_per_second": 1546.661, |
|
"eval_steps_per_second": 16.116, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 3.416827309236948e-06, |
|
"loss": 0.6947, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"eval_loss": 0.6781629920005798, |
|
"eval_runtime": 65.93, |
|
"eval_samples_per_second": 1516.76, |
|
"eval_steps_per_second": 15.805, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 3.216124497991968e-06, |
|
"loss": 0.6932, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"eval_loss": 0.6771513819694519, |
|
"eval_runtime": 64.7971, |
|
"eval_samples_per_second": 1543.279, |
|
"eval_steps_per_second": 16.081, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"learning_rate": 3.015401606425703e-06, |
|
"loss": 0.6914, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"eval_loss": 0.6757428050041199, |
|
"eval_runtime": 64.8856, |
|
"eval_samples_per_second": 1541.173, |
|
"eval_steps_per_second": 16.059, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 2.8146987951807233e-06, |
|
"loss": 0.6907, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"eval_loss": 0.6730121374130249, |
|
"eval_runtime": 65.1098, |
|
"eval_samples_per_second": 1535.867, |
|
"eval_steps_per_second": 16.004, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 2.6139959839357434e-06, |
|
"loss": 0.689, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"eval_loss": 0.6724720597267151, |
|
"eval_runtime": 65.1981, |
|
"eval_samples_per_second": 1533.787, |
|
"eval_steps_per_second": 15.982, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"learning_rate": 2.413293172690763e-06, |
|
"loss": 0.6885, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"eval_loss": 0.6744215488433838, |
|
"eval_runtime": 64.9859, |
|
"eval_samples_per_second": 1538.796, |
|
"eval_steps_per_second": 16.034, |
|
"step": 380000 |
|
} |
|
], |
|
"logging_steps": 10000, |
|
"max_steps": 500000, |
|
"num_train_epochs": 5, |
|
"save_steps": 10000, |
|
"total_flos": 6.301004166565632e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|