|
{ |
|
"best_metric": 0.6895740032196045, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224", |
|
"epoch": 10.0, |
|
"eval_steps": 1.0, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.2380081706918525, |
|
"learning_rate": 0.0, |
|
"loss": 1.2458, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.3161638975143433, |
|
"eval_runtime": 50.8995, |
|
"eval_samples_per_second": 3.929, |
|
"eval_steps_per_second": 0.255, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.20429495268987705, |
|
"learning_rate": 8.613531161467863e-06, |
|
"loss": 1.2003, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.3161638975143433, |
|
"eval_runtime": 47.4818, |
|
"eval_samples_per_second": 4.212, |
|
"eval_steps_per_second": 0.274, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.20616215800420787, |
|
"learning_rate": 1.3652123889719709e-05, |
|
"loss": 1.2622, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.309991478919983, |
|
"eval_runtime": 47.4152, |
|
"eval_samples_per_second": 4.218, |
|
"eval_steps_per_second": 0.274, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.20155595022101944, |
|
"learning_rate": 1.7227062322935725e-05, |
|
"loss": 1.2845, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.3013781309127808, |
|
"eval_runtime": 47.4814, |
|
"eval_samples_per_second": 4.212, |
|
"eval_steps_per_second": 0.274, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.21113117474989132, |
|
"learning_rate": 2e-05, |
|
"loss": 1.246, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.2892160415649414, |
|
"eval_runtime": 47.7209, |
|
"eval_samples_per_second": 4.191, |
|
"eval_steps_per_second": 0.272, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.21377946631015488, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2684, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.2754532098770142, |
|
"eval_runtime": 47.5781, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.2284268997618767, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2681, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.2605774402618408, |
|
"eval_runtime": 47.5326, |
|
"eval_samples_per_second": 4.208, |
|
"eval_steps_per_second": 0.273, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.23585343568544442, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2407, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.244718313217163, |
|
"eval_runtime": 47.5001, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.23051191992462533, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2766, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.2285138368606567, |
|
"eval_runtime": 47.4631, |
|
"eval_samples_per_second": 4.214, |
|
"eval_steps_per_second": 0.274, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.22726394327484983, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2024, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.2118008136749268, |
|
"eval_runtime": 47.4991, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.25404890894461285, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2742, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.1942989826202393, |
|
"eval_runtime": 49.2609, |
|
"eval_samples_per_second": 4.06, |
|
"eval_steps_per_second": 0.264, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.26336210916526287, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2258, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.176426649093628, |
|
"eval_runtime": 49.0639, |
|
"eval_samples_per_second": 4.076, |
|
"eval_steps_per_second": 0.265, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.29637148470746666, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2345, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.1577811241149902, |
|
"eval_runtime": 49.1352, |
|
"eval_samples_per_second": 4.07, |
|
"eval_steps_per_second": 0.265, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.2841880377627424, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0765, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.1381279230117798, |
|
"eval_runtime": 49.25, |
|
"eval_samples_per_second": 4.061, |
|
"eval_steps_per_second": 0.264, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.2773140636191091, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1812, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.1178216934204102, |
|
"eval_runtime": 49.0879, |
|
"eval_samples_per_second": 4.074, |
|
"eval_steps_per_second": 0.265, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3568607365552051, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1327, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0954149961471558, |
|
"eval_runtime": 48.6546, |
|
"eval_samples_per_second": 4.111, |
|
"eval_steps_per_second": 0.267, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.32574391414112897, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1162, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.071275234222412, |
|
"eval_runtime": 48.5618, |
|
"eval_samples_per_second": 4.118, |
|
"eval_steps_per_second": 0.268, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.4256864144638081, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1138, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 1.0455905199050903, |
|
"eval_runtime": 48.4981, |
|
"eval_samples_per_second": 4.124, |
|
"eval_steps_per_second": 0.268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.31230014132112643, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0011, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 1.0208789110183716, |
|
"eval_runtime": 48.4675, |
|
"eval_samples_per_second": 4.126, |
|
"eval_steps_per_second": 0.268, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.3025724039243594, |
|
"learning_rate": 2e-05, |
|
"loss": 1.109, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 1.002480149269104, |
|
"eval_runtime": 48.5265, |
|
"eval_samples_per_second": 4.121, |
|
"eval_steps_per_second": 0.268, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.27787879590501874, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0291, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9933492541313171, |
|
"eval_runtime": 50.0369, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.4231294067130801, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0779, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.9850385785102844, |
|
"eval_runtime": 50.0062, |
|
"eval_samples_per_second": 4.0, |
|
"eval_steps_per_second": 0.26, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.42130097437373987, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0897, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.9758670330047607, |
|
"eval_runtime": 50.1031, |
|
"eval_samples_per_second": 3.992, |
|
"eval_steps_per_second": 0.259, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.27711808063263893, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0739, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9674506187438965, |
|
"eval_runtime": 50.0337, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.2879649409281791, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0182, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.9592065215110779, |
|
"eval_runtime": 50.0709, |
|
"eval_samples_per_second": 3.994, |
|
"eval_steps_per_second": 0.26, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.19327450826076825, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0413, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.9518552422523499, |
|
"eval_runtime": 50.0572, |
|
"eval_samples_per_second": 3.995, |
|
"eval_steps_per_second": 0.26, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.19707021382445633, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9525, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9449941515922546, |
|
"eval_runtime": 50.0515, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.2420270757641518, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9658, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9378474354743958, |
|
"eval_runtime": 49.9299, |
|
"eval_samples_per_second": 4.006, |
|
"eval_steps_per_second": 0.26, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.18074632782127534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9866, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.93099045753479, |
|
"eval_runtime": 50.0096, |
|
"eval_samples_per_second": 3.999, |
|
"eval_steps_per_second": 0.26, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.1936051126921734, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.9244199991226196, |
|
"eval_runtime": 50.2469, |
|
"eval_samples_per_second": 3.98, |
|
"eval_steps_per_second": 0.259, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.26164254459782943, |
|
"learning_rate": 2e-05, |
|
"loss": 0.88, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.9175177216529846, |
|
"eval_runtime": 50.1695, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.18677152741688485, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9569, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9108598828315735, |
|
"eval_runtime": 50.0387, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.20486279036126417, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0208, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.9042049646377563, |
|
"eval_runtime": 50.1472, |
|
"eval_samples_per_second": 3.988, |
|
"eval_steps_per_second": 0.259, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.2004946169291112, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9931, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.8980298042297363, |
|
"eval_runtime": 50.245, |
|
"eval_samples_per_second": 3.98, |
|
"eval_steps_per_second": 0.259, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.1645872432258401, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0184, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.8924428820610046, |
|
"eval_runtime": 50.3703, |
|
"eval_samples_per_second": 3.971, |
|
"eval_steps_per_second": 0.258, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.18293519304435016, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0026, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.8870412707328796, |
|
"eval_runtime": 50.0483, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.17712548516246762, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9387, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.881915271282196, |
|
"eval_runtime": 49.9751, |
|
"eval_samples_per_second": 4.002, |
|
"eval_steps_per_second": 0.26, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.21472689311609464, |
|
"learning_rate": 2e-05, |
|
"loss": 0.958, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8768754601478577, |
|
"eval_runtime": 50.1204, |
|
"eval_samples_per_second": 3.99, |
|
"eval_steps_per_second": 0.259, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.21117297910005806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9922, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.8718628883361816, |
|
"eval_runtime": 50.1732, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.17835587003909165, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9776, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8669865131378174, |
|
"eval_runtime": 50.1148, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.2092736372483734, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9731, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.8619834780693054, |
|
"eval_runtime": 50.052, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.2338857391910308, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9319, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8572126030921936, |
|
"eval_runtime": 50.1212, |
|
"eval_samples_per_second": 3.99, |
|
"eval_steps_per_second": 0.259, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.19168719284572813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9083, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8525611758232117, |
|
"eval_runtime": 50.1733, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.20004868138433377, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9118, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.8483461141586304, |
|
"eval_runtime": 50.1083, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.19012965506122342, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8888, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.8446614742279053, |
|
"eval_runtime": 50.1171, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.21187005706805245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9319, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8412036299705505, |
|
"eval_runtime": 50.0918, |
|
"eval_samples_per_second": 3.993, |
|
"eval_steps_per_second": 0.26, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.19673832205926584, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9359, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8380417823791504, |
|
"eval_runtime": 50.2214, |
|
"eval_samples_per_second": 3.982, |
|
"eval_steps_per_second": 0.259, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.21712294106174318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8511, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8353021740913391, |
|
"eval_runtime": 50.1617, |
|
"eval_samples_per_second": 3.987, |
|
"eval_steps_per_second": 0.259, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.2138924779700934, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8695, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8327407836914062, |
|
"eval_runtime": 50.1442, |
|
"eval_samples_per_second": 3.988, |
|
"eval_steps_per_second": 0.259, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.22387442384578618, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8518, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8301742076873779, |
|
"eval_runtime": 50.1867, |
|
"eval_samples_per_second": 3.985, |
|
"eval_steps_per_second": 0.259, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.1975577146517192, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8868, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8275265693664551, |
|
"eval_runtime": 51.2257, |
|
"eval_samples_per_second": 3.904, |
|
"eval_steps_per_second": 0.254, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.21474817057286624, |
|
"learning_rate": 2e-05, |
|
"loss": 0.767, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.824796736240387, |
|
"eval_runtime": 51.276, |
|
"eval_samples_per_second": 3.9, |
|
"eval_steps_per_second": 0.254, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.21105651676755652, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9219, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8221166729927063, |
|
"eval_runtime": 51.141, |
|
"eval_samples_per_second": 3.911, |
|
"eval_steps_per_second": 0.254, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.20706475184742085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8873, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.819589376449585, |
|
"eval_runtime": 51.0045, |
|
"eval_samples_per_second": 3.921, |
|
"eval_steps_per_second": 0.255, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.21722220033855957, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8956, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.8176340460777283, |
|
"eval_runtime": 51.1941, |
|
"eval_samples_per_second": 3.907, |
|
"eval_steps_per_second": 0.254, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.20669001221665667, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9506, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8158826231956482, |
|
"eval_runtime": 52.1162, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.22189732090066341, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8955, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.814656674861908, |
|
"eval_runtime": 52.1361, |
|
"eval_samples_per_second": 3.836, |
|
"eval_steps_per_second": 0.249, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.2030113892848459, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9108, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.813343346118927, |
|
"eval_runtime": 52.2552, |
|
"eval_samples_per_second": 3.827, |
|
"eval_steps_per_second": 0.249, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.2123201057569791, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8779, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.8116877675056458, |
|
"eval_runtime": 52.1233, |
|
"eval_samples_per_second": 3.837, |
|
"eval_steps_per_second": 0.249, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.211551126937912, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9294, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.8098442554473877, |
|
"eval_runtime": 52.1091, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.24981344981629752, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8409, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.8070770502090454, |
|
"eval_runtime": 53.4187, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.2341550589775159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.888, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.8040286898612976, |
|
"eval_runtime": 53.2197, |
|
"eval_samples_per_second": 3.758, |
|
"eval_steps_per_second": 0.244, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.2336241775649256, |
|
"learning_rate": 2e-05, |
|
"loss": 0.913, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.8013430833816528, |
|
"eval_runtime": 53.1784, |
|
"eval_samples_per_second": 3.761, |
|
"eval_steps_per_second": 0.244, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2414390628081758, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8754, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7985894680023193, |
|
"eval_runtime": 53.2454, |
|
"eval_samples_per_second": 3.756, |
|
"eval_steps_per_second": 0.244, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.2484104465653703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8497, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.7954932451248169, |
|
"eval_runtime": 53.3794, |
|
"eval_samples_per_second": 3.747, |
|
"eval_steps_per_second": 0.244, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.23859744120942086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8567, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.7929843068122864, |
|
"eval_runtime": 55.517, |
|
"eval_samples_per_second": 3.602, |
|
"eval_steps_per_second": 0.234, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.24584758647855462, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8489, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.7903321981430054, |
|
"eval_runtime": 55.4151, |
|
"eval_samples_per_second": 3.609, |
|
"eval_steps_per_second": 0.235, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.2484917818304153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9122, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7877185344696045, |
|
"eval_runtime": 55.4069, |
|
"eval_samples_per_second": 3.61, |
|
"eval_steps_per_second": 0.235, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.2184614083026819, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8355, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7852210998535156, |
|
"eval_runtime": 55.3381, |
|
"eval_samples_per_second": 3.614, |
|
"eval_steps_per_second": 0.235, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.24978410070800153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7968, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7827157378196716, |
|
"eval_runtime": 55.3708, |
|
"eval_samples_per_second": 3.612, |
|
"eval_steps_per_second": 0.235, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.23059883325890385, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8783, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7805906534194946, |
|
"eval_runtime": 55.6033, |
|
"eval_samples_per_second": 3.597, |
|
"eval_steps_per_second": 0.234, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.23261007334915096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7956, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7786691784858704, |
|
"eval_runtime": 55.0913, |
|
"eval_samples_per_second": 3.63, |
|
"eval_steps_per_second": 0.236, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.25779598356574085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8426, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7771151661872864, |
|
"eval_runtime": 55.0698, |
|
"eval_samples_per_second": 3.632, |
|
"eval_steps_per_second": 0.236, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.2288243335971112, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8381, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7756838202476501, |
|
"eval_runtime": 54.8412, |
|
"eval_samples_per_second": 3.647, |
|
"eval_steps_per_second": 0.237, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.24235644907977733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.887, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7739972472190857, |
|
"eval_runtime": 54.9718, |
|
"eval_samples_per_second": 3.638, |
|
"eval_steps_per_second": 0.236, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.23666820017867402, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8007, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.7724328637123108, |
|
"eval_runtime": 55.0225, |
|
"eval_samples_per_second": 3.635, |
|
"eval_steps_per_second": 0.236, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.22815737396609181, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8529, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.7710004448890686, |
|
"eval_runtime": 55.321, |
|
"eval_samples_per_second": 3.615, |
|
"eval_steps_per_second": 0.235, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.2701264871470739, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8515, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7695322632789612, |
|
"eval_runtime": 55.3045, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.24363813951328234, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8587, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7689024209976196, |
|
"eval_runtime": 55.3009, |
|
"eval_samples_per_second": 3.617, |
|
"eval_steps_per_second": 0.235, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.30924701355253065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9076, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7676254510879517, |
|
"eval_runtime": 55.2365, |
|
"eval_samples_per_second": 3.621, |
|
"eval_steps_per_second": 0.235, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.2665188280221636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8445, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7661146521568298, |
|
"eval_runtime": 55.2775, |
|
"eval_samples_per_second": 3.618, |
|
"eval_steps_per_second": 0.235, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.24674191720675534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8882, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.76513671875, |
|
"eval_runtime": 55.0857, |
|
"eval_samples_per_second": 3.631, |
|
"eval_steps_per_second": 0.236, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.2736689405531704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8336, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.764373779296875, |
|
"eval_runtime": 55.2069, |
|
"eval_samples_per_second": 3.623, |
|
"eval_steps_per_second": 0.235, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.290841287198557, |
|
"learning_rate": 2e-05, |
|
"loss": 0.795, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.7632084488868713, |
|
"eval_runtime": 55.1009, |
|
"eval_samples_per_second": 3.63, |
|
"eval_steps_per_second": 0.236, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.2912051076836381, |
|
"learning_rate": 2e-05, |
|
"loss": 0.772, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7618446350097656, |
|
"eval_runtime": 55.3717, |
|
"eval_samples_per_second": 3.612, |
|
"eval_steps_per_second": 0.235, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.3169908538809109, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8148, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7599577307701111, |
|
"eval_runtime": 55.3931, |
|
"eval_samples_per_second": 3.611, |
|
"eval_steps_per_second": 0.235, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.28780549186847426, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8154, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.7583369612693787, |
|
"eval_runtime": 55.1679, |
|
"eval_samples_per_second": 3.625, |
|
"eval_steps_per_second": 0.236, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.30695250620091474, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9032, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7571613192558289, |
|
"eval_runtime": 55.1779, |
|
"eval_samples_per_second": 3.625, |
|
"eval_steps_per_second": 0.236, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.2693887416759828, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8106, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.7566004991531372, |
|
"eval_runtime": 55.1107, |
|
"eval_samples_per_second": 3.629, |
|
"eval_steps_per_second": 0.236, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.2887583627563198, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8518, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7558963298797607, |
|
"eval_runtime": 55.2153, |
|
"eval_samples_per_second": 3.622, |
|
"eval_steps_per_second": 0.235, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.3059402168979351, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7727, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.7545350790023804, |
|
"eval_runtime": 55.3225, |
|
"eval_samples_per_second": 3.615, |
|
"eval_steps_per_second": 0.235, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.3096260477909968, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8477, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7526452541351318, |
|
"eval_runtime": 55.4311, |
|
"eval_samples_per_second": 3.608, |
|
"eval_steps_per_second": 0.235, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.31498884686525297, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7982, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7510760426521301, |
|
"eval_runtime": 55.4361, |
|
"eval_samples_per_second": 3.608, |
|
"eval_steps_per_second": 0.235, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.31302830623184313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.871, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7500898838043213, |
|
"eval_runtime": 55.3025, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.3132608568779145, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8094, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.7498895525932312, |
|
"eval_runtime": 55.2402, |
|
"eval_samples_per_second": 3.621, |
|
"eval_steps_per_second": 0.235, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.298645350091386, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7673, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7493192553520203, |
|
"eval_runtime": 54.8718, |
|
"eval_samples_per_second": 3.645, |
|
"eval_steps_per_second": 0.237, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.34042584783125357, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7336, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.7476670742034912, |
|
"eval_runtime": 54.9305, |
|
"eval_samples_per_second": 3.641, |
|
"eval_steps_per_second": 0.237, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.293099043801068, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8088, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.745802640914917, |
|
"eval_runtime": 55.2051, |
|
"eval_samples_per_second": 3.623, |
|
"eval_steps_per_second": 0.235, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.3042839507858426, |
|
"learning_rate": 2e-05, |
|
"loss": 0.787, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7439618110656738, |
|
"eval_runtime": 55.0065, |
|
"eval_samples_per_second": 3.636, |
|
"eval_steps_per_second": 0.236, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.32992077073227005, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8296, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.7424842715263367, |
|
"eval_runtime": 55.1254, |
|
"eval_samples_per_second": 3.628, |
|
"eval_steps_per_second": 0.236, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.2798839747424062, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7642, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7414796948432922, |
|
"eval_runtime": 49.183, |
|
"eval_samples_per_second": 4.066, |
|
"eval_steps_per_second": 0.264, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.3046631191964983, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8203, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7410265207290649, |
|
"eval_runtime": 48.1541, |
|
"eval_samples_per_second": 4.153, |
|
"eval_steps_per_second": 0.27, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.3117517214859861, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8222, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.7405675649642944, |
|
"eval_runtime": 47.7145, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 0.272, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.3412709249466801, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7459, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7395681738853455, |
|
"eval_runtime": 47.5855, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.2917443566507923, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7849, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.7387100458145142, |
|
"eval_runtime": 47.6344, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.3054484743574741, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8354, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7384718060493469, |
|
"eval_runtime": 47.8373, |
|
"eval_samples_per_second": 4.181, |
|
"eval_steps_per_second": 0.272, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.34986630381114014, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7069, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.737342357635498, |
|
"eval_runtime": 47.5763, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.32324403145716496, |
|
"learning_rate": 2e-05, |
|
"loss": 0.767, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.7360101938247681, |
|
"eval_runtime": 47.5774, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.3795969851258545, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7556, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.7339167594909668, |
|
"eval_runtime": 47.5818, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.34401062275458993, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7321068644523621, |
|
"eval_runtime": 47.7643, |
|
"eval_samples_per_second": 4.187, |
|
"eval_steps_per_second": 0.272, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.3248480010385237, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8103, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7309197783470154, |
|
"eval_runtime": 49.5841, |
|
"eval_samples_per_second": 4.034, |
|
"eval_steps_per_second": 0.262, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3572409124813593, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7972, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.7301727533340454, |
|
"eval_runtime": 49.3728, |
|
"eval_samples_per_second": 4.051, |
|
"eval_steps_per_second": 0.263, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.37348522775103665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.88, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.7292957305908203, |
|
"eval_runtime": 49.2192, |
|
"eval_samples_per_second": 4.063, |
|
"eval_steps_per_second": 0.264, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.37667450960329546, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7518, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.728556215763092, |
|
"eval_runtime": 49.0971, |
|
"eval_samples_per_second": 4.074, |
|
"eval_steps_per_second": 0.265, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.3163628607304638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7948, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7287828326225281, |
|
"eval_runtime": 49.0213, |
|
"eval_samples_per_second": 4.08, |
|
"eval_steps_per_second": 0.265, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.3038899302084592, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7791, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"eval_loss": 0.7294514179229736, |
|
"eval_runtime": 51.9137, |
|
"eval_samples_per_second": 3.853, |
|
"eval_steps_per_second": 0.25, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 0.3746448663122327, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7863, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"eval_loss": 0.7289304137229919, |
|
"eval_runtime": 51.3023, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 0.253, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.4058937381299434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7907, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"eval_loss": 0.7281011343002319, |
|
"eval_runtime": 50.8635, |
|
"eval_samples_per_second": 3.932, |
|
"eval_steps_per_second": 0.256, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.31608065583227885, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8348, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"eval_loss": 0.7280247211456299, |
|
"eval_runtime": 50.4903, |
|
"eval_samples_per_second": 3.961, |
|
"eval_steps_per_second": 0.257, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.3375768031046084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7783, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.7281913757324219, |
|
"eval_runtime": 50.5906, |
|
"eval_samples_per_second": 3.953, |
|
"eval_steps_per_second": 0.257, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 0.36047493494859845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.765, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"eval_loss": 0.7269737124443054, |
|
"eval_runtime": 53.4722, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.389743860171921, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8269, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"eval_loss": 0.7251996397972107, |
|
"eval_runtime": 53.4986, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.243, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.33850935145960215, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7497, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"eval_loss": 0.723595142364502, |
|
"eval_runtime": 53.4196, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.3166770012114478, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7648, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"eval_loss": 0.7223578095436096, |
|
"eval_runtime": 52.6143, |
|
"eval_samples_per_second": 3.801, |
|
"eval_steps_per_second": 0.247, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.41948670305268276, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8306, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"eval_loss": 0.7206680774688721, |
|
"eval_runtime": 52.3885, |
|
"eval_samples_per_second": 3.818, |
|
"eval_steps_per_second": 0.248, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.35580041105853477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7945, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"eval_loss": 0.7196171283721924, |
|
"eval_runtime": 55.1225, |
|
"eval_samples_per_second": 3.628, |
|
"eval_steps_per_second": 0.236, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.38411890663257114, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7466, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"eval_loss": 0.7188088297843933, |
|
"eval_runtime": 55.3068, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3682220575203032, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6752, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7181470990180969, |
|
"eval_runtime": 53.9116, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.241, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.34160763542661665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7788, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"eval_loss": 0.717949390411377, |
|
"eval_runtime": 53.8446, |
|
"eval_samples_per_second": 3.714, |
|
"eval_steps_per_second": 0.241, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.35709301353799944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8002, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_loss": 0.7179380655288696, |
|
"eval_runtime": 53.9299, |
|
"eval_samples_per_second": 3.709, |
|
"eval_steps_per_second": 0.241, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 0.3503147340749238, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7789, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"eval_loss": 0.7180312871932983, |
|
"eval_runtime": 53.4091, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.3931715546229069, |
|
"learning_rate": 2e-05, |
|
"loss": 0.762, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"eval_loss": 0.717825710773468, |
|
"eval_runtime": 53.6366, |
|
"eval_samples_per_second": 3.729, |
|
"eval_steps_per_second": 0.242, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.36864033862644363, |
|
"learning_rate": 2e-05, |
|
"loss": 0.829, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"eval_loss": 0.7178698182106018, |
|
"eval_runtime": 53.4891, |
|
"eval_samples_per_second": 3.739, |
|
"eval_steps_per_second": 0.243, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.41393587587462155, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7624, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"eval_loss": 0.7181968092918396, |
|
"eval_runtime": 53.5395, |
|
"eval_samples_per_second": 3.736, |
|
"eval_steps_per_second": 0.243, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.36727603900023204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7572, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"eval_loss": 0.7187527418136597, |
|
"eval_runtime": 53.4818, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.3684078795455007, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7352, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.7194793820381165, |
|
"eval_runtime": 53.4694, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 0.42414766562621153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7433, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"eval_loss": 0.7189603447914124, |
|
"eval_runtime": 53.8049, |
|
"eval_samples_per_second": 3.717, |
|
"eval_steps_per_second": 0.242, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.40420796619211563, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7466, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"eval_loss": 0.7173956036567688, |
|
"eval_runtime": 53.4014, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.36419740641344456, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7045, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"eval_loss": 0.7153105139732361, |
|
"eval_runtime": 53.285, |
|
"eval_samples_per_second": 3.753, |
|
"eval_steps_per_second": 0.244, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.384927357409491, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7437, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_loss": 0.7135314345359802, |
|
"eval_runtime": 53.4056, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 0.37218579680263697, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7693, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"eval_loss": 0.7120725512504578, |
|
"eval_runtime": 53.5467, |
|
"eval_samples_per_second": 3.735, |
|
"eval_steps_per_second": 0.243, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.38541382926033946, |
|
"learning_rate": 2e-05, |
|
"loss": 0.708, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"eval_loss": 0.7110380530357361, |
|
"eval_runtime": 53.4119, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.4028726453247759, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7263, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"eval_loss": 0.7100683450698853, |
|
"eval_runtime": 53.4337, |
|
"eval_samples_per_second": 3.743, |
|
"eval_steps_per_second": 0.243, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.3736204162232246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.698, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.7093971371650696, |
|
"eval_runtime": 53.4582, |
|
"eval_samples_per_second": 3.741, |
|
"eval_steps_per_second": 0.243, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.4179284798304916, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7611, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"eval_loss": 0.7089446783065796, |
|
"eval_runtime": 53.4752, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.4038858950888911, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6652, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"eval_loss": 0.7089542150497437, |
|
"eval_runtime": 53.4741, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.41740068710674544, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7319, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"eval_loss": 0.7090431451797485, |
|
"eval_runtime": 53.2419, |
|
"eval_samples_per_second": 3.756, |
|
"eval_steps_per_second": 0.244, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.4288335811568808, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6837, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"eval_loss": 0.7088204026222229, |
|
"eval_runtime": 53.3614, |
|
"eval_samples_per_second": 3.748, |
|
"eval_steps_per_second": 0.244, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.399955010119186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7989, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"eval_loss": 0.7084855437278748, |
|
"eval_runtime": 53.4923, |
|
"eval_samples_per_second": 3.739, |
|
"eval_steps_per_second": 0.243, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.41794643164255846, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7194, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_loss": 0.7080708146095276, |
|
"eval_runtime": 53.639, |
|
"eval_samples_per_second": 3.729, |
|
"eval_steps_per_second": 0.242, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.40953367303148197, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7354, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"eval_loss": 0.7077429890632629, |
|
"eval_runtime": 53.3837, |
|
"eval_samples_per_second": 3.746, |
|
"eval_steps_per_second": 0.244, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.5012282841513718, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7662, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.7064151167869568, |
|
"eval_runtime": 53.3549, |
|
"eval_samples_per_second": 3.748, |
|
"eval_steps_per_second": 0.244, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.4210784420989087, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7133, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"eval_loss": 0.7052726745605469, |
|
"eval_runtime": 53.5059, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.243, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.43520348530514996, |
|
"learning_rate": 2e-05, |
|
"loss": 0.729, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"eval_loss": 0.7045274972915649, |
|
"eval_runtime": 53.8352, |
|
"eval_samples_per_second": 3.715, |
|
"eval_steps_per_second": 0.241, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.4287647569802656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6727, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"eval_loss": 0.7041358947753906, |
|
"eval_runtime": 53.7435, |
|
"eval_samples_per_second": 3.721, |
|
"eval_steps_per_second": 0.242, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.41883715320456333, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7755, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"eval_loss": 0.7037128210067749, |
|
"eval_runtime": 53.8035, |
|
"eval_samples_per_second": 3.717, |
|
"eval_steps_per_second": 0.242, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 0.40617584505395354, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7776, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"eval_loss": 0.703965425491333, |
|
"eval_runtime": 53.8731, |
|
"eval_samples_per_second": 3.712, |
|
"eval_steps_per_second": 0.241, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.4085802225532245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7628, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"eval_loss": 0.7040860056877136, |
|
"eval_runtime": 53.9059, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.241, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.418039298119887, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7221, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"eval_loss": 0.7039948105812073, |
|
"eval_runtime": 53.7323, |
|
"eval_samples_per_second": 3.722, |
|
"eval_steps_per_second": 0.242, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.46118870048713073, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7029, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.703814685344696, |
|
"eval_runtime": 53.8975, |
|
"eval_samples_per_second": 3.711, |
|
"eval_steps_per_second": 0.241, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.431474386110294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6772, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"eval_loss": 0.7034456133842468, |
|
"eval_runtime": 51.1105, |
|
"eval_samples_per_second": 3.913, |
|
"eval_steps_per_second": 0.254, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.39618929325750435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8219, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"eval_loss": 0.7042189240455627, |
|
"eval_runtime": 47.2927, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.275, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.4489132713249424, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6387, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"eval_loss": 0.7061256170272827, |
|
"eval_runtime": 47.387, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.5100329637159183, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7677, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"eval_loss": 0.708121657371521, |
|
"eval_runtime": 47.3311, |
|
"eval_samples_per_second": 4.226, |
|
"eval_steps_per_second": 0.275, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.525511631981176, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5956, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"eval_loss": 0.7091134786605835, |
|
"eval_runtime": 47.2978, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.275, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.534675354231597, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7097, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"eval_loss": 0.7097848653793335, |
|
"eval_runtime": 47.4095, |
|
"eval_samples_per_second": 4.219, |
|
"eval_steps_per_second": 0.274, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.47286903698857446, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7371, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"eval_loss": 0.7090296745300293, |
|
"eval_runtime": 47.4487, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.274, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.4734705066820788, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7652, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.7079525589942932, |
|
"eval_runtime": 47.4101, |
|
"eval_samples_per_second": 4.219, |
|
"eval_steps_per_second": 0.274, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.46209764763985184, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6852, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"eval_loss": 0.7072803974151611, |
|
"eval_runtime": 47.3704, |
|
"eval_samples_per_second": 4.222, |
|
"eval_steps_per_second": 0.274, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.4828284708486433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6609, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_loss": 0.7068901062011719, |
|
"eval_runtime": 47.425, |
|
"eval_samples_per_second": 4.217, |
|
"eval_steps_per_second": 0.274, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.5230116179180577, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6872, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 0.7058187127113342, |
|
"eval_runtime": 47.5711, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.48081340678536255, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7694, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"eval_loss": 0.7044984698295593, |
|
"eval_runtime": 47.4233, |
|
"eval_samples_per_second": 4.217, |
|
"eval_steps_per_second": 0.274, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.4787525602476421, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7342, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"eval_loss": 0.7032212018966675, |
|
"eval_runtime": 47.3534, |
|
"eval_samples_per_second": 4.224, |
|
"eval_steps_per_second": 0.275, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.4871847582306217, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7562, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"eval_loss": 0.7019696235656738, |
|
"eval_runtime": 47.382, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.47999745025553603, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7534, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"eval_loss": 0.7014529705047607, |
|
"eval_runtime": 47.4435, |
|
"eval_samples_per_second": 4.216, |
|
"eval_steps_per_second": 0.274, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.5168030891996357, |
|
"learning_rate": 2e-05, |
|
"loss": 0.707, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.6993884444236755, |
|
"eval_runtime": 47.4943, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.536450206978984, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7318, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"eval_loss": 0.6971662640571594, |
|
"eval_runtime": 47.4193, |
|
"eval_samples_per_second": 4.218, |
|
"eval_steps_per_second": 0.274, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.45352543205020696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7421, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"eval_loss": 0.6962605118751526, |
|
"eval_runtime": 47.3798, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.5054883443109318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6668, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"eval_loss": 0.6970357298851013, |
|
"eval_runtime": 47.3311, |
|
"eval_samples_per_second": 4.226, |
|
"eval_steps_per_second": 0.275, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.49584660418833293, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_loss": 0.6980059146881104, |
|
"eval_runtime": 47.299, |
|
"eval_samples_per_second": 4.228, |
|
"eval_steps_per_second": 0.275, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.5114381326491793, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6691, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"eval_loss": 0.6995040774345398, |
|
"eval_runtime": 47.3887, |
|
"eval_samples_per_second": 4.22, |
|
"eval_steps_per_second": 0.274, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.48550125668870825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6525, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"eval_loss": 0.7020326256752014, |
|
"eval_runtime": 47.3838, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.5860847796671736, |
|
"learning_rate": 2e-05, |
|
"loss": 0.674, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"eval_loss": 0.7027825713157654, |
|
"eval_runtime": 47.3875, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.5535582209035479, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6643, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.7025408148765564, |
|
"eval_runtime": 47.5534, |
|
"eval_samples_per_second": 4.206, |
|
"eval_steps_per_second": 0.273, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.5443574176405931, |
|
"learning_rate": 2e-05, |
|
"loss": 0.709, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"eval_loss": 0.7007840871810913, |
|
"eval_runtime": 47.4469, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.274, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.563830259704143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6884, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"eval_loss": 0.6979361176490784, |
|
"eval_runtime": 49.1203, |
|
"eval_samples_per_second": 4.072, |
|
"eval_steps_per_second": 0.265, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.5094956892765212, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7318, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"eval_loss": 0.6962587237358093, |
|
"eval_runtime": 49.1831, |
|
"eval_samples_per_second": 4.066, |
|
"eval_steps_per_second": 0.264, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.5264819980742595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6746, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"eval_loss": 0.694776713848114, |
|
"eval_runtime": 49.1994, |
|
"eval_samples_per_second": 4.065, |
|
"eval_steps_per_second": 0.264, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.4737429304023209, |
|
"learning_rate": 2e-05, |
|
"loss": 0.664, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"eval_loss": 0.6939517855644226, |
|
"eval_runtime": 49.2438, |
|
"eval_samples_per_second": 4.061, |
|
"eval_steps_per_second": 0.264, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.494163934813738, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6978, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_loss": 0.6933834552764893, |
|
"eval_runtime": 49.3494, |
|
"eval_samples_per_second": 4.053, |
|
"eval_steps_per_second": 0.263, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.4945972278087299, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6909, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"eval_loss": 0.6924250721931458, |
|
"eval_runtime": 50.3255, |
|
"eval_samples_per_second": 3.974, |
|
"eval_steps_per_second": 0.258, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.48872556688745233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6622, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.6922193765640259, |
|
"eval_runtime": 50.4561, |
|
"eval_samples_per_second": 3.964, |
|
"eval_steps_per_second": 0.258, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 0.5013452255378538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7458, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"eval_loss": 0.6931161284446716, |
|
"eval_runtime": 50.5049, |
|
"eval_samples_per_second": 3.96, |
|
"eval_steps_per_second": 0.257, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.48271161232093784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7171, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"eval_loss": 0.6959040760993958, |
|
"eval_runtime": 50.2441, |
|
"eval_samples_per_second": 3.981, |
|
"eval_steps_per_second": 0.259, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.5414562703154852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6419, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"eval_loss": 0.7000604271888733, |
|
"eval_runtime": 50.4261, |
|
"eval_samples_per_second": 3.966, |
|
"eval_steps_per_second": 0.258, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.5074661247335385, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6881, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"eval_loss": 0.7039622664451599, |
|
"eval_runtime": 51.5214, |
|
"eval_samples_per_second": 3.882, |
|
"eval_steps_per_second": 0.252, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.5603468534764365, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7085, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"eval_loss": 0.7055023312568665, |
|
"eval_runtime": 51.7102, |
|
"eval_samples_per_second": 3.868, |
|
"eval_steps_per_second": 0.251, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.5992190802422799, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7614, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"eval_loss": 0.7046856880187988, |
|
"eval_runtime": 51.5464, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 0.252, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.6293684167527106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6435, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"eval_loss": 0.7021151781082153, |
|
"eval_runtime": 51.5328, |
|
"eval_samples_per_second": 3.881, |
|
"eval_steps_per_second": 0.252, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.591265449241434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.7002359628677368, |
|
"eval_runtime": 51.5812, |
|
"eval_samples_per_second": 3.877, |
|
"eval_steps_per_second": 0.252, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.543141536526749, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7027, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"eval_loss": 0.6986366510391235, |
|
"eval_runtime": 52.6956, |
|
"eval_samples_per_second": 3.795, |
|
"eval_steps_per_second": 0.247, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.5679656300203245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.625, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"eval_loss": 0.698679506778717, |
|
"eval_runtime": 52.5102, |
|
"eval_samples_per_second": 3.809, |
|
"eval_steps_per_second": 0.248, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 0.5285839896523021, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7687, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"eval_loss": 0.7005956768989563, |
|
"eval_runtime": 52.6067, |
|
"eval_samples_per_second": 3.802, |
|
"eval_steps_per_second": 0.247, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.6512964945211068, |
|
"learning_rate": 2e-05, |
|
"loss": 0.623, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"eval_loss": 0.7013595104217529, |
|
"eval_runtime": 52.5428, |
|
"eval_samples_per_second": 3.806, |
|
"eval_steps_per_second": 0.247, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.5295248631519638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5941, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"eval_loss": 0.7016547322273254, |
|
"eval_runtime": 52.6142, |
|
"eval_samples_per_second": 3.801, |
|
"eval_steps_per_second": 0.247, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.6134157701434021, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6506, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"eval_loss": 0.7009623646736145, |
|
"eval_runtime": 52.1942, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.57886797614996, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6983, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"eval_loss": 0.6988092064857483, |
|
"eval_runtime": 52.2577, |
|
"eval_samples_per_second": 3.827, |
|
"eval_steps_per_second": 0.249, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.5593482836944472, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6348, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 0.698823094367981, |
|
"eval_runtime": 52.2296, |
|
"eval_samples_per_second": 3.829, |
|
"eval_steps_per_second": 0.249, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 0.662802162179718, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6206, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"eval_loss": 0.6990167498588562, |
|
"eval_runtime": 52.4316, |
|
"eval_samples_per_second": 3.814, |
|
"eval_steps_per_second": 0.248, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.6874374231122908, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6033, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_loss": 0.699796736240387, |
|
"eval_runtime": 52.3193, |
|
"eval_samples_per_second": 3.823, |
|
"eval_steps_per_second": 0.248, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 0.6625766736772473, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6398, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"eval_loss": 0.6989737153053284, |
|
"eval_runtime": 52.1885, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.6563419096027812, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6119, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"eval_loss": 0.6973609924316406, |
|
"eval_runtime": 52.1628, |
|
"eval_samples_per_second": 3.834, |
|
"eval_steps_per_second": 0.249, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.5796353226697397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7041, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"eval_loss": 0.6957942247390747, |
|
"eval_runtime": 52.2028, |
|
"eval_samples_per_second": 3.831, |
|
"eval_steps_per_second": 0.249, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.5711947110504899, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6465, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"eval_loss": 0.696739673614502, |
|
"eval_runtime": 52.1849, |
|
"eval_samples_per_second": 3.833, |
|
"eval_steps_per_second": 0.249, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.6619502413653232, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6563, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"eval_loss": 0.6960940361022949, |
|
"eval_runtime": 52.0996, |
|
"eval_samples_per_second": 3.839, |
|
"eval_steps_per_second": 0.25, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.6587126256919645, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6505, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 0.6959022283554077, |
|
"eval_runtime": 52.1062, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.648164277941964, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5969, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"eval_loss": 0.6999121308326721, |
|
"eval_runtime": 51.9356, |
|
"eval_samples_per_second": 3.851, |
|
"eval_steps_per_second": 0.25, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.6595860789738482, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5945, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"eval_loss": 0.7028067111968994, |
|
"eval_runtime": 52.2232, |
|
"eval_samples_per_second": 3.83, |
|
"eval_steps_per_second": 0.249, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.7116894779822719, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7027, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"eval_loss": 0.7035638689994812, |
|
"eval_runtime": 52.1471, |
|
"eval_samples_per_second": 3.835, |
|
"eval_steps_per_second": 0.249, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.7581142336087988, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7171, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_loss": 0.6981176733970642, |
|
"eval_runtime": 52.1366, |
|
"eval_samples_per_second": 3.836, |
|
"eval_steps_per_second": 0.249, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.6261292745909233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.658, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"eval_loss": 0.6939045786857605, |
|
"eval_runtime": 52.2211, |
|
"eval_samples_per_second": 3.83, |
|
"eval_steps_per_second": 0.249, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.7256427809370966, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6576, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"eval_loss": 0.6904327273368835, |
|
"eval_runtime": 52.1829, |
|
"eval_samples_per_second": 3.833, |
|
"eval_steps_per_second": 0.249, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.6653711103404113, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6938, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"eval_loss": 0.6893274188041687, |
|
"eval_runtime": 51.899, |
|
"eval_samples_per_second": 3.854, |
|
"eval_steps_per_second": 0.25, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6730688267524797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7397, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6895740032196045, |
|
"eval_runtime": 52.1977, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.5832904533111831, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6366, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"eval_loss": 0.690305769443512, |
|
"eval_runtime": 51.0898, |
|
"eval_samples_per_second": 3.915, |
|
"eval_steps_per_second": 0.254, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.7244416322910332, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5756, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"eval_loss": 0.6943302154541016, |
|
"eval_runtime": 47.5876, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"grad_norm": 0.6507055762944723, |
|
"learning_rate": 2e-05, |
|
"loss": 0.622, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"eval_loss": 0.7073258757591248, |
|
"eval_runtime": 47.5809, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.7122561204700196, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5908, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"eval_loss": 0.7263233065605164, |
|
"eval_runtime": 47.544, |
|
"eval_samples_per_second": 4.207, |
|
"eval_steps_per_second": 0.273, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"grad_norm": 1.053512823308346, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6193, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"eval_loss": 0.7283624410629272, |
|
"eval_runtime": 47.5998, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 1.0167138351900848, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5942, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"eval_loss": 0.7136476039886475, |
|
"eval_runtime": 47.5738, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"grad_norm": 0.7388726343392281, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6898, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"eval_loss": 0.7017656564712524, |
|
"eval_runtime": 47.5857, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.6255681554939039, |
|
"learning_rate": 2e-05, |
|
"loss": 0.669, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 0.6967242956161499, |
|
"eval_runtime": 47.7483, |
|
"eval_samples_per_second": 4.189, |
|
"eval_steps_per_second": 0.272, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"grad_norm": 0.7000438574267057, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6143, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"eval_loss": 0.694460391998291, |
|
"eval_runtime": 47.7828, |
|
"eval_samples_per_second": 4.186, |
|
"eval_steps_per_second": 0.272, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.6658391411050186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6737, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"eval_loss": 0.6925583481788635, |
|
"eval_runtime": 47.7913, |
|
"eval_samples_per_second": 4.185, |
|
"eval_steps_per_second": 0.272, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.6473191970636399, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6347, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"eval_loss": 0.6907203793525696, |
|
"eval_runtime": 47.6866, |
|
"eval_samples_per_second": 4.194, |
|
"eval_steps_per_second": 0.273, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.703409963718735, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5991, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"eval_loss": 0.6898574829101562, |
|
"eval_runtime": 47.6481, |
|
"eval_samples_per_second": 4.197, |
|
"eval_steps_per_second": 0.273, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"grad_norm": 0.6957469611517898, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6428, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"eval_loss": 0.6922276020050049, |
|
"eval_runtime": 47.7072, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 0.272, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 0.7383281551578481, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6272, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"eval_loss": 0.6988270282745361, |
|
"eval_runtime": 47.5925, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"grad_norm": 0.7113722006702997, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6594, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"eval_loss": 0.7074680328369141, |
|
"eval_runtime": 47.7257, |
|
"eval_samples_per_second": 4.191, |
|
"eval_steps_per_second": 0.272, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.7233836456752487, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6003, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 0.7172031402587891, |
|
"eval_runtime": 47.7463, |
|
"eval_samples_per_second": 4.189, |
|
"eval_steps_per_second": 0.272, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"grad_norm": 0.7452166529670862, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6463, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"eval_loss": 0.7228195071220398, |
|
"eval_runtime": 47.6283, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.88949489838851, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6463, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"eval_loss": 0.7194420099258423, |
|
"eval_runtime": 47.6221, |
|
"eval_samples_per_second": 4.2, |
|
"eval_steps_per_second": 0.273, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"grad_norm": 0.7592408002786533, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6301, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"eval_loss": 0.7122278809547424, |
|
"eval_runtime": 47.7549, |
|
"eval_samples_per_second": 4.188, |
|
"eval_steps_per_second": 0.272, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.910753798896517, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7016, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"eval_loss": 0.7019688487052917, |
|
"eval_runtime": 47.5592, |
|
"eval_samples_per_second": 4.205, |
|
"eval_steps_per_second": 0.273, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.7861795541835009, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6107, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"eval_loss": 0.6964650750160217, |
|
"eval_runtime": 47.5842, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 0.7162378610377871, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6474, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"eval_loss": 0.6934291124343872, |
|
"eval_runtime": 47.4792, |
|
"eval_samples_per_second": 4.212, |
|
"eval_steps_per_second": 0.274, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"grad_norm": 0.7261823254305776, |
|
"learning_rate": 2e-05, |
|
"loss": 0.636, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"eval_loss": 0.6925876140594482, |
|
"eval_runtime": 47.6623, |
|
"eval_samples_per_second": 4.196, |
|
"eval_steps_per_second": 0.273, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.6757318335309442, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6249, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_loss": 0.6934402585029602, |
|
"eval_runtime": 47.5464, |
|
"eval_samples_per_second": 4.206, |
|
"eval_steps_per_second": 0.273, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"grad_norm": 0.7182105984315053, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6676, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"eval_loss": 0.6956924200057983, |
|
"eval_runtime": 47.6014, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.7231439954921842, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6719, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"eval_loss": 0.6987011432647705, |
|
"eval_runtime": 47.64, |
|
"eval_samples_per_second": 4.198, |
|
"eval_steps_per_second": 0.273, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"grad_norm": 0.7938681326839265, |
|
"learning_rate": 2e-05, |
|
"loss": 0.584, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"eval_loss": 0.7026040554046631, |
|
"eval_runtime": 47.6391, |
|
"eval_samples_per_second": 4.198, |
|
"eval_steps_per_second": 0.273, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.8011657536057513, |
|
"learning_rate": 2e-05, |
|
"loss": 0.594, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"eval_loss": 0.7068576216697693, |
|
"eval_runtime": 47.635, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"grad_norm": 0.819763617578999, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6758, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"eval_loss": 0.7079121470451355, |
|
"eval_runtime": 47.6352, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.7697343122686975, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6224, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"eval_loss": 0.7092974781990051, |
|
"eval_runtime": 47.5993, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.8148531217392738, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5579, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"eval_loss": 0.7090660333633423, |
|
"eval_runtime": 47.5602, |
|
"eval_samples_per_second": 4.205, |
|
"eval_steps_per_second": 0.273, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.7576748044477204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.609, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.7068901062011719, |
|
"eval_runtime": 47.5944, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"grad_norm": 0.814119412415159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5816, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"eval_loss": 0.7052778005599976, |
|
"eval_runtime": 50.9012, |
|
"eval_samples_per_second": 3.929, |
|
"eval_steps_per_second": 0.255, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"grad_norm": 0.7940502590060119, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5974, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"eval_loss": 0.7055818438529968, |
|
"eval_runtime": 47.5726, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"grad_norm": 0.7373690747574106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6267, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"eval_loss": 0.7084596753120422, |
|
"eval_runtime": 47.5924, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.8486372724795598, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6349, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"eval_loss": 0.7118301391601562, |
|
"eval_runtime": 47.9994, |
|
"eval_samples_per_second": 4.167, |
|
"eval_steps_per_second": 0.271, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"grad_norm": 0.8391397763830329, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5575, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"eval_loss": 0.7155640125274658, |
|
"eval_runtime": 47.6071, |
|
"eval_samples_per_second": 4.201, |
|
"eval_steps_per_second": 0.273, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"grad_norm": 0.7928693737279656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6777, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"eval_loss": 0.7209051251411438, |
|
"eval_runtime": 47.6324, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"grad_norm": 0.9171124624201488, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5582, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"eval_loss": 0.7233929634094238, |
|
"eval_runtime": 47.7509, |
|
"eval_samples_per_second": 4.188, |
|
"eval_steps_per_second": 0.272, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.9128766641132847, |
|
"learning_rate": 2e-05, |
|
"loss": 0.597, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_loss": 0.7227862477302551, |
|
"eval_runtime": 47.5667, |
|
"eval_samples_per_second": 4.205, |
|
"eval_steps_per_second": 0.273, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 1.0298171058788395, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6262, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"eval_loss": 0.7159123420715332, |
|
"eval_runtime": 47.6441, |
|
"eval_samples_per_second": 4.198, |
|
"eval_steps_per_second": 0.273, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"grad_norm": 0.8345277253579861, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5973, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"eval_loss": 0.7099489569664001, |
|
"eval_runtime": 49.5358, |
|
"eval_samples_per_second": 4.037, |
|
"eval_steps_per_second": 0.262, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"grad_norm": 0.8270640865043484, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5418, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"eval_loss": 0.7083099484443665, |
|
"eval_runtime": 49.7373, |
|
"eval_samples_per_second": 4.021, |
|
"eval_steps_per_second": 0.261, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.8670483383004401, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5935, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"eval_loss": 0.7091077566146851, |
|
"eval_runtime": 49.6764, |
|
"eval_samples_per_second": 4.026, |
|
"eval_steps_per_second": 0.262, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"grad_norm": 0.8373742279582174, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5947, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"eval_loss": 0.709764301776886, |
|
"eval_runtime": 49.5613, |
|
"eval_samples_per_second": 4.035, |
|
"eval_steps_per_second": 0.262, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.9406584622840672, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6079, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"eval_loss": 0.7089658379554749, |
|
"eval_runtime": 49.6241, |
|
"eval_samples_per_second": 4.03, |
|
"eval_steps_per_second": 0.262, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"grad_norm": 0.9394463996884406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5102, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"eval_loss": 0.7126440405845642, |
|
"eval_runtime": 50.6997, |
|
"eval_samples_per_second": 3.945, |
|
"eval_steps_per_second": 0.256, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.8618711805362732, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5883, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_loss": 0.7210386395454407, |
|
"eval_runtime": 47.7127, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 0.272, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"grad_norm": 0.9598465596200918, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5958, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"eval_loss": 0.7250240445137024, |
|
"eval_runtime": 47.5731, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"grad_norm": 0.9512065591304456, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5701, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"eval_loss": 0.7265011072158813, |
|
"eval_runtime": 47.611, |
|
"eval_samples_per_second": 4.201, |
|
"eval_steps_per_second": 0.273, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 1.0268459491950561, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6169, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"eval_loss": 0.723859965801239, |
|
"eval_runtime": 47.5959, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.273, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 0.9424594037649877, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6084, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"eval_loss": 0.7198401093482971, |
|
"eval_runtime": 49.4929, |
|
"eval_samples_per_second": 4.041, |
|
"eval_steps_per_second": 0.263, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"grad_norm": 0.9035217720347092, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5512, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"eval_loss": 0.7168082594871521, |
|
"eval_runtime": 49.6613, |
|
"eval_samples_per_second": 4.027, |
|
"eval_steps_per_second": 0.262, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"grad_norm": 0.8659031266239389, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5863, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"eval_loss": 0.7159530520439148, |
|
"eval_runtime": 49.5693, |
|
"eval_samples_per_second": 4.035, |
|
"eval_steps_per_second": 0.262, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"grad_norm": 0.8740167542953284, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5667, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"eval_loss": 0.7145251631736755, |
|
"eval_runtime": 49.4465, |
|
"eval_samples_per_second": 4.045, |
|
"eval_steps_per_second": 0.263, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.9263844516793406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6124, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_loss": 0.7149668335914612, |
|
"eval_runtime": 49.6649, |
|
"eval_samples_per_second": 4.027, |
|
"eval_steps_per_second": 0.262, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"grad_norm": 0.8604543323600852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5688, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"eval_loss": 0.7160521149635315, |
|
"eval_runtime": 50.6672, |
|
"eval_samples_per_second": 3.947, |
|
"eval_steps_per_second": 0.257, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"grad_norm": 0.9357009474127106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5463, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"eval_loss": 0.7187457084655762, |
|
"eval_runtime": 50.6875, |
|
"eval_samples_per_second": 3.946, |
|
"eval_steps_per_second": 0.256, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"grad_norm": 0.8237087244624672, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5393, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"eval_loss": 0.7205131649971008, |
|
"eval_runtime": 50.5794, |
|
"eval_samples_per_second": 3.954, |
|
"eval_steps_per_second": 0.257, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.8962206816300475, |
|
"learning_rate": 2e-05, |
|
"loss": 0.484, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"eval_loss": 0.7228506207466125, |
|
"eval_runtime": 50.5953, |
|
"eval_samples_per_second": 3.953, |
|
"eval_steps_per_second": 0.257, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.9983325109069782, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5592, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"eval_loss": 0.7194100022315979, |
|
"eval_runtime": 50.8657, |
|
"eval_samples_per_second": 3.932, |
|
"eval_steps_per_second": 0.256, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"grad_norm": 0.8875985843008509, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6679, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"eval_loss": 0.7146596312522888, |
|
"eval_runtime": 51.9576, |
|
"eval_samples_per_second": 3.849, |
|
"eval_steps_per_second": 0.25, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"grad_norm": 0.8611052694088349, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5812, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"eval_loss": 0.710852861404419, |
|
"eval_runtime": 51.9658, |
|
"eval_samples_per_second": 3.849, |
|
"eval_steps_per_second": 0.25, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.8497210900533776, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5212, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.7121503353118896, |
|
"eval_runtime": 51.6828, |
|
"eval_samples_per_second": 3.87, |
|
"eval_steps_per_second": 0.252, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"grad_norm": 0.8921157674462687, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5437, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"eval_loss": 0.7179412841796875, |
|
"eval_runtime": 51.9759, |
|
"eval_samples_per_second": 3.848, |
|
"eval_steps_per_second": 0.25, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.9291292967074066, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5679, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"eval_loss": 0.7306573390960693, |
|
"eval_runtime": 51.603, |
|
"eval_samples_per_second": 3.876, |
|
"eval_steps_per_second": 0.252, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"grad_norm": 0.9871115113489229, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5744, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"eval_loss": 0.74213707447052, |
|
"eval_runtime": 51.5255, |
|
"eval_samples_per_second": 3.882, |
|
"eval_steps_per_second": 0.252, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"grad_norm": 1.1662734879135015, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5274, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"eval_loss": 0.7484179139137268, |
|
"eval_runtime": 51.3131, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 0.253, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"grad_norm": 1.096240777006249, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5864, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"eval_loss": 0.745439887046814, |
|
"eval_runtime": 51.1121, |
|
"eval_samples_per_second": 3.913, |
|
"eval_steps_per_second": 0.254, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"grad_norm": 0.944903135330694, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5131, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"eval_loss": 0.7430945038795471, |
|
"eval_runtime": 51.307, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 0.253, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 1.0736115005040638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4866, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"eval_loss": 0.7417933940887451, |
|
"eval_runtime": 51.2372, |
|
"eval_samples_per_second": 3.903, |
|
"eval_steps_per_second": 0.254, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 1.0688144195951634, |
|
"learning_rate": 2e-05, |
|
"loss": 0.509, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"eval_loss": 0.7381229996681213, |
|
"eval_runtime": 51.1494, |
|
"eval_samples_per_second": 3.91, |
|
"eval_steps_per_second": 0.254, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"grad_norm": 1.0276146013155785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5708, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"eval_loss": 0.7391738891601562, |
|
"eval_runtime": 51.6779, |
|
"eval_samples_per_second": 3.87, |
|
"eval_steps_per_second": 0.252, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"grad_norm": 1.1618114955183, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5337, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"eval_loss": 0.7411096096038818, |
|
"eval_runtime": 51.5937, |
|
"eval_samples_per_second": 3.876, |
|
"eval_steps_per_second": 0.252, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"grad_norm": 1.08837375836462, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5241, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"eval_loss": 0.7420552968978882, |
|
"eval_runtime": 51.5437, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 0.252, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 1.0106379800787466, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5198, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"eval_loss": 0.7437419295310974, |
|
"eval_runtime": 51.3565, |
|
"eval_samples_per_second": 3.894, |
|
"eval_steps_per_second": 0.253, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"grad_norm": 1.0700897207702011, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5107, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"eval_loss": 0.7382708787918091, |
|
"eval_runtime": 51.4533, |
|
"eval_samples_per_second": 3.887, |
|
"eval_steps_per_second": 0.253, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"grad_norm": 1.1021606769115393, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5679, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"eval_loss": 0.7324429154396057, |
|
"eval_runtime": 51.4117, |
|
"eval_samples_per_second": 3.89, |
|
"eval_steps_per_second": 0.253, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"grad_norm": 0.9792628984982289, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5509, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"eval_loss": 0.7311490774154663, |
|
"eval_runtime": 51.8022, |
|
"eval_samples_per_second": 3.861, |
|
"eval_steps_per_second": 0.251, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.9256898215171215, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5824, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 0.736283540725708, |
|
"eval_runtime": 51.7678, |
|
"eval_samples_per_second": 3.863, |
|
"eval_steps_per_second": 0.251, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.993495109546069, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5452, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"eval_loss": 0.7425567507743835, |
|
"eval_runtime": 51.6022, |
|
"eval_samples_per_second": 3.876, |
|
"eval_steps_per_second": 0.252, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"grad_norm": 1.096995253097988, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5359, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"eval_loss": 0.7483149766921997, |
|
"eval_runtime": 51.5727, |
|
"eval_samples_per_second": 3.878, |
|
"eval_steps_per_second": 0.252, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"grad_norm": 1.1542996117677211, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5229, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"eval_loss": 0.7505038380622864, |
|
"eval_runtime": 51.846, |
|
"eval_samples_per_second": 3.858, |
|
"eval_steps_per_second": 0.251, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"grad_norm": 1.1044494998416634, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5718, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"eval_loss": 0.7511885166168213, |
|
"eval_runtime": 51.613, |
|
"eval_samples_per_second": 3.875, |
|
"eval_steps_per_second": 0.252, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"grad_norm": 1.0517094139644794, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5395, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"eval_loss": 0.750588059425354, |
|
"eval_runtime": 51.9083, |
|
"eval_samples_per_second": 3.853, |
|
"eval_steps_per_second": 0.25, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 1.2320471917997522, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5266, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"eval_loss": 0.7492180466651917, |
|
"eval_runtime": 51.3612, |
|
"eval_samples_per_second": 3.894, |
|
"eval_steps_per_second": 0.253, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"grad_norm": 1.189122697506972, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4893, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"eval_loss": 0.7448427081108093, |
|
"eval_runtime": 51.8761, |
|
"eval_samples_per_second": 3.855, |
|
"eval_steps_per_second": 0.251, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 1.1250245833360049, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5434, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_loss": 0.742850661277771, |
|
"eval_runtime": 51.4442, |
|
"eval_samples_per_second": 3.888, |
|
"eval_steps_per_second": 0.253, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"grad_norm": 1.0320917220089818, |
|
"learning_rate": 2e-05, |
|
"loss": 0.539, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"eval_loss": 0.7389761209487915, |
|
"eval_runtime": 51.609, |
|
"eval_samples_per_second": 3.875, |
|
"eval_steps_per_second": 0.252, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"grad_norm": 1.1419373892040323, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5077, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"eval_loss": 0.7384924292564392, |
|
"eval_runtime": 51.6937, |
|
"eval_samples_per_second": 3.869, |
|
"eval_steps_per_second": 0.251, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 1.0260401820964369, |
|
"learning_rate": 2e-05, |
|
"loss": 0.534, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"eval_loss": 0.738023579120636, |
|
"eval_runtime": 51.5428, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 0.252, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"grad_norm": 1.0164514553564235, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5514, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"eval_loss": 0.7399526834487915, |
|
"eval_runtime": 51.6232, |
|
"eval_samples_per_second": 3.874, |
|
"eval_steps_per_second": 0.252, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"grad_norm": 1.1847056085947891, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5216, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"eval_loss": 0.7401251196861267, |
|
"eval_runtime": 51.7617, |
|
"eval_samples_per_second": 3.864, |
|
"eval_steps_per_second": 0.251, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"grad_norm": 1.075888871715244, |
|
"learning_rate": 2e-05, |
|
"loss": 0.511, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"eval_loss": 0.739520788192749, |
|
"eval_runtime": 51.7458, |
|
"eval_samples_per_second": 3.865, |
|
"eval_steps_per_second": 0.251, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"grad_norm": 1.16238118046427, |
|
"learning_rate": 2e-05, |
|
"loss": 0.546, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"eval_loss": 0.7371450662612915, |
|
"eval_runtime": 51.4519, |
|
"eval_samples_per_second": 3.887, |
|
"eval_steps_per_second": 0.253, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.109611378591182, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4855, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.7406165599822998, |
|
"eval_runtime": 51.6984, |
|
"eval_samples_per_second": 3.869, |
|
"eval_steps_per_second": 0.251, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 320, |
|
"total_flos": 461377729855488.0, |
|
"train_loss": 0.11189348716288805, |
|
"train_runtime": 4599.9883, |
|
"train_samples_per_second": 2.174, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 461377729855488.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|