|
{ |
|
"best_metric": 2.9399616718292236, |
|
"best_model_checkpoint": "./results/models/checkpoint-43032", |
|
"epoch": 44.0, |
|
"eval_steps": 500, |
|
"global_step": 43032, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5112474437627812, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 0.001979550102249489, |
|
"loss": 3.8421, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.714660882949829, |
|
"eval_runtime": 2.4299, |
|
"eval_samples_per_second": 205.767, |
|
"eval_steps_per_second": 0.412, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.0224948875255624, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.0019591002044989777, |
|
"loss": 3.7427, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5337423312883436, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0019386503067484661, |
|
"loss": 3.6832, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 3.5354397296905518, |
|
"eval_runtime": 2.3994, |
|
"eval_samples_per_second": 208.386, |
|
"eval_steps_per_second": 0.417, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 2.044989775051125, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.001918200408997955, |
|
"loss": 3.5848, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.556237218813906, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0018977505112474438, |
|
"loss": 3.474, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 3.3599166870117188, |
|
"eval_runtime": 2.2955, |
|
"eval_samples_per_second": 217.822, |
|
"eval_steps_per_second": 0.436, |
|
"step": 2934 |
|
}, |
|
{ |
|
"epoch": 3.067484662576687, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0018773006134969327, |
|
"loss": 3.3901, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.5787321063394684, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0018568507157464215, |
|
"loss": 3.3209, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3.2504658699035645, |
|
"eval_runtime": 2.7262, |
|
"eval_samples_per_second": 183.409, |
|
"eval_steps_per_second": 0.367, |
|
"step": 3912 |
|
}, |
|
{ |
|
"epoch": 4.08997955010225, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00183640081799591, |
|
"loss": 3.2742, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.601226993865031, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0018159509202453987, |
|
"loss": 3.2314, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 3.1896719932556152, |
|
"eval_runtime": 2.6127, |
|
"eval_samples_per_second": 191.371, |
|
"eval_steps_per_second": 0.383, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 5.112474437627812, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0017955010224948876, |
|
"loss": 3.2012, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.623721881390593, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0017750511247443764, |
|
"loss": 3.1715, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 3.143582820892334, |
|
"eval_runtime": 2.3471, |
|
"eval_samples_per_second": 213.027, |
|
"eval_steps_per_second": 0.426, |
|
"step": 5868 |
|
}, |
|
{ |
|
"epoch": 6.134969325153374, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.001754601226993865, |
|
"loss": 3.1533, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.6462167689161555, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0017341513292433537, |
|
"loss": 3.1316, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 3.1050572395324707, |
|
"eval_runtime": 2.7309, |
|
"eval_samples_per_second": 183.09, |
|
"eval_steps_per_second": 0.366, |
|
"step": 6846 |
|
}, |
|
{ |
|
"epoch": 7.157464212678937, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0017137014314928425, |
|
"loss": 3.1166, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.668711656441718, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0016932515337423314, |
|
"loss": 3.1031, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.0762298107147217, |
|
"eval_runtime": 3.005, |
|
"eval_samples_per_second": 166.388, |
|
"eval_steps_per_second": 0.333, |
|
"step": 7824 |
|
}, |
|
{ |
|
"epoch": 8.1799591002045, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0016728016359918202, |
|
"loss": 3.0867, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.69120654396728, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0016523517382413088, |
|
"loss": 3.0785, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.063835382461548, |
|
"eval_runtime": 0.9235, |
|
"eval_samples_per_second": 541.411, |
|
"eval_steps_per_second": 1.083, |
|
"step": 8802 |
|
}, |
|
{ |
|
"epoch": 9.202453987730062, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0016319018404907975, |
|
"loss": 3.0683, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.713701431492842, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0016114519427402863, |
|
"loss": 3.0582, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.043813943862915, |
|
"eval_runtime": 0.9286, |
|
"eval_samples_per_second": 538.419, |
|
"eval_steps_per_second": 1.077, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 10.224948875255624, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0015910020449897751, |
|
"loss": 3.05, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 10.736196319018404, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0015705521472392638, |
|
"loss": 3.0436, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 3.0265145301818848, |
|
"eval_runtime": 0.8628, |
|
"eval_samples_per_second": 579.525, |
|
"eval_steps_per_second": 1.159, |
|
"step": 10758 |
|
}, |
|
{ |
|
"epoch": 11.247443762781186, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0015501022494887526, |
|
"loss": 3.0351, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 11.758691206543967, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0015296523517382412, |
|
"loss": 3.0337, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 3.0209317207336426, |
|
"eval_runtime": 1.0057, |
|
"eval_samples_per_second": 497.152, |
|
"eval_steps_per_second": 0.994, |
|
"step": 11736 |
|
}, |
|
{ |
|
"epoch": 12.269938650306749, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00150920245398773, |
|
"loss": 3.0209, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 12.781186094069529, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.001488752556237219, |
|
"loss": 3.0193, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 3.0119516849517822, |
|
"eval_runtime": 0.9281, |
|
"eval_samples_per_second": 538.717, |
|
"eval_steps_per_second": 1.077, |
|
"step": 12714 |
|
}, |
|
{ |
|
"epoch": 13.292433537832311, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0014683026584867075, |
|
"loss": 3.0169, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 13.803680981595091, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0014478527607361964, |
|
"loss": 3.0083, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 3.005497455596924, |
|
"eval_runtime": 0.9352, |
|
"eval_samples_per_second": 534.66, |
|
"eval_steps_per_second": 1.069, |
|
"step": 13692 |
|
}, |
|
{ |
|
"epoch": 14.314928425357873, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.001427402862985685, |
|
"loss": 3.0039, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 14.826175869120654, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0014069529652351738, |
|
"loss": 3.0026, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 2.997267007827759, |
|
"eval_runtime": 0.8134, |
|
"eval_samples_per_second": 614.694, |
|
"eval_steps_per_second": 1.229, |
|
"step": 14670 |
|
}, |
|
{ |
|
"epoch": 15.337423312883436, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0013865030674846627, |
|
"loss": 2.9966, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 15.848670756646216, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0013660531697341513, |
|
"loss": 2.9944, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 2.986419916152954, |
|
"eval_runtime": 1.0985, |
|
"eval_samples_per_second": 455.159, |
|
"eval_steps_per_second": 0.91, |
|
"step": 15648 |
|
}, |
|
{ |
|
"epoch": 16.359918200409, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0013456032719836402, |
|
"loss": 2.9894, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 16.87116564417178, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0013251533742331288, |
|
"loss": 2.9893, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 2.9802663326263428, |
|
"eval_runtime": 0.9902, |
|
"eval_samples_per_second": 504.964, |
|
"eval_steps_per_second": 1.01, |
|
"step": 16626 |
|
}, |
|
{ |
|
"epoch": 17.38241308793456, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0013047034764826176, |
|
"loss": 2.9857, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 17.893660531697343, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0012842535787321062, |
|
"loss": 2.9819, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 2.978501796722412, |
|
"eval_runtime": 0.9842, |
|
"eval_samples_per_second": 508.013, |
|
"eval_steps_per_second": 1.016, |
|
"step": 17604 |
|
}, |
|
{ |
|
"epoch": 18.404907975460123, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.001263803680981595, |
|
"loss": 2.9785, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 18.916155419222903, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.001243353783231084, |
|
"loss": 2.9779, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 2.9747886657714844, |
|
"eval_runtime": 1.1337, |
|
"eval_samples_per_second": 441.025, |
|
"eval_steps_per_second": 0.882, |
|
"step": 18582 |
|
}, |
|
{ |
|
"epoch": 19.427402862985684, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0012229038854805726, |
|
"loss": 2.9719, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 19.938650306748468, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0012024539877300614, |
|
"loss": 2.9763, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 2.9712185859680176, |
|
"eval_runtime": 0.8388, |
|
"eval_samples_per_second": 596.063, |
|
"eval_steps_per_second": 1.192, |
|
"step": 19560 |
|
}, |
|
{ |
|
"epoch": 20.449897750511248, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00118200408997955, |
|
"loss": 2.9675, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 20.961145194274028, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0011615541922290389, |
|
"loss": 2.9715, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 2.9683492183685303, |
|
"eval_runtime": 1.2443, |
|
"eval_samples_per_second": 401.838, |
|
"eval_steps_per_second": 0.804, |
|
"step": 20538 |
|
}, |
|
{ |
|
"epoch": 21.47239263803681, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0011411042944785277, |
|
"loss": 2.9674, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 21.983640081799592, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0011206543967280163, |
|
"loss": 2.9658, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 2.96321177482605, |
|
"eval_runtime": 1.0239, |
|
"eval_samples_per_second": 488.314, |
|
"eval_steps_per_second": 0.977, |
|
"step": 21516 |
|
}, |
|
{ |
|
"epoch": 22.494887525562373, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0011002044989775052, |
|
"loss": 2.9617, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 2.962963342666626, |
|
"eval_runtime": 1.176, |
|
"eval_samples_per_second": 425.172, |
|
"eval_steps_per_second": 0.85, |
|
"step": 22494 |
|
}, |
|
{ |
|
"epoch": 23.006134969325153, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0010797546012269938, |
|
"loss": 2.9646, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 23.517382413087933, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0010593047034764826, |
|
"loss": 2.959, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 2.9584052562713623, |
|
"eval_runtime": 0.8936, |
|
"eval_samples_per_second": 559.54, |
|
"eval_steps_per_second": 1.119, |
|
"step": 23472 |
|
}, |
|
{ |
|
"epoch": 24.028629856850717, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0010388548057259715, |
|
"loss": 2.9606, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 24.539877300613497, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00101840490797546, |
|
"loss": 2.9591, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 2.9545176029205322, |
|
"eval_runtime": 0.9712, |
|
"eval_samples_per_second": 514.829, |
|
"eval_steps_per_second": 1.03, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 25.051124744376278, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.000997955010224949, |
|
"loss": 2.9567, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 25.562372188139058, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0009775051124744376, |
|
"loss": 2.9566, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 2.9551963806152344, |
|
"eval_runtime": 0.8333, |
|
"eval_samples_per_second": 600.037, |
|
"eval_steps_per_second": 1.2, |
|
"step": 25428 |
|
}, |
|
{ |
|
"epoch": 26.073619631901842, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0009570552147239264, |
|
"loss": 2.9551, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 26.584867075664622, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0009366053169734151, |
|
"loss": 2.9523, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 2.957306146621704, |
|
"eval_runtime": 0.7301, |
|
"eval_samples_per_second": 684.842, |
|
"eval_steps_per_second": 1.37, |
|
"step": 26406 |
|
}, |
|
{ |
|
"epoch": 27.096114519427402, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0009161554192229039, |
|
"loss": 2.9531, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 27.607361963190183, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0008957055214723927, |
|
"loss": 2.9524, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 2.9476640224456787, |
|
"eval_runtime": 0.8281, |
|
"eval_samples_per_second": 603.779, |
|
"eval_steps_per_second": 1.208, |
|
"step": 27384 |
|
}, |
|
{ |
|
"epoch": 28.118609406952967, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0008752556237218813, |
|
"loss": 2.9493, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 28.629856850715747, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0008548057259713702, |
|
"loss": 2.9487, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 2.9507575035095215, |
|
"eval_runtime": 0.9351, |
|
"eval_samples_per_second": 534.698, |
|
"eval_steps_per_second": 1.069, |
|
"step": 28362 |
|
}, |
|
{ |
|
"epoch": 29.141104294478527, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0008343558282208589, |
|
"loss": 2.95, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 29.652351738241308, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0008139059304703477, |
|
"loss": 2.9467, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 2.9493441581726074, |
|
"eval_runtime": 0.9913, |
|
"eval_samples_per_second": 504.379, |
|
"eval_steps_per_second": 1.009, |
|
"step": 29340 |
|
}, |
|
{ |
|
"epoch": 30.16359918200409, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0007934560327198365, |
|
"loss": 2.9481, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 30.67484662576687, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0007730061349693251, |
|
"loss": 2.945, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 2.9469032287597656, |
|
"eval_runtime": 0.7298, |
|
"eval_samples_per_second": 685.073, |
|
"eval_steps_per_second": 1.37, |
|
"step": 30318 |
|
}, |
|
{ |
|
"epoch": 31.186094069529652, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.000752556237218814, |
|
"loss": 2.9452, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 31.697341513292432, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0007321063394683026, |
|
"loss": 2.9431, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 2.947190046310425, |
|
"eval_runtime": 0.8045, |
|
"eval_samples_per_second": 621.476, |
|
"eval_steps_per_second": 1.243, |
|
"step": 31296 |
|
}, |
|
{ |
|
"epoch": 32.20858895705521, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0007116564417177914, |
|
"loss": 2.9454, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 32.719836400818, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0006912065439672803, |
|
"loss": 2.9428, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 2.9455933570861816, |
|
"eval_runtime": 0.7507, |
|
"eval_samples_per_second": 666.034, |
|
"eval_steps_per_second": 1.332, |
|
"step": 32274 |
|
}, |
|
{ |
|
"epoch": 33.23108384458078, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0006707566462167689, |
|
"loss": 2.9424, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 33.74233128834356, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0006503067484662577, |
|
"loss": 2.9426, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 2.945394992828369, |
|
"eval_runtime": 0.91, |
|
"eval_samples_per_second": 549.459, |
|
"eval_steps_per_second": 1.099, |
|
"step": 33252 |
|
}, |
|
{ |
|
"epoch": 34.25357873210634, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0006298568507157464, |
|
"loss": 2.94, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 34.76482617586912, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0006094069529652352, |
|
"loss": 2.9438, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 2.9434268474578857, |
|
"eval_runtime": 0.9642, |
|
"eval_samples_per_second": 518.581, |
|
"eval_steps_per_second": 1.037, |
|
"step": 34230 |
|
}, |
|
{ |
|
"epoch": 35.2760736196319, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0005889570552147239, |
|
"loss": 2.939, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 35.787321063394685, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0005685071574642127, |
|
"loss": 2.9407, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 2.9426307678222656, |
|
"eval_runtime": 0.9434, |
|
"eval_samples_per_second": 530.024, |
|
"eval_steps_per_second": 1.06, |
|
"step": 35208 |
|
}, |
|
{ |
|
"epoch": 36.29856850715746, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0005480572597137015, |
|
"loss": 2.9372, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 36.809815950920246, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0005276073619631901, |
|
"loss": 2.941, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 2.9429283142089844, |
|
"eval_runtime": 0.9172, |
|
"eval_samples_per_second": 545.157, |
|
"eval_steps_per_second": 1.09, |
|
"step": 36186 |
|
}, |
|
{ |
|
"epoch": 37.32106339468303, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.000507157464212679, |
|
"loss": 2.9368, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 37.83231083844581, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00048670756646216766, |
|
"loss": 2.9385, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 2.9418914318084717, |
|
"eval_runtime": 0.9937, |
|
"eval_samples_per_second": 503.192, |
|
"eval_steps_per_second": 1.006, |
|
"step": 37164 |
|
}, |
|
{ |
|
"epoch": 38.34355828220859, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00046625766871165645, |
|
"loss": 2.9349, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 38.85480572597137, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00044580777096114523, |
|
"loss": 2.9384, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 2.943824291229248, |
|
"eval_runtime": 0.9909, |
|
"eval_samples_per_second": 504.6, |
|
"eval_steps_per_second": 1.009, |
|
"step": 38142 |
|
}, |
|
{ |
|
"epoch": 39.36605316973415, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00042535787321063397, |
|
"loss": 2.9386, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 39.877300613496935, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0004049079754601227, |
|
"loss": 2.9364, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 2.941757917404175, |
|
"eval_runtime": 0.8169, |
|
"eval_samples_per_second": 612.068, |
|
"eval_steps_per_second": 1.224, |
|
"step": 39120 |
|
}, |
|
{ |
|
"epoch": 40.38854805725971, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00038445807770961143, |
|
"loss": 2.9374, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 40.899795501022496, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0003640081799591002, |
|
"loss": 2.9341, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_loss": 2.9411299228668213, |
|
"eval_runtime": 0.9525, |
|
"eval_samples_per_second": 524.937, |
|
"eval_steps_per_second": 1.05, |
|
"step": 40098 |
|
}, |
|
{ |
|
"epoch": 41.41104294478528, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00034355828220858896, |
|
"loss": 2.9323, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 41.922290388548056, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00032310838445807774, |
|
"loss": 2.9361, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 2.94166898727417, |
|
"eval_runtime": 0.7476, |
|
"eval_samples_per_second": 668.768, |
|
"eval_steps_per_second": 1.338, |
|
"step": 41076 |
|
}, |
|
{ |
|
"epoch": 42.43353783231084, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0003026584867075665, |
|
"loss": 2.9344, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 42.94478527607362, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0002822085889570552, |
|
"loss": 2.9345, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_loss": 2.9405741691589355, |
|
"eval_runtime": 0.8055, |
|
"eval_samples_per_second": 620.71, |
|
"eval_steps_per_second": 1.241, |
|
"step": 42054 |
|
}, |
|
{ |
|
"epoch": 43.4560327198364, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.000261758691206544, |
|
"loss": 2.9342, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 43.967280163599185, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00024130879345603273, |
|
"loss": 2.9347, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 2.9399616718292236, |
|
"eval_runtime": 0.7311, |
|
"eval_samples_per_second": 683.897, |
|
"eval_steps_per_second": 1.368, |
|
"step": 43032 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 48900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.377598882696192e+17, |
|
"train_batch_size": 1024, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|