{ "best_metric": 2.9399616718292236, "best_model_checkpoint": "./results/models/checkpoint-43032", "epoch": 44.0, "eval_steps": 500, "global_step": 43032, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5112474437627812, "grad_norm": 0.056640625, "learning_rate": 0.001979550102249489, "loss": 3.8421, "step": 500 }, { "epoch": 1.0, "eval_loss": 3.714660882949829, "eval_runtime": 2.4299, "eval_samples_per_second": 205.767, "eval_steps_per_second": 0.412, "step": 978 }, { "epoch": 1.0224948875255624, "grad_norm": 0.0849609375, "learning_rate": 0.0019591002044989777, "loss": 3.7427, "step": 1000 }, { "epoch": 1.5337423312883436, "grad_norm": 0.2236328125, "learning_rate": 0.0019386503067484661, "loss": 3.6832, "step": 1500 }, { "epoch": 2.0, "eval_loss": 3.5354397296905518, "eval_runtime": 2.3994, "eval_samples_per_second": 208.386, "eval_steps_per_second": 0.417, "step": 1956 }, { "epoch": 2.044989775051125, "grad_norm": 0.197265625, "learning_rate": 0.001918200408997955, "loss": 3.5848, "step": 2000 }, { "epoch": 2.556237218813906, "grad_norm": 0.26171875, "learning_rate": 0.0018977505112474438, "loss": 3.474, "step": 2500 }, { "epoch": 3.0, "eval_loss": 3.3599166870117188, "eval_runtime": 2.2955, "eval_samples_per_second": 217.822, "eval_steps_per_second": 0.436, "step": 2934 }, { "epoch": 3.067484662576687, "grad_norm": 0.2216796875, "learning_rate": 0.0018773006134969327, "loss": 3.3901, "step": 3000 }, { "epoch": 3.5787321063394684, "grad_norm": 0.263671875, "learning_rate": 0.0018568507157464215, "loss": 3.3209, "step": 3500 }, { "epoch": 4.0, "eval_loss": 3.2504658699035645, "eval_runtime": 2.7262, "eval_samples_per_second": 183.409, "eval_steps_per_second": 0.367, "step": 3912 }, { "epoch": 4.08997955010225, "grad_norm": 0.302734375, "learning_rate": 0.00183640081799591, "loss": 3.2742, "step": 4000 }, { "epoch": 4.601226993865031, "grad_norm": 0.44140625, "learning_rate": 0.0018159509202453987, "loss": 3.2314, "step": 4500 }, { "epoch": 5.0, "eval_loss": 3.1896719932556152, "eval_runtime": 2.6127, "eval_samples_per_second": 191.371, "eval_steps_per_second": 0.383, "step": 4890 }, { "epoch": 5.112474437627812, "grad_norm": 0.302734375, "learning_rate": 0.0017955010224948876, "loss": 3.2012, "step": 5000 }, { "epoch": 5.623721881390593, "grad_norm": 0.296875, "learning_rate": 0.0017750511247443764, "loss": 3.1715, "step": 5500 }, { "epoch": 6.0, "eval_loss": 3.143582820892334, "eval_runtime": 2.3471, "eval_samples_per_second": 213.027, "eval_steps_per_second": 0.426, "step": 5868 }, { "epoch": 6.134969325153374, "grad_norm": 0.318359375, "learning_rate": 0.001754601226993865, "loss": 3.1533, "step": 6000 }, { "epoch": 6.6462167689161555, "grad_norm": 0.283203125, "learning_rate": 0.0017341513292433537, "loss": 3.1316, "step": 6500 }, { "epoch": 7.0, "eval_loss": 3.1050572395324707, "eval_runtime": 2.7309, "eval_samples_per_second": 183.09, "eval_steps_per_second": 0.366, "step": 6846 }, { "epoch": 7.157464212678937, "grad_norm": 0.291015625, "learning_rate": 0.0017137014314928425, "loss": 3.1166, "step": 7000 }, { "epoch": 7.668711656441718, "grad_norm": 0.4296875, "learning_rate": 0.0016932515337423314, "loss": 3.1031, "step": 7500 }, { "epoch": 8.0, "eval_loss": 3.0762298107147217, "eval_runtime": 3.005, "eval_samples_per_second": 166.388, "eval_steps_per_second": 0.333, "step": 7824 }, { "epoch": 8.1799591002045, "grad_norm": 0.310546875, "learning_rate": 0.0016728016359918202, "loss": 3.0867, "step": 8000 }, { "epoch": 8.69120654396728, "grad_norm": 0.283203125, "learning_rate": 0.0016523517382413088, "loss": 3.0785, "step": 8500 }, { "epoch": 9.0, "eval_loss": 3.063835382461548, "eval_runtime": 0.9235, "eval_samples_per_second": 541.411, "eval_steps_per_second": 1.083, "step": 8802 }, { "epoch": 9.202453987730062, "grad_norm": 0.3125, "learning_rate": 0.0016319018404907975, "loss": 3.0683, "step": 9000 }, { "epoch": 9.713701431492842, "grad_norm": 0.39453125, "learning_rate": 0.0016114519427402863, "loss": 3.0582, "step": 9500 }, { "epoch": 10.0, "eval_loss": 3.043813943862915, "eval_runtime": 0.9286, "eval_samples_per_second": 538.419, "eval_steps_per_second": 1.077, "step": 9780 }, { "epoch": 10.224948875255624, "grad_norm": 0.341796875, "learning_rate": 0.0015910020449897751, "loss": 3.05, "step": 10000 }, { "epoch": 10.736196319018404, "grad_norm": 0.412109375, "learning_rate": 0.0015705521472392638, "loss": 3.0436, "step": 10500 }, { "epoch": 11.0, "eval_loss": 3.0265145301818848, "eval_runtime": 0.8628, "eval_samples_per_second": 579.525, "eval_steps_per_second": 1.159, "step": 10758 }, { "epoch": 11.247443762781186, "grad_norm": 0.287109375, "learning_rate": 0.0015501022494887526, "loss": 3.0351, "step": 11000 }, { "epoch": 11.758691206543967, "grad_norm": 0.34765625, "learning_rate": 0.0015296523517382412, "loss": 3.0337, "step": 11500 }, { "epoch": 12.0, "eval_loss": 3.0209317207336426, "eval_runtime": 1.0057, "eval_samples_per_second": 497.152, "eval_steps_per_second": 0.994, "step": 11736 }, { "epoch": 12.269938650306749, "grad_norm": 0.29296875, "learning_rate": 0.00150920245398773, "loss": 3.0209, "step": 12000 }, { "epoch": 12.781186094069529, "grad_norm": 0.333984375, "learning_rate": 0.001488752556237219, "loss": 3.0193, "step": 12500 }, { "epoch": 13.0, "eval_loss": 3.0119516849517822, "eval_runtime": 0.9281, "eval_samples_per_second": 538.717, "eval_steps_per_second": 1.077, "step": 12714 }, { "epoch": 13.292433537832311, "grad_norm": 0.373046875, "learning_rate": 0.0014683026584867075, "loss": 3.0169, "step": 13000 }, { "epoch": 13.803680981595091, "grad_norm": 0.337890625, "learning_rate": 0.0014478527607361964, "loss": 3.0083, "step": 13500 }, { "epoch": 14.0, "eval_loss": 3.005497455596924, "eval_runtime": 0.9352, "eval_samples_per_second": 534.66, "eval_steps_per_second": 1.069, "step": 13692 }, { "epoch": 14.314928425357873, "grad_norm": 0.373046875, "learning_rate": 0.001427402862985685, "loss": 3.0039, "step": 14000 }, { "epoch": 14.826175869120654, "grad_norm": 0.35546875, "learning_rate": 0.0014069529652351738, "loss": 3.0026, "step": 14500 }, { "epoch": 15.0, "eval_loss": 2.997267007827759, "eval_runtime": 0.8134, "eval_samples_per_second": 614.694, "eval_steps_per_second": 1.229, "step": 14670 }, { "epoch": 15.337423312883436, "grad_norm": 0.306640625, "learning_rate": 0.0013865030674846627, "loss": 2.9966, "step": 15000 }, { "epoch": 15.848670756646216, "grad_norm": 0.32421875, "learning_rate": 0.0013660531697341513, "loss": 2.9944, "step": 15500 }, { "epoch": 16.0, "eval_loss": 2.986419916152954, "eval_runtime": 1.0985, "eval_samples_per_second": 455.159, "eval_steps_per_second": 0.91, "step": 15648 }, { "epoch": 16.359918200409, "grad_norm": 0.3203125, "learning_rate": 0.0013456032719836402, "loss": 2.9894, "step": 16000 }, { "epoch": 16.87116564417178, "grad_norm": 0.3046875, "learning_rate": 0.0013251533742331288, "loss": 2.9893, "step": 16500 }, { "epoch": 17.0, "eval_loss": 2.9802663326263428, "eval_runtime": 0.9902, "eval_samples_per_second": 504.964, "eval_steps_per_second": 1.01, "step": 16626 }, { "epoch": 17.38241308793456, "grad_norm": 0.318359375, "learning_rate": 0.0013047034764826176, "loss": 2.9857, "step": 17000 }, { "epoch": 17.893660531697343, "grad_norm": 0.33203125, "learning_rate": 0.0012842535787321062, "loss": 2.9819, "step": 17500 }, { "epoch": 18.0, "eval_loss": 2.978501796722412, "eval_runtime": 0.9842, "eval_samples_per_second": 508.013, "eval_steps_per_second": 1.016, "step": 17604 }, { "epoch": 18.404907975460123, "grad_norm": 0.337890625, "learning_rate": 0.001263803680981595, "loss": 2.9785, "step": 18000 }, { "epoch": 18.916155419222903, "grad_norm": 0.30859375, "learning_rate": 0.001243353783231084, "loss": 2.9779, "step": 18500 }, { "epoch": 19.0, "eval_loss": 2.9747886657714844, "eval_runtime": 1.1337, "eval_samples_per_second": 441.025, "eval_steps_per_second": 0.882, "step": 18582 }, { "epoch": 19.427402862985684, "grad_norm": 0.296875, "learning_rate": 0.0012229038854805726, "loss": 2.9719, "step": 19000 }, { "epoch": 19.938650306748468, "grad_norm": 0.33984375, "learning_rate": 0.0012024539877300614, "loss": 2.9763, "step": 19500 }, { "epoch": 20.0, "eval_loss": 2.9712185859680176, "eval_runtime": 0.8388, "eval_samples_per_second": 596.063, "eval_steps_per_second": 1.192, "step": 19560 }, { "epoch": 20.449897750511248, "grad_norm": 0.298828125, "learning_rate": 0.00118200408997955, "loss": 2.9675, "step": 20000 }, { "epoch": 20.961145194274028, "grad_norm": 0.447265625, "learning_rate": 0.0011615541922290389, "loss": 2.9715, "step": 20500 }, { "epoch": 21.0, "eval_loss": 2.9683492183685303, "eval_runtime": 1.2443, "eval_samples_per_second": 401.838, "eval_steps_per_second": 0.804, "step": 20538 }, { "epoch": 21.47239263803681, "grad_norm": 0.2890625, "learning_rate": 0.0011411042944785277, "loss": 2.9674, "step": 21000 }, { "epoch": 21.983640081799592, "grad_norm": 0.322265625, "learning_rate": 0.0011206543967280163, "loss": 2.9658, "step": 21500 }, { "epoch": 22.0, "eval_loss": 2.96321177482605, "eval_runtime": 1.0239, "eval_samples_per_second": 488.314, "eval_steps_per_second": 0.977, "step": 21516 }, { "epoch": 22.494887525562373, "grad_norm": 0.296875, "learning_rate": 0.0011002044989775052, "loss": 2.9617, "step": 22000 }, { "epoch": 23.0, "eval_loss": 2.962963342666626, "eval_runtime": 1.176, "eval_samples_per_second": 425.172, "eval_steps_per_second": 0.85, "step": 22494 }, { "epoch": 23.006134969325153, "grad_norm": 0.392578125, "learning_rate": 0.0010797546012269938, "loss": 2.9646, "step": 22500 }, { "epoch": 23.517382413087933, "grad_norm": 0.345703125, "learning_rate": 0.0010593047034764826, "loss": 2.959, "step": 23000 }, { "epoch": 24.0, "eval_loss": 2.9584052562713623, "eval_runtime": 0.8936, "eval_samples_per_second": 559.54, "eval_steps_per_second": 1.119, "step": 23472 }, { "epoch": 24.028629856850717, "grad_norm": 0.33984375, "learning_rate": 0.0010388548057259715, "loss": 2.9606, "step": 23500 }, { "epoch": 24.539877300613497, "grad_norm": 0.333984375, "learning_rate": 0.00101840490797546, "loss": 2.9591, "step": 24000 }, { "epoch": 25.0, "eval_loss": 2.9545176029205322, "eval_runtime": 0.9712, "eval_samples_per_second": 514.829, "eval_steps_per_second": 1.03, "step": 24450 }, { "epoch": 25.051124744376278, "grad_norm": 0.279296875, "learning_rate": 0.000997955010224949, "loss": 2.9567, "step": 24500 }, { "epoch": 25.562372188139058, "grad_norm": 0.380859375, "learning_rate": 0.0009775051124744376, "loss": 2.9566, "step": 25000 }, { "epoch": 26.0, "eval_loss": 2.9551963806152344, "eval_runtime": 0.8333, "eval_samples_per_second": 600.037, "eval_steps_per_second": 1.2, "step": 25428 }, { "epoch": 26.073619631901842, "grad_norm": 0.3046875, "learning_rate": 0.0009570552147239264, "loss": 2.9551, "step": 25500 }, { "epoch": 26.584867075664622, "grad_norm": 0.2734375, "learning_rate": 0.0009366053169734151, "loss": 2.9523, "step": 26000 }, { "epoch": 27.0, "eval_loss": 2.957306146621704, "eval_runtime": 0.7301, "eval_samples_per_second": 684.842, "eval_steps_per_second": 1.37, "step": 26406 }, { "epoch": 27.096114519427402, "grad_norm": 0.318359375, "learning_rate": 0.0009161554192229039, "loss": 2.9531, "step": 26500 }, { "epoch": 27.607361963190183, "grad_norm": 0.318359375, "learning_rate": 0.0008957055214723927, "loss": 2.9524, "step": 27000 }, { "epoch": 28.0, "eval_loss": 2.9476640224456787, "eval_runtime": 0.8281, "eval_samples_per_second": 603.779, "eval_steps_per_second": 1.208, "step": 27384 }, { "epoch": 28.118609406952967, "grad_norm": 0.294921875, "learning_rate": 0.0008752556237218813, "loss": 2.9493, "step": 27500 }, { "epoch": 28.629856850715747, "grad_norm": 0.31640625, "learning_rate": 0.0008548057259713702, "loss": 2.9487, "step": 28000 }, { "epoch": 29.0, "eval_loss": 2.9507575035095215, "eval_runtime": 0.9351, "eval_samples_per_second": 534.698, "eval_steps_per_second": 1.069, "step": 28362 }, { "epoch": 29.141104294478527, "grad_norm": 0.287109375, "learning_rate": 0.0008343558282208589, "loss": 2.95, "step": 28500 }, { "epoch": 29.652351738241308, "grad_norm": 0.294921875, "learning_rate": 0.0008139059304703477, "loss": 2.9467, "step": 29000 }, { "epoch": 30.0, "eval_loss": 2.9493441581726074, "eval_runtime": 0.9913, "eval_samples_per_second": 504.379, "eval_steps_per_second": 1.009, "step": 29340 }, { "epoch": 30.16359918200409, "grad_norm": 0.26171875, "learning_rate": 0.0007934560327198365, "loss": 2.9481, "step": 29500 }, { "epoch": 30.67484662576687, "grad_norm": 0.283203125, "learning_rate": 0.0007730061349693251, "loss": 2.945, "step": 30000 }, { "epoch": 31.0, "eval_loss": 2.9469032287597656, "eval_runtime": 0.7298, "eval_samples_per_second": 685.073, "eval_steps_per_second": 1.37, "step": 30318 }, { "epoch": 31.186094069529652, "grad_norm": 0.28515625, "learning_rate": 0.000752556237218814, "loss": 2.9452, "step": 30500 }, { "epoch": 31.697341513292432, "grad_norm": 0.28125, "learning_rate": 0.0007321063394683026, "loss": 2.9431, "step": 31000 }, { "epoch": 32.0, "eval_loss": 2.947190046310425, "eval_runtime": 0.8045, "eval_samples_per_second": 621.476, "eval_steps_per_second": 1.243, "step": 31296 }, { "epoch": 32.20858895705521, "grad_norm": 0.265625, "learning_rate": 0.0007116564417177914, "loss": 2.9454, "step": 31500 }, { "epoch": 32.719836400818, "grad_norm": 0.287109375, "learning_rate": 0.0006912065439672803, "loss": 2.9428, "step": 32000 }, { "epoch": 33.0, "eval_loss": 2.9455933570861816, "eval_runtime": 0.7507, "eval_samples_per_second": 666.034, "eval_steps_per_second": 1.332, "step": 32274 }, { "epoch": 33.23108384458078, "grad_norm": 0.27734375, "learning_rate": 0.0006707566462167689, "loss": 2.9424, "step": 32500 }, { "epoch": 33.74233128834356, "grad_norm": 0.2890625, "learning_rate": 0.0006503067484662577, "loss": 2.9426, "step": 33000 }, { "epoch": 34.0, "eval_loss": 2.945394992828369, "eval_runtime": 0.91, "eval_samples_per_second": 549.459, "eval_steps_per_second": 1.099, "step": 33252 }, { "epoch": 34.25357873210634, "grad_norm": 0.3125, "learning_rate": 0.0006298568507157464, "loss": 2.94, "step": 33500 }, { "epoch": 34.76482617586912, "grad_norm": 0.2734375, "learning_rate": 0.0006094069529652352, "loss": 2.9438, "step": 34000 }, { "epoch": 35.0, "eval_loss": 2.9434268474578857, "eval_runtime": 0.9642, "eval_samples_per_second": 518.581, "eval_steps_per_second": 1.037, "step": 34230 }, { "epoch": 35.2760736196319, "grad_norm": 0.287109375, "learning_rate": 0.0005889570552147239, "loss": 2.939, "step": 34500 }, { "epoch": 35.787321063394685, "grad_norm": 0.263671875, "learning_rate": 0.0005685071574642127, "loss": 2.9407, "step": 35000 }, { "epoch": 36.0, "eval_loss": 2.9426307678222656, "eval_runtime": 0.9434, "eval_samples_per_second": 530.024, "eval_steps_per_second": 1.06, "step": 35208 }, { "epoch": 36.29856850715746, "grad_norm": 0.2578125, "learning_rate": 0.0005480572597137015, "loss": 2.9372, "step": 35500 }, { "epoch": 36.809815950920246, "grad_norm": 0.283203125, "learning_rate": 0.0005276073619631901, "loss": 2.941, "step": 36000 }, { "epoch": 37.0, "eval_loss": 2.9429283142089844, "eval_runtime": 0.9172, "eval_samples_per_second": 545.157, "eval_steps_per_second": 1.09, "step": 36186 }, { "epoch": 37.32106339468303, "grad_norm": 0.26171875, "learning_rate": 0.000507157464212679, "loss": 2.9368, "step": 36500 }, { "epoch": 37.83231083844581, "grad_norm": 0.2578125, "learning_rate": 0.00048670756646216766, "loss": 2.9385, "step": 37000 }, { "epoch": 38.0, "eval_loss": 2.9418914318084717, "eval_runtime": 0.9937, "eval_samples_per_second": 503.192, "eval_steps_per_second": 1.006, "step": 37164 }, { "epoch": 38.34355828220859, "grad_norm": 0.25390625, "learning_rate": 0.00046625766871165645, "loss": 2.9349, "step": 37500 }, { "epoch": 38.85480572597137, "grad_norm": 0.255859375, "learning_rate": 0.00044580777096114523, "loss": 2.9384, "step": 38000 }, { "epoch": 39.0, "eval_loss": 2.943824291229248, "eval_runtime": 0.9909, "eval_samples_per_second": 504.6, "eval_steps_per_second": 1.009, "step": 38142 }, { "epoch": 39.36605316973415, "grad_norm": 0.271484375, "learning_rate": 0.00042535787321063397, "loss": 2.9386, "step": 38500 }, { "epoch": 39.877300613496935, "grad_norm": 0.24609375, "learning_rate": 0.0004049079754601227, "loss": 2.9364, "step": 39000 }, { "epoch": 40.0, "eval_loss": 2.941757917404175, "eval_runtime": 0.8169, "eval_samples_per_second": 612.068, "eval_steps_per_second": 1.224, "step": 39120 }, { "epoch": 40.38854805725971, "grad_norm": 0.25, "learning_rate": 0.00038445807770961143, "loss": 2.9374, "step": 39500 }, { "epoch": 40.899795501022496, "grad_norm": 0.2431640625, "learning_rate": 0.0003640081799591002, "loss": 2.9341, "step": 40000 }, { "epoch": 41.0, "eval_loss": 2.9411299228668213, "eval_runtime": 0.9525, "eval_samples_per_second": 524.937, "eval_steps_per_second": 1.05, "step": 40098 }, { "epoch": 41.41104294478528, "grad_norm": 0.2451171875, "learning_rate": 0.00034355828220858896, "loss": 2.9323, "step": 40500 }, { "epoch": 41.922290388548056, "grad_norm": 0.2353515625, "learning_rate": 0.00032310838445807774, "loss": 2.9361, "step": 41000 }, { "epoch": 42.0, "eval_loss": 2.94166898727417, "eval_runtime": 0.7476, "eval_samples_per_second": 668.768, "eval_steps_per_second": 1.338, "step": 41076 }, { "epoch": 42.43353783231084, "grad_norm": 0.2470703125, "learning_rate": 0.0003026584867075665, "loss": 2.9344, "step": 41500 }, { "epoch": 42.94478527607362, "grad_norm": 0.255859375, "learning_rate": 0.0002822085889570552, "loss": 2.9345, "step": 42000 }, { "epoch": 43.0, "eval_loss": 2.9405741691589355, "eval_runtime": 0.8055, "eval_samples_per_second": 620.71, "eval_steps_per_second": 1.241, "step": 42054 }, { "epoch": 43.4560327198364, "grad_norm": 0.24609375, "learning_rate": 0.000261758691206544, "loss": 2.9342, "step": 42500 }, { "epoch": 43.967280163599185, "grad_norm": 0.2451171875, "learning_rate": 0.00024130879345603273, "loss": 2.9347, "step": 43000 }, { "epoch": 44.0, "eval_loss": 2.9399616718292236, "eval_runtime": 0.7311, "eval_samples_per_second": 683.897, "eval_steps_per_second": 1.368, "step": 43032 } ], "logging_steps": 500, "max_steps": 48900, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.377598882696192e+17, "train_batch_size": 1024, "trial_name": null, "trial_params": null }