{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1956, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07668711656441718, "grad_norm": 4.499133110046387, "learning_rate": 1.948875255623722e-05, "loss": 1.9728, "step": 50 }, { "epoch": 0.15337423312883436, "grad_norm": 5.71051025390625, "learning_rate": 1.8977505112474438e-05, "loss": 0.461, "step": 100 }, { "epoch": 0.15337423312883436, "eval_loss": 0.22311069071292877, "eval_runtime": 5.1848, "eval_samples_per_second": 38.574, "eval_steps_per_second": 9.644, "step": 100 }, { "epoch": 0.23006134969325154, "grad_norm": 3.072005271911621, "learning_rate": 1.8466257668711657e-05, "loss": 0.2389, "step": 150 }, { "epoch": 0.3067484662576687, "grad_norm": 1.1518678665161133, "learning_rate": 1.795501022494888e-05, "loss": 0.1298, "step": 200 }, { "epoch": 0.3067484662576687, "eval_loss": 0.15515142679214478, "eval_runtime": 5.0536, "eval_samples_per_second": 39.576, "eval_steps_per_second": 9.894, "step": 200 }, { "epoch": 0.3834355828220859, "grad_norm": 0.24072186648845673, "learning_rate": 1.7443762781186097e-05, "loss": 0.1158, "step": 250 }, { "epoch": 0.4601226993865031, "grad_norm": 1.7901278734207153, "learning_rate": 1.6932515337423315e-05, "loss": 0.1306, "step": 300 }, { "epoch": 0.4601226993865031, "eval_loss": 0.14273914694786072, "eval_runtime": 5.0507, "eval_samples_per_second": 39.599, "eval_steps_per_second": 9.9, "step": 300 }, { "epoch": 0.5368098159509203, "grad_norm": 3.707313299179077, "learning_rate": 1.6421267893660533e-05, "loss": 0.0962, "step": 350 }, { "epoch": 0.6134969325153374, "grad_norm": 2.1014602184295654, "learning_rate": 1.591002044989775e-05, "loss": 0.1075, "step": 400 }, { "epoch": 0.6134969325153374, "eval_loss": 0.1252596527338028, "eval_runtime": 5.1358, "eval_samples_per_second": 38.942, "eval_steps_per_second": 9.736, "step": 400 }, { "epoch": 0.6901840490797546, "grad_norm": 0.3385634124279022, "learning_rate": 1.539877300613497e-05, "loss": 0.1164, "step": 450 }, { "epoch": 0.7668711656441718, "grad_norm": 0.6808003783226013, "learning_rate": 1.488752556237219e-05, "loss": 0.0819, "step": 500 }, { "epoch": 0.7668711656441718, "eval_loss": 0.12110447138547897, "eval_runtime": 5.0635, "eval_samples_per_second": 39.498, "eval_steps_per_second": 9.875, "step": 500 }, { "epoch": 0.843558282208589, "grad_norm": 0.19589178264141083, "learning_rate": 1.4376278118609408e-05, "loss": 0.0872, "step": 550 }, { "epoch": 0.9202453987730062, "grad_norm": 2.004000186920166, "learning_rate": 1.3865030674846627e-05, "loss": 0.0991, "step": 600 }, { "epoch": 0.9202453987730062, "eval_loss": 0.11007200181484222, "eval_runtime": 5.0747, "eval_samples_per_second": 39.411, "eval_steps_per_second": 9.853, "step": 600 }, { "epoch": 0.9969325153374233, "grad_norm": 1.948713779449463, "learning_rate": 1.3353783231083845e-05, "loss": 0.0886, "step": 650 }, { "epoch": 1.0736196319018405, "grad_norm": 1.6017649173736572, "learning_rate": 1.2842535787321065e-05, "loss": 0.0921, "step": 700 }, { "epoch": 1.0736196319018405, "eval_loss": 0.10996025800704956, "eval_runtime": 5.0876, "eval_samples_per_second": 39.312, "eval_steps_per_second": 9.828, "step": 700 }, { "epoch": 1.1503067484662577, "grad_norm": 4.096665382385254, "learning_rate": 1.2331288343558283e-05, "loss": 0.0645, "step": 750 }, { "epoch": 1.2269938650306749, "grad_norm": 6.811099529266357, "learning_rate": 1.1820040899795502e-05, "loss": 0.0692, "step": 800 }, { "epoch": 1.2269938650306749, "eval_loss": 0.10661312192678452, "eval_runtime": 5.0632, "eval_samples_per_second": 39.5, "eval_steps_per_second": 9.875, "step": 800 }, { "epoch": 1.303680981595092, "grad_norm": 0.8224550485610962, "learning_rate": 1.130879345603272e-05, "loss": 0.0431, "step": 850 }, { "epoch": 1.3803680981595092, "grad_norm": 0.07309632748365402, "learning_rate": 1.079754601226994e-05, "loss": 0.0557, "step": 900 }, { "epoch": 1.3803680981595092, "eval_loss": 0.10908563435077667, "eval_runtime": 5.0441, "eval_samples_per_second": 39.651, "eval_steps_per_second": 9.913, "step": 900 }, { "epoch": 1.4570552147239264, "grad_norm": 0.06727185100317001, "learning_rate": 1.0286298568507158e-05, "loss": 0.0531, "step": 950 }, { "epoch": 1.5337423312883436, "grad_norm": 0.17810319364070892, "learning_rate": 9.775051124744377e-06, "loss": 0.0546, "step": 1000 }, { "epoch": 1.5337423312883436, "eval_loss": 0.11184686422348022, "eval_runtime": 5.078, "eval_samples_per_second": 39.385, "eval_steps_per_second": 9.846, "step": 1000 }, { "epoch": 1.6104294478527608, "grad_norm": 0.6150496602058411, "learning_rate": 9.263803680981595e-06, "loss": 0.0921, "step": 1050 }, { "epoch": 1.687116564417178, "grad_norm": 1.6784019470214844, "learning_rate": 8.752556237218815e-06, "loss": 0.0754, "step": 1100 }, { "epoch": 1.687116564417178, "eval_loss": 0.10531575977802277, "eval_runtime": 5.0653, "eval_samples_per_second": 39.484, "eval_steps_per_second": 9.871, "step": 1100 }, { "epoch": 1.7638036809815951, "grad_norm": 1.444920539855957, "learning_rate": 8.241308793456033e-06, "loss": 0.0643, "step": 1150 }, { "epoch": 1.8404907975460123, "grad_norm": 6.627353668212891, "learning_rate": 7.730061349693252e-06, "loss": 0.0554, "step": 1200 }, { "epoch": 1.8404907975460123, "eval_loss": 0.10470691323280334, "eval_runtime": 5.0833, "eval_samples_per_second": 39.344, "eval_steps_per_second": 9.836, "step": 1200 }, { "epoch": 1.9171779141104295, "grad_norm": 6.533907413482666, "learning_rate": 7.218813905930471e-06, "loss": 0.0615, "step": 1250 }, { "epoch": 1.9938650306748467, "grad_norm": 0.020672863349318504, "learning_rate": 6.707566462167689e-06, "loss": 0.0585, "step": 1300 }, { "epoch": 1.9938650306748467, "eval_loss": 0.1054287850856781, "eval_runtime": 5.0922, "eval_samples_per_second": 39.276, "eval_steps_per_second": 9.819, "step": 1300 }, { "epoch": 2.0705521472392636, "grad_norm": 1.1368516683578491, "learning_rate": 6.1963190184049085e-06, "loss": 0.0505, "step": 1350 }, { "epoch": 2.147239263803681, "grad_norm": 2.824483633041382, "learning_rate": 5.685071574642127e-06, "loss": 0.058, "step": 1400 }, { "epoch": 2.147239263803681, "eval_loss": 0.10347571969032288, "eval_runtime": 5.1381, "eval_samples_per_second": 38.925, "eval_steps_per_second": 9.731, "step": 1400 }, { "epoch": 2.223926380368098, "grad_norm": 0.24446068704128265, "learning_rate": 5.173824130879346e-06, "loss": 0.0495, "step": 1450 }, { "epoch": 2.3006134969325154, "grad_norm": 0.29132819175720215, "learning_rate": 4.662576687116564e-06, "loss": 0.0643, "step": 1500 }, { "epoch": 2.3006134969325154, "eval_loss": 0.10253553092479706, "eval_runtime": 5.0736, "eval_samples_per_second": 39.419, "eval_steps_per_second": 9.855, "step": 1500 }, { "epoch": 2.3773006134969323, "grad_norm": 0.03403930738568306, "learning_rate": 4.1513292433537835e-06, "loss": 0.0454, "step": 1550 }, { "epoch": 2.4539877300613497, "grad_norm": 0.08365653455257416, "learning_rate": 3.6400817995910027e-06, "loss": 0.046, "step": 1600 }, { "epoch": 2.4539877300613497, "eval_loss": 0.10056012868881226, "eval_runtime": 5.0737, "eval_samples_per_second": 39.419, "eval_steps_per_second": 9.855, "step": 1600 }, { "epoch": 2.530674846625767, "grad_norm": 3.2596278190612793, "learning_rate": 3.1288343558282214e-06, "loss": 0.0423, "step": 1650 }, { "epoch": 2.607361963190184, "grad_norm": 4.449475288391113, "learning_rate": 2.61758691206544e-06, "loss": 0.041, "step": 1700 }, { "epoch": 2.607361963190184, "eval_loss": 0.10248824954032898, "eval_runtime": 5.0793, "eval_samples_per_second": 39.375, "eval_steps_per_second": 9.844, "step": 1700 }, { "epoch": 2.684049079754601, "grad_norm": 0.08391686528921127, "learning_rate": 2.1063394683026585e-06, "loss": 0.0432, "step": 1750 }, { "epoch": 2.7607361963190185, "grad_norm": 5.902612209320068, "learning_rate": 1.5950920245398775e-06, "loss": 0.0431, "step": 1800 }, { "epoch": 2.7607361963190185, "eval_loss": 0.1023348718881607, "eval_runtime": 5.0815, "eval_samples_per_second": 39.358, "eval_steps_per_second": 9.84, "step": 1800 }, { "epoch": 2.837423312883436, "grad_norm": 5.896852016448975, "learning_rate": 1.0838445807770962e-06, "loss": 0.0508, "step": 1850 }, { "epoch": 2.914110429447853, "grad_norm": 11.15097427368164, "learning_rate": 5.72597137014315e-07, "loss": 0.0367, "step": 1900 }, { "epoch": 2.914110429447853, "eval_loss": 0.10221439599990845, "eval_runtime": 5.104, "eval_samples_per_second": 39.185, "eval_steps_per_second": 9.796, "step": 1900 }, { "epoch": 2.9907975460122698, "grad_norm": 0.035228431224823, "learning_rate": 6.134969325153375e-08, "loss": 0.0398, "step": 1950 } ], "logging_steps": 50, "max_steps": 1956, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": -1956, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1820994785378304e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }