{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9955423476968797, "eval_steps": 500, "global_step": 756, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0396235760277365, "grad_norm": 27.41085149828123, "learning_rate": 5e-06, "loss": 1.0725, "step": 10 }, { "epoch": 0.079247152055473, "grad_norm": 21.426346851591315, "learning_rate": 5e-06, "loss": 0.9867, "step": 20 }, { "epoch": 0.1188707280832095, "grad_norm": 0.8214529838984242, "learning_rate": 5e-06, "loss": 0.9489, "step": 30 }, { "epoch": 0.158494304110946, "grad_norm": 0.8934541351618693, "learning_rate": 5e-06, "loss": 0.9168, "step": 40 }, { "epoch": 0.1981178801386825, "grad_norm": 0.6396922311734878, "learning_rate": 5e-06, "loss": 0.903, "step": 50 }, { "epoch": 0.237741456166419, "grad_norm": 0.8318834132473311, "learning_rate": 5e-06, "loss": 0.884, "step": 60 }, { "epoch": 0.27736503219415554, "grad_norm": 0.6283992782157446, "learning_rate": 5e-06, "loss": 0.8732, "step": 70 }, { "epoch": 0.316988608221892, "grad_norm": 0.6596849388425453, "learning_rate": 5e-06, "loss": 0.8701, "step": 80 }, { "epoch": 0.35661218424962854, "grad_norm": 0.6758187675410737, "learning_rate": 5e-06, "loss": 0.8641, "step": 90 }, { "epoch": 0.396235760277365, "grad_norm": 1.0513265622155463, "learning_rate": 5e-06, "loss": 0.8597, "step": 100 }, { "epoch": 0.43585933630510154, "grad_norm": 0.5790780273693242, "learning_rate": 5e-06, "loss": 0.8538, "step": 110 }, { "epoch": 0.475482912332838, "grad_norm": 0.5821582037832116, "learning_rate": 5e-06, "loss": 0.8493, "step": 120 }, { "epoch": 0.5151064883605745, "grad_norm": 0.6354797019513801, "learning_rate": 5e-06, "loss": 0.8491, "step": 130 }, { "epoch": 0.5547300643883111, "grad_norm": 0.74047379799972, "learning_rate": 5e-06, "loss": 0.8519, "step": 140 }, { "epoch": 0.5943536404160475, "grad_norm": 0.6926741472374109, "learning_rate": 5e-06, "loss": 0.8388, "step": 150 }, { "epoch": 0.633977216443784, "grad_norm": 0.5805929955993835, "learning_rate": 5e-06, "loss": 0.842, "step": 160 }, { "epoch": 0.6736007924715206, "grad_norm": 0.6028836834161021, "learning_rate": 5e-06, "loss": 0.8381, "step": 170 }, { "epoch": 0.7132243684992571, "grad_norm": 0.5654340386019846, "learning_rate": 5e-06, "loss": 0.8341, "step": 180 }, { "epoch": 0.7528479445269936, "grad_norm": 0.5723158352099399, "learning_rate": 5e-06, "loss": 0.8375, "step": 190 }, { "epoch": 0.79247152055473, "grad_norm": 0.5198688688930553, "learning_rate": 5e-06, "loss": 0.8381, "step": 200 }, { "epoch": 0.8320950965824666, "grad_norm": 0.6050739761187648, "learning_rate": 5e-06, "loss": 0.8342, "step": 210 }, { "epoch": 0.8717186726102031, "grad_norm": 0.5397219218505803, "learning_rate": 5e-06, "loss": 0.8308, "step": 220 }, { "epoch": 0.9113422486379396, "grad_norm": 0.5613324111744644, "learning_rate": 5e-06, "loss": 0.8302, "step": 230 }, { "epoch": 0.950965824665676, "grad_norm": 0.5372566937252206, "learning_rate": 5e-06, "loss": 0.8272, "step": 240 }, { "epoch": 0.9905894006934126, "grad_norm": 0.4784741594645362, "learning_rate": 5e-06, "loss": 0.8269, "step": 250 }, { "epoch": 0.9985141158989599, "eval_loss": 0.8291837573051453, "eval_runtime": 271.0608, "eval_samples_per_second": 25.083, "eval_steps_per_second": 0.395, "step": 252 }, { "epoch": 1.030212976721149, "grad_norm": 0.87740197973562, "learning_rate": 5e-06, "loss": 0.8491, "step": 260 }, { "epoch": 1.0698365527488856, "grad_norm": 0.5678119260366427, "learning_rate": 5e-06, "loss": 0.785, "step": 270 }, { "epoch": 1.1094601287766221, "grad_norm": 0.6337004334606812, "learning_rate": 5e-06, "loss": 0.7887, "step": 280 }, { "epoch": 1.1490837048043585, "grad_norm": 0.6986139436792874, "learning_rate": 5e-06, "loss": 0.7862, "step": 290 }, { "epoch": 1.188707280832095, "grad_norm": 0.5874593683473379, "learning_rate": 5e-06, "loss": 0.7793, "step": 300 }, { "epoch": 1.2283308568598317, "grad_norm": 0.7975693973340302, "learning_rate": 5e-06, "loss": 0.7795, "step": 310 }, { "epoch": 1.267954432887568, "grad_norm": 0.5846939921280547, "learning_rate": 5e-06, "loss": 0.7798, "step": 320 }, { "epoch": 1.3075780089153046, "grad_norm": 0.7654979422005525, "learning_rate": 5e-06, "loss": 0.7832, "step": 330 }, { "epoch": 1.3472015849430412, "grad_norm": 0.6496190339072363, "learning_rate": 5e-06, "loss": 0.7835, "step": 340 }, { "epoch": 1.3868251609707776, "grad_norm": 0.7019021685342436, "learning_rate": 5e-06, "loss": 0.7841, "step": 350 }, { "epoch": 1.4264487369985142, "grad_norm": 0.697904784471384, "learning_rate": 5e-06, "loss": 0.7809, "step": 360 }, { "epoch": 1.4660723130262507, "grad_norm": 0.6140987998837893, "learning_rate": 5e-06, "loss": 0.7839, "step": 370 }, { "epoch": 1.505695889053987, "grad_norm": 0.6176502513567036, "learning_rate": 5e-06, "loss": 0.78, "step": 380 }, { "epoch": 1.5453194650817235, "grad_norm": 0.6195463728584585, "learning_rate": 5e-06, "loss": 0.7812, "step": 390 }, { "epoch": 1.5849430411094603, "grad_norm": 0.5860977422231922, "learning_rate": 5e-06, "loss": 0.7855, "step": 400 }, { "epoch": 1.6245666171371966, "grad_norm": 0.6058978073304615, "learning_rate": 5e-06, "loss": 0.7826, "step": 410 }, { "epoch": 1.664190193164933, "grad_norm": 0.5726428056331896, "learning_rate": 5e-06, "loss": 0.7767, "step": 420 }, { "epoch": 1.7038137691926698, "grad_norm": 0.6255990916494661, "learning_rate": 5e-06, "loss": 0.7776, "step": 430 }, { "epoch": 1.7434373452204062, "grad_norm": 0.621833549178633, "learning_rate": 5e-06, "loss": 0.7786, "step": 440 }, { "epoch": 1.7830609212481425, "grad_norm": 0.5435896897432534, "learning_rate": 5e-06, "loss": 0.7775, "step": 450 }, { "epoch": 1.8226844972758791, "grad_norm": 0.7495004498554898, "learning_rate": 5e-06, "loss": 0.7759, "step": 460 }, { "epoch": 1.8623080733036157, "grad_norm": 0.5987267949701414, "learning_rate": 5e-06, "loss": 0.778, "step": 470 }, { "epoch": 1.901931649331352, "grad_norm": 0.6388662201965061, "learning_rate": 5e-06, "loss": 0.7728, "step": 480 }, { "epoch": 1.9415552253590886, "grad_norm": 0.5711439995135582, "learning_rate": 5e-06, "loss": 0.7817, "step": 490 }, { "epoch": 1.9811788013868252, "grad_norm": 0.617975937550618, "learning_rate": 5e-06, "loss": 0.7778, "step": 500 }, { "epoch": 1.9970282317979198, "eval_loss": 0.8146935701370239, "eval_runtime": 271.9619, "eval_samples_per_second": 25.0, "eval_steps_per_second": 0.393, "step": 504 }, { "epoch": 2.0208023774145616, "grad_norm": 0.6930015338867856, "learning_rate": 5e-06, "loss": 0.8016, "step": 510 }, { "epoch": 2.060425953442298, "grad_norm": 0.6358377946223995, "learning_rate": 5e-06, "loss": 0.7316, "step": 520 }, { "epoch": 2.1000495294700348, "grad_norm": 0.8278609254484581, "learning_rate": 5e-06, "loss": 0.7265, "step": 530 }, { "epoch": 2.139673105497771, "grad_norm": 0.6962674479421791, "learning_rate": 5e-06, "loss": 0.7311, "step": 540 }, { "epoch": 2.1792966815255075, "grad_norm": 0.6727207101976576, "learning_rate": 5e-06, "loss": 0.7294, "step": 550 }, { "epoch": 2.2189202575532443, "grad_norm": 0.5817825946787325, "learning_rate": 5e-06, "loss": 0.7307, "step": 560 }, { "epoch": 2.2585438335809807, "grad_norm": 0.6436375631599169, "learning_rate": 5e-06, "loss": 0.7325, "step": 570 }, { "epoch": 2.298167409608717, "grad_norm": 0.9292364267574976, "learning_rate": 5e-06, "loss": 0.7352, "step": 580 }, { "epoch": 2.337790985636454, "grad_norm": 0.9489749212265489, "learning_rate": 5e-06, "loss": 0.7317, "step": 590 }, { "epoch": 2.37741456166419, "grad_norm": 0.7240317017944226, "learning_rate": 5e-06, "loss": 0.7274, "step": 600 }, { "epoch": 2.4170381376919265, "grad_norm": 0.6501433881507601, "learning_rate": 5e-06, "loss": 0.7301, "step": 610 }, { "epoch": 2.4566617137196634, "grad_norm": 0.8099751855974757, "learning_rate": 5e-06, "loss": 0.734, "step": 620 }, { "epoch": 2.4962852897473997, "grad_norm": 0.6496005537182832, "learning_rate": 5e-06, "loss": 0.7361, "step": 630 }, { "epoch": 2.535908865775136, "grad_norm": 0.5493051299286916, "learning_rate": 5e-06, "loss": 0.7327, "step": 640 }, { "epoch": 2.5755324418028724, "grad_norm": 0.5890719222816667, "learning_rate": 5e-06, "loss": 0.7346, "step": 650 }, { "epoch": 2.6151560178306092, "grad_norm": 0.5341897077649945, "learning_rate": 5e-06, "loss": 0.7314, "step": 660 }, { "epoch": 2.6547795938583456, "grad_norm": 0.5921523513579053, "learning_rate": 5e-06, "loss": 0.7352, "step": 670 }, { "epoch": 2.6944031698860824, "grad_norm": 0.5890344658716773, "learning_rate": 5e-06, "loss": 0.7328, "step": 680 }, { "epoch": 2.734026745913819, "grad_norm": 0.5886927133745706, "learning_rate": 5e-06, "loss": 0.7353, "step": 690 }, { "epoch": 2.773650321941555, "grad_norm": 0.7084337230865917, "learning_rate": 5e-06, "loss": 0.7359, "step": 700 }, { "epoch": 2.8132738979692915, "grad_norm": 0.6320863625704357, "learning_rate": 5e-06, "loss": 0.7289, "step": 710 }, { "epoch": 2.8528974739970283, "grad_norm": 0.6406803421798857, "learning_rate": 5e-06, "loss": 0.7321, "step": 720 }, { "epoch": 2.8925210500247647, "grad_norm": 0.5985842168545841, "learning_rate": 5e-06, "loss": 0.7363, "step": 730 }, { "epoch": 2.9321446260525015, "grad_norm": 0.5878885552435451, "learning_rate": 5e-06, "loss": 0.7317, "step": 740 }, { "epoch": 2.971768202080238, "grad_norm": 0.7738327584141713, "learning_rate": 5e-06, "loss": 0.7327, "step": 750 }, { "epoch": 2.9955423476968797, "eval_loss": 0.8128942847251892, "eval_runtime": 272.1686, "eval_samples_per_second": 24.981, "eval_steps_per_second": 0.393, "step": 756 }, { "epoch": 2.9955423476968797, "step": 756, "total_flos": 1266118777896960.0, "train_loss": 0.7958845669630343, "train_runtime": 44860.7812, "train_samples_per_second": 8.638, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 756, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1266118777896960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }