{ "best_global_step": 43, "best_metric": 3.79597425, "best_model_checkpoint": "/workspace/output/v0-20250510-202602/checkpoint-43", "epoch": 0.9842632331902719, "eval_steps": 200, "global_step": 43, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022889842632331903, "grad_norm": 0.5022401213645935, "learning_rate": 2.5e-05, "loss": 5.9138689041137695, "memory(GiB)": 22.25, "step": 1, "token_acc": 0.2735191637630662, "train_speed(iter/s)": 0.017618 }, { "epoch": 0.045779685264663805, "grad_norm": 0.4973876178264618, "learning_rate": 5e-05, "loss": 6.206646919250488, "memory(GiB)": 22.25, "step": 2, "token_acc": 0.25411334552102377, "train_speed(iter/s)": 0.024222 }, { "epoch": 0.06866952789699571, "grad_norm": 0.520351767539978, "learning_rate": 4.992664502959351e-05, "loss": 5.884594917297363, "memory(GiB)": 22.25, "step": 3, "token_acc": 0.26119402985074625, "train_speed(iter/s)": 0.027671 }, { "epoch": 0.09155937052932761, "grad_norm": 0.6917837262153625, "learning_rate": 4.970701059450872e-05, "loss": 5.813294887542725, "memory(GiB)": 22.25, "step": 4, "token_acc": 0.2789115646258503, "train_speed(iter/s)": 0.02975 }, { "epoch": 0.11444921316165951, "grad_norm": 0.8174898028373718, "learning_rate": 4.934238559694448e-05, "loss": 6.142425537109375, "memory(GiB)": 22.25, "step": 5, "token_acc": 0.20984455958549222, "train_speed(iter/s)": 0.031187 }, { "epoch": 0.13733905579399142, "grad_norm": 0.5081659555435181, "learning_rate": 4.8834909801373264e-05, "loss": 5.509262561798096, "memory(GiB)": 22.25, "step": 6, "token_acc": 0.29264214046822745, "train_speed(iter/s)": 0.032136 }, { "epoch": 0.16022889842632332, "grad_norm": 0.5285544395446777, "learning_rate": 4.8187561277552374e-05, "loss": 5.453015327453613, "memory(GiB)": 22.25, "step": 7, "token_acc": 0.33134328358208953, "train_speed(iter/s)": 0.032853 }, { "epoch": 0.18311874105865522, "grad_norm": 0.6126793026924133, "learning_rate": 4.740413892402639e-05, "loss": 5.514800071716309, "memory(GiB)": 22.25, "step": 8, "token_acc": 0.24347826086956523, "train_speed(iter/s)": 0.033481 }, { "epoch": 0.20600858369098712, "grad_norm": 0.5079677104949951, "learning_rate": 4.648924017468003e-05, "loss": 5.397139549255371, "memory(GiB)": 22.25, "step": 9, "token_acc": 0.2693069306930693, "train_speed(iter/s)": 0.033962 }, { "epoch": 0.22889842632331903, "grad_norm": 0.5848721861839294, "learning_rate": 4.5448234019167945e-05, "loss": 5.021652698516846, "memory(GiB)": 22.25, "step": 10, "token_acc": 0.32229965156794427, "train_speed(iter/s)": 0.034354 }, { "epoch": 0.25178826895565093, "grad_norm": 0.4369657635688782, "learning_rate": 4.428722949554857e-05, "loss": 5.207980155944824, "memory(GiB)": 22.25, "step": 11, "token_acc": 0.34467455621301774, "train_speed(iter/s)": 0.034653 }, { "epoch": 0.27467811158798283, "grad_norm": 0.7269682884216309, "learning_rate": 4.301303984001967e-05, "loss": 5.160121917724609, "memory(GiB)": 22.25, "step": 12, "token_acc": 0.34941763727121466, "train_speed(iter/s)": 0.034904 }, { "epoch": 0.29756795422031473, "grad_norm": 0.829106867313385, "learning_rate": 4.163314250413913e-05, "loss": 4.662051200866699, "memory(GiB)": 22.25, "step": 13, "token_acc": 0.32751091703056767, "train_speed(iter/s)": 0.035138 }, { "epoch": 0.32045779685264664, "grad_norm": 1.1529988050460815, "learning_rate": 4.015563527416595e-05, "loss": 5.173630237579346, "memory(GiB)": 22.25, "step": 14, "token_acc": 0.28865979381443296, "train_speed(iter/s)": 0.035337 }, { "epoch": 0.34334763948497854, "grad_norm": 0.7239392995834351, "learning_rate": 3.858918875003053e-05, "loss": 4.88520622253418, "memory(GiB)": 22.25, "step": 15, "token_acc": 0.332089552238806, "train_speed(iter/s)": 0.035497 }, { "epoch": 0.36623748211731044, "grad_norm": 0.5255656838417053, "learning_rate": 3.694299546280657e-05, "loss": 4.789463043212891, "memory(GiB)": 22.25, "step": 16, "token_acc": 0.36082474226804123, "train_speed(iter/s)": 0.035644 }, { "epoch": 0.38912732474964234, "grad_norm": 0.527284562587738, "learning_rate": 3.5226715929283506e-05, "loss": 5.008277416229248, "memory(GiB)": 22.25, "step": 17, "token_acc": 0.3034188034188034, "train_speed(iter/s)": 0.035798 }, { "epoch": 0.41201716738197425, "grad_norm": 0.6423527002334595, "learning_rate": 3.3450421960212566e-05, "loss": 4.778470039367676, "memory(GiB)": 22.25, "step": 18, "token_acc": 0.3488372093023256, "train_speed(iter/s)": 0.035907 }, { "epoch": 0.43490701001430615, "grad_norm": 0.4906652867794037, "learning_rate": 3.162453755491655e-05, "loss": 4.682660102844238, "memory(GiB)": 22.25, "step": 19, "token_acc": 0.35555555555555557, "train_speed(iter/s)": 0.036025 }, { "epoch": 0.45779685264663805, "grad_norm": 0.9560534358024597, "learning_rate": 2.975977772911671e-05, "loss": 4.940546989440918, "memory(GiB)": 22.25, "step": 20, "token_acc": 0.36923076923076925, "train_speed(iter/s)": 0.036093 }, { "epoch": 0.48068669527896996, "grad_norm": 0.5544789433479309, "learning_rate": 2.7867085634960016e-05, "loss": 4.366146087646484, "memory(GiB)": 22.25, "step": 21, "token_acc": 0.3649906890130354, "train_speed(iter/s)": 0.036168 }, { "epoch": 0.5035765379113019, "grad_norm": 0.4951302111148834, "learning_rate": 2.595756834225089e-05, "loss": 4.866259574890137, "memory(GiB)": 22.25, "step": 22, "token_acc": 0.34402852049910876, "train_speed(iter/s)": 0.036268 }, { "epoch": 0.5264663805436338, "grad_norm": 1.56654953956604, "learning_rate": 2.4042431657749117e-05, "loss": 4.790994644165039, "memory(GiB)": 22.25, "step": 23, "token_acc": 0.3361522198731501, "train_speed(iter/s)": 0.03635 }, { "epoch": 0.5493562231759657, "grad_norm": 0.529353678226471, "learning_rate": 2.2132914365039993e-05, "loss": 4.498373985290527, "memory(GiB)": 22.25, "step": 24, "token_acc": 0.38278388278388276, "train_speed(iter/s)": 0.036412 }, { "epoch": 0.5722460658082976, "grad_norm": 0.5923216342926025, "learning_rate": 2.0240222270883288e-05, "loss": 4.431886672973633, "memory(GiB)": 22.25, "step": 25, "token_acc": 0.3901345291479821, "train_speed(iter/s)": 0.036468 }, { "epoch": 0.5951359084406295, "grad_norm": 0.5044678449630737, "learning_rate": 1.8375462445083464e-05, "loss": 4.577709674835205, "memory(GiB)": 22.25, "step": 26, "token_acc": 0.3509803921568627, "train_speed(iter/s)": 0.036523 }, { "epoch": 0.6180257510729614, "grad_norm": 0.8515617251396179, "learning_rate": 1.6549578039787436e-05, "loss": 3.797635555267334, "memory(GiB)": 22.25, "step": 27, "token_acc": 0.40134907251264756, "train_speed(iter/s)": 0.036566 }, { "epoch": 0.6409155937052933, "grad_norm": 0.9012308120727539, "learning_rate": 1.4773284070716503e-05, "loss": 4.415590286254883, "memory(GiB)": 22.25, "step": 28, "token_acc": 0.38589981447124305, "train_speed(iter/s)": 0.036597 }, { "epoch": 0.6638054363376252, "grad_norm": 0.5051128268241882, "learning_rate": 1.3057004537193423e-05, "loss": 4.514218330383301, "memory(GiB)": 22.25, "step": 29, "token_acc": 0.3765541740674956, "train_speed(iter/s)": 0.036643 }, { "epoch": 0.6866952789699571, "grad_norm": 0.8118892908096313, "learning_rate": 1.1410811249969475e-05, "loss": 4.161840915679932, "memory(GiB)": 22.25, "step": 30, "token_acc": 0.35412474849094566, "train_speed(iter/s)": 0.036683 }, { "epoch": 0.709585121602289, "grad_norm": 0.7509729266166687, "learning_rate": 9.844364725834057e-06, "loss": 4.108524799346924, "memory(GiB)": 22.25, "step": 31, "token_acc": 0.4240924092409241, "train_speed(iter/s)": 0.036725 }, { "epoch": 0.7324749642346209, "grad_norm": 0.6745265126228333, "learning_rate": 8.36685749586087e-06, "loss": 4.507699489593506, "memory(GiB)": 22.25, "step": 32, "token_acc": 0.35660377358490564, "train_speed(iter/s)": 0.036768 }, { "epoch": 0.7553648068669528, "grad_norm": 0.5046018958091736, "learning_rate": 6.986960159980327e-06, "loss": 4.469419479370117, "memory(GiB)": 22.25, "step": 33, "token_acc": 0.41550387596899224, "train_speed(iter/s)": 0.036795 }, { "epoch": 0.7782546494992847, "grad_norm": 0.6278886198997498, "learning_rate": 5.712770504451426e-06, "loss": 4.4875640869140625, "memory(GiB)": 22.25, "step": 34, "token_acc": 0.386411889596603, "train_speed(iter/s)": 0.036831 }, { "epoch": 0.8011444921316166, "grad_norm": 1.2817845344543457, "learning_rate": 4.551765980832059e-06, "loss": 4.035043239593506, "memory(GiB)": 22.25, "step": 35, "token_acc": 0.39222042139384117, "train_speed(iter/s)": 0.036858 }, { "epoch": 0.8240343347639485, "grad_norm": 0.6294739246368408, "learning_rate": 3.5107598253199758e-06, "loss": 3.905367612838745, "memory(GiB)": 22.25, "step": 36, "token_acc": 0.4039301310043668, "train_speed(iter/s)": 0.036893 }, { "epoch": 0.8469241773962805, "grad_norm": 0.5308797359466553, "learning_rate": 2.595861075973613e-06, "loss": 3.832357883453369, "memory(GiB)": 22.25, "step": 37, "token_acc": 0.37555555555555553, "train_speed(iter/s)": 0.03692 }, { "epoch": 0.8698140200286123, "grad_norm": 0.613280177116394, "learning_rate": 1.8124387224476347e-06, "loss": 3.510023832321167, "memory(GiB)": 22.25, "step": 38, "token_acc": 0.41849529780564265, "train_speed(iter/s)": 0.036949 }, { "epoch": 0.8927038626609443, "grad_norm": 0.5897545218467712, "learning_rate": 1.1650901986267365e-06, "loss": 4.297924041748047, "memory(GiB)": 22.25, "step": 39, "token_acc": 0.38562091503267976, "train_speed(iter/s)": 0.036961 }, { "epoch": 0.9155937052932761, "grad_norm": 0.5033223032951355, "learning_rate": 6.576144030555259e-07, "loss": 3.912318229675293, "memory(GiB)": 22.25, "step": 40, "token_acc": 0.3920792079207921, "train_speed(iter/s)": 0.036991 }, { "epoch": 0.9384835479256081, "grad_norm": 0.44826453924179077, "learning_rate": 2.9298940549128964e-07, "loss": 3.7790920734405518, "memory(GiB)": 22.25, "step": 41, "token_acc": 0.4283464566929134, "train_speed(iter/s)": 0.03701 }, { "epoch": 0.9613733905579399, "grad_norm": 0.8731946349143982, "learning_rate": 7.335497040648898e-08, "loss": 4.0045576095581055, "memory(GiB)": 22.25, "step": 42, "token_acc": 0.3923076923076923, "train_speed(iter/s)": 0.03704 }, { "epoch": 0.9842632331902719, "grad_norm": 1.097395420074463, "learning_rate": 0.0, "loss": 4.415482521057129, "memory(GiB)": 22.25, "step": 43, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 0.037059 }, { "epoch": 0.9842632331902719, "eval_loss": 3.7959742546081543, "eval_runtime": 29.3121, "eval_samples_per_second": 9.996, "eval_steps_per_second": 1.262, "eval_token_acc": 0.4216255442670537, "step": 43 } ], "logging_steps": 1, "max_steps": 43, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.042062092776243e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }