{ "best_metric": 0.7058091506789402, "best_model_checkpoint": "/data/ephemeral/home/level2-nlp-datacentric-nlp-15/models/train_aug_filtered_data_8315.csv_20241107_223341/checkpoint-400", "epoch": 2.0, "eval_steps": 100, "global_step": 492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04065040650406504, "grad_norm": 5.844911098480225, "learning_rate": 1.959349593495935e-05, "loss": 1.9107, "step": 10 }, { "epoch": 0.08130081300813008, "grad_norm": 4.806931495666504, "learning_rate": 1.91869918699187e-05, "loss": 1.793, "step": 20 }, { "epoch": 0.12195121951219512, "grad_norm": 5.768178939819336, "learning_rate": 1.878048780487805e-05, "loss": 1.6268, "step": 30 }, { "epoch": 0.16260162601626016, "grad_norm": 6.002701759338379, "learning_rate": 1.83739837398374e-05, "loss": 1.5186, "step": 40 }, { "epoch": 0.2032520325203252, "grad_norm": 5.8298115730285645, "learning_rate": 1.796747967479675e-05, "loss": 1.3802, "step": 50 }, { "epoch": 0.24390243902439024, "grad_norm": 5.422698020935059, "learning_rate": 1.75609756097561e-05, "loss": 1.3176, "step": 60 }, { "epoch": 0.2845528455284553, "grad_norm": 7.105832099914551, "learning_rate": 1.7154471544715447e-05, "loss": 1.1553, "step": 70 }, { "epoch": 0.3252032520325203, "grad_norm": 6.3010101318359375, "learning_rate": 1.6747967479674798e-05, "loss": 1.209, "step": 80 }, { "epoch": 0.36585365853658536, "grad_norm": 5.355374813079834, "learning_rate": 1.6341463414634145e-05, "loss": 1.1115, "step": 90 }, { "epoch": 0.4065040650406504, "grad_norm": 8.652898788452148, "learning_rate": 1.5934959349593496e-05, "loss": 1.2678, "step": 100 }, { "epoch": 0.4065040650406504, "eval_f1": 0.6317862490710497, "eval_loss": 1.1097809076309204, "eval_runtime": 30.8532, "eval_samples_per_second": 109.097, "eval_steps_per_second": 3.436, "step": 100 }, { "epoch": 0.44715447154471544, "grad_norm": 5.56029748916626, "learning_rate": 1.5528455284552847e-05, "loss": 1.2245, "step": 110 }, { "epoch": 0.4878048780487805, "grad_norm": 5.553459644317627, "learning_rate": 1.5121951219512196e-05, "loss": 1.0607, "step": 120 }, { "epoch": 0.5284552845528455, "grad_norm": 6.536402702331543, "learning_rate": 1.4715447154471545e-05, "loss": 1.1759, "step": 130 }, { "epoch": 0.5691056910569106, "grad_norm": 5.6078200340271, "learning_rate": 1.4308943089430896e-05, "loss": 1.0878, "step": 140 }, { "epoch": 0.6097560975609756, "grad_norm": 7.2565789222717285, "learning_rate": 1.3902439024390244e-05, "loss": 1.1789, "step": 150 }, { "epoch": 0.6504065040650406, "grad_norm": 6.567378520965576, "learning_rate": 1.3495934959349594e-05, "loss": 1.1621, "step": 160 }, { "epoch": 0.6910569105691057, "grad_norm": 6.680911064147949, "learning_rate": 1.3089430894308943e-05, "loss": 1.0317, "step": 170 }, { "epoch": 0.7317073170731707, "grad_norm": 5.503971576690674, "learning_rate": 1.2682926829268294e-05, "loss": 1.2348, "step": 180 }, { "epoch": 0.7723577235772358, "grad_norm": 5.242718696594238, "learning_rate": 1.2276422764227642e-05, "loss": 1.0482, "step": 190 }, { "epoch": 0.8130081300813008, "grad_norm": 5.709949016571045, "learning_rate": 1.1869918699186992e-05, "loss": 1.0018, "step": 200 }, { "epoch": 0.8130081300813008, "eval_f1": 0.6788360267939961, "eval_loss": 0.9953919649124146, "eval_runtime": 30.8775, "eval_samples_per_second": 109.011, "eval_steps_per_second": 3.433, "step": 200 }, { "epoch": 0.8536585365853658, "grad_norm": 6.952666759490967, "learning_rate": 1.1463414634146342e-05, "loss": 1.0277, "step": 210 }, { "epoch": 0.8943089430894309, "grad_norm": 8.080061912536621, "learning_rate": 1.1056910569105692e-05, "loss": 0.9702, "step": 220 }, { "epoch": 0.9349593495934959, "grad_norm": 6.3944549560546875, "learning_rate": 1.065040650406504e-05, "loss": 1.0117, "step": 230 }, { "epoch": 0.975609756097561, "grad_norm": 6.3046488761901855, "learning_rate": 1.024390243902439e-05, "loss": 1.1426, "step": 240 }, { "epoch": 1.016260162601626, "grad_norm": 5.248263835906982, "learning_rate": 9.837398373983741e-06, "loss": 0.9627, "step": 250 }, { "epoch": 1.056910569105691, "grad_norm": 7.71372127532959, "learning_rate": 9.43089430894309e-06, "loss": 0.9348, "step": 260 }, { "epoch": 1.0975609756097562, "grad_norm": 9.51766586303711, "learning_rate": 9.02439024390244e-06, "loss": 0.9111, "step": 270 }, { "epoch": 1.1382113821138211, "grad_norm": 5.655633926391602, "learning_rate": 8.617886178861789e-06, "loss": 0.8609, "step": 280 }, { "epoch": 1.1788617886178863, "grad_norm": 7.609259128570557, "learning_rate": 8.21138211382114e-06, "loss": 0.8905, "step": 290 }, { "epoch": 1.2195121951219512, "grad_norm": 5.490593433380127, "learning_rate": 7.804878048780489e-06, "loss": 0.7554, "step": 300 }, { "epoch": 1.2195121951219512, "eval_f1": 0.6839676640367311, "eval_loss": 0.9506719708442688, "eval_runtime": 30.8566, "eval_samples_per_second": 109.085, "eval_steps_per_second": 3.435, "step": 300 }, { "epoch": 1.2601626016260163, "grad_norm": 10.015968322753906, "learning_rate": 7.398373983739838e-06, "loss": 0.9043, "step": 310 }, { "epoch": 1.3008130081300813, "grad_norm": 7.634036064147949, "learning_rate": 6.991869918699188e-06, "loss": 0.8676, "step": 320 }, { "epoch": 1.3414634146341464, "grad_norm": 5.928556442260742, "learning_rate": 6.585365853658538e-06, "loss": 0.9127, "step": 330 }, { "epoch": 1.3821138211382114, "grad_norm": 6.368280410766602, "learning_rate": 6.178861788617887e-06, "loss": 0.9316, "step": 340 }, { "epoch": 1.4227642276422765, "grad_norm": 6.322418212890625, "learning_rate": 5.772357723577237e-06, "loss": 0.8273, "step": 350 }, { "epoch": 1.4634146341463414, "grad_norm": 7.215012550354004, "learning_rate": 5.365853658536586e-06, "loss": 0.8814, "step": 360 }, { "epoch": 1.5040650406504064, "grad_norm": 5.894520282745361, "learning_rate": 4.959349593495935e-06, "loss": 0.8295, "step": 370 }, { "epoch": 1.5447154471544715, "grad_norm": 6.685314178466797, "learning_rate": 4.552845528455285e-06, "loss": 0.8727, "step": 380 }, { "epoch": 1.5853658536585367, "grad_norm": 8.929649353027344, "learning_rate": 4.146341463414634e-06, "loss": 0.8012, "step": 390 }, { "epoch": 1.6260162601626016, "grad_norm": 8.644834518432617, "learning_rate": 3.7398373983739838e-06, "loss": 0.8151, "step": 400 }, { "epoch": 1.6260162601626016, "eval_f1": 0.7058091506789402, "eval_loss": 0.9088509678840637, "eval_runtime": 30.8562, "eval_samples_per_second": 109.087, "eval_steps_per_second": 3.435, "step": 400 }, { "epoch": 1.6666666666666665, "grad_norm": 7.286980628967285, "learning_rate": 3.3333333333333333e-06, "loss": 0.8603, "step": 410 }, { "epoch": 1.7073170731707317, "grad_norm": 7.062027454376221, "learning_rate": 2.926829268292683e-06, "loss": 0.8243, "step": 420 }, { "epoch": 1.7479674796747968, "grad_norm": 6.453483581542969, "learning_rate": 2.5203252032520324e-06, "loss": 0.8328, "step": 430 }, { "epoch": 1.7886178861788617, "grad_norm": 6.350010395050049, "learning_rate": 2.1138211382113824e-06, "loss": 0.8307, "step": 440 }, { "epoch": 1.8292682926829267, "grad_norm": 11.967658042907715, "learning_rate": 1.707317073170732e-06, "loss": 0.8335, "step": 450 }, { "epoch": 1.8699186991869918, "grad_norm": 7.773582935333252, "learning_rate": 1.3008130081300815e-06, "loss": 0.7627, "step": 460 }, { "epoch": 1.910569105691057, "grad_norm": 8.230002403259277, "learning_rate": 8.94308943089431e-07, "loss": 0.7758, "step": 470 }, { "epoch": 1.951219512195122, "grad_norm": 5.672370910644531, "learning_rate": 4.878048780487805e-07, "loss": 0.8621, "step": 480 }, { "epoch": 1.9918699186991868, "grad_norm": 9.773723602294922, "learning_rate": 8.130081300813009e-08, "loss": 0.8322, "step": 490 } ], "logging_steps": 10, "max_steps": 492, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4131555260774400.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }