|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9865871833084947, |
|
"eval_steps": 500, |
|
"global_step": 501, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05961251862891207, |
|
"grad_norm": 9.858894874210412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9953, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11922503725782414, |
|
"grad_norm": 3.200951587553443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8872, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17883755588673622, |
|
"grad_norm": 0.8709047390143161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23845007451564829, |
|
"grad_norm": 0.7917099907436183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8247, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29806259314456035, |
|
"grad_norm": 0.7435704731717787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8138, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35767511177347244, |
|
"grad_norm": 0.6069787155672357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4172876304023845, |
|
"grad_norm": 0.7240817518375863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7952, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47690014903129657, |
|
"grad_norm": 0.7043281338985534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7931, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5365126676602087, |
|
"grad_norm": 0.586854693193699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7883, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5961251862891207, |
|
"grad_norm": 0.5823407135149267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.6283773137272283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7847, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7153502235469449, |
|
"grad_norm": 0.6299087806837431, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7847, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7749627421758569, |
|
"grad_norm": 0.6905339934914414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.834575260804769, |
|
"grad_norm": 0.5764098315055344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7722, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8941877794336811, |
|
"grad_norm": 0.596630510361194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7723, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9538002980625931, |
|
"grad_norm": 0.6393363259269436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7727, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9955290611028316, |
|
"eval_loss": 0.7674793601036072, |
|
"eval_runtime": 178.4077, |
|
"eval_samples_per_second": 25.33, |
|
"eval_steps_per_second": 0.398, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.0134128166915053, |
|
"grad_norm": 1.028243455897246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7832, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0730253353204173, |
|
"grad_norm": 0.6463791732368608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7342, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1326378539493294, |
|
"grad_norm": 0.7099545246983331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7264, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1922503725782414, |
|
"grad_norm": 0.6643420655401983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7249, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2518628912071534, |
|
"grad_norm": 0.7310409955228447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.733, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 0.6511777722980082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7219, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3710879284649775, |
|
"grad_norm": 0.7827849824805997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4307004470938898, |
|
"grad_norm": 0.6318650254279009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7188, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4903129657228018, |
|
"grad_norm": 0.6677002292570327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7192, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5499254843517138, |
|
"grad_norm": 0.564078834227423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7245, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6095380029806259, |
|
"grad_norm": 0.5237353037646948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7263, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.669150521609538, |
|
"grad_norm": 0.6003861550477204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7203, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7287630402384502, |
|
"grad_norm": 0.7033677185903798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7217, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.788375558867362, |
|
"grad_norm": 0.6820739163112765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7267, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8479880774962743, |
|
"grad_norm": 0.6594996515164985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7179, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9076005961251863, |
|
"grad_norm": 0.6309878288237989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7196, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 0.5108818281911819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7274, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9970193740685542, |
|
"eval_loss": 0.7537275552749634, |
|
"eval_runtime": 178.6454, |
|
"eval_samples_per_second": 25.296, |
|
"eval_steps_per_second": 0.397, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.0268256333830106, |
|
"grad_norm": 0.919905069790943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7213, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0864381520119224, |
|
"grad_norm": 0.5627127586056083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6748, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1460506706408347, |
|
"grad_norm": 0.698070822337305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6786, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.2056631892697465, |
|
"grad_norm": 0.7188866615898121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6768, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.2652757078986587, |
|
"grad_norm": 0.7308617732721838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6779, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3248882265275705, |
|
"grad_norm": 0.6553661010694801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6764, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.384500745156483, |
|
"grad_norm": 0.5825389739879382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6782, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.444113263785395, |
|
"grad_norm": 0.5934721290598023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6723, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.503725782414307, |
|
"grad_norm": 0.6213580858455807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6771, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.563338301043219, |
|
"grad_norm": 0.5958772497068647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6772, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 0.6075566821845071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6771, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.682563338301043, |
|
"grad_norm": 0.6500795583419717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6777, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.742175856929955, |
|
"grad_norm": 0.5645816490580728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6745, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.8017883755588673, |
|
"grad_norm": 0.593606888596975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6781, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8614008941877795, |
|
"grad_norm": 0.7862396669753284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6794, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9210134128166914, |
|
"grad_norm": 0.5741353341999686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6742, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9806259314456036, |
|
"grad_norm": 0.6882181948210808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6854, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.9865871833084947, |
|
"eval_loss": 0.7531510591506958, |
|
"eval_runtime": 177.5218, |
|
"eval_samples_per_second": 25.456, |
|
"eval_steps_per_second": 0.4, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.9865871833084947, |
|
"step": 501, |
|
"total_flos": 838984280309760.0, |
|
"train_loss": 0.7383992823060164, |
|
"train_runtime": 29820.4569, |
|
"train_samples_per_second": 8.636, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 501, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 838984280309760.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|