|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.6716417910447765, |
|
"eval_steps": 500, |
|
"global_step": 65, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 11.831602096557617, |
|
"learning_rate": 0.00019692307692307696, |
|
"loss": 9.5753, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 10.040313720703125, |
|
"learning_rate": 0.00019384615384615385, |
|
"loss": 9.3879, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 11.504229545593262, |
|
"learning_rate": 0.0001907692307692308, |
|
"loss": 9.2222, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 13.777145385742188, |
|
"learning_rate": 0.0001876923076923077, |
|
"loss": 9.0187, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 16.53196907043457, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 8.8251, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 23.02943229675293, |
|
"learning_rate": 0.00018153846153846155, |
|
"loss": 8.4652, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 20.641206741333008, |
|
"learning_rate": 0.00017846153846153847, |
|
"loss": 8.1413, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 16.40256690979004, |
|
"learning_rate": 0.0001753846153846154, |
|
"loss": 7.9036, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 19.79137420654297, |
|
"learning_rate": 0.00017230769230769234, |
|
"loss": 7.8958, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 10.174325942993164, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 7.6606, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 11.052704811096191, |
|
"learning_rate": 0.00016615384615384617, |
|
"loss": 7.6561, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 12.233122825622559, |
|
"learning_rate": 0.0001630769230769231, |
|
"loss": 7.5328, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 10.275497436523438, |
|
"learning_rate": 0.00016, |
|
"loss": 7.4978, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.3966779708862305, |
|
"learning_rate": 0.00015692307692307693, |
|
"loss": 3.0154, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 13.597101211547852, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 7.3022, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 12.541572570800781, |
|
"learning_rate": 0.00015076923076923077, |
|
"loss": 7.4243, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 7.609414577484131, |
|
"learning_rate": 0.00014769230769230772, |
|
"loss": 7.3352, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 8.79019546508789, |
|
"learning_rate": 0.0001446153846153846, |
|
"loss": 7.3403, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 7.513161659240723, |
|
"learning_rate": 0.00014153846153846156, |
|
"loss": 7.3048, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.00014153846153846156, |
|
"loss": 7.3934, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 11.386406898498535, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 7.2708, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 7.2549, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 9.923229217529297, |
|
"learning_rate": 0.0001353846153846154, |
|
"loss": 7.3161, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 7.624272346496582, |
|
"learning_rate": 0.0001323076923076923, |
|
"loss": 7.2844, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 6.796629905700684, |
|
"learning_rate": 0.00012923076923076923, |
|
"loss": 7.2594, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 5.520105838775635, |
|
"learning_rate": 0.00012615384615384615, |
|
"loss": 7.2639, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 5.706660270690918, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 7.2271, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.667247295379639, |
|
"learning_rate": 0.00012, |
|
"loss": 2.942, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.074626865671642, |
|
"grad_norm": 8.385282516479492, |
|
"learning_rate": 0.00011692307692307694, |
|
"loss": 7.1126, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 2.1492537313432836, |
|
"grad_norm": 4.17704963684082, |
|
"learning_rate": 0.00011384615384615384, |
|
"loss": 7.2177, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.2238805970149254, |
|
"grad_norm": 7.29147481918335, |
|
"learning_rate": 0.00011076923076923077, |
|
"loss": 7.2731, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 2.298507462686567, |
|
"grad_norm": 5.086247444152832, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 7.1494, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.373134328358209, |
|
"grad_norm": 4.92710542678833, |
|
"learning_rate": 0.00010461538461538463, |
|
"loss": 7.1479, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 2.4477611940298507, |
|
"grad_norm": 5.310170650482178, |
|
"learning_rate": 0.00010153846153846153, |
|
"loss": 7.1172, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.5223880597014925, |
|
"grad_norm": 5.681138515472412, |
|
"learning_rate": 9.846153846153848e-05, |
|
"loss": 7.1665, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 2.5970149253731343, |
|
"grad_norm": 6.48416805267334, |
|
"learning_rate": 9.53846153846154e-05, |
|
"loss": 7.229, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.671641791044776, |
|
"grad_norm": 7.22155237197876, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 7.2443, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 2.746268656716418, |
|
"grad_norm": 5.244325160980225, |
|
"learning_rate": 8.923076923076924e-05, |
|
"loss": 7.1864, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.8208955223880596, |
|
"grad_norm": 6.491042613983154, |
|
"learning_rate": 8.615384615384617e-05, |
|
"loss": 7.1403, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 2.8955223880597014, |
|
"grad_norm": 4.748079299926758, |
|
"learning_rate": 8.307692307692309e-05, |
|
"loss": 7.2464, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.970149253731343, |
|
"grad_norm": 4.683705806732178, |
|
"learning_rate": 8e-05, |
|
"loss": 7.1566, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 8.315367698669434, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 2.7335, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 3.074626865671642, |
|
"grad_norm": 8.71499252319336, |
|
"learning_rate": 7.384615384615386e-05, |
|
"loss": 7.0353, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 3.1492537313432836, |
|
"grad_norm": 4.421390056610107, |
|
"learning_rate": 7.076923076923078e-05, |
|
"loss": 7.1684, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 3.2238805970149254, |
|
"grad_norm": 4.962438106536865, |
|
"learning_rate": 6.76923076923077e-05, |
|
"loss": 7.0518, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 3.298507462686567, |
|
"grad_norm": 4.061994552612305, |
|
"learning_rate": 6.461538461538462e-05, |
|
"loss": 7.0753, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 3.373134328358209, |
|
"grad_norm": 10.23737621307373, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 7.2593, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 3.4477611940298507, |
|
"grad_norm": 5.402864456176758, |
|
"learning_rate": 5.846153846153847e-05, |
|
"loss": 7.1255, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.5223880597014925, |
|
"grad_norm": 7.442513465881348, |
|
"learning_rate": 5.538461538461539e-05, |
|
"loss": 6.9086, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 3.5970149253731343, |
|
"grad_norm": 12.119452476501465, |
|
"learning_rate": 5.230769230769231e-05, |
|
"loss": 7.2655, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.671641791044776, |
|
"grad_norm": 4.1253156661987305, |
|
"learning_rate": 4.923076923076924e-05, |
|
"loss": 7.129, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 3.746268656716418, |
|
"grad_norm": 7.582332611083984, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 7.1866, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.8208955223880596, |
|
"grad_norm": 9.213349342346191, |
|
"learning_rate": 4.3076923076923084e-05, |
|
"loss": 7.2262, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 3.8955223880597014, |
|
"grad_norm": 4.230329513549805, |
|
"learning_rate": 4e-05, |
|
"loss": 7.0612, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.970149253731343, |
|
"grad_norm": 3.959320545196533, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 7.095, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.782017469406128, |
|
"learning_rate": 3.384615384615385e-05, |
|
"loss": 2.8985, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 4.074626865671641, |
|
"grad_norm": 8.111074447631836, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 7.0008, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 4.149253731343284, |
|
"grad_norm": 3.9246532917022705, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 7.1253, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 4.223880597014926, |
|
"grad_norm": 11.857646942138672, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 6.9411, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 4.298507462686567, |
|
"grad_norm": 2.8319931030273438, |
|
"learning_rate": 2.1538461538461542e-05, |
|
"loss": 7.1616, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.373134328358209, |
|
"grad_norm": 3.061171293258667, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 7.1291, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 4.447761194029851, |
|
"grad_norm": 5.19327449798584, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 7.1867, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 4.522388059701493, |
|
"grad_norm": 3.0203020572662354, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 7.1364, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 4.597014925373134, |
|
"grad_norm": 3.957735300064087, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 7.139, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.6716417910447765, |
|
"grad_norm": 4.099056243896484, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 7.1104, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 4.6716417910447765, |
|
"step": 65, |
|
"total_flos": 46267156615872.0, |
|
"train_loss": 7.157758657748882, |
|
"train_runtime": 2311.3066, |
|
"train_samples_per_second": 0.58, |
|
"train_steps_per_second": 0.028 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 65, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 46267156615872.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|