|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4149, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024102193299590263, |
|
"grad_norm": 0.10564957559108734, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4343, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.048204386599180526, |
|
"grad_norm": 0.12436462193727493, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4331, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07230657989877079, |
|
"grad_norm": 0.17909838259220123, |
|
"learning_rate": 6e-06, |
|
"loss": 2.4138, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09640877319836105, |
|
"grad_norm": 0.2627304196357727, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.4152, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12051096649795132, |
|
"grad_norm": 0.3407520651817322, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3356, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14461315979754158, |
|
"grad_norm": 0.4051746428012848, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2944, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16871535309713184, |
|
"grad_norm": 0.4393845200538635, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.3165, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1928175463967221, |
|
"grad_norm": 0.49629464745521545, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.3161, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.21691973969631237, |
|
"grad_norm": 0.5297220945358276, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.2482, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.24102193299590263, |
|
"grad_norm": 0.586216390132904, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2301, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.26512412629549287, |
|
"grad_norm": 0.600564181804657, |
|
"learning_rate": 1.9950276214176393e-05, |
|
"loss": 2.1974, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.28922631959508316, |
|
"grad_norm": 0.6726705431938171, |
|
"learning_rate": 1.9801599347680883e-05, |
|
"loss": 2.187, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3133285128946734, |
|
"grad_norm": 0.618769109249115, |
|
"learning_rate": 1.9555447955846792e-05, |
|
"loss": 2.1867, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3374307061942637, |
|
"grad_norm": 0.7493981719017029, |
|
"learning_rate": 1.921426995449166e-05, |
|
"loss": 2.1705, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3615328994938539, |
|
"grad_norm": 0.8110669255256653, |
|
"learning_rate": 1.8781458275988913e-05, |
|
"loss": 2.174, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3856350927934442, |
|
"grad_norm": 0.7075093388557434, |
|
"learning_rate": 1.826131712737932e-05, |
|
"loss": 2.1311, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40973728609303445, |
|
"grad_norm": 0.7727652192115784, |
|
"learning_rate": 1.7659019186077174e-05, |
|
"loss": 2.1228, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.43383947939262474, |
|
"grad_norm": 0.84642493724823, |
|
"learning_rate": 1.6980554158849546e-05, |
|
"loss": 2.1254, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.457941672692215, |
|
"grad_norm": 0.855783998966217, |
|
"learning_rate": 1.6232669215636963e-05, |
|
"loss": 2.1079, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.48204386599180526, |
|
"grad_norm": 0.7568260431289673, |
|
"learning_rate": 1.5422801890586833e-05, |
|
"loss": 2.0675, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5061460592913956, |
|
"grad_norm": 1.03830885887146, |
|
"learning_rate": 1.4559006117582424e-05, |
|
"loss": 2.1163, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5302482525909857, |
|
"grad_norm": 1.3009450435638428, |
|
"learning_rate": 1.3649872135826173e-05, |
|
"loss": 2.1031, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.554350445890576, |
|
"grad_norm": 0.9386299252510071, |
|
"learning_rate": 1.2704441061996842e-05, |
|
"loss": 2.0527, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5784526391901663, |
|
"grad_norm": 0.8045169115066528, |
|
"learning_rate": 1.1732114978539648e-05, |
|
"loss": 2.0772, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6025548324897566, |
|
"grad_norm": 0.8329111933708191, |
|
"learning_rate": 1.0742563432239503e-05, |
|
"loss": 2.0947, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6266570257893468, |
|
"grad_norm": 0.9610249996185303, |
|
"learning_rate": 9.745627272926332e-06, |
|
"loss": 2.0799, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6507592190889371, |
|
"grad_norm": 0.8086296319961548, |
|
"learning_rate": 8.751220788613237e-06, |
|
"loss": 2.0669, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6748614123885274, |
|
"grad_norm": 0.8295447826385498, |
|
"learning_rate": 7.769233110309735e-06, |
|
"loss": 2.0666, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6989636056881177, |
|
"grad_norm": 0.796343982219696, |
|
"learning_rate": 6.809429867015307e-06, |
|
"loss": 2.125, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7230657989877078, |
|
"grad_norm": 0.9749905467033386, |
|
"learning_rate": 5.881356068910429e-06, |
|
"loss": 2.0724, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7471679922872981, |
|
"grad_norm": 1.090606689453125, |
|
"learning_rate": 4.994241184548192e-06, |
|
"loss": 2.0116, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7712701855868884, |
|
"grad_norm": 1.000171422958374, |
|
"learning_rate": 4.1569073560307905e-06, |
|
"loss": 2.0562, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7953723788864787, |
|
"grad_norm": 0.9586558938026428, |
|
"learning_rate": 3.3776816649486378e-06, |
|
"loss": 2.0545, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8194745721860689, |
|
"grad_norm": 0.949877917766571, |
|
"learning_rate": 2.6643133215760586e-06, |
|
"loss": 2.0571, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8435767654856592, |
|
"grad_norm": 0.9553411602973938, |
|
"learning_rate": 2.0238966008568905e-06, |
|
"loss": 2.027, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8676789587852495, |
|
"grad_norm": 0.9195176959037781, |
|
"learning_rate": 1.4628002915629202e-06, |
|
"loss": 2.0751, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8917811520848398, |
|
"grad_norm": 0.9022210836410522, |
|
"learning_rate": 9.866043602360909e-07, |
|
"loss": 2.0573, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.91588334538443, |
|
"grad_norm": 0.7705771923065186, |
|
"learning_rate": 6.000444597762811e-07, |
|
"loss": 2.06, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9399855386840202, |
|
"grad_norm": 1.009355902671814, |
|
"learning_rate": 3.069648345231813e-07, |
|
"loss": 2.0263, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9640877319836105, |
|
"grad_norm": 0.9817578196525574, |
|
"learning_rate": 1.1028009017986174e-07, |
|
"loss": 2.0777, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9881899252832008, |
|
"grad_norm": 1.0477696657180786, |
|
"learning_rate": 1.1946208766822066e-08, |
|
"loss": 2.0764, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 4149, |
|
"total_flos": 7.54868493877248e+16, |
|
"train_loss": 2.153304632613963, |
|
"train_runtime": 1112.8562, |
|
"train_samples_per_second": 7.456, |
|
"train_steps_per_second": 3.728 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 4149, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.54868493877248e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|