|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1261.8181818181818, |
|
"eval_steps": 500, |
|
"global_step": 3470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 36.36363636363637, |
|
"grad_norm": 3.9305503368377686, |
|
"learning_rate": 0.00019992595626374085, |
|
"loss": 3.5508, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 72.72727272727273, |
|
"grad_norm": 7.143374919891357, |
|
"learning_rate": 0.00019969786478821292, |
|
"loss": 1.0142, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 109.0909090909091, |
|
"grad_norm": 4.481863498687744, |
|
"learning_rate": 0.00019932062382607466, |
|
"loss": 0.5034, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 145.45454545454547, |
|
"grad_norm": 5.447987079620361, |
|
"learning_rate": 0.00019878719501520854, |
|
"loss": 0.3536, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 181.8181818181818, |
|
"grad_norm": 6.030304431915283, |
|
"learning_rate": 0.00019810144350986773, |
|
"loss": 0.2281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 218.1818181818182, |
|
"grad_norm": 3.293713331222534, |
|
"learning_rate": 0.0001972644266891692, |
|
"loss": 0.1648, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 254.54545454545453, |
|
"grad_norm": 3.844909191131592, |
|
"learning_rate": 0.0001962774351723822, |
|
"loss": 0.1083, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 290.90909090909093, |
|
"grad_norm": 6.67781400680542, |
|
"learning_rate": 0.00019514199082888708, |
|
"loss": 0.0708, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 327.27272727272725, |
|
"grad_norm": 0.4285012483596802, |
|
"learning_rate": 0.00019385984443156292, |
|
"loss": 0.0539, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 363.6363636363636, |
|
"grad_norm": 9.999969482421875, |
|
"learning_rate": 0.00019243297295722252, |
|
"loss": 0.0471, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"grad_norm": 0.11577145010232925, |
|
"learning_rate": 0.00019086357653825758, |
|
"loss": 0.0361, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 436.3636363636364, |
|
"grad_norm": 0.04236361011862755, |
|
"learning_rate": 0.00018915407507019406, |
|
"loss": 0.0202, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 472.72727272727275, |
|
"grad_norm": 0.09598010778427124, |
|
"learning_rate": 0.0001873071044803886, |
|
"loss": 0.0204, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 509.09090909090907, |
|
"grad_norm": 0.04872431233525276, |
|
"learning_rate": 0.00018532551266361953, |
|
"loss": 0.0127, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 545.4545454545455, |
|
"grad_norm": 1.3734139204025269, |
|
"learning_rate": 0.00018321235509083966, |
|
"loss": 0.0125, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 581.8181818181819, |
|
"grad_norm": 0.058050643652677536, |
|
"learning_rate": 0.00018097089009786154, |
|
"loss": 0.0109, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 618.1818181818181, |
|
"grad_norm": 0.2065199464559555, |
|
"learning_rate": 0.0001786045738612397, |
|
"loss": 0.0054, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 654.5454545454545, |
|
"grad_norm": 0.011252381838858128, |
|
"learning_rate": 0.000176117055069097, |
|
"loss": 0.0092, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 690.9090909090909, |
|
"grad_norm": 0.2677134573459625, |
|
"learning_rate": 0.00017351216929511202, |
|
"loss": 0.0069, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 727.2727272727273, |
|
"grad_norm": 0.19302548468112946, |
|
"learning_rate": 0.00017079393308434222, |
|
"loss": 0.0052, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 763.6363636363636, |
|
"grad_norm": 0.09640073031187057, |
|
"learning_rate": 0.000167966537760003, |
|
"loss": 0.0056, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 800.0, |
|
"grad_norm": 0.05480790510773659, |
|
"learning_rate": 0.00016503434296075077, |
|
"loss": 0.0041, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 836.3636363636364, |
|
"grad_norm": 0.10284140706062317, |
|
"learning_rate": 0.00016200186991843633, |
|
"loss": 0.0026, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 872.7272727272727, |
|
"grad_norm": 0.03779178485274315, |
|
"learning_rate": 0.0001588737944866928, |
|
"loss": 0.0077, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 909.0909090909091, |
|
"grad_norm": 0.040165312588214874, |
|
"learning_rate": 0.00015565493993110856, |
|
"loss": 0.0068, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 945.4545454545455, |
|
"grad_norm": 0.0260086078196764, |
|
"learning_rate": 0.00015235026949210102, |
|
"loss": 0.004, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 981.8181818181819, |
|
"grad_norm": 0.0010075848549604416, |
|
"learning_rate": 0.0001489648787319599, |
|
"loss": 0.0029, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1018.1818181818181, |
|
"grad_norm": 0.1857517808675766, |
|
"learning_rate": 0.000145538952885112, |
|
"loss": 0.0025, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1054.5454545454545, |
|
"grad_norm": 0.009264905005693436, |
|
"learning_rate": 0.00014200857284118066, |
|
"loss": 0.0032, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1090.909090909091, |
|
"grad_norm": 0.005081487353891134, |
|
"learning_rate": 0.00013841341862277026, |
|
"loss": 0.0023, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1127.2727272727273, |
|
"grad_norm": 0.32677099108695984, |
|
"learning_rate": 0.0001347590336971037, |
|
"loss": 0.0055, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1163.6363636363637, |
|
"grad_norm": 0.014215439558029175, |
|
"learning_rate": 0.00013105105286086123, |
|
"loss": 0.0005, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1200.0, |
|
"grad_norm": 0.015997236594557762, |
|
"learning_rate": 0.00012729519355173254, |
|
"loss": 0.0009, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1236.3636363636363, |
|
"grad_norm": 0.668640673160553, |
|
"learning_rate": 0.00012349724703254215, |
|
"loss": 0.0024, |
|
"step": 3400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4000, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0175353452509184e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|