|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.01949769498511016, |
|
"global_step": 1120, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2e-05, |
|
"loss": 4.1497, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 4e-05, |
|
"loss": 3.7626, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 6e-05, |
|
"loss": 3.3202, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 8e-05, |
|
"loss": 3.1832, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0001, |
|
"loss": 3.0717, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00012, |
|
"loss": 2.9981, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00014000000000000001, |
|
"loss": 2.9461, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00016, |
|
"loss": 2.9218, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 2.9574, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0002, |
|
"loss": 2.9917, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00022, |
|
"loss": 2.9242, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00024, |
|
"loss": 2.9225, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 2.9253, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 2.8171, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0003, |
|
"loss": 2.9023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00032, |
|
"loss": 2.809, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00034, |
|
"loss": 2.9079, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 2.7811, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00038, |
|
"loss": 2.8214, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0004, |
|
"loss": 2.821, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00042, |
|
"loss": 2.7647, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00044, |
|
"loss": 2.776, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00046, |
|
"loss": 2.8615, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00048, |
|
"loss": 2.7913, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0005, |
|
"loss": 2.7515, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0005200000000000001, |
|
"loss": 2.7199, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00054, |
|
"loss": 2.7464, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0005600000000000001, |
|
"loss": 2.7036, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00058, |
|
"loss": 2.8226, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0006, |
|
"loss": 2.6855, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00062, |
|
"loss": 2.7442, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00064, |
|
"loss": 2.7517, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00066, |
|
"loss": 2.6921, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00068, |
|
"loss": 2.7034, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0007, |
|
"loss": 2.7489, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 2.6452, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00074, |
|
"loss": 2.7452, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00076, |
|
"loss": 2.6834, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 2.6984, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0008, |
|
"loss": 2.6716, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00082, |
|
"loss": 2.7164, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00084, |
|
"loss": 2.634, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00086, |
|
"loss": 2.6932, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00088, |
|
"loss": 2.623, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 2.6827, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00092, |
|
"loss": 2.6558, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00094, |
|
"loss": 2.7282, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00096, |
|
"loss": 2.7123, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00098, |
|
"loss": 2.5858, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.7641, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00102, |
|
"loss": 2.6606, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0010400000000000001, |
|
"loss": 2.7237, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0010600000000000002, |
|
"loss": 2.66, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00108, |
|
"loss": 2.6041, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0011, |
|
"loss": 2.608, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0011200000000000001, |
|
"loss": 2.63, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00114, |
|
"loss": 2.6255, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00116, |
|
"loss": 2.6602, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00118, |
|
"loss": 2.6522, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0012, |
|
"loss": 2.7137, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00122, |
|
"loss": 2.6914, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00124, |
|
"loss": 2.6888, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00126, |
|
"loss": 2.6398, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00128, |
|
"loss": 2.6058, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0013000000000000002, |
|
"loss": 2.7215, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00132, |
|
"loss": 2.5945, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00134, |
|
"loss": 2.6325, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00136, |
|
"loss": 2.6191, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00138, |
|
"loss": 2.5328, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0014, |
|
"loss": 2.562, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00142, |
|
"loss": 2.5454, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0014399999999999999, |
|
"loss": 2.5328, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00146, |
|
"loss": 2.5141, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00148, |
|
"loss": 2.5019, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0015, |
|
"loss": 2.4906, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00152, |
|
"loss": 2.5153, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0015400000000000001, |
|
"loss": 2.4857, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0015600000000000002, |
|
"loss": 2.5265, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00158, |
|
"loss": 2.4994, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0016, |
|
"loss": 2.4538, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0016200000000000001, |
|
"loss": 2.5823, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00164, |
|
"loss": 2.4527, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00166, |
|
"loss": 2.4914, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00168, |
|
"loss": 2.4307, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0017, |
|
"loss": 2.4795, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00172, |
|
"loss": 2.4245, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00174, |
|
"loss": 2.4751, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00176, |
|
"loss": 2.4705, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0017800000000000001, |
|
"loss": 2.4574, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 2.4271, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00182, |
|
"loss": 2.4609, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00184, |
|
"loss": 2.4255, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00186, |
|
"loss": 2.4151, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00188, |
|
"loss": 2.4331, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0019, |
|
"loss": 2.4576, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00192, |
|
"loss": 2.4156, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0019399999999999999, |
|
"loss": 2.417, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00196, |
|
"loss": 2.444, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00198, |
|
"loss": 2.5019, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.002, |
|
"loss": 2.4522, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00202, |
|
"loss": 2.4106, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00204, |
|
"loss": 2.4295, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00206, |
|
"loss": 2.4683, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0020800000000000003, |
|
"loss": 2.4315, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0021000000000000003, |
|
"loss": 2.41, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0021200000000000004, |
|
"loss": 2.4551, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00214, |
|
"loss": 2.4642, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00216, |
|
"loss": 2.4526, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00218, |
|
"loss": 2.3934, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0022, |
|
"loss": 2.3918, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00222, |
|
"loss": 2.4356, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0022400000000000002, |
|
"loss": 2.4506, |
|
"step": 1120 |
|
} |
|
], |
|
"max_steps": 172326, |
|
"num_train_epochs": 3, |
|
"total_flos": 5.5330613854470144e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|