|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 1317, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007593014426727411, |
|
"eval_accuracy": 0.14927598308927809, |
|
"eval_loss": 7.863234519958496, |
|
"eval_runtime": 368.109, |
|
"eval_samples_per_second": 192.592, |
|
"eval_steps_per_second": 0.752, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007593014426727411, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1e-05, |
|
"loss": 7.8621, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015186028853454821, |
|
"grad_norm": 9.625, |
|
"learning_rate": 2e-05, |
|
"loss": 7.8394, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022779043280182234, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 3e-05, |
|
"loss": 7.7726, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030372057706909643, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4e-05, |
|
"loss": 7.6201, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.037965072133637055, |
|
"grad_norm": 8.375, |
|
"learning_rate": 5e-05, |
|
"loss": 7.3344, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04555808656036447, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0089, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05315110098709187, |
|
"grad_norm": 5.0, |
|
"learning_rate": 7e-05, |
|
"loss": 6.5946, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.060744115413819286, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8e-05, |
|
"loss": 6.0679, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0683371298405467, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 9e-05, |
|
"loss": 5.6963, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07593014426727411, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001, |
|
"loss": 5.4276, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08352315869400152, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.998334155418425e-05, |
|
"loss": 5.2216, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09111617312072894, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 9.993337731688963e-05, |
|
"loss": 5.0464, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09870918754745633, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.985014058117778e-05, |
|
"loss": 4.9161, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10630220197418375, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.973368681083473e-05, |
|
"loss": 4.8156, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11389521640091116, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.958409360341343e-05, |
|
"loss": 4.735, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12148823082763857, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.940146063852748e-05, |
|
"loss": 4.6663, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12908124525436598, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 9.918590961143088e-05, |
|
"loss": 4.6082, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1366742596810934, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.893758415192783e-05, |
|
"loss": 4.5533, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1442672741078208, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.865664972866682e-05, |
|
"loss": 4.5055, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.15186028853454822, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 9.834329353888254e-05, |
|
"loss": 4.4668, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15945330296127563, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.799772438365936e-05, |
|
"loss": 4.4333, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.16704631738800305, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.762017252879917e-05, |
|
"loss": 4.4072, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17463933181473046, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.721088955138664e-05, |
|
"loss": 4.3759, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.18223234624145787, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.67701481721539e-05, |
|
"loss": 4.3542, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18982536066818528, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 9.629824207375632e-05, |
|
"loss": 4.3318, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19741837509491267, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 9.57954857050807e-05, |
|
"loss": 4.3142, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.20501138952164008, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.52622140717161e-05, |
|
"loss": 4.2948, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2126044039483675, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 9.46987825127269e-05, |
|
"loss": 4.282, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2201974183750949, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.410556646387694e-05, |
|
"loss": 4.2621, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.22779043280182232, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 9.348296120746251e-05, |
|
"loss": 4.2483, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23538344722854973, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 9.283138160892074e-05, |
|
"loss": 4.2307, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.24297646165527714, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 9.215126184038911e-05, |
|
"loss": 4.2256, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2505694760820046, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 9.144305509140012e-05, |
|
"loss": 4.2123, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.25816249050873197, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 9.070723326690396e-05, |
|
"loss": 4.1984, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.26575550493545935, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.994428667282032e-05, |
|
"loss": 4.1908, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2733485193621868, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 8.915472368932916e-05, |
|
"loss": 4.1878, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2809415337889142, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 8.83390704321176e-05, |
|
"loss": 4.1766, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2885345482156416, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.749787040180923e-05, |
|
"loss": 4.1719, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.296127562642369, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 8.663168412180907e-05, |
|
"loss": 4.1615, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.30372057706909644, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 8.574108876480557e-05, |
|
"loss": 4.1505, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3113135914958238, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 8.482667776817868e-05, |
|
"loss": 4.1422, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.31890660592255127, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 8.388906043857001e-05, |
|
"loss": 4.1427, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.32649962034927865, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 8.292886154587884e-05, |
|
"loss": 4.14, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3340926347760061, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.194672090695419e-05, |
|
"loss": 4.127, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3416856492027335, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.094329295926075e-05, |
|
"loss": 4.1251, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3492786636294609, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 7.991924632480238e-05, |
|
"loss": 4.1234, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3568716780561883, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 7.887526336459401e-05, |
|
"loss": 4.1151, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.36446469248291574, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.781203972397866e-05, |
|
"loss": 4.1102, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3720577069096431, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.673028386909259e-05, |
|
"loss": 4.1107, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.37965072133637057, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.563071661478758e-05, |
|
"loss": 4.1017, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.38724373576309795, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 7.451407064432471e-05, |
|
"loss": 4.096, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.39483675018982534, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.338109002115976e-05, |
|
"loss": 4.0982, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4024297646165528, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 7.223252969314557e-05, |
|
"loss": 4.0925, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.41002277904328016, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.106915498948175e-05, |
|
"loss": 4.0917, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4176157934700076, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 6.989174111074688e-05, |
|
"loss": 4.0865, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.425208807896735, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 6.870107261235304e-05, |
|
"loss": 4.0831, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4328018223234624, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 6.749794288176681e-05, |
|
"loss": 4.0833, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4403948367501898, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 6.628315360984526e-05, |
|
"loss": 4.0813, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.44798785117691725, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 6.505751425663892e-05, |
|
"loss": 4.0757, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.45558086560364464, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 6.382184151201804e-05, |
|
"loss": 4.0782, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4631738800303721, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 6.257695875148106e-05, |
|
"loss": 4.0763, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.47076689445709946, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 6.132369548750851e-05, |
|
"loss": 4.0728, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4783599088838269, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 6.0062886816827494e-05, |
|
"loss": 4.0696, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4859529233105543, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 5.8795372863954924e-05, |
|
"loss": 4.0634, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4935459377372817, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 5.7521998221391004e-05, |
|
"loss": 4.0658, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5011389521640092, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 5.6243611386835205e-05, |
|
"loss": 4.0631, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5087319665907365, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 5.49610641978001e-05, |
|
"loss": 4.0639, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5163249810174639, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 5.36752112639999e-05, |
|
"loss": 4.0631, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5239179954441914, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.2386909397891595e-05, |
|
"loss": 4.0592, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5315110098709187, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 5.109701704374841e-05, |
|
"loss": 4.0625, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5391040242976461, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.980639370564593e-05, |
|
"loss": 4.0632, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5466970387243736, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 4.851589937474202e-05, |
|
"loss": 4.0557, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.554290053151101, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 4.722639395623215e-05, |
|
"loss": 4.0654, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5618830675778284, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 4.593873669636204e-05, |
|
"loss": 4.0542, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5694760820045558, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 4.465378560987944e-05, |
|
"loss": 4.0584, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5770690964312832, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 4.337239690830632e-05, |
|
"loss": 4.058, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5846621108580107, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 4.209542442941284e-05, |
|
"loss": 4.0574, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.592255125284738, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.082371906827295e-05, |
|
"loss": 4.0569, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5998481397114654, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.9558128210280706e-05, |
|
"loss": 4.0512, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6074411541381929, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.829949516650543e-05, |
|
"loss": 4.054, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6150341685649203, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 3.704865861176159e-05, |
|
"loss": 4.0514, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6226271829916477, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 3.5806452025768056e-05, |
|
"loss": 4.0564, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6302201974183751, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 3.457370313776897e-05, |
|
"loss": 4.0529, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6378132118451025, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 3.335123337498662e-05, |
|
"loss": 4.055, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6454062262718299, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 3.213985731527313e-05, |
|
"loss": 4.053, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6529992406985573, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.0940382144326666e-05, |
|
"loss": 4.0494, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6605922551252847, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.9753607117832903e-05, |
|
"loss": 4.0488, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6681852695520122, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 2.858032302889084e-05, |
|
"loss": 4.0548, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6757782839787395, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.7421311681077344e-05, |
|
"loss": 4.0507, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.683371298405467, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.627734536750191e-05, |
|
"loss": 4.0524, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6909643128321944, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.5149186356198562e-05, |
|
"loss": 4.0494, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6985573272589218, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.4037586382197685e-05, |
|
"loss": 4.0516, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7061503416856492, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.2943286146616788e-05, |
|
"loss": 4.0507, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7137433561123766, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 2.1867014823103044e-05, |
|
"loss": 4.0507, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.721336370539104, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.080948957195749e-05, |
|
"loss": 4.0506, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7289293849658315, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.9771415062263965e-05, |
|
"loss": 4.0502, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7365223993925588, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.875348300234131e-05, |
|
"loss": 4.0503, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7441154138192863, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.7756371678832145e-05, |
|
"loss": 4.049, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7517084282460137, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.6780745504734617e-05, |
|
"loss": 4.0465, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7593014426727411, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.58272545766791e-05, |
|
"loss": 4.0503, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7593014426727411, |
|
"eval_accuracy": 0.30700041923136506, |
|
"eval_loss": 4.048835277557373, |
|
"eval_runtime": 369.3769, |
|
"eval_samples_per_second": 191.931, |
|
"eval_steps_per_second": 0.75, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7668944570994685, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.4896534241743988e-05, |
|
"loss": 4.0501, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7744874715261959, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 1.398920467410002e-05, |
|
"loss": 4.0469, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7820804859529233, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.3105870461764847e-05, |
|
"loss": 4.0512, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7896735003796507, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.2247120203742945e-05, |
|
"loss": 4.0463, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7972665148063781, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 1.141352611782005e-05, |
|
"loss": 4.0457, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8048595292331056, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.0605643659272668e-05, |
|
"loss": 4.0511, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.812452543659833, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 9.824011150747226e-06, |
|
"loss": 4.0533, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8200455580865603, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 9.069149423555378e-06, |
|
"loss": 4.0541, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8276385725132878, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 8.341561470624365e-06, |
|
"loss": 4.0464, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8352315869400152, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 7.641732111333793e-06, |
|
"loss": 4.0518, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8428246013667426, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 6.970127668462129e-06, |
|
"loss": 4.0492, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.85041761579347, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 6.3271956574582296e-06, |
|
"loss": 4.0489, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8580106302201974, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 5.7133644882448376e-06, |
|
"loss": 4.0462, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8656036446469249, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 5.12904317975283e-06, |
|
"loss": 4.0466, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8731966590736523, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 4.574621087376501e-06, |
|
"loss": 4.0496, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8807896735003796, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 4.050467643531214e-06, |
|
"loss": 4.0478, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8883826879271071, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.556932111486727e-06, |
|
"loss": 4.0524, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8959757023538345, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.0943433526397838e-06, |
|
"loss": 4.0526, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9035687167805618, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.663009607381345e-06, |
|
"loss": 4.0484, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9111617312072893, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 2.2632182897043773e-06, |
|
"loss": 4.0489, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9187547456340167, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.8952357956889988e-06, |
|
"loss": 4.0477, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9263477600607442, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 1.559307325992726e-06, |
|
"loss": 4.0498, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9339407744874715, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.2556567224639714e-06, |
|
"loss": 4.0451, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9415337889141989, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 9.844863189877596e-07, |
|
"loss": 4.0513, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9491268033409264, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 7.459768066630291e-07, |
|
"loss": 4.0487, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9567198177676538, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 5.402871134012666e-07, |
|
"loss": 4.0509, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9643128321943811, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.6755429802690087e-07, |
|
"loss": 4.048, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9719058466211086, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 2.2789345894972945e-07, |
|
"loss": 4.0524, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.979498861047836, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.213976574705733e-07, |
|
"loss": 4.0501, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9870918754745635, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 4.813785577097996e-08, |
|
"loss": 4.0455, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9946848899012908, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 8.162869628425274e-09, |
|
"loss": 4.0489, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1317, |
|
"total_flos": 1.0857282238836376e+19, |
|
"train_loss": 4.36365500488513, |
|
"train_runtime": 5657.2998, |
|
"train_samples_per_second": 238.343, |
|
"train_steps_per_second": 0.233 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1317, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0857282238836376e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|