|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.87109375, |
|
"eval_steps": 500, |
|
"global_step": 1470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01953125, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 1e-05, |
|
"loss": 12.1665, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 2e-05, |
|
"loss": 11.9295, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05859375, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 3e-05, |
|
"loss": 11.3886, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4e-05, |
|
"loss": 10.1535, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09765625, |
|
"grad_norm": 6.375, |
|
"learning_rate": 5e-05, |
|
"loss": 8.626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 6.75, |
|
"learning_rate": 6e-05, |
|
"loss": 7.467, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13671875, |
|
"grad_norm": 9.375, |
|
"learning_rate": 7e-05, |
|
"loss": 6.5431, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 8e-05, |
|
"loss": 5.5603, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17578125, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 9e-05, |
|
"loss": 4.1738, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5115, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21484375, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 9.930362116991644e-05, |
|
"loss": 0.7972, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.860724233983287e-05, |
|
"loss": 0.3554, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25390625, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.79108635097493e-05, |
|
"loss": 0.2982, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.721448467966574e-05, |
|
"loss": 0.3548, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.29296875, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.651810584958218e-05, |
|
"loss": 0.2728, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.58217270194986e-05, |
|
"loss": 0.275, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.33203125, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.512534818941504e-05, |
|
"loss": 0.2812, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.442896935933148e-05, |
|
"loss": 0.2561, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.37109375, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.373259052924791e-05, |
|
"loss": 0.2348, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.303621169916435e-05, |
|
"loss": 0.2322, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.41015625, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.233983286908079e-05, |
|
"loss": 0.2403, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.164345403899723e-05, |
|
"loss": 0.2299, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44921875, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 9.094707520891366e-05, |
|
"loss": 0.2166, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.025069637883009e-05, |
|
"loss": 0.2118, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.48828125, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8.955431754874652e-05, |
|
"loss": 0.2317, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.885793871866296e-05, |
|
"loss": 0.2111, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.52734375, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.81615598885794e-05, |
|
"loss": 0.2349, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.746518105849582e-05, |
|
"loss": 0.2059, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.56640625, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.676880222841226e-05, |
|
"loss": 0.1923, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 8.60724233983287e-05, |
|
"loss": 0.1994, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.60546875, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.537604456824512e-05, |
|
"loss": 0.2052, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 30.125, |
|
"learning_rate": 8.467966573816156e-05, |
|
"loss": 0.1987, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.64453125, |
|
"grad_norm": 1.125, |
|
"learning_rate": 8.3983286908078e-05, |
|
"loss": 0.2071, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6640625, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.328690807799443e-05, |
|
"loss": 0.1882, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.68359375, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 8.259052924791086e-05, |
|
"loss": 0.1953, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.18941504178273e-05, |
|
"loss": 0.1923, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.72265625, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.119777158774373e-05, |
|
"loss": 0.1866, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7421875, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.050139275766017e-05, |
|
"loss": 0.1754, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.76171875, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 7.980501392757661e-05, |
|
"loss": 0.1762, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 7.910863509749304e-05, |
|
"loss": 0.1818, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.80078125, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 7.841225626740948e-05, |
|
"loss": 0.1762, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8203125, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 7.771587743732592e-05, |
|
"loss": 0.1752, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.83984375, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 7.701949860724234e-05, |
|
"loss": 0.163, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 7.632311977715878e-05, |
|
"loss": 0.1531, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.87890625, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 7.562674094707522e-05, |
|
"loss": 0.1657, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8984375, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.493036211699165e-05, |
|
"loss": 0.1326, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.91796875, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.423398328690808e-05, |
|
"loss": 0.1369, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.353760445682452e-05, |
|
"loss": 0.1431, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.95703125, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.284122562674095e-05, |
|
"loss": 0.1341, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9765625, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.214484679665738e-05, |
|
"loss": 0.1319, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.99609375, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 7.144846796657381e-05, |
|
"loss": 0.1243, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 7.075208913649025e-05, |
|
"loss": 0.1269, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.03515625, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.005571030640669e-05, |
|
"loss": 0.1208, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0546875, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.935933147632311e-05, |
|
"loss": 0.114, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.07421875, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.866295264623955e-05, |
|
"loss": 0.1116, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.796657381615599e-05, |
|
"loss": 0.1146, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.11328125, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 6.727019498607243e-05, |
|
"loss": 0.1156, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1328125, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 6.657381615598886e-05, |
|
"loss": 0.1068, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.15234375, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.58774373259053e-05, |
|
"loss": 0.1211, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 6.518105849582174e-05, |
|
"loss": 0.1099, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.19140625, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 6.448467966573817e-05, |
|
"loss": 0.1008, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2109375, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 6.37883008356546e-05, |
|
"loss": 0.1115, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.23046875, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.309192200557104e-05, |
|
"loss": 0.1067, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 6.239554317548747e-05, |
|
"loss": 0.1081, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.26953125, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 6.169916434540391e-05, |
|
"loss": 0.0991, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2890625, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 6.100278551532034e-05, |
|
"loss": 0.1027, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.30859375, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 6.030640668523677e-05, |
|
"loss": 0.102, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 5.96100278551532e-05, |
|
"loss": 0.1012, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.34765625, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 5.891364902506964e-05, |
|
"loss": 0.0978, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3671875, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 5.821727019498607e-05, |
|
"loss": 0.1196, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.38671875, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.752089136490251e-05, |
|
"loss": 0.1047, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 5.682451253481894e-05, |
|
"loss": 0.1071, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.42578125, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.6128133704735375e-05, |
|
"loss": 0.105, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4453125, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 5.5431754874651806e-05, |
|
"loss": 0.1121, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.46484375, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 5.473537604456824e-05, |
|
"loss": 0.1016, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.4038997214484674e-05, |
|
"loss": 0.1079, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.50390625, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 5.3342618384401125e-05, |
|
"loss": 0.1079, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5234375, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 5.2646239554317555e-05, |
|
"loss": 0.1076, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.54296875, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 5.194986072423399e-05, |
|
"loss": 0.1192, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 5.125348189415042e-05, |
|
"loss": 0.1, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.58203125, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 5.055710306406686e-05, |
|
"loss": 0.1075, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6015625, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 4.986072423398329e-05, |
|
"loss": 0.1036, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.62109375, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.916434540389973e-05, |
|
"loss": 0.1016, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.846796657381616e-05, |
|
"loss": 0.1041, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.66015625, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.7771587743732597e-05, |
|
"loss": 0.108, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6796875, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.707520891364903e-05, |
|
"loss": 0.1063, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.69921875, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 4.637883008356546e-05, |
|
"loss": 0.1048, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.5682451253481895e-05, |
|
"loss": 0.0985, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.73828125, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 4.4986072423398326e-05, |
|
"loss": 0.111, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7578125, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 4.428969359331476e-05, |
|
"loss": 0.1007, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.77734375, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 4.35933147632312e-05, |
|
"loss": 0.0981, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 4.289693593314764e-05, |
|
"loss": 0.1016, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.81640625, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 4.220055710306407e-05, |
|
"loss": 0.1045, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8359375, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.1504178272980506e-05, |
|
"loss": 0.1019, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.85546875, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 4.0807799442896936e-05, |
|
"loss": 0.1061, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.0111420612813374e-05, |
|
"loss": 0.1098, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.89453125, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 3.9415041782729804e-05, |
|
"loss": 0.0954, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9140625, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.871866295264624e-05, |
|
"loss": 0.1012, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.93359375, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.802228412256267e-05, |
|
"loss": 0.099, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 3.7325905292479116e-05, |
|
"loss": 0.1046, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.97265625, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.662952646239555e-05, |
|
"loss": 0.097, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9921875, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.5933147632311984e-05, |
|
"loss": 0.0964, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.01171875, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 3.5236768802228415e-05, |
|
"loss": 0.0986, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.454038997214485e-05, |
|
"loss": 0.1056, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.05078125, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.384401114206128e-05, |
|
"loss": 0.1071, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0703125, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.314763231197771e-05, |
|
"loss": 0.1053, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.08984375, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 3.245125348189415e-05, |
|
"loss": 0.1018, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 5.125, |
|
"learning_rate": 3.175487465181058e-05, |
|
"loss": 0.093, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.12890625, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.105849582172702e-05, |
|
"loss": 0.102, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1484375, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.036211699164346e-05, |
|
"loss": 0.1046, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.16796875, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.9665738161559893e-05, |
|
"loss": 0.1016, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.8969359331476327e-05, |
|
"loss": 0.0973, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.20703125, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.827298050139276e-05, |
|
"loss": 0.0925, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2265625, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.7576601671309192e-05, |
|
"loss": 0.1027, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.24609375, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.6880222841225626e-05, |
|
"loss": 0.0926, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.618384401114206e-05, |
|
"loss": 0.0987, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.28515625, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 2.5487465181058494e-05, |
|
"loss": 0.0975, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.3046875, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.479108635097493e-05, |
|
"loss": 0.1016, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.32421875, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.4094707520891365e-05, |
|
"loss": 0.1005, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.33983286908078e-05, |
|
"loss": 0.0971, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.36328125, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.2701949860724233e-05, |
|
"loss": 0.096, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.3828125, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.200557103064067e-05, |
|
"loss": 0.0988, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.40234375, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.1309192200557104e-05, |
|
"loss": 0.1, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 2.0612813370473538e-05, |
|
"loss": 0.0996, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.44140625, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 1.9916434540389972e-05, |
|
"loss": 0.1018, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.4609375, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 1.922005571030641e-05, |
|
"loss": 0.0939, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.48046875, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.8523676880222844e-05, |
|
"loss": 0.0981, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.7827298050139278e-05, |
|
"loss": 0.1044, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.51953125, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.713091922005571e-05, |
|
"loss": 0.1002, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5390625, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.6434540389972145e-05, |
|
"loss": 0.0922, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.55859375, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.5738161559888583e-05, |
|
"loss": 0.0959, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.5041782729805015e-05, |
|
"loss": 0.0932, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.59765625, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.4345403899721449e-05, |
|
"loss": 0.0982, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.6171875, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 1.3649025069637883e-05, |
|
"loss": 0.0994, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.63671875, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.2952646239554317e-05, |
|
"loss": 0.0995, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.2256267409470753e-05, |
|
"loss": 0.0947, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.67578125, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 1.1559888579387188e-05, |
|
"loss": 0.0923, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.6953125, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.086350974930362e-05, |
|
"loss": 0.1004, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.71484375, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.0167130919220056e-05, |
|
"loss": 0.095, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 9.47075208913649e-06, |
|
"loss": 0.098, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.75390625, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 8.774373259052924e-06, |
|
"loss": 0.1082, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.7734375, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.07799442896936e-06, |
|
"loss": 0.0969, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.79296875, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 7.381615598885794e-06, |
|
"loss": 0.0922, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 6.6852367688022295e-06, |
|
"loss": 0.0951, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.83203125, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 5.988857938718663e-06, |
|
"loss": 0.0947, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.8515625, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 5.2924791086350974e-06, |
|
"loss": 0.1013, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.87109375, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.596100278551532e-06, |
|
"loss": 0.0897, |
|
"step": 1470 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1536, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5658272704198656.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|