{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.87109375, "eval_steps": 500, "global_step": 1470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01953125, "grad_norm": 7.15625, "learning_rate": 1e-05, "loss": 12.1665, "step": 10 }, { "epoch": 0.0390625, "grad_norm": 5.6875, "learning_rate": 2e-05, "loss": 11.9295, "step": 20 }, { "epoch": 0.05859375, "grad_norm": 7.28125, "learning_rate": 3e-05, "loss": 11.3886, "step": 30 }, { "epoch": 0.078125, "grad_norm": 6.90625, "learning_rate": 4e-05, "loss": 10.1535, "step": 40 }, { "epoch": 0.09765625, "grad_norm": 6.375, "learning_rate": 5e-05, "loss": 8.626, "step": 50 }, { "epoch": 0.1171875, "grad_norm": 6.75, "learning_rate": 6e-05, "loss": 7.467, "step": 60 }, { "epoch": 0.13671875, "grad_norm": 9.375, "learning_rate": 7e-05, "loss": 6.5431, "step": 70 }, { "epoch": 0.15625, "grad_norm": 12.8125, "learning_rate": 8e-05, "loss": 5.5603, "step": 80 }, { "epoch": 0.17578125, "grad_norm": 14.1875, "learning_rate": 9e-05, "loss": 4.1738, "step": 90 }, { "epoch": 0.1953125, "grad_norm": 14.8125, "learning_rate": 0.0001, "loss": 2.5115, "step": 100 }, { "epoch": 0.21484375, "grad_norm": 4.34375, "learning_rate": 9.930362116991644e-05, "loss": 0.7972, "step": 110 }, { "epoch": 0.234375, "grad_norm": 2.28125, "learning_rate": 9.860724233983287e-05, "loss": 0.3554, "step": 120 }, { "epoch": 0.25390625, "grad_norm": 0.9375, "learning_rate": 9.79108635097493e-05, "loss": 0.2982, "step": 130 }, { "epoch": 0.2734375, "grad_norm": 3.515625, "learning_rate": 9.721448467966574e-05, "loss": 0.3548, "step": 140 }, { "epoch": 0.29296875, "grad_norm": 1.40625, "learning_rate": 9.651810584958218e-05, "loss": 0.2728, "step": 150 }, { "epoch": 0.3125, "grad_norm": 1.046875, "learning_rate": 9.58217270194986e-05, "loss": 0.275, "step": 160 }, { "epoch": 0.33203125, "grad_norm": 2.6875, "learning_rate": 9.512534818941504e-05, "loss": 0.2812, "step": 170 }, { "epoch": 0.3515625, "grad_norm": 1.171875, "learning_rate": 9.442896935933148e-05, "loss": 0.2561, "step": 180 }, { "epoch": 0.37109375, "grad_norm": 1.4453125, "learning_rate": 9.373259052924791e-05, "loss": 0.2348, "step": 190 }, { "epoch": 0.390625, "grad_norm": 2.03125, "learning_rate": 9.303621169916435e-05, "loss": 0.2322, "step": 200 }, { "epoch": 0.41015625, "grad_norm": 1.2890625, "learning_rate": 9.233983286908079e-05, "loss": 0.2403, "step": 210 }, { "epoch": 0.4296875, "grad_norm": 0.78125, "learning_rate": 9.164345403899723e-05, "loss": 0.2299, "step": 220 }, { "epoch": 0.44921875, "grad_norm": 1.0625, "learning_rate": 9.094707520891366e-05, "loss": 0.2166, "step": 230 }, { "epoch": 0.46875, "grad_norm": 0.79296875, "learning_rate": 9.025069637883009e-05, "loss": 0.2118, "step": 240 }, { "epoch": 0.48828125, "grad_norm": 0.94921875, "learning_rate": 8.955431754874652e-05, "loss": 0.2317, "step": 250 }, { "epoch": 0.5078125, "grad_norm": 1.2578125, "learning_rate": 8.885793871866296e-05, "loss": 0.2111, "step": 260 }, { "epoch": 0.52734375, "grad_norm": 0.984375, "learning_rate": 8.81615598885794e-05, "loss": 0.2349, "step": 270 }, { "epoch": 0.546875, "grad_norm": 0.5859375, "learning_rate": 8.746518105849582e-05, "loss": 0.2059, "step": 280 }, { "epoch": 0.56640625, "grad_norm": 0.79296875, "learning_rate": 8.676880222841226e-05, "loss": 0.1923, "step": 290 }, { "epoch": 0.5859375, "grad_norm": 0.90625, "learning_rate": 8.60724233983287e-05, "loss": 0.1994, "step": 300 }, { "epoch": 0.60546875, "grad_norm": 0.77734375, "learning_rate": 8.537604456824512e-05, "loss": 0.2052, "step": 310 }, { "epoch": 0.625, "grad_norm": 30.125, "learning_rate": 8.467966573816156e-05, "loss": 0.1987, "step": 320 }, { "epoch": 0.64453125, "grad_norm": 1.125, "learning_rate": 8.3983286908078e-05, "loss": 0.2071, "step": 330 }, { "epoch": 0.6640625, "grad_norm": 0.58203125, "learning_rate": 8.328690807799443e-05, "loss": 0.1882, "step": 340 }, { "epoch": 0.68359375, "grad_norm": 1.046875, "learning_rate": 8.259052924791086e-05, "loss": 0.1953, "step": 350 }, { "epoch": 0.703125, "grad_norm": 0.7265625, "learning_rate": 8.18941504178273e-05, "loss": 0.1923, "step": 360 }, { "epoch": 0.72265625, "grad_norm": 0.66796875, "learning_rate": 8.119777158774373e-05, "loss": 0.1866, "step": 370 }, { "epoch": 0.7421875, "grad_norm": 0.5859375, "learning_rate": 8.050139275766017e-05, "loss": 0.1754, "step": 380 }, { "epoch": 0.76171875, "grad_norm": 0.83203125, "learning_rate": 7.980501392757661e-05, "loss": 0.1762, "step": 390 }, { "epoch": 0.78125, "grad_norm": 0.64453125, "learning_rate": 7.910863509749304e-05, "loss": 0.1818, "step": 400 }, { "epoch": 0.80078125, "grad_norm": 0.62109375, "learning_rate": 7.841225626740948e-05, "loss": 0.1762, "step": 410 }, { "epoch": 0.8203125, "grad_norm": 0.64453125, "learning_rate": 7.771587743732592e-05, "loss": 0.1752, "step": 420 }, { "epoch": 0.83984375, "grad_norm": 0.55859375, "learning_rate": 7.701949860724234e-05, "loss": 0.163, "step": 430 }, { "epoch": 0.859375, "grad_norm": 0.58984375, "learning_rate": 7.632311977715878e-05, "loss": 0.1531, "step": 440 }, { "epoch": 0.87890625, "grad_norm": 0.9296875, "learning_rate": 7.562674094707522e-05, "loss": 0.1657, "step": 450 }, { "epoch": 0.8984375, "grad_norm": 0.6953125, "learning_rate": 7.493036211699165e-05, "loss": 0.1326, "step": 460 }, { "epoch": 0.91796875, "grad_norm": 0.69921875, "learning_rate": 7.423398328690808e-05, "loss": 0.1369, "step": 470 }, { "epoch": 0.9375, "grad_norm": 0.82421875, "learning_rate": 7.353760445682452e-05, "loss": 0.1431, "step": 480 }, { "epoch": 0.95703125, "grad_norm": 0.59375, "learning_rate": 7.284122562674095e-05, "loss": 0.1341, "step": 490 }, { "epoch": 0.9765625, "grad_norm": 0.8515625, "learning_rate": 7.214484679665738e-05, "loss": 0.1319, "step": 500 }, { "epoch": 0.99609375, "grad_norm": 0.94140625, "learning_rate": 7.144846796657381e-05, "loss": 0.1243, "step": 510 }, { "epoch": 1.015625, "grad_norm": 0.84765625, "learning_rate": 7.075208913649025e-05, "loss": 0.1269, "step": 520 }, { "epoch": 1.03515625, "grad_norm": 0.8203125, "learning_rate": 7.005571030640669e-05, "loss": 0.1208, "step": 530 }, { "epoch": 1.0546875, "grad_norm": 0.78515625, "learning_rate": 6.935933147632311e-05, "loss": 0.114, "step": 540 }, { "epoch": 1.07421875, "grad_norm": 0.83203125, "learning_rate": 6.866295264623955e-05, "loss": 0.1116, "step": 550 }, { "epoch": 1.09375, "grad_norm": 0.65625, "learning_rate": 6.796657381615599e-05, "loss": 0.1146, "step": 560 }, { "epoch": 1.11328125, "grad_norm": 0.953125, "learning_rate": 6.727019498607243e-05, "loss": 0.1156, "step": 570 }, { "epoch": 1.1328125, "grad_norm": 0.9765625, "learning_rate": 6.657381615598886e-05, "loss": 0.1068, "step": 580 }, { "epoch": 1.15234375, "grad_norm": 0.6953125, "learning_rate": 6.58774373259053e-05, "loss": 0.1211, "step": 590 }, { "epoch": 1.171875, "grad_norm": 1.6328125, "learning_rate": 6.518105849582174e-05, "loss": 0.1099, "step": 600 }, { "epoch": 1.19140625, "grad_norm": 1.1484375, "learning_rate": 6.448467966573817e-05, "loss": 0.1008, "step": 610 }, { "epoch": 1.2109375, "grad_norm": 0.58203125, "learning_rate": 6.37883008356546e-05, "loss": 0.1115, "step": 620 }, { "epoch": 1.23046875, "grad_norm": 0.77734375, "learning_rate": 6.309192200557104e-05, "loss": 0.1067, "step": 630 }, { "epoch": 1.25, "grad_norm": 1.1171875, "learning_rate": 6.239554317548747e-05, "loss": 0.1081, "step": 640 }, { "epoch": 1.26953125, "grad_norm": 0.515625, "learning_rate": 6.169916434540391e-05, "loss": 0.0991, "step": 650 }, { "epoch": 1.2890625, "grad_norm": 0.58203125, "learning_rate": 6.100278551532034e-05, "loss": 0.1027, "step": 660 }, { "epoch": 1.30859375, "grad_norm": 0.65234375, "learning_rate": 6.030640668523677e-05, "loss": 0.102, "step": 670 }, { "epoch": 1.328125, "grad_norm": 0.41796875, "learning_rate": 5.96100278551532e-05, "loss": 0.1012, "step": 680 }, { "epoch": 1.34765625, "grad_norm": 0.443359375, "learning_rate": 5.891364902506964e-05, "loss": 0.0978, "step": 690 }, { "epoch": 1.3671875, "grad_norm": 0.51953125, "learning_rate": 5.821727019498607e-05, "loss": 0.1196, "step": 700 }, { "epoch": 1.38671875, "grad_norm": 0.58984375, "learning_rate": 5.752089136490251e-05, "loss": 0.1047, "step": 710 }, { "epoch": 1.40625, "grad_norm": 0.56640625, "learning_rate": 5.682451253481894e-05, "loss": 0.1071, "step": 720 }, { "epoch": 1.42578125, "grad_norm": 0.71875, "learning_rate": 5.6128133704735375e-05, "loss": 0.105, "step": 730 }, { "epoch": 1.4453125, "grad_norm": 0.57421875, "learning_rate": 5.5431754874651806e-05, "loss": 0.1121, "step": 740 }, { "epoch": 1.46484375, "grad_norm": 0.515625, "learning_rate": 5.473537604456824e-05, "loss": 0.1016, "step": 750 }, { "epoch": 1.484375, "grad_norm": 1.734375, "learning_rate": 5.4038997214484674e-05, "loss": 0.1079, "step": 760 }, { "epoch": 1.50390625, "grad_norm": 0.7109375, "learning_rate": 5.3342618384401125e-05, "loss": 0.1079, "step": 770 }, { "epoch": 1.5234375, "grad_norm": 0.62890625, "learning_rate": 5.2646239554317555e-05, "loss": 0.1076, "step": 780 }, { "epoch": 1.54296875, "grad_norm": 0.55078125, "learning_rate": 5.194986072423399e-05, "loss": 0.1192, "step": 790 }, { "epoch": 1.5625, "grad_norm": 0.7109375, "learning_rate": 5.125348189415042e-05, "loss": 0.1, "step": 800 }, { "epoch": 1.58203125, "grad_norm": 1.0546875, "learning_rate": 5.055710306406686e-05, "loss": 0.1075, "step": 810 }, { "epoch": 1.6015625, "grad_norm": 0.43359375, "learning_rate": 4.986072423398329e-05, "loss": 0.1036, "step": 820 }, { "epoch": 1.62109375, "grad_norm": 1.1875, "learning_rate": 4.916434540389973e-05, "loss": 0.1016, "step": 830 }, { "epoch": 1.640625, "grad_norm": 0.76171875, "learning_rate": 4.846796657381616e-05, "loss": 0.1041, "step": 840 }, { "epoch": 1.66015625, "grad_norm": 1.1484375, "learning_rate": 4.7771587743732597e-05, "loss": 0.108, "step": 850 }, { "epoch": 1.6796875, "grad_norm": 0.66796875, "learning_rate": 4.707520891364903e-05, "loss": 0.1063, "step": 860 }, { "epoch": 1.69921875, "grad_norm": 0.419921875, "learning_rate": 4.637883008356546e-05, "loss": 0.1048, "step": 870 }, { "epoch": 1.71875, "grad_norm": 0.6328125, "learning_rate": 4.5682451253481895e-05, "loss": 0.0985, "step": 880 }, { "epoch": 1.73828125, "grad_norm": 0.55078125, "learning_rate": 4.4986072423398326e-05, "loss": 0.111, "step": 890 }, { "epoch": 1.7578125, "grad_norm": 0.5859375, "learning_rate": 4.428969359331476e-05, "loss": 0.1007, "step": 900 }, { "epoch": 1.77734375, "grad_norm": 0.5546875, "learning_rate": 4.35933147632312e-05, "loss": 0.0981, "step": 910 }, { "epoch": 1.796875, "grad_norm": 0.56640625, "learning_rate": 4.289693593314764e-05, "loss": 0.1016, "step": 920 }, { "epoch": 1.81640625, "grad_norm": 0.44140625, "learning_rate": 4.220055710306407e-05, "loss": 0.1045, "step": 930 }, { "epoch": 1.8359375, "grad_norm": 0.9609375, "learning_rate": 4.1504178272980506e-05, "loss": 0.1019, "step": 940 }, { "epoch": 1.85546875, "grad_norm": 0.53125, "learning_rate": 4.0807799442896936e-05, "loss": 0.1061, "step": 950 }, { "epoch": 1.875, "grad_norm": 0.80078125, "learning_rate": 4.0111420612813374e-05, "loss": 0.1098, "step": 960 }, { "epoch": 1.89453125, "grad_norm": 0.53125, "learning_rate": 3.9415041782729804e-05, "loss": 0.0954, "step": 970 }, { "epoch": 1.9140625, "grad_norm": 0.65625, "learning_rate": 3.871866295264624e-05, "loss": 0.1012, "step": 980 }, { "epoch": 1.93359375, "grad_norm": 0.4765625, "learning_rate": 3.802228412256267e-05, "loss": 0.099, "step": 990 }, { "epoch": 1.953125, "grad_norm": 0.498046875, "learning_rate": 3.7325905292479116e-05, "loss": 0.1046, "step": 1000 }, { "epoch": 1.97265625, "grad_norm": 0.46875, "learning_rate": 3.662952646239555e-05, "loss": 0.097, "step": 1010 }, { "epoch": 1.9921875, "grad_norm": 0.6953125, "learning_rate": 3.5933147632311984e-05, "loss": 0.0964, "step": 1020 }, { "epoch": 2.01171875, "grad_norm": 0.59375, "learning_rate": 3.5236768802228415e-05, "loss": 0.0986, "step": 1030 }, { "epoch": 2.03125, "grad_norm": 0.484375, "learning_rate": 3.454038997214485e-05, "loss": 0.1056, "step": 1040 }, { "epoch": 2.05078125, "grad_norm": 0.78125, "learning_rate": 3.384401114206128e-05, "loss": 0.1071, "step": 1050 }, { "epoch": 2.0703125, "grad_norm": 0.78125, "learning_rate": 3.314763231197771e-05, "loss": 0.1053, "step": 1060 }, { "epoch": 2.08984375, "grad_norm": 0.45703125, "learning_rate": 3.245125348189415e-05, "loss": 0.1018, "step": 1070 }, { "epoch": 2.109375, "grad_norm": 5.125, "learning_rate": 3.175487465181058e-05, "loss": 0.093, "step": 1080 }, { "epoch": 2.12890625, "grad_norm": 0.71484375, "learning_rate": 3.105849582172702e-05, "loss": 0.102, "step": 1090 }, { "epoch": 2.1484375, "grad_norm": 0.734375, "learning_rate": 3.036211699164346e-05, "loss": 0.1046, "step": 1100 }, { "epoch": 2.16796875, "grad_norm": 0.6640625, "learning_rate": 2.9665738161559893e-05, "loss": 0.1016, "step": 1110 }, { "epoch": 2.1875, "grad_norm": 0.482421875, "learning_rate": 2.8969359331476327e-05, "loss": 0.0973, "step": 1120 }, { "epoch": 2.20703125, "grad_norm": 0.53125, "learning_rate": 2.827298050139276e-05, "loss": 0.0925, "step": 1130 }, { "epoch": 2.2265625, "grad_norm": 0.58203125, "learning_rate": 2.7576601671309192e-05, "loss": 0.1027, "step": 1140 }, { "epoch": 2.24609375, "grad_norm": 0.5078125, "learning_rate": 2.6880222841225626e-05, "loss": 0.0926, "step": 1150 }, { "epoch": 2.265625, "grad_norm": 0.5078125, "learning_rate": 2.618384401114206e-05, "loss": 0.0987, "step": 1160 }, { "epoch": 2.28515625, "grad_norm": 0.65234375, "learning_rate": 2.5487465181058494e-05, "loss": 0.0975, "step": 1170 }, { "epoch": 2.3046875, "grad_norm": 0.66796875, "learning_rate": 2.479108635097493e-05, "loss": 0.1016, "step": 1180 }, { "epoch": 2.32421875, "grad_norm": 0.490234375, "learning_rate": 2.4094707520891365e-05, "loss": 0.1005, "step": 1190 }, { "epoch": 2.34375, "grad_norm": 1.4921875, "learning_rate": 2.33983286908078e-05, "loss": 0.0971, "step": 1200 }, { "epoch": 2.36328125, "grad_norm": 0.421875, "learning_rate": 2.2701949860724233e-05, "loss": 0.096, "step": 1210 }, { "epoch": 2.3828125, "grad_norm": 0.59765625, "learning_rate": 2.200557103064067e-05, "loss": 0.0988, "step": 1220 }, { "epoch": 2.40234375, "grad_norm": 0.578125, "learning_rate": 2.1309192200557104e-05, "loss": 0.1, "step": 1230 }, { "epoch": 2.421875, "grad_norm": 0.390625, "learning_rate": 2.0612813370473538e-05, "loss": 0.0996, "step": 1240 }, { "epoch": 2.44140625, "grad_norm": 0.7578125, "learning_rate": 1.9916434540389972e-05, "loss": 0.1018, "step": 1250 }, { "epoch": 2.4609375, "grad_norm": 0.462890625, "learning_rate": 1.922005571030641e-05, "loss": 0.0939, "step": 1260 }, { "epoch": 2.48046875, "grad_norm": 0.5078125, "learning_rate": 1.8523676880222844e-05, "loss": 0.0981, "step": 1270 }, { "epoch": 2.5, "grad_norm": 0.5390625, "learning_rate": 1.7827298050139278e-05, "loss": 0.1044, "step": 1280 }, { "epoch": 2.51953125, "grad_norm": 0.54296875, "learning_rate": 1.713091922005571e-05, "loss": 0.1002, "step": 1290 }, { "epoch": 2.5390625, "grad_norm": 0.53125, "learning_rate": 1.6434540389972145e-05, "loss": 0.0922, "step": 1300 }, { "epoch": 2.55859375, "grad_norm": 0.609375, "learning_rate": 1.5738161559888583e-05, "loss": 0.0959, "step": 1310 }, { "epoch": 2.578125, "grad_norm": 0.60546875, "learning_rate": 1.5041782729805015e-05, "loss": 0.0932, "step": 1320 }, { "epoch": 2.59765625, "grad_norm": 0.5390625, "learning_rate": 1.4345403899721449e-05, "loss": 0.0982, "step": 1330 }, { "epoch": 2.6171875, "grad_norm": 0.458984375, "learning_rate": 1.3649025069637883e-05, "loss": 0.0994, "step": 1340 }, { "epoch": 2.63671875, "grad_norm": 0.5625, "learning_rate": 1.2952646239554317e-05, "loss": 0.0995, "step": 1350 }, { "epoch": 2.65625, "grad_norm": 0.51953125, "learning_rate": 1.2256267409470753e-05, "loss": 0.0947, "step": 1360 }, { "epoch": 2.67578125, "grad_norm": 0.49609375, "learning_rate": 1.1559888579387188e-05, "loss": 0.0923, "step": 1370 }, { "epoch": 2.6953125, "grad_norm": 1.2734375, "learning_rate": 1.086350974930362e-05, "loss": 0.1004, "step": 1380 }, { "epoch": 2.71484375, "grad_norm": 0.6953125, "learning_rate": 1.0167130919220056e-05, "loss": 0.095, "step": 1390 }, { "epoch": 2.734375, "grad_norm": 0.71484375, "learning_rate": 9.47075208913649e-06, "loss": 0.098, "step": 1400 }, { "epoch": 2.75390625, "grad_norm": 1.4296875, "learning_rate": 8.774373259052924e-06, "loss": 0.1082, "step": 1410 }, { "epoch": 2.7734375, "grad_norm": 0.6015625, "learning_rate": 8.07799442896936e-06, "loss": 0.0969, "step": 1420 }, { "epoch": 2.79296875, "grad_norm": 0.46484375, "learning_rate": 7.381615598885794e-06, "loss": 0.0922, "step": 1430 }, { "epoch": 2.8125, "grad_norm": 0.62109375, "learning_rate": 6.6852367688022295e-06, "loss": 0.0951, "step": 1440 }, { "epoch": 2.83203125, "grad_norm": 0.61328125, "learning_rate": 5.988857938718663e-06, "loss": 0.0947, "step": 1450 }, { "epoch": 2.8515625, "grad_norm": 0.458984375, "learning_rate": 5.2924791086350974e-06, "loss": 0.1013, "step": 1460 }, { "epoch": 2.87109375, "grad_norm": 0.6171875, "learning_rate": 4.596100278551532e-06, "loss": 0.0897, "step": 1470 } ], "logging_steps": 10, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5658272704198656.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }