{ "best_metric": 0.05755883455276489, "best_model_checkpoint": "./beans_outputs/checkpoint-1040", "epoch": 10.0, "eval_steps": 500, "global_step": 1300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 8.837075233459473, "learning_rate": 1.9846153846153847e-05, "loss": 1.1327, "step": 10 }, { "epoch": 0.15, "grad_norm": 10.297589302062988, "learning_rate": 1.9692307692307696e-05, "loss": 1.0169, "step": 20 }, { "epoch": 0.23, "grad_norm": 11.187561988830566, "learning_rate": 1.953846153846154e-05, "loss": 0.8387, "step": 30 }, { "epoch": 0.31, "grad_norm": 15.935965538024902, "learning_rate": 1.9384615384615386e-05, "loss": 0.6578, "step": 40 }, { "epoch": 0.38, "grad_norm": 20.925371170043945, "learning_rate": 1.923076923076923e-05, "loss": 0.6827, "step": 50 }, { "epoch": 0.46, "grad_norm": 14.081457138061523, "learning_rate": 1.907692307692308e-05, "loss": 0.6062, "step": 60 }, { "epoch": 0.54, "grad_norm": 14.699126243591309, "learning_rate": 1.8923076923076925e-05, "loss": 0.4653, "step": 70 }, { "epoch": 0.62, "grad_norm": 11.159515380859375, "learning_rate": 1.876923076923077e-05, "loss": 0.2846, "step": 80 }, { "epoch": 0.69, "grad_norm": 22.39027976989746, "learning_rate": 1.8615384615384616e-05, "loss": 0.4281, "step": 90 }, { "epoch": 0.77, "grad_norm": 21.893550872802734, "learning_rate": 1.8461538461538465e-05, "loss": 0.4518, "step": 100 }, { "epoch": 0.85, "grad_norm": 21.904132843017578, "learning_rate": 1.830769230769231e-05, "loss": 0.3664, "step": 110 }, { "epoch": 0.92, "grad_norm": 16.900848388671875, "learning_rate": 1.8153846153846155e-05, "loss": 0.2882, "step": 120 }, { "epoch": 1.0, "grad_norm": 14.193367004394531, "learning_rate": 1.8e-05, "loss": 0.3159, "step": 130 }, { "epoch": 1.0, "eval_accuracy": 0.9172932330827067, "eval_loss": 0.2388302981853485, "eval_runtime": 0.7788, "eval_samples_per_second": 170.783, "eval_steps_per_second": 21.829, "step": 130 }, { "epoch": 1.08, "grad_norm": 4.1739726066589355, "learning_rate": 1.784615384615385e-05, "loss": 0.2402, "step": 140 }, { "epoch": 1.15, "grad_norm": 22.700687408447266, "learning_rate": 1.7692307692307694e-05, "loss": 0.2028, "step": 150 }, { "epoch": 1.23, "grad_norm": 14.141243934631348, "learning_rate": 1.753846153846154e-05, "loss": 0.2432, "step": 160 }, { "epoch": 1.31, "grad_norm": 3.9268712997436523, "learning_rate": 1.7384615384615385e-05, "loss": 0.1768, "step": 170 }, { "epoch": 1.38, "grad_norm": 2.51151442527771, "learning_rate": 1.7230769230769234e-05, "loss": 0.3026, "step": 180 }, { "epoch": 1.46, "grad_norm": 22.92229652404785, "learning_rate": 1.707692307692308e-05, "loss": 0.1529, "step": 190 }, { "epoch": 1.54, "grad_norm": 4.511898517608643, "learning_rate": 1.6923076923076924e-05, "loss": 0.263, "step": 200 }, { "epoch": 1.62, "grad_norm": 2.9937801361083984, "learning_rate": 1.676923076923077e-05, "loss": 0.279, "step": 210 }, { "epoch": 1.69, "grad_norm": 15.286409378051758, "learning_rate": 1.6615384615384618e-05, "loss": 0.2082, "step": 220 }, { "epoch": 1.77, "grad_norm": 8.706442832946777, "learning_rate": 1.6461538461538463e-05, "loss": 0.0985, "step": 230 }, { "epoch": 1.85, "grad_norm": 2.0332553386688232, "learning_rate": 1.630769230769231e-05, "loss": 0.3507, "step": 240 }, { "epoch": 1.92, "grad_norm": 6.008168697357178, "learning_rate": 1.6153846153846154e-05, "loss": 0.3121, "step": 250 }, { "epoch": 2.0, "grad_norm": 1.6274163722991943, "learning_rate": 1.6000000000000003e-05, "loss": 0.2644, "step": 260 }, { "epoch": 2.0, "eval_accuracy": 0.9323308270676691, "eval_loss": 0.19859685003757477, "eval_runtime": 0.7472, "eval_samples_per_second": 177.998, "eval_steps_per_second": 22.752, "step": 260 }, { "epoch": 2.08, "grad_norm": 25.121997833251953, "learning_rate": 1.5846153846153848e-05, "loss": 0.1375, "step": 270 }, { "epoch": 2.15, "grad_norm": 1.5816584825515747, "learning_rate": 1.5692307692307693e-05, "loss": 0.0942, "step": 280 }, { "epoch": 2.23, "grad_norm": 19.77696990966797, "learning_rate": 1.553846153846154e-05, "loss": 0.3347, "step": 290 }, { "epoch": 2.31, "grad_norm": 53.547080993652344, "learning_rate": 1.5384615384615387e-05, "loss": 0.1328, "step": 300 }, { "epoch": 2.38, "grad_norm": 10.436347007751465, "learning_rate": 1.523076923076923e-05, "loss": 0.3129, "step": 310 }, { "epoch": 2.46, "grad_norm": 1.7305067777633667, "learning_rate": 1.5076923076923078e-05, "loss": 0.208, "step": 320 }, { "epoch": 2.54, "grad_norm": 6.365098476409912, "learning_rate": 1.4923076923076925e-05, "loss": 0.1819, "step": 330 }, { "epoch": 2.62, "grad_norm": 35.983497619628906, "learning_rate": 1.4769230769230772e-05, "loss": 0.1696, "step": 340 }, { "epoch": 2.69, "grad_norm": 21.715662002563477, "learning_rate": 1.4615384615384615e-05, "loss": 0.1796, "step": 350 }, { "epoch": 2.77, "grad_norm": 16.103130340576172, "learning_rate": 1.4461538461538462e-05, "loss": 0.1686, "step": 360 }, { "epoch": 2.85, "grad_norm": 4.200051307678223, "learning_rate": 1.430769230769231e-05, "loss": 0.1856, "step": 370 }, { "epoch": 2.92, "grad_norm": 15.16735553741455, "learning_rate": 1.4153846153846156e-05, "loss": 0.106, "step": 380 }, { "epoch": 3.0, "grad_norm": 0.15394054353237152, "learning_rate": 1.4e-05, "loss": 0.18, "step": 390 }, { "epoch": 3.0, "eval_accuracy": 0.9774436090225563, "eval_loss": 0.08752014487981796, "eval_runtime": 0.7768, "eval_samples_per_second": 171.206, "eval_steps_per_second": 21.883, "step": 390 }, { "epoch": 3.08, "grad_norm": 26.86464500427246, "learning_rate": 1.3846153846153847e-05, "loss": 0.3284, "step": 400 }, { "epoch": 3.15, "grad_norm": 39.36838912963867, "learning_rate": 1.3692307692307694e-05, "loss": 0.2683, "step": 410 }, { "epoch": 3.23, "grad_norm": 13.922553062438965, "learning_rate": 1.353846153846154e-05, "loss": 0.1447, "step": 420 }, { "epoch": 3.31, "grad_norm": 0.15522202849388123, "learning_rate": 1.3384615384615384e-05, "loss": 0.0803, "step": 430 }, { "epoch": 3.38, "grad_norm": 17.914121627807617, "learning_rate": 1.3230769230769231e-05, "loss": 0.1794, "step": 440 }, { "epoch": 3.46, "grad_norm": 39.1897087097168, "learning_rate": 1.3076923076923078e-05, "loss": 0.0426, "step": 450 }, { "epoch": 3.54, "grad_norm": 8.533951759338379, "learning_rate": 1.2923076923076925e-05, "loss": 0.159, "step": 460 }, { "epoch": 3.62, "grad_norm": 59.198516845703125, "learning_rate": 1.2769230769230769e-05, "loss": 0.1513, "step": 470 }, { "epoch": 3.69, "grad_norm": 8.653518676757812, "learning_rate": 1.2615384615384616e-05, "loss": 0.1033, "step": 480 }, { "epoch": 3.77, "grad_norm": 0.08433680981397629, "learning_rate": 1.2461538461538463e-05, "loss": 0.0467, "step": 490 }, { "epoch": 3.85, "grad_norm": 7.614084243774414, "learning_rate": 1.230769230769231e-05, "loss": 0.193, "step": 500 }, { "epoch": 3.92, "grad_norm": 0.27582940459251404, "learning_rate": 1.2153846153846153e-05, "loss": 0.0759, "step": 510 }, { "epoch": 4.0, "grad_norm": 0.06643696129322052, "learning_rate": 1.2e-05, "loss": 0.1356, "step": 520 }, { "epoch": 4.0, "eval_accuracy": 0.9548872180451128, "eval_loss": 0.13278333842754364, "eval_runtime": 0.7809, "eval_samples_per_second": 170.307, "eval_steps_per_second": 21.769, "step": 520 }, { "epoch": 4.08, "grad_norm": 46.570106506347656, "learning_rate": 1.1846153846153847e-05, "loss": 0.0815, "step": 530 }, { "epoch": 4.15, "grad_norm": 69.75978088378906, "learning_rate": 1.1692307692307694e-05, "loss": 0.0594, "step": 540 }, { "epoch": 4.23, "grad_norm": 4.658773899078369, "learning_rate": 1.1538461538461538e-05, "loss": 0.2327, "step": 550 }, { "epoch": 4.31, "grad_norm": 0.7817876935005188, "learning_rate": 1.1384615384615385e-05, "loss": 0.007, "step": 560 }, { "epoch": 4.38, "grad_norm": 0.08630599081516266, "learning_rate": 1.1230769230769232e-05, "loss": 0.2592, "step": 570 }, { "epoch": 4.46, "grad_norm": 0.20543892681598663, "learning_rate": 1.1076923076923079e-05, "loss": 0.0271, "step": 580 }, { "epoch": 4.54, "grad_norm": 66.41576385498047, "learning_rate": 1.0923076923076922e-05, "loss": 0.0285, "step": 590 }, { "epoch": 4.62, "grad_norm": 1.2515759468078613, "learning_rate": 1.076923076923077e-05, "loss": 0.2033, "step": 600 }, { "epoch": 4.69, "grad_norm": 30.381885528564453, "learning_rate": 1.0615384615384616e-05, "loss": 0.1545, "step": 610 }, { "epoch": 4.77, "grad_norm": 0.04296184331178665, "learning_rate": 1.0461538461538463e-05, "loss": 0.1813, "step": 620 }, { "epoch": 4.85, "grad_norm": 0.4676840603351593, "learning_rate": 1.0307692307692307e-05, "loss": 0.0166, "step": 630 }, { "epoch": 4.92, "grad_norm": 0.2990114688873291, "learning_rate": 1.0153846153846154e-05, "loss": 0.1085, "step": 640 }, { "epoch": 5.0, "grad_norm": 0.4609494209289551, "learning_rate": 1e-05, "loss": 0.2743, "step": 650 }, { "epoch": 5.0, "eval_accuracy": 0.9699248120300752, "eval_loss": 0.08440900593996048, "eval_runtime": 0.7719, "eval_samples_per_second": 172.306, "eval_steps_per_second": 22.024, "step": 650 }, { "epoch": 5.08, "grad_norm": 0.06812762469053268, "learning_rate": 9.846153846153848e-06, "loss": 0.0564, "step": 660 }, { "epoch": 5.15, "grad_norm": 27.386592864990234, "learning_rate": 9.692307692307693e-06, "loss": 0.0583, "step": 670 }, { "epoch": 5.23, "grad_norm": 0.35092148184776306, "learning_rate": 9.53846153846154e-06, "loss": 0.2522, "step": 680 }, { "epoch": 5.31, "grad_norm": 4.415610313415527, "learning_rate": 9.384615384615385e-06, "loss": 0.2159, "step": 690 }, { "epoch": 5.38, "grad_norm": 0.2573728859424591, "learning_rate": 9.230769230769232e-06, "loss": 0.0882, "step": 700 }, { "epoch": 5.46, "grad_norm": 51.18489074707031, "learning_rate": 9.076923076923078e-06, "loss": 0.0612, "step": 710 }, { "epoch": 5.54, "grad_norm": 0.20563609898090363, "learning_rate": 8.923076923076925e-06, "loss": 0.1178, "step": 720 }, { "epoch": 5.62, "grad_norm": 64.889892578125, "learning_rate": 8.76923076923077e-06, "loss": 0.1949, "step": 730 }, { "epoch": 5.69, "grad_norm": 0.058016762137413025, "learning_rate": 8.615384615384617e-06, "loss": 0.2591, "step": 740 }, { "epoch": 5.77, "grad_norm": 0.6193475723266602, "learning_rate": 8.461538461538462e-06, "loss": 0.047, "step": 750 }, { "epoch": 5.85, "grad_norm": 0.057584960013628006, "learning_rate": 8.307692307692309e-06, "loss": 0.0409, "step": 760 }, { "epoch": 5.92, "grad_norm": 65.12732696533203, "learning_rate": 8.153846153846154e-06, "loss": 0.0481, "step": 770 }, { "epoch": 6.0, "grad_norm": 0.03365050256252289, "learning_rate": 8.000000000000001e-06, "loss": 0.0601, "step": 780 }, { "epoch": 6.0, "eval_accuracy": 0.9774436090225563, "eval_loss": 0.07215330749750137, "eval_runtime": 0.8003, "eval_samples_per_second": 166.181, "eval_steps_per_second": 21.241, "step": 780 }, { "epoch": 6.08, "grad_norm": 0.029437165707349777, "learning_rate": 7.846153846153847e-06, "loss": 0.074, "step": 790 }, { "epoch": 6.15, "grad_norm": 19.134477615356445, "learning_rate": 7.692307692307694e-06, "loss": 0.2048, "step": 800 }, { "epoch": 6.23, "grad_norm": 46.81254577636719, "learning_rate": 7.538461538461539e-06, "loss": 0.1062, "step": 810 }, { "epoch": 6.31, "grad_norm": 0.6008255481719971, "learning_rate": 7.384615384615386e-06, "loss": 0.1212, "step": 820 }, { "epoch": 6.38, "grad_norm": 47.47008514404297, "learning_rate": 7.230769230769231e-06, "loss": 0.2855, "step": 830 }, { "epoch": 6.46, "grad_norm": 0.06452729552984238, "learning_rate": 7.076923076923078e-06, "loss": 0.1477, "step": 840 }, { "epoch": 6.54, "grad_norm": 10.194902420043945, "learning_rate": 6.923076923076923e-06, "loss": 0.1517, "step": 850 }, { "epoch": 6.62, "grad_norm": 0.16757723689079285, "learning_rate": 6.76923076923077e-06, "loss": 0.1035, "step": 860 }, { "epoch": 6.69, "grad_norm": 60.36661911010742, "learning_rate": 6.615384615384616e-06, "loss": 0.0993, "step": 870 }, { "epoch": 6.77, "grad_norm": 0.21039122343063354, "learning_rate": 6.461538461538463e-06, "loss": 0.0477, "step": 880 }, { "epoch": 6.85, "grad_norm": 70.45368194580078, "learning_rate": 6.307692307692308e-06, "loss": 0.2678, "step": 890 }, { "epoch": 6.92, "grad_norm": 0.43550992012023926, "learning_rate": 6.153846153846155e-06, "loss": 0.0305, "step": 900 }, { "epoch": 7.0, "grad_norm": 0.05162184685468674, "learning_rate": 6e-06, "loss": 0.0679, "step": 910 }, { "epoch": 7.0, "eval_accuracy": 0.9699248120300752, "eval_loss": 0.0805138349533081, "eval_runtime": 0.7531, "eval_samples_per_second": 176.594, "eval_steps_per_second": 22.572, "step": 910 }, { "epoch": 7.08, "grad_norm": 0.08115831762552261, "learning_rate": 5.846153846153847e-06, "loss": 0.0424, "step": 920 }, { "epoch": 7.15, "grad_norm": 0.6763426661491394, "learning_rate": 5.692307692307692e-06, "loss": 0.0958, "step": 930 }, { "epoch": 7.23, "grad_norm": 9.94186782836914, "learning_rate": 5.538461538461539e-06, "loss": 0.1781, "step": 940 }, { "epoch": 7.31, "grad_norm": 0.8364148736000061, "learning_rate": 5.384615384615385e-06, "loss": 0.0789, "step": 950 }, { "epoch": 7.38, "grad_norm": 70.52117919921875, "learning_rate": 5.230769230769232e-06, "loss": 0.0671, "step": 960 }, { "epoch": 7.46, "grad_norm": 31.207950592041016, "learning_rate": 5.076923076923077e-06, "loss": 0.0266, "step": 970 }, { "epoch": 7.54, "grad_norm": 0.03295217826962471, "learning_rate": 4.923076923076924e-06, "loss": 0.038, "step": 980 }, { "epoch": 7.62, "grad_norm": 0.6278024911880493, "learning_rate": 4.76923076923077e-06, "loss": 0.1286, "step": 990 }, { "epoch": 7.69, "grad_norm": 0.09250018000602722, "learning_rate": 4.615384615384616e-06, "loss": 0.0345, "step": 1000 }, { "epoch": 7.77, "grad_norm": 0.041673317551612854, "learning_rate": 4.461538461538462e-06, "loss": 0.1463, "step": 1010 }, { "epoch": 7.85, "grad_norm": 5.301642894744873, "learning_rate": 4.307692307692308e-06, "loss": 0.1582, "step": 1020 }, { "epoch": 7.92, "grad_norm": 24.586471557617188, "learning_rate": 4.1538461538461545e-06, "loss": 0.0457, "step": 1030 }, { "epoch": 8.0, "grad_norm": 0.19031739234924316, "learning_rate": 4.000000000000001e-06, "loss": 0.1452, "step": 1040 }, { "epoch": 8.0, "eval_accuracy": 0.9774436090225563, "eval_loss": 0.05755883455276489, "eval_runtime": 0.8129, "eval_samples_per_second": 163.609, "eval_steps_per_second": 20.912, "step": 1040 }, { "epoch": 8.08, "grad_norm": 0.30499914288520813, "learning_rate": 3.846153846153847e-06, "loss": 0.0491, "step": 1050 }, { "epoch": 8.15, "grad_norm": 0.05551747977733612, "learning_rate": 3.692307692307693e-06, "loss": 0.11, "step": 1060 }, { "epoch": 8.23, "grad_norm": 0.039759561419487, "learning_rate": 3.538461538461539e-06, "loss": 0.0279, "step": 1070 }, { "epoch": 8.31, "grad_norm": 29.442441940307617, "learning_rate": 3.384615384615385e-06, "loss": 0.0955, "step": 1080 }, { "epoch": 8.38, "grad_norm": 0.036451034247875214, "learning_rate": 3.2307692307692313e-06, "loss": 0.1197, "step": 1090 }, { "epoch": 8.46, "grad_norm": 0.6850001811981201, "learning_rate": 3.0769230769230774e-06, "loss": 0.1451, "step": 1100 }, { "epoch": 8.54, "grad_norm": 0.020315758883953094, "learning_rate": 2.9230769230769236e-06, "loss": 0.0885, "step": 1110 }, { "epoch": 8.62, "grad_norm": 1.7580561637878418, "learning_rate": 2.7692307692307697e-06, "loss": 0.0126, "step": 1120 }, { "epoch": 8.69, "grad_norm": 0.2630916237831116, "learning_rate": 2.615384615384616e-06, "loss": 0.1777, "step": 1130 }, { "epoch": 8.77, "grad_norm": 0.032109491527080536, "learning_rate": 2.461538461538462e-06, "loss": 0.1485, "step": 1140 }, { "epoch": 8.85, "grad_norm": 0.209160715341568, "learning_rate": 2.307692307692308e-06, "loss": 0.0801, "step": 1150 }, { "epoch": 8.92, "grad_norm": 0.021766025573015213, "learning_rate": 2.153846153846154e-06, "loss": 0.0184, "step": 1160 }, { "epoch": 9.0, "grad_norm": 0.31575343012809753, "learning_rate": 2.0000000000000003e-06, "loss": 0.0451, "step": 1170 }, { "epoch": 9.0, "eval_accuracy": 0.9774436090225563, "eval_loss": 0.07056861370801926, "eval_runtime": 0.7606, "eval_samples_per_second": 174.861, "eval_steps_per_second": 22.351, "step": 1170 }, { "epoch": 9.08, "grad_norm": 0.14251826703548431, "learning_rate": 1.8461538461538465e-06, "loss": 0.1575, "step": 1180 }, { "epoch": 9.15, "grad_norm": 9.808426856994629, "learning_rate": 1.6923076923076926e-06, "loss": 0.1674, "step": 1190 }, { "epoch": 9.23, "grad_norm": 50.211734771728516, "learning_rate": 1.5384615384615387e-06, "loss": 0.0556, "step": 1200 }, { "epoch": 9.31, "grad_norm": 6.145327091217041, "learning_rate": 1.3846153846153848e-06, "loss": 0.1331, "step": 1210 }, { "epoch": 9.38, "grad_norm": 4.349958896636963, "learning_rate": 1.230769230769231e-06, "loss": 0.0421, "step": 1220 }, { "epoch": 9.46, "grad_norm": 0.05844373255968094, "learning_rate": 1.076923076923077e-06, "loss": 0.1256, "step": 1230 }, { "epoch": 9.54, "grad_norm": 0.09147713333368301, "learning_rate": 9.230769230769232e-07, "loss": 0.0715, "step": 1240 }, { "epoch": 9.62, "grad_norm": 1.6623743772506714, "learning_rate": 7.692307692307694e-07, "loss": 0.1119, "step": 1250 }, { "epoch": 9.69, "grad_norm": 21.137853622436523, "learning_rate": 6.153846153846155e-07, "loss": 0.0591, "step": 1260 }, { "epoch": 9.77, "grad_norm": 0.023551329970359802, "learning_rate": 4.615384615384616e-07, "loss": 0.0028, "step": 1270 }, { "epoch": 9.85, "grad_norm": 2.5063748359680176, "learning_rate": 3.0769230769230774e-07, "loss": 0.056, "step": 1280 }, { "epoch": 9.92, "grad_norm": 45.495296478271484, "learning_rate": 1.5384615384615387e-07, "loss": 0.0664, "step": 1290 }, { "epoch": 10.0, "grad_norm": 0.04642321541905403, "learning_rate": 0.0, "loss": 0.1342, "step": 1300 }, { "epoch": 10.0, "eval_accuracy": 0.9774436090225563, "eval_loss": 0.07238258421421051, "eval_runtime": 0.7484, "eval_samples_per_second": 177.701, "eval_steps_per_second": 22.714, "step": 1300 }, { "epoch": 10.0, "step": 1300, "total_flos": 5.15965863942144e+16, "train_loss": 0.17891026951515904, "train_runtime": 94.0369, "train_samples_per_second": 109.957, "train_steps_per_second": 13.824 } ], "logging_steps": 10, "max_steps": 1300, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5.15965863942144e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }