{ "best_metric": 0.986103355884552, "best_model_checkpoint": "runs/deepseek_lora_20240422-095359/checkpoint-2500", "epoch": 0.05788142411455891, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.34415054321289, "learning_rate": 4.0000000000000003e-07, "loss": 1.2669, "step": 10 }, { "epoch": 0.0, "grad_norm": 7.290144443511963, "learning_rate": 8.000000000000001e-07, "loss": 1.3878, "step": 20 }, { "epoch": 0.0, "grad_norm": 5.349151134490967, "learning_rate": 1.2000000000000002e-06, "loss": 1.1855, "step": 30 }, { "epoch": 0.0, "grad_norm": 14.503132820129395, "learning_rate": 1.6000000000000001e-06, "loss": 1.0839, "step": 40 }, { "epoch": 0.0, "grad_norm": 10.504097938537598, "learning_rate": 2.0000000000000003e-06, "loss": 1.3778, "step": 50 }, { "epoch": 0.0, "grad_norm": 10.759912490844727, "learning_rate": 2.4000000000000003e-06, "loss": 1.2921, "step": 60 }, { "epoch": 0.0, "grad_norm": 10.052595138549805, "learning_rate": 2.8000000000000003e-06, "loss": 1.2835, "step": 70 }, { "epoch": 0.0, "grad_norm": 4.416482925415039, "learning_rate": 3.2000000000000003e-06, "loss": 1.1876, "step": 80 }, { "epoch": 0.0, "grad_norm": 8.467272758483887, "learning_rate": 3.6000000000000003e-06, "loss": 1.0513, "step": 90 }, { "epoch": 0.0, "grad_norm": 7.2040510177612305, "learning_rate": 4.000000000000001e-06, "loss": 1.1429, "step": 100 }, { "epoch": 0.0, "grad_norm": 22.72906494140625, "learning_rate": 4.4e-06, "loss": 1.344, "step": 110 }, { "epoch": 0.0, "grad_norm": 2.6358766555786133, "learning_rate": 4.800000000000001e-06, "loss": 1.1027, "step": 120 }, { "epoch": 0.0, "grad_norm": 2.7037551403045654, "learning_rate": 5.2e-06, "loss": 1.1483, "step": 130 }, { "epoch": 0.0, "grad_norm": 7.809167861938477, "learning_rate": 5.600000000000001e-06, "loss": 1.2556, "step": 140 }, { "epoch": 0.0, "grad_norm": 10.522435188293457, "learning_rate": 6e-06, "loss": 1.3272, "step": 150 }, { "epoch": 0.0, "grad_norm": 12.723409652709961, "learning_rate": 6.4000000000000006e-06, "loss": 1.1342, "step": 160 }, { "epoch": 0.0, "grad_norm": 9.65882682800293, "learning_rate": 6.800000000000001e-06, "loss": 1.1509, "step": 170 }, { "epoch": 0.0, "grad_norm": 7.01952600479126, "learning_rate": 7.2000000000000005e-06, "loss": 1.3069, "step": 180 }, { "epoch": 0.0, "grad_norm": 3.8367834091186523, "learning_rate": 7.600000000000001e-06, "loss": 1.1551, "step": 190 }, { "epoch": 0.0, "grad_norm": 6.172680854797363, "learning_rate": 8.000000000000001e-06, "loss": 1.4156, "step": 200 }, { "epoch": 0.0, "grad_norm": 13.904732704162598, "learning_rate": 8.400000000000001e-06, "loss": 1.2325, "step": 210 }, { "epoch": 0.01, "grad_norm": 7.113356590270996, "learning_rate": 8.8e-06, "loss": 1.142, "step": 220 }, { "epoch": 0.01, "grad_norm": 14.426301956176758, "learning_rate": 9.200000000000002e-06, "loss": 1.2648, "step": 230 }, { "epoch": 0.01, "grad_norm": 11.00722885131836, "learning_rate": 9.600000000000001e-06, "loss": 1.3083, "step": 240 }, { "epoch": 0.01, "grad_norm": 5.72845983505249, "learning_rate": 1e-05, "loss": 1.1742, "step": 250 }, { "epoch": 0.01, "grad_norm": 2.7809460163116455, "learning_rate": 1.04e-05, "loss": 1.2009, "step": 260 }, { "epoch": 0.01, "grad_norm": 11.464003562927246, "learning_rate": 1.0800000000000002e-05, "loss": 1.1236, "step": 270 }, { "epoch": 0.01, "grad_norm": 4.139829158782959, "learning_rate": 1.1200000000000001e-05, "loss": 1.412, "step": 280 }, { "epoch": 0.01, "grad_norm": 11.483891487121582, "learning_rate": 1.16e-05, "loss": 1.2858, "step": 290 }, { "epoch": 0.01, "grad_norm": 5.432833671569824, "learning_rate": 1.2e-05, "loss": 1.2636, "step": 300 }, { "epoch": 0.01, "grad_norm": 8.610489845275879, "learning_rate": 1.2400000000000002e-05, "loss": 1.3628, "step": 310 }, { "epoch": 0.01, "grad_norm": 5.597244739532471, "learning_rate": 1.2800000000000001e-05, "loss": 1.3585, "step": 320 }, { "epoch": 0.01, "grad_norm": 11.667290687561035, "learning_rate": 1.3200000000000002e-05, "loss": 1.4112, "step": 330 }, { "epoch": 0.01, "grad_norm": 4.102590084075928, "learning_rate": 1.3600000000000002e-05, "loss": 1.2255, "step": 340 }, { "epoch": 0.01, "grad_norm": 15.342700958251953, "learning_rate": 1.4e-05, "loss": 1.2915, "step": 350 }, { "epoch": 0.01, "grad_norm": 13.78890323638916, "learning_rate": 1.4400000000000001e-05, "loss": 1.5371, "step": 360 }, { "epoch": 0.01, "grad_norm": 13.456836700439453, "learning_rate": 1.48e-05, "loss": 1.0586, "step": 370 }, { "epoch": 0.01, "grad_norm": 2.98771071434021, "learning_rate": 1.5200000000000002e-05, "loss": 1.3475, "step": 380 }, { "epoch": 0.01, "grad_norm": 6.61796236038208, "learning_rate": 1.5600000000000003e-05, "loss": 1.1562, "step": 390 }, { "epoch": 0.01, "grad_norm": 5.871325492858887, "learning_rate": 1.6000000000000003e-05, "loss": 1.3575, "step": 400 }, { "epoch": 0.01, "grad_norm": 6.282336235046387, "learning_rate": 1.64e-05, "loss": 1.3261, "step": 410 }, { "epoch": 0.01, "grad_norm": 5.444363594055176, "learning_rate": 1.6800000000000002e-05, "loss": 1.0091, "step": 420 }, { "epoch": 0.01, "grad_norm": 1.7336666584014893, "learning_rate": 1.72e-05, "loss": 1.3492, "step": 430 }, { "epoch": 0.01, "grad_norm": 5.654507637023926, "learning_rate": 1.76e-05, "loss": 1.2701, "step": 440 }, { "epoch": 0.01, "grad_norm": 3.83292555809021, "learning_rate": 1.8e-05, "loss": 1.1073, "step": 450 }, { "epoch": 0.01, "grad_norm": 5.379516124725342, "learning_rate": 1.8400000000000003e-05, "loss": 1.2015, "step": 460 }, { "epoch": 0.01, "grad_norm": 3.365577220916748, "learning_rate": 1.88e-05, "loss": 1.1174, "step": 470 }, { "epoch": 0.01, "grad_norm": 11.620206832885742, "learning_rate": 1.9200000000000003e-05, "loss": 1.3091, "step": 480 }, { "epoch": 0.01, "grad_norm": 10.400999069213867, "learning_rate": 1.9600000000000002e-05, "loss": 0.98, "step": 490 }, { "epoch": 0.01, "grad_norm": 5.732550144195557, "learning_rate": 2e-05, "loss": 1.3638, "step": 500 }, { "epoch": 0.01, "eval_loss": 1.0330229997634888, "eval_runtime": 66.9668, "eval_samples_per_second": 14.933, "eval_steps_per_second": 14.933, "step": 500 }, { "epoch": 0.01, "grad_norm": 3.0168514251708984, "learning_rate": 1.9955555555555557e-05, "loss": 1.2762, "step": 510 }, { "epoch": 0.01, "grad_norm": 3.4872841835021973, "learning_rate": 1.9911111111111112e-05, "loss": 1.3287, "step": 520 }, { "epoch": 0.01, "grad_norm": 5.1740336418151855, "learning_rate": 1.9866666666666667e-05, "loss": 1.2577, "step": 530 }, { "epoch": 0.01, "grad_norm": 2.3551251888275146, "learning_rate": 1.9822222222222226e-05, "loss": 1.1244, "step": 540 }, { "epoch": 0.01, "grad_norm": 2.230109691619873, "learning_rate": 1.977777777777778e-05, "loss": 1.2021, "step": 550 }, { "epoch": 0.01, "grad_norm": 2.284055709838867, "learning_rate": 1.9733333333333336e-05, "loss": 1.2919, "step": 560 }, { "epoch": 0.01, "grad_norm": 2.5889816284179688, "learning_rate": 1.968888888888889e-05, "loss": 1.3648, "step": 570 }, { "epoch": 0.01, "grad_norm": 4.799360752105713, "learning_rate": 1.9644444444444447e-05, "loss": 1.396, "step": 580 }, { "epoch": 0.01, "grad_norm": 2.2582507133483887, "learning_rate": 1.9600000000000002e-05, "loss": 1.2263, "step": 590 }, { "epoch": 0.01, "grad_norm": 9.519213676452637, "learning_rate": 1.9555555555555557e-05, "loss": 1.1475, "step": 600 }, { "epoch": 0.01, "grad_norm": 1.0547032356262207, "learning_rate": 1.9511111111111113e-05, "loss": 1.4278, "step": 610 }, { "epoch": 0.01, "grad_norm": 9.448994636535645, "learning_rate": 1.9466666666666668e-05, "loss": 1.3394, "step": 620 }, { "epoch": 0.01, "grad_norm": 6.21259069442749, "learning_rate": 1.9422222222222223e-05, "loss": 1.206, "step": 630 }, { "epoch": 0.01, "grad_norm": 3.4361753463745117, "learning_rate": 1.9377777777777778e-05, "loss": 1.2472, "step": 640 }, { "epoch": 0.02, "grad_norm": 2.0469722747802734, "learning_rate": 1.9333333333333333e-05, "loss": 1.1283, "step": 650 }, { "epoch": 0.02, "grad_norm": 3.071639060974121, "learning_rate": 1.928888888888889e-05, "loss": 1.2191, "step": 660 }, { "epoch": 0.02, "grad_norm": 3.428431272506714, "learning_rate": 1.9244444444444444e-05, "loss": 1.3863, "step": 670 }, { "epoch": 0.02, "grad_norm": 7.4005656242370605, "learning_rate": 1.9200000000000003e-05, "loss": 1.0794, "step": 680 }, { "epoch": 0.02, "grad_norm": 2.636923313140869, "learning_rate": 1.9155555555555558e-05, "loss": 1.3761, "step": 690 }, { "epoch": 0.02, "grad_norm": 5.060346603393555, "learning_rate": 1.9111111111111113e-05, "loss": 1.4087, "step": 700 }, { "epoch": 0.02, "grad_norm": 6.77576208114624, "learning_rate": 1.9066666666666668e-05, "loss": 1.2442, "step": 710 }, { "epoch": 0.02, "grad_norm": 3.3169615268707275, "learning_rate": 1.9022222222222223e-05, "loss": 1.2498, "step": 720 }, { "epoch": 0.02, "grad_norm": 3.9390623569488525, "learning_rate": 1.897777777777778e-05, "loss": 1.4098, "step": 730 }, { "epoch": 0.02, "grad_norm": 5.928336143493652, "learning_rate": 1.8933333333333334e-05, "loss": 1.235, "step": 740 }, { "epoch": 0.02, "grad_norm": 9.329615592956543, "learning_rate": 1.888888888888889e-05, "loss": 1.2166, "step": 750 }, { "epoch": 0.02, "grad_norm": 6.106197357177734, "learning_rate": 1.8844444444444444e-05, "loss": 1.3728, "step": 760 }, { "epoch": 0.02, "grad_norm": 4.729337215423584, "learning_rate": 1.88e-05, "loss": 1.2131, "step": 770 }, { "epoch": 0.02, "grad_norm": 3.116116762161255, "learning_rate": 1.8755555555555558e-05, "loss": 1.4169, "step": 780 }, { "epoch": 0.02, "grad_norm": 8.869202613830566, "learning_rate": 1.8711111111111113e-05, "loss": 1.2377, "step": 790 }, { "epoch": 0.02, "grad_norm": 4.858852863311768, "learning_rate": 1.866666666666667e-05, "loss": 1.3863, "step": 800 }, { "epoch": 0.02, "grad_norm": 10.197395324707031, "learning_rate": 1.8622222222222224e-05, "loss": 1.1053, "step": 810 }, { "epoch": 0.02, "grad_norm": 2.7740931510925293, "learning_rate": 1.857777777777778e-05, "loss": 1.41, "step": 820 }, { "epoch": 0.02, "grad_norm": 8.306866645812988, "learning_rate": 1.8533333333333334e-05, "loss": 1.2661, "step": 830 }, { "epoch": 0.02, "grad_norm": 5.462616920471191, "learning_rate": 1.848888888888889e-05, "loss": 1.1217, "step": 840 }, { "epoch": 0.02, "grad_norm": 1.6351518630981445, "learning_rate": 1.8444444444444448e-05, "loss": 1.2512, "step": 850 }, { "epoch": 0.02, "grad_norm": 4.930731773376465, "learning_rate": 1.8400000000000003e-05, "loss": 1.3136, "step": 860 }, { "epoch": 0.02, "grad_norm": 6.907737731933594, "learning_rate": 1.835555555555556e-05, "loss": 1.1728, "step": 870 }, { "epoch": 0.02, "grad_norm": 2.2834455966949463, "learning_rate": 1.8311111111111114e-05, "loss": 1.3566, "step": 880 }, { "epoch": 0.02, "grad_norm": 6.938192367553711, "learning_rate": 1.826666666666667e-05, "loss": 1.2879, "step": 890 }, { "epoch": 0.02, "grad_norm": 3.4376509189605713, "learning_rate": 1.8222222222222224e-05, "loss": 1.1971, "step": 900 }, { "epoch": 0.02, "grad_norm": 4.437848091125488, "learning_rate": 1.817777777777778e-05, "loss": 1.3583, "step": 910 }, { "epoch": 0.02, "grad_norm": 6.843893051147461, "learning_rate": 1.8133333333333335e-05, "loss": 1.2323, "step": 920 }, { "epoch": 0.02, "grad_norm": 3.2527034282684326, "learning_rate": 1.808888888888889e-05, "loss": 1.2691, "step": 930 }, { "epoch": 0.02, "grad_norm": 2.6528022289276123, "learning_rate": 1.8044444444444445e-05, "loss": 1.2525, "step": 940 }, { "epoch": 0.02, "grad_norm": 2.8804101943969727, "learning_rate": 1.8e-05, "loss": 1.3387, "step": 950 }, { "epoch": 0.02, "grad_norm": 9.754573822021484, "learning_rate": 1.7955555555555556e-05, "loss": 1.265, "step": 960 }, { "epoch": 0.02, "grad_norm": 2.54309344291687, "learning_rate": 1.791111111111111e-05, "loss": 1.0998, "step": 970 }, { "epoch": 0.02, "grad_norm": 9.926447868347168, "learning_rate": 1.7866666666666666e-05, "loss": 1.1927, "step": 980 }, { "epoch": 0.02, "grad_norm": 6.4870805740356445, "learning_rate": 1.782222222222222e-05, "loss": 1.0565, "step": 990 }, { "epoch": 0.02, "grad_norm": 3.2183382511138916, "learning_rate": 1.7777777777777777e-05, "loss": 1.2572, "step": 1000 }, { "epoch": 0.02, "eval_loss": 1.019097089767456, "eval_runtime": 66.8787, "eval_samples_per_second": 14.952, "eval_steps_per_second": 14.952, "step": 1000 }, { "epoch": 0.02, "grad_norm": 9.61516284942627, "learning_rate": 1.7733333333333335e-05, "loss": 1.4306, "step": 1010 }, { "epoch": 0.02, "grad_norm": 10.755435943603516, "learning_rate": 1.768888888888889e-05, "loss": 1.3042, "step": 1020 }, { "epoch": 0.02, "grad_norm": 11.427221298217773, "learning_rate": 1.7644444444444446e-05, "loss": 1.3201, "step": 1030 }, { "epoch": 0.02, "grad_norm": 9.656847953796387, "learning_rate": 1.76e-05, "loss": 1.2579, "step": 1040 }, { "epoch": 0.02, "grad_norm": 4.030166149139404, "learning_rate": 1.7555555555555556e-05, "loss": 1.4462, "step": 1050 }, { "epoch": 0.02, "grad_norm": 6.84343957901001, "learning_rate": 1.751111111111111e-05, "loss": 1.4213, "step": 1060 }, { "epoch": 0.02, "grad_norm": 2.4579355716705322, "learning_rate": 1.7466666666666667e-05, "loss": 1.2673, "step": 1070 }, { "epoch": 0.03, "grad_norm": 6.692587375640869, "learning_rate": 1.7422222222222222e-05, "loss": 1.1579, "step": 1080 }, { "epoch": 0.03, "grad_norm": 4.579308032989502, "learning_rate": 1.737777777777778e-05, "loss": 1.3278, "step": 1090 }, { "epoch": 0.03, "grad_norm": 7.755845069885254, "learning_rate": 1.7333333333333336e-05, "loss": 1.3471, "step": 1100 }, { "epoch": 0.03, "grad_norm": 2.065462112426758, "learning_rate": 1.728888888888889e-05, "loss": 1.2811, "step": 1110 }, { "epoch": 0.03, "grad_norm": 1.6691714525222778, "learning_rate": 1.7244444444444446e-05, "loss": 1.4433, "step": 1120 }, { "epoch": 0.03, "grad_norm": 6.87007474899292, "learning_rate": 1.72e-05, "loss": 1.1916, "step": 1130 }, { "epoch": 0.03, "grad_norm": 1.663476586341858, "learning_rate": 1.7155555555555557e-05, "loss": 1.3006, "step": 1140 }, { "epoch": 0.03, "grad_norm": 2.493046522140503, "learning_rate": 1.7111111111111112e-05, "loss": 1.3099, "step": 1150 }, { "epoch": 0.03, "grad_norm": 3.2659084796905518, "learning_rate": 1.706666666666667e-05, "loss": 1.1553, "step": 1160 }, { "epoch": 0.03, "grad_norm": 3.07832670211792, "learning_rate": 1.7022222222222226e-05, "loss": 1.2604, "step": 1170 }, { "epoch": 0.03, "grad_norm": 2.0068790912628174, "learning_rate": 1.697777777777778e-05, "loss": 0.9679, "step": 1180 }, { "epoch": 0.03, "grad_norm": 14.066789627075195, "learning_rate": 1.6933333333333336e-05, "loss": 1.2291, "step": 1190 }, { "epoch": 0.03, "grad_norm": 5.058719158172607, "learning_rate": 1.688888888888889e-05, "loss": 1.3224, "step": 1200 }, { "epoch": 0.03, "grad_norm": 6.309176445007324, "learning_rate": 1.6844444444444447e-05, "loss": 1.1184, "step": 1210 }, { "epoch": 0.03, "grad_norm": 9.05258846282959, "learning_rate": 1.6800000000000002e-05, "loss": 1.345, "step": 1220 }, { "epoch": 0.03, "grad_norm": 4.345346927642822, "learning_rate": 1.6755555555555557e-05, "loss": 1.2663, "step": 1230 }, { "epoch": 0.03, "grad_norm": 4.119368076324463, "learning_rate": 1.6711111111111112e-05, "loss": 1.1441, "step": 1240 }, { "epoch": 0.03, "grad_norm": 5.0422563552856445, "learning_rate": 1.6666666666666667e-05, "loss": 1.2199, "step": 1250 }, { "epoch": 0.03, "grad_norm": 11.27535343170166, "learning_rate": 1.6622222222222223e-05, "loss": 1.1876, "step": 1260 }, { "epoch": 0.03, "grad_norm": 6.767408847808838, "learning_rate": 1.6577777777777778e-05, "loss": 1.3928, "step": 1270 }, { "epoch": 0.03, "grad_norm": 6.1706862449646, "learning_rate": 1.6533333333333333e-05, "loss": 1.2268, "step": 1280 }, { "epoch": 0.03, "grad_norm": 6.185644149780273, "learning_rate": 1.648888888888889e-05, "loss": 1.3086, "step": 1290 }, { "epoch": 0.03, "grad_norm": 9.57487678527832, "learning_rate": 1.6444444444444444e-05, "loss": 1.2079, "step": 1300 }, { "epoch": 0.03, "grad_norm": 4.104099273681641, "learning_rate": 1.64e-05, "loss": 1.2012, "step": 1310 }, { "epoch": 0.03, "grad_norm": 6.703191757202148, "learning_rate": 1.6355555555555557e-05, "loss": 1.2567, "step": 1320 }, { "epoch": 0.03, "grad_norm": 4.194169521331787, "learning_rate": 1.6311111111111113e-05, "loss": 1.3687, "step": 1330 }, { "epoch": 0.03, "grad_norm": 4.2333269119262695, "learning_rate": 1.6266666666666668e-05, "loss": 1.1547, "step": 1340 }, { "epoch": 0.03, "grad_norm": 3.007706880569458, "learning_rate": 1.6222222222222223e-05, "loss": 1.2031, "step": 1350 }, { "epoch": 0.03, "grad_norm": 22.12256622314453, "learning_rate": 1.617777777777778e-05, "loss": 1.1117, "step": 1360 }, { "epoch": 0.03, "grad_norm": 5.559662818908691, "learning_rate": 1.6133333333333334e-05, "loss": 1.0796, "step": 1370 }, { "epoch": 0.03, "grad_norm": 4.618852615356445, "learning_rate": 1.608888888888889e-05, "loss": 1.3308, "step": 1380 }, { "epoch": 0.03, "grad_norm": 3.6262142658233643, "learning_rate": 1.6044444444444444e-05, "loss": 1.1913, "step": 1390 }, { "epoch": 0.03, "grad_norm": 8.0167875289917, "learning_rate": 1.6000000000000003e-05, "loss": 1.341, "step": 1400 }, { "epoch": 0.03, "grad_norm": 5.142825603485107, "learning_rate": 1.5955555555555558e-05, "loss": 1.03, "step": 1410 }, { "epoch": 0.03, "grad_norm": 3.186448335647583, "learning_rate": 1.5911111111111113e-05, "loss": 1.2788, "step": 1420 }, { "epoch": 0.03, "grad_norm": 8.11291217803955, "learning_rate": 1.586666666666667e-05, "loss": 1.4419, "step": 1430 }, { "epoch": 0.03, "grad_norm": 6.920192718505859, "learning_rate": 1.5822222222222224e-05, "loss": 1.3127, "step": 1440 }, { "epoch": 0.03, "grad_norm": 4.357940196990967, "learning_rate": 1.577777777777778e-05, "loss": 1.3184, "step": 1450 }, { "epoch": 0.03, "grad_norm": 8.83785343170166, "learning_rate": 1.5733333333333334e-05, "loss": 1.382, "step": 1460 }, { "epoch": 0.03, "grad_norm": 2.565369129180908, "learning_rate": 1.5688888888888893e-05, "loss": 1.4148, "step": 1470 }, { "epoch": 0.03, "grad_norm": 3.819150686264038, "learning_rate": 1.5644444444444448e-05, "loss": 1.4428, "step": 1480 }, { "epoch": 0.03, "grad_norm": 7.107534408569336, "learning_rate": 1.5600000000000003e-05, "loss": 1.2188, "step": 1490 }, { "epoch": 0.03, "grad_norm": 9.566027641296387, "learning_rate": 1.555555555555556e-05, "loss": 1.2406, "step": 1500 }, { "epoch": 0.03, "eval_loss": 1.0252078771591187, "eval_runtime": 66.9753, "eval_samples_per_second": 14.931, "eval_steps_per_second": 14.931, "step": 1500 }, { "epoch": 0.03, "grad_norm": 8.054425239562988, "learning_rate": 1.5511111111111114e-05, "loss": 1.2767, "step": 1510 }, { "epoch": 0.04, "grad_norm": 2.8494319915771484, "learning_rate": 1.546666666666667e-05, "loss": 1.1632, "step": 1520 }, { "epoch": 0.04, "grad_norm": 2.9305810928344727, "learning_rate": 1.5422222222222224e-05, "loss": 1.2075, "step": 1530 }, { "epoch": 0.04, "grad_norm": 11.442655563354492, "learning_rate": 1.537777777777778e-05, "loss": 1.0231, "step": 1540 }, { "epoch": 0.04, "grad_norm": 1.735372543334961, "learning_rate": 1.5333333333333334e-05, "loss": 1.282, "step": 1550 }, { "epoch": 0.04, "grad_norm": 11.126168251037598, "learning_rate": 1.528888888888889e-05, "loss": 1.2106, "step": 1560 }, { "epoch": 0.04, "grad_norm": 5.569930076599121, "learning_rate": 1.5244444444444447e-05, "loss": 1.2016, "step": 1570 }, { "epoch": 0.04, "grad_norm": 4.002272605895996, "learning_rate": 1.5200000000000002e-05, "loss": 1.206, "step": 1580 }, { "epoch": 0.04, "grad_norm": 11.26425838470459, "learning_rate": 1.5155555555555557e-05, "loss": 1.3145, "step": 1590 }, { "epoch": 0.04, "grad_norm": 6.265772819519043, "learning_rate": 1.5111111111111112e-05, "loss": 1.105, "step": 1600 }, { "epoch": 0.04, "grad_norm": 6.139275550842285, "learning_rate": 1.5066666666666668e-05, "loss": 1.2016, "step": 1610 }, { "epoch": 0.04, "grad_norm": 4.753066539764404, "learning_rate": 1.5022222222222223e-05, "loss": 1.3433, "step": 1620 }, { "epoch": 0.04, "grad_norm": 8.761942863464355, "learning_rate": 1.497777777777778e-05, "loss": 1.3153, "step": 1630 }, { "epoch": 0.04, "grad_norm": 3.3448381423950195, "learning_rate": 1.4933333333333335e-05, "loss": 1.24, "step": 1640 }, { "epoch": 0.04, "grad_norm": 2.818711519241333, "learning_rate": 1.488888888888889e-05, "loss": 1.2067, "step": 1650 }, { "epoch": 0.04, "grad_norm": 11.795276641845703, "learning_rate": 1.4844444444444445e-05, "loss": 1.3926, "step": 1660 }, { "epoch": 0.04, "grad_norm": 4.47786283493042, "learning_rate": 1.48e-05, "loss": 1.5349, "step": 1670 }, { "epoch": 0.04, "grad_norm": 4.560647010803223, "learning_rate": 1.4755555555555556e-05, "loss": 1.2318, "step": 1680 }, { "epoch": 0.04, "grad_norm": 5.564818382263184, "learning_rate": 1.4711111111111111e-05, "loss": 1.2044, "step": 1690 }, { "epoch": 0.04, "grad_norm": 1.4382556676864624, "learning_rate": 1.4666666666666666e-05, "loss": 1.3367, "step": 1700 }, { "epoch": 0.04, "grad_norm": 2.7976467609405518, "learning_rate": 1.4622222222222225e-05, "loss": 1.2304, "step": 1710 }, { "epoch": 0.04, "grad_norm": 3.021933078765869, "learning_rate": 1.457777777777778e-05, "loss": 1.1339, "step": 1720 }, { "epoch": 0.04, "grad_norm": 1.9026000499725342, "learning_rate": 1.4533333333333335e-05, "loss": 1.3733, "step": 1730 }, { "epoch": 0.04, "grad_norm": 4.237542152404785, "learning_rate": 1.448888888888889e-05, "loss": 1.0956, "step": 1740 }, { "epoch": 0.04, "grad_norm": 5.288679599761963, "learning_rate": 1.4444444444444446e-05, "loss": 1.0542, "step": 1750 }, { "epoch": 0.04, "grad_norm": 4.956302642822266, "learning_rate": 1.4400000000000001e-05, "loss": 1.326, "step": 1760 }, { "epoch": 0.04, "grad_norm": 4.096738338470459, "learning_rate": 1.4355555555555556e-05, "loss": 1.1285, "step": 1770 }, { "epoch": 0.04, "grad_norm": 7.757137298583984, "learning_rate": 1.4311111111111111e-05, "loss": 1.3607, "step": 1780 }, { "epoch": 0.04, "grad_norm": 5.415702819824219, "learning_rate": 1.4266666666666668e-05, "loss": 1.0, "step": 1790 }, { "epoch": 0.04, "grad_norm": 9.445133209228516, "learning_rate": 1.4222222222222224e-05, "loss": 1.2187, "step": 1800 }, { "epoch": 0.04, "grad_norm": 5.406456470489502, "learning_rate": 1.4177777777777779e-05, "loss": 1.3459, "step": 1810 }, { "epoch": 0.04, "grad_norm": 1.619770884513855, "learning_rate": 1.4133333333333334e-05, "loss": 1.2368, "step": 1820 }, { "epoch": 0.04, "grad_norm": 4.307522296905518, "learning_rate": 1.408888888888889e-05, "loss": 1.0995, "step": 1830 }, { "epoch": 0.04, "grad_norm": 6.3472185134887695, "learning_rate": 1.4044444444444445e-05, "loss": 1.2563, "step": 1840 }, { "epoch": 0.04, "grad_norm": 2.04168701171875, "learning_rate": 1.4e-05, "loss": 1.2858, "step": 1850 }, { "epoch": 0.04, "grad_norm": 2.180267810821533, "learning_rate": 1.3955555555555558e-05, "loss": 1.123, "step": 1860 }, { "epoch": 0.04, "grad_norm": 2.560042381286621, "learning_rate": 1.3911111111111114e-05, "loss": 1.3962, "step": 1870 }, { "epoch": 0.04, "grad_norm": 5.683982849121094, "learning_rate": 1.3866666666666669e-05, "loss": 1.0984, "step": 1880 }, { "epoch": 0.04, "grad_norm": 3.1994190216064453, "learning_rate": 1.3822222222222224e-05, "loss": 1.1746, "step": 1890 }, { "epoch": 0.04, "grad_norm": 8.851926803588867, "learning_rate": 1.377777777777778e-05, "loss": 1.4551, "step": 1900 }, { "epoch": 0.04, "grad_norm": 2.670786142349243, "learning_rate": 1.3733333333333335e-05, "loss": 1.2446, "step": 1910 }, { "epoch": 0.04, "grad_norm": 5.134795188903809, "learning_rate": 1.368888888888889e-05, "loss": 1.2128, "step": 1920 }, { "epoch": 0.04, "grad_norm": 4.486630916595459, "learning_rate": 1.3644444444444445e-05, "loss": 1.1951, "step": 1930 }, { "epoch": 0.04, "grad_norm": 4.025615215301514, "learning_rate": 1.3600000000000002e-05, "loss": 1.3628, "step": 1940 }, { "epoch": 0.05, "grad_norm": 4.232900619506836, "learning_rate": 1.3555555555555557e-05, "loss": 1.3367, "step": 1950 }, { "epoch": 0.05, "grad_norm": 1.6469148397445679, "learning_rate": 1.3511111111111112e-05, "loss": 1.2144, "step": 1960 }, { "epoch": 0.05, "grad_norm": 8.560945510864258, "learning_rate": 1.3466666666666668e-05, "loss": 1.4125, "step": 1970 }, { "epoch": 0.05, "grad_norm": 3.06697416305542, "learning_rate": 1.3422222222222223e-05, "loss": 1.1769, "step": 1980 }, { "epoch": 0.05, "grad_norm": 2.721186399459839, "learning_rate": 1.3377777777777778e-05, "loss": 1.3266, "step": 1990 }, { "epoch": 0.05, "grad_norm": 4.427524566650391, "learning_rate": 1.3333333333333333e-05, "loss": 1.1697, "step": 2000 }, { "epoch": 0.05, "eval_loss": 1.042366623878479, "eval_runtime": 67.0143, "eval_samples_per_second": 14.922, "eval_steps_per_second": 14.922, "step": 2000 }, { "epoch": 0.05, "grad_norm": 1.322222113609314, "learning_rate": 1.3288888888888889e-05, "loss": 1.3173, "step": 2010 }, { "epoch": 0.05, "grad_norm": 5.731060028076172, "learning_rate": 1.3244444444444447e-05, "loss": 1.2578, "step": 2020 }, { "epoch": 0.05, "grad_norm": 0.8411041498184204, "learning_rate": 1.3200000000000002e-05, "loss": 1.1796, "step": 2030 }, { "epoch": 0.05, "grad_norm": 5.4170026779174805, "learning_rate": 1.3155555555555558e-05, "loss": 1.3608, "step": 2040 }, { "epoch": 0.05, "grad_norm": 4.58616304397583, "learning_rate": 1.3111111111111113e-05, "loss": 1.2726, "step": 2050 }, { "epoch": 0.05, "grad_norm": 3.936751365661621, "learning_rate": 1.3066666666666668e-05, "loss": 1.3327, "step": 2060 }, { "epoch": 0.05, "grad_norm": 4.1074042320251465, "learning_rate": 1.3022222222222223e-05, "loss": 1.2446, "step": 2070 }, { "epoch": 0.05, "grad_norm": 2.657953977584839, "learning_rate": 1.2977777777777779e-05, "loss": 0.9989, "step": 2080 }, { "epoch": 0.05, "grad_norm": 5.181591987609863, "learning_rate": 1.2933333333333334e-05, "loss": 1.1732, "step": 2090 }, { "epoch": 0.05, "grad_norm": 5.981390953063965, "learning_rate": 1.288888888888889e-05, "loss": 1.1463, "step": 2100 }, { "epoch": 0.05, "grad_norm": 6.05696964263916, "learning_rate": 1.2844444444444446e-05, "loss": 1.2061, "step": 2110 }, { "epoch": 0.05, "grad_norm": 6.697308540344238, "learning_rate": 1.2800000000000001e-05, "loss": 1.3052, "step": 2120 }, { "epoch": 0.05, "grad_norm": 14.833478927612305, "learning_rate": 1.2755555555555556e-05, "loss": 1.1987, "step": 2130 }, { "epoch": 0.05, "grad_norm": 4.479732990264893, "learning_rate": 1.2711111111111112e-05, "loss": 1.337, "step": 2140 }, { "epoch": 0.05, "grad_norm": 3.746943235397339, "learning_rate": 1.2666666666666667e-05, "loss": 1.3082, "step": 2150 }, { "epoch": 0.05, "grad_norm": 4.7201828956604, "learning_rate": 1.2622222222222222e-05, "loss": 1.1393, "step": 2160 }, { "epoch": 0.05, "grad_norm": 5.082671642303467, "learning_rate": 1.257777777777778e-05, "loss": 1.3047, "step": 2170 }, { "epoch": 0.05, "grad_norm": 3.5268049240112305, "learning_rate": 1.2533333333333336e-05, "loss": 1.1255, "step": 2180 }, { "epoch": 0.05, "grad_norm": 3.8856544494628906, "learning_rate": 1.2488888888888891e-05, "loss": 1.4022, "step": 2190 }, { "epoch": 0.05, "grad_norm": 5.552175521850586, "learning_rate": 1.2444444444444446e-05, "loss": 1.3444, "step": 2200 }, { "epoch": 0.05, "grad_norm": 2.772660255432129, "learning_rate": 1.2400000000000002e-05, "loss": 1.4274, "step": 2210 }, { "epoch": 0.05, "grad_norm": 6.9137773513793945, "learning_rate": 1.2355555555555557e-05, "loss": 1.1356, "step": 2220 }, { "epoch": 0.05, "grad_norm": 2.5068724155426025, "learning_rate": 1.2311111111111112e-05, "loss": 1.25, "step": 2230 }, { "epoch": 0.05, "grad_norm": 0.8587338924407959, "learning_rate": 1.2266666666666667e-05, "loss": 0.9182, "step": 2240 }, { "epoch": 0.05, "grad_norm": 5.356514930725098, "learning_rate": 1.2222222222222224e-05, "loss": 1.2882, "step": 2250 }, { "epoch": 0.05, "grad_norm": 8.728981018066406, "learning_rate": 1.217777777777778e-05, "loss": 1.2974, "step": 2260 }, { "epoch": 0.05, "grad_norm": 2.5229454040527344, "learning_rate": 1.2133333333333335e-05, "loss": 1.2368, "step": 2270 }, { "epoch": 0.05, "grad_norm": 2.760233163833618, "learning_rate": 1.208888888888889e-05, "loss": 1.0504, "step": 2280 }, { "epoch": 0.05, "grad_norm": 1.7355753183364868, "learning_rate": 1.2044444444444445e-05, "loss": 1.3547, "step": 2290 }, { "epoch": 0.05, "grad_norm": 2.2375967502593994, "learning_rate": 1.2e-05, "loss": 1.3058, "step": 2300 }, { "epoch": 0.05, "grad_norm": 4.5727386474609375, "learning_rate": 1.1955555555555556e-05, "loss": 1.3097, "step": 2310 }, { "epoch": 0.05, "grad_norm": 7.387458801269531, "learning_rate": 1.191111111111111e-05, "loss": 1.2058, "step": 2320 }, { "epoch": 0.05, "grad_norm": 3.593174934387207, "learning_rate": 1.186666666666667e-05, "loss": 1.1711, "step": 2330 }, { "epoch": 0.05, "grad_norm": 3.3077690601348877, "learning_rate": 1.1822222222222225e-05, "loss": 1.1175, "step": 2340 }, { "epoch": 0.05, "grad_norm": 4.785140514373779, "learning_rate": 1.177777777777778e-05, "loss": 1.2618, "step": 2350 }, { "epoch": 0.05, "grad_norm": 1.8957252502441406, "learning_rate": 1.1733333333333335e-05, "loss": 1.1641, "step": 2360 }, { "epoch": 0.05, "grad_norm": 7.033701419830322, "learning_rate": 1.168888888888889e-05, "loss": 1.3563, "step": 2370 }, { "epoch": 0.06, "grad_norm": 2.6566953659057617, "learning_rate": 1.1644444444444446e-05, "loss": 1.3673, "step": 2380 }, { "epoch": 0.06, "grad_norm": 10.329083442687988, "learning_rate": 1.16e-05, "loss": 1.2953, "step": 2390 }, { "epoch": 0.06, "grad_norm": 3.5499980449676514, "learning_rate": 1.1555555555555556e-05, "loss": 1.3136, "step": 2400 }, { "epoch": 0.06, "grad_norm": 2.051661968231201, "learning_rate": 1.1511111111111113e-05, "loss": 1.3235, "step": 2410 }, { "epoch": 0.06, "grad_norm": 2.9961462020874023, "learning_rate": 1.1466666666666668e-05, "loss": 1.1637, "step": 2420 }, { "epoch": 0.06, "grad_norm": 8.53834056854248, "learning_rate": 1.1422222222222223e-05, "loss": 1.1066, "step": 2430 }, { "epoch": 0.06, "grad_norm": 5.684910297393799, "learning_rate": 1.1377777777777779e-05, "loss": 1.4476, "step": 2440 }, { "epoch": 0.06, "grad_norm": 16.79145622253418, "learning_rate": 1.1333333333333334e-05, "loss": 1.3978, "step": 2450 }, { "epoch": 0.06, "grad_norm": 3.3043346405029297, "learning_rate": 1.1288888888888889e-05, "loss": 1.1598, "step": 2460 }, { "epoch": 0.06, "grad_norm": 3.7516391277313232, "learning_rate": 1.1244444444444444e-05, "loss": 1.3534, "step": 2470 }, { "epoch": 0.06, "grad_norm": 2.0643539428710938, "learning_rate": 1.1200000000000001e-05, "loss": 1.2924, "step": 2480 }, { "epoch": 0.06, "grad_norm": 4.207937717437744, "learning_rate": 1.1155555555555556e-05, "loss": 1.2699, "step": 2490 }, { "epoch": 0.06, "grad_norm": 3.3195173740386963, "learning_rate": 1.1111111111111113e-05, "loss": 1.379, "step": 2500 }, { "epoch": 0.06, "eval_loss": 0.986103355884552, "eval_runtime": 67.0933, "eval_samples_per_second": 14.905, "eval_steps_per_second": 14.905, "step": 2500 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "total_flos": 4.025531498496e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }