{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.581755593803787, "eval_steps": 500, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.281562089920044, "learning_rate": 4e-05, "loss": 0.5044, "step": 100 }, { "epoch": 0.04, "grad_norm": 1.3552560806274414, "learning_rate": 8e-05, "loss": 0.4867, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.8318453431129456, "learning_rate": 0.00012, "loss": 0.4359, "step": 300 }, { "epoch": 0.09, "grad_norm": 1.3478361368179321, "learning_rate": 0.00016, "loss": 0.4475, "step": 400 }, { "epoch": 0.11, "grad_norm": 1.1437219381332397, "learning_rate": 0.0002, "loss": 0.4524, "step": 500 }, { "epoch": 0.11, "eval_loss": 0.3725915253162384, "eval_runtime": 90.4733, "eval_samples_per_second": 11.053, "eval_steps_per_second": 2.763, "step": 500 }, { "epoch": 0.13, "grad_norm": 0.8374431133270264, "learning_rate": 0.00019897435897435898, "loss": 0.4569, "step": 600 }, { "epoch": 0.15, "grad_norm": 1.6386727094650269, "learning_rate": 0.00019794871794871796, "loss": 0.4584, "step": 700 }, { "epoch": 0.17, "grad_norm": 1.4089267253875732, "learning_rate": 0.00019692307692307696, "loss": 0.4625, "step": 800 }, { "epoch": 0.19, "grad_norm": 2.269509792327881, "learning_rate": 0.0001958974358974359, "loss": 0.464, "step": 900 }, { "epoch": 0.22, "grad_norm": 1.0896354913711548, "learning_rate": 0.00019487179487179487, "loss": 0.4807, "step": 1000 }, { "epoch": 0.22, "eval_loss": 0.3314511179924011, "eval_runtime": 90.3867, "eval_samples_per_second": 11.064, "eval_steps_per_second": 2.766, "step": 1000 }, { "epoch": 0.24, "grad_norm": 1.0933148860931396, "learning_rate": 0.00019384615384615385, "loss": 0.4032, "step": 1100 }, { "epoch": 0.26, "grad_norm": 2.555145740509033, "learning_rate": 0.00019282051282051282, "loss": 0.4353, "step": 1200 }, { "epoch": 0.28, "grad_norm": 2.245523452758789, "learning_rate": 0.00019179487179487182, "loss": 0.409, "step": 1300 }, { "epoch": 0.3, "grad_norm": 2.084887981414795, "learning_rate": 0.0001907692307692308, "loss": 0.4214, "step": 1400 }, { "epoch": 0.32, "grad_norm": 1.2279912233352661, "learning_rate": 0.00018974358974358974, "loss": 0.4507, "step": 1500 }, { "epoch": 0.32, "eval_loss": 0.312484472990036, "eval_runtime": 90.4149, "eval_samples_per_second": 11.06, "eval_steps_per_second": 2.765, "step": 1500 }, { "epoch": 0.34, "grad_norm": 4.321805477142334, "learning_rate": 0.0001887179487179487, "loss": 0.4523, "step": 1600 }, { "epoch": 0.37, "grad_norm": 1.376891016960144, "learning_rate": 0.0001876923076923077, "loss": 0.441, "step": 1700 }, { "epoch": 0.39, "grad_norm": 2.077620267868042, "learning_rate": 0.0001866666666666667, "loss": 0.4392, "step": 1800 }, { "epoch": 0.41, "grad_norm": 2.546964168548584, "learning_rate": 0.00018564102564102566, "loss": 0.4749, "step": 1900 }, { "epoch": 0.43, "grad_norm": 2.035524845123291, "learning_rate": 0.00018461538461538463, "loss": 0.4116, "step": 2000 }, { "epoch": 0.43, "eval_loss": 0.2960582375526428, "eval_runtime": 90.8091, "eval_samples_per_second": 11.012, "eval_steps_per_second": 2.753, "step": 2000 }, { "epoch": 0.45, "grad_norm": 1.3167320489883423, "learning_rate": 0.00018358974358974358, "loss": 0.4135, "step": 2100 }, { "epoch": 0.47, "grad_norm": 2.908482074737549, "learning_rate": 0.00018256410256410258, "loss": 0.4092, "step": 2200 }, { "epoch": 0.49, "grad_norm": 2.8161234855651855, "learning_rate": 0.00018153846153846155, "loss": 0.4364, "step": 2300 }, { "epoch": 0.52, "grad_norm": 1.0435675382614136, "learning_rate": 0.00018051282051282052, "loss": 0.4258, "step": 2400 }, { "epoch": 0.54, "grad_norm": 0.7152374982833862, "learning_rate": 0.0001794871794871795, "loss": 0.4208, "step": 2500 }, { "epoch": 0.54, "eval_loss": 0.3006269633769989, "eval_runtime": 90.5246, "eval_samples_per_second": 11.047, "eval_steps_per_second": 2.762, "step": 2500 }, { "epoch": 0.56, "grad_norm": 1.79283607006073, "learning_rate": 0.00017846153846153847, "loss": 0.4209, "step": 2600 }, { "epoch": 0.58, "grad_norm": 1.5817480087280273, "learning_rate": 0.00017743589743589744, "loss": 0.415, "step": 2700 }, { "epoch": 0.6, "grad_norm": 2.306807279586792, "learning_rate": 0.00017641025641025642, "loss": 0.4231, "step": 2800 }, { "epoch": 0.62, "grad_norm": 1.0432254076004028, "learning_rate": 0.0001753846153846154, "loss": 0.38, "step": 2900 }, { "epoch": 0.65, "grad_norm": 1.0641353130340576, "learning_rate": 0.00017435897435897436, "loss": 0.4671, "step": 3000 }, { "epoch": 0.65, "eval_loss": 0.3091895282268524, "eval_runtime": 90.8377, "eval_samples_per_second": 11.009, "eval_steps_per_second": 2.752, "step": 3000 }, { "epoch": 0.67, "grad_norm": 2.6519484519958496, "learning_rate": 0.00017333333333333334, "loss": 0.4148, "step": 3100 }, { "epoch": 0.69, "grad_norm": 0.851606547832489, "learning_rate": 0.00017230769230769234, "loss": 0.37, "step": 3200 }, { "epoch": 0.71, "grad_norm": 1.1799293756484985, "learning_rate": 0.0001712923076923077, "loss": 0.3687, "step": 3300 }, { "epoch": 0.73, "grad_norm": 1.2787513732910156, "learning_rate": 0.0001702666666666667, "loss": 0.3997, "step": 3400 }, { "epoch": 0.75, "grad_norm": 1.2793489694595337, "learning_rate": 0.00016924102564102564, "loss": 0.3936, "step": 3500 }, { "epoch": 0.75, "eval_loss": 0.2866547703742981, "eval_runtime": 90.8477, "eval_samples_per_second": 11.007, "eval_steps_per_second": 2.752, "step": 3500 }, { "epoch": 0.77, "grad_norm": 0.7189435362815857, "learning_rate": 0.0001682153846153846, "loss": 0.4191, "step": 3600 }, { "epoch": 0.8, "grad_norm": 0.9801158308982849, "learning_rate": 0.0001671897435897436, "loss": 0.3978, "step": 3700 }, { "epoch": 0.82, "grad_norm": 1.41176438331604, "learning_rate": 0.0001661641025641026, "loss": 0.3762, "step": 3800 }, { "epoch": 0.84, "grad_norm": 0.9064908027648926, "learning_rate": 0.00016513846153846156, "loss": 0.3746, "step": 3900 }, { "epoch": 0.86, "grad_norm": 1.3439726829528809, "learning_rate": 0.00016411282051282053, "loss": 0.3823, "step": 4000 }, { "epoch": 0.86, "eval_loss": 0.2893534004688263, "eval_runtime": 90.8714, "eval_samples_per_second": 11.005, "eval_steps_per_second": 2.751, "step": 4000 }, { "epoch": 0.88, "grad_norm": 1.6646331548690796, "learning_rate": 0.00016308717948717948, "loss": 0.4029, "step": 4100 }, { "epoch": 0.9, "grad_norm": 0.9874048829078674, "learning_rate": 0.00016206153846153845, "loss": 0.3933, "step": 4200 }, { "epoch": 0.93, "grad_norm": 1.0010713338851929, "learning_rate": 0.00016103589743589745, "loss": 0.41, "step": 4300 }, { "epoch": 0.95, "grad_norm": 0.9607357978820801, "learning_rate": 0.00016001025641025642, "loss": 0.3659, "step": 4400 }, { "epoch": 0.97, "grad_norm": 1.3652217388153076, "learning_rate": 0.0001589846153846154, "loss": 0.3794, "step": 4500 }, { "epoch": 0.97, "eval_loss": 0.2667659819126129, "eval_runtime": 90.9201, "eval_samples_per_second": 10.999, "eval_steps_per_second": 2.75, "step": 4500 }, { "epoch": 0.99, "grad_norm": 1.056810736656189, "learning_rate": 0.00015795897435897437, "loss": 0.3878, "step": 4600 }, { "epoch": 1.01, "grad_norm": 0.7765600085258484, "learning_rate": 0.00015693333333333334, "loss": 0.3396, "step": 4700 }, { "epoch": 1.03, "grad_norm": 1.0664465427398682, "learning_rate": 0.00015590769230769232, "loss": 0.2813, "step": 4800 }, { "epoch": 1.05, "grad_norm": 0.6342141628265381, "learning_rate": 0.0001548820512820513, "loss": 0.309, "step": 4900 }, { "epoch": 1.08, "grad_norm": 1.1380507946014404, "learning_rate": 0.00015385641025641026, "loss": 0.2888, "step": 5000 }, { "epoch": 1.08, "eval_loss": 0.27630186080932617, "eval_runtime": 90.9762, "eval_samples_per_second": 10.992, "eval_steps_per_second": 2.748, "step": 5000 }, { "epoch": 1.1, "grad_norm": 1.6043438911437988, "learning_rate": 0.00015283076923076924, "loss": 0.3836, "step": 5100 }, { "epoch": 1.12, "grad_norm": 0.2961331307888031, "learning_rate": 0.0001518051282051282, "loss": 0.2964, "step": 5200 }, { "epoch": 1.14, "grad_norm": 1.0106085538864136, "learning_rate": 0.00015077948717948718, "loss": 0.2837, "step": 5300 }, { "epoch": 1.16, "grad_norm": 1.2187350988388062, "learning_rate": 0.00014975384615384616, "loss": 0.3093, "step": 5400 }, { "epoch": 1.18, "grad_norm": 1.4018324613571167, "learning_rate": 0.00014872820512820513, "loss": 0.3029, "step": 5500 }, { "epoch": 1.18, "eval_loss": 0.2620677351951599, "eval_runtime": 90.8592, "eval_samples_per_second": 11.006, "eval_steps_per_second": 2.752, "step": 5500 }, { "epoch": 1.2, "grad_norm": 0.8634820580482483, "learning_rate": 0.00014771282051282051, "loss": 0.3046, "step": 5600 }, { "epoch": 1.23, "grad_norm": 0.6309552788734436, "learning_rate": 0.0001466871794871795, "loss": 0.3041, "step": 5700 }, { "epoch": 1.25, "grad_norm": 1.2985124588012695, "learning_rate": 0.00014566153846153846, "loss": 0.312, "step": 5800 }, { "epoch": 1.27, "grad_norm": 0.9580160975456238, "learning_rate": 0.00014463589743589746, "loss": 0.3306, "step": 5900 }, { "epoch": 1.29, "grad_norm": 1.0316152572631836, "learning_rate": 0.00014361025641025643, "loss": 0.2979, "step": 6000 }, { "epoch": 1.29, "eval_loss": 0.26443469524383545, "eval_runtime": 90.9026, "eval_samples_per_second": 11.001, "eval_steps_per_second": 2.75, "step": 6000 }, { "epoch": 1.31, "grad_norm": 1.9922102689743042, "learning_rate": 0.00014258461538461538, "loss": 0.3105, "step": 6100 }, { "epoch": 1.33, "grad_norm": 1.274574875831604, "learning_rate": 0.00014155897435897435, "loss": 0.3174, "step": 6200 }, { "epoch": 1.36, "grad_norm": 1.1152174472808838, "learning_rate": 0.00014053333333333335, "loss": 0.3094, "step": 6300 }, { "epoch": 1.38, "grad_norm": 1.4428844451904297, "learning_rate": 0.00013950769230769233, "loss": 0.3136, "step": 6400 }, { "epoch": 1.4, "grad_norm": 1.418609380722046, "learning_rate": 0.0001384820512820513, "loss": 0.3425, "step": 6500 }, { "epoch": 1.4, "eval_loss": 0.299164742231369, "eval_runtime": 90.9837, "eval_samples_per_second": 10.991, "eval_steps_per_second": 2.748, "step": 6500 }, { "epoch": 1.42, "grad_norm": 1.309278130531311, "learning_rate": 0.00013745641025641027, "loss": 0.3318, "step": 6600 }, { "epoch": 1.44, "grad_norm": 1.0984652042388916, "learning_rate": 0.00013643076923076922, "loss": 0.3221, "step": 6700 }, { "epoch": 1.46, "grad_norm": 1.1776598691940308, "learning_rate": 0.00013540512820512822, "loss": 0.3151, "step": 6800 }, { "epoch": 1.48, "grad_norm": 1.1536751985549927, "learning_rate": 0.0001343794871794872, "loss": 0.3471, "step": 6900 }, { "epoch": 1.51, "grad_norm": 1.5731265544891357, "learning_rate": 0.00013335384615384616, "loss": 0.3342, "step": 7000 }, { "epoch": 1.51, "eval_loss": 0.25467580556869507, "eval_runtime": 90.9083, "eval_samples_per_second": 11.0, "eval_steps_per_second": 2.75, "step": 7000 }, { "epoch": 1.53, "grad_norm": 1.5424981117248535, "learning_rate": 0.00013233846153846155, "loss": 0.3337, "step": 7100 }, { "epoch": 1.55, "grad_norm": 1.2443273067474365, "learning_rate": 0.00013131282051282052, "loss": 0.2955, "step": 7200 }, { "epoch": 1.57, "grad_norm": 0.7886701822280884, "learning_rate": 0.0001302871794871795, "loss": 0.3017, "step": 7300 }, { "epoch": 1.59, "grad_norm": 1.6013621091842651, "learning_rate": 0.00012926153846153847, "loss": 0.2976, "step": 7400 }, { "epoch": 1.61, "grad_norm": 1.495753288269043, "learning_rate": 0.00012823589743589744, "loss": 0.3086, "step": 7500 }, { "epoch": 1.61, "eval_loss": 0.25845155119895935, "eval_runtime": 90.8943, "eval_samples_per_second": 11.002, "eval_steps_per_second": 2.75, "step": 7500 }, { "epoch": 1.64, "grad_norm": 1.181015968322754, "learning_rate": 0.00012721025641025641, "loss": 0.2657, "step": 7600 }, { "epoch": 1.66, "grad_norm": 1.2074403762817383, "learning_rate": 0.0001261846153846154, "loss": 0.3149, "step": 7700 }, { "epoch": 1.68, "grad_norm": 1.216676950454712, "learning_rate": 0.00012515897435897436, "loss": 0.2999, "step": 7800 }, { "epoch": 1.7, "grad_norm": 1.8093730211257935, "learning_rate": 0.00012413333333333333, "loss": 0.3144, "step": 7900 }, { "epoch": 1.72, "grad_norm": 3.7325637340545654, "learning_rate": 0.00012310769230769233, "loss": 0.3326, "step": 8000 }, { "epoch": 1.72, "eval_loss": 0.23783066868782043, "eval_runtime": 90.8877, "eval_samples_per_second": 11.003, "eval_steps_per_second": 2.751, "step": 8000 }, { "epoch": 1.74, "grad_norm": 0.6969020366668701, "learning_rate": 0.00012208205128205128, "loss": 0.2935, "step": 8100 }, { "epoch": 1.76, "grad_norm": 2.1927125453948975, "learning_rate": 0.00012105641025641025, "loss": 0.3091, "step": 8200 }, { "epoch": 1.79, "grad_norm": 1.8521186113357544, "learning_rate": 0.00012003076923076924, "loss": 0.2517, "step": 8300 }, { "epoch": 1.81, "grad_norm": 1.5349504947662354, "learning_rate": 0.00011900512820512821, "loss": 0.2794, "step": 8400 }, { "epoch": 1.83, "grad_norm": 0.6325456500053406, "learning_rate": 0.00011797948717948718, "loss": 0.2912, "step": 8500 }, { "epoch": 1.83, "eval_loss": 0.23375801742076874, "eval_runtime": 90.8478, "eval_samples_per_second": 11.007, "eval_steps_per_second": 2.752, "step": 8500 }, { "epoch": 1.85, "grad_norm": 1.615515112876892, "learning_rate": 0.00011695384615384617, "loss": 0.2953, "step": 8600 }, { "epoch": 1.87, "grad_norm": 1.2424674034118652, "learning_rate": 0.00011592820512820513, "loss": 0.3047, "step": 8700 }, { "epoch": 1.89, "grad_norm": 1.2125675678253174, "learning_rate": 0.0001149025641025641, "loss": 0.3209, "step": 8800 }, { "epoch": 1.91, "grad_norm": 1.6464908123016357, "learning_rate": 0.00011387692307692308, "loss": 0.3214, "step": 8900 }, { "epoch": 1.94, "grad_norm": 1.300310730934143, "learning_rate": 0.00011285128205128206, "loss": 0.2965, "step": 9000 }, { "epoch": 1.94, "eval_loss": 0.2334287166595459, "eval_runtime": 90.9451, "eval_samples_per_second": 10.996, "eval_steps_per_second": 2.749, "step": 9000 }, { "epoch": 1.96, "grad_norm": 1.7211843729019165, "learning_rate": 0.00011182564102564104, "loss": 0.2813, "step": 9100 }, { "epoch": 1.98, "grad_norm": 1.5777404308319092, "learning_rate": 0.00011080000000000001, "loss": 0.2951, "step": 9200 }, { "epoch": 2.0, "grad_norm": 1.3048077821731567, "learning_rate": 0.00010977435897435897, "loss": 0.301, "step": 9300 }, { "epoch": 2.02, "grad_norm": 1.1753878593444824, "learning_rate": 0.00010874871794871794, "loss": 0.1944, "step": 9400 }, { "epoch": 2.04, "grad_norm": 0.8378590941429138, "learning_rate": 0.00010772307692307693, "loss": 0.2041, "step": 9500 }, { "epoch": 2.04, "eval_loss": 0.2733120024204254, "eval_runtime": 90.9249, "eval_samples_per_second": 10.998, "eval_steps_per_second": 2.75, "step": 9500 }, { "epoch": 2.07, "grad_norm": 1.4123268127441406, "learning_rate": 0.0001066974358974359, "loss": 0.2206, "step": 9600 }, { "epoch": 2.09, "grad_norm": 1.5923714637756348, "learning_rate": 0.00010567179487179489, "loss": 0.2331, "step": 9700 }, { "epoch": 2.11, "grad_norm": 2.31000018119812, "learning_rate": 0.00010465641025641026, "loss": 0.2564, "step": 9800 }, { "epoch": 2.13, "grad_norm": 1.4770272970199585, "learning_rate": 0.00010363076923076925, "loss": 0.2009, "step": 9900 }, { "epoch": 2.15, "grad_norm": 1.5393586158752441, "learning_rate": 0.00010260512820512822, "loss": 0.2168, "step": 10000 }, { "epoch": 2.15, "eval_loss": 0.24773281812667847, "eval_runtime": 90.901, "eval_samples_per_second": 11.001, "eval_steps_per_second": 2.75, "step": 10000 }, { "epoch": 2.17, "grad_norm": 2.3399605751037598, "learning_rate": 0.00010157948717948718, "loss": 0.2275, "step": 10100 }, { "epoch": 2.19, "grad_norm": 1.417143702507019, "learning_rate": 0.00010055384615384615, "loss": 0.2093, "step": 10200 }, { "epoch": 2.22, "grad_norm": 1.7041810750961304, "learning_rate": 9.952820512820513e-05, "loss": 0.2105, "step": 10300 }, { "epoch": 2.24, "grad_norm": 3.7192060947418213, "learning_rate": 9.850256410256411e-05, "loss": 0.2078, "step": 10400 }, { "epoch": 2.26, "grad_norm": 0.7868184447288513, "learning_rate": 9.747692307692307e-05, "loss": 0.2058, "step": 10500 }, { "epoch": 2.26, "eval_loss": 0.22978341579437256, "eval_runtime": 90.9187, "eval_samples_per_second": 10.999, "eval_steps_per_second": 2.75, "step": 10500 }, { "epoch": 2.28, "grad_norm": 2.572187662124634, "learning_rate": 9.645128205128206e-05, "loss": 0.2304, "step": 10600 }, { "epoch": 2.3, "grad_norm": 1.3671247959136963, "learning_rate": 9.542564102564103e-05, "loss": 0.1962, "step": 10700 }, { "epoch": 2.32, "grad_norm": 2.6237735748291016, "learning_rate": 9.44e-05, "loss": 0.2201, "step": 10800 }, { "epoch": 2.35, "grad_norm": 1.1776219606399536, "learning_rate": 9.337435897435898e-05, "loss": 0.1972, "step": 10900 }, { "epoch": 2.37, "grad_norm": 1.236425757408142, "learning_rate": 9.234871794871795e-05, "loss": 0.2126, "step": 11000 }, { "epoch": 2.37, "eval_loss": 0.24023191630840302, "eval_runtime": 90.9406, "eval_samples_per_second": 10.996, "eval_steps_per_second": 2.749, "step": 11000 }, { "epoch": 2.39, "grad_norm": 1.0826618671417236, "learning_rate": 9.132307692307692e-05, "loss": 0.2168, "step": 11100 }, { "epoch": 2.41, "grad_norm": 0.8385189771652222, "learning_rate": 9.02974358974359e-05, "loss": 0.2029, "step": 11200 }, { "epoch": 2.43, "grad_norm": 0.7595863342285156, "learning_rate": 8.927179487179488e-05, "loss": 0.1902, "step": 11300 }, { "epoch": 2.45, "grad_norm": 2.0246148109436035, "learning_rate": 8.824615384615384e-05, "loss": 0.2276, "step": 11400 }, { "epoch": 2.47, "grad_norm": 1.2247196435928345, "learning_rate": 8.722051282051283e-05, "loss": 0.2, "step": 11500 }, { "epoch": 2.47, "eval_loss": 0.25648975372314453, "eval_runtime": 90.8579, "eval_samples_per_second": 11.006, "eval_steps_per_second": 2.752, "step": 11500 }, { "epoch": 2.5, "grad_norm": 1.0143483877182007, "learning_rate": 8.61948717948718e-05, "loss": 0.2179, "step": 11600 }, { "epoch": 2.52, "grad_norm": 2.7020885944366455, "learning_rate": 8.516923076923076e-05, "loss": 0.2214, "step": 11700 }, { "epoch": 2.54, "grad_norm": 1.8533117771148682, "learning_rate": 8.414358974358975e-05, "loss": 0.2074, "step": 11800 }, { "epoch": 2.56, "grad_norm": 1.7365753650665283, "learning_rate": 8.311794871794872e-05, "loss": 0.2396, "step": 11900 }, { "epoch": 2.58, "grad_norm": 0.8160982131958008, "learning_rate": 8.209230769230771e-05, "loss": 0.1786, "step": 12000 }, { "epoch": 2.58, "eval_loss": 0.2251870185136795, "eval_runtime": 90.8167, "eval_samples_per_second": 11.011, "eval_steps_per_second": 2.753, "step": 12000 } ], "logging_steps": 100, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 3.86418819603628e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }