{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-07, "logits/chosen": -2.001559019088745, "logits/rejected": -2.052518367767334, "logps/chosen": -2030.77099609375, "logps/rejected": -2033.691650390625, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": -2.146657705307007, "logits/rejected": -2.073806047439575, "logps/chosen": -1147.4302978515625, "logps/rejected": -2748.2314453125, "loss": 0.4979, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": -0.028285933658480644, "rewards/margins": 0.02654173970222473, "rewards/rejected": -0.054827671498060226, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -2.2760894298553467, "logits/rejected": -2.1618006229400635, "logps/chosen": -1438.9384765625, "logps/rejected": -3313.58837890625, "loss": 0.4847, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.2823066711425781, "rewards/margins": 0.31389349699020386, "rewards/rejected": -0.596200168132782, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.164494752883911, "logits/rejected": -2.0667026042938232, "logps/chosen": -2890.416015625, "logps/rejected": -4221.6728515625, "loss": 0.4841, "rewards/accuracies": 0.5, "rewards/chosen": -1.3330113887786865, "rewards/margins": 0.03289594128727913, "rewards/rejected": -1.3659074306488037, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -2.1548264026641846, "logits/rejected": -2.0702128410339355, "logps/chosen": -3225.27685546875, "logps/rejected": -4589.10107421875, "loss": 0.4908, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.890707015991211, "rewards/margins": 0.1443764716386795, "rewards/rejected": -2.035083293914795, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-06, "logits/chosen": -2.221364974975586, "logits/rejected": -2.0859131813049316, "logps/chosen": -2741.037353515625, "logps/rejected": -4698.10302734375, "loss": 0.4781, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1827226877212524, "rewards/margins": 0.47015246748924255, "rewards/rejected": -1.6528751850128174, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -2.3111491203308105, "logits/rejected": -2.244699001312256, "logps/chosen": -2379.827392578125, "logps/rejected": -4096.80712890625, "loss": 0.4761, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8030478358268738, "rewards/margins": 0.2678872048854828, "rewards/rejected": -1.0709350109100342, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.7761938666470405e-06, "logits/chosen": -2.369278907775879, "logits/rejected": -2.2874133586883545, "logps/chosen": -2155.563720703125, "logps/rejected": -3842.12109375, "loss": 0.4766, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.7144564986228943, "rewards/margins": 0.4151995778083801, "rewards/rejected": -1.1296560764312744, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -2.3896870613098145, "logits/rejected": -2.330310583114624, "logps/chosen": -2313.40380859375, "logps/rejected": -4401.6201171875, "loss": 0.4816, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8072439432144165, "rewards/margins": 0.2879006266593933, "rewards/rejected": -1.0951446294784546, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.4890613722044526e-06, "logits/chosen": -2.4341788291931152, "logits/rejected": -2.4094738960266113, "logps/chosen": -2331.335205078125, "logps/rejected": -3481.241455078125, "loss": 0.4816, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.9047203063964844, "rewards/margins": 0.22198553383350372, "rewards/rejected": -1.1267058849334717, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684795e-06, "logits/chosen": -2.4191980361938477, "logits/rejected": -2.3792288303375244, "logps/chosen": -2510.74267578125, "logps/rejected": -3801.65771484375, "loss": 0.4795, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9170597791671753, "rewards/margins": 0.13030724227428436, "rewards/rejected": -1.0473670959472656, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.102189034962561e-06, "logits/chosen": -2.4569015502929688, "logits/rejected": -2.4183101654052734, "logps/chosen": -2428.78369140625, "logps/rejected": -5115.49267578125, "loss": 0.4677, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9490839242935181, "rewards/margins": 0.8531773686408997, "rewards/rejected": -1.8022613525390625, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -2.5435500144958496, "logits/rejected": -2.540524482727051, "logps/chosen": -2736.87158203125, "logps/rejected": -4036.8984375, "loss": 0.4729, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.0938019752502441, "rewards/margins": 0.23258857429027557, "rewards/rejected": -1.3263905048370361, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-06, "logits/chosen": -2.5451807975769043, "logits/rejected": -2.5353500843048096, "logps/chosen": -2400.534423828125, "logps/rejected": -3994.819091796875, "loss": 0.4725, "rewards/accuracies": 0.5, "rewards/chosen": -1.0581142902374268, "rewards/margins": 0.3071446716785431, "rewards/rejected": -1.3652589321136475, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.3784370602033572e-06, "logits/chosen": -2.502204179763794, "logits/rejected": -2.5256900787353516, "logps/chosen": -2380.7509765625, "logps/rejected": -4508.63818359375, "loss": 0.4723, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.9423351287841797, "rewards/margins": 0.6326543092727661, "rewards/rejected": -1.5749894380569458, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.1108510153447352e-06, "logits/chosen": -2.583056688308716, "logits/rejected": -2.599966049194336, "logps/chosen": -1865.611328125, "logps/rejected": -4491.51806640625, "loss": 0.4669, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6541633009910583, "rewards/margins": 0.6614173054695129, "rewards/rejected": -1.3155806064605713, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -2.5848755836486816, "logits/rejected": -2.6103827953338623, "logps/chosen": -2029.941162109375, "logps/rejected": -3397.024169921875, "loss": 0.4691, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.725816011428833, "rewards/margins": 0.292774498462677, "rewards/rejected": -1.0185905694961548, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.556095160739513e-06, "logits/chosen": -2.5292410850524902, "logits/rejected": -2.5528335571289062, "logps/chosen": -1871.8792724609375, "logps/rejected": -3524.819091796875, "loss": 0.4714, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6304308176040649, "rewards/margins": 0.3158620595932007, "rewards/rejected": -0.9462926983833313, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414165e-06, "logits/chosen": -2.618015766143799, "logits/rejected": -2.658297061920166, "logps/chosen": -2715.969970703125, "logps/rejected": -4858.7861328125, "loss": 0.4621, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.088996171951294, "rewards/margins": 0.6818682551383972, "rewards/rejected": -1.770864486694336, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.9985264605418185e-06, "logits/chosen": -2.663578748703003, "logits/rejected": -2.7316274642944336, "logps/chosen": -2506.532958984375, "logps/rejected": -5105.27001953125, "loss": 0.4641, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2244542837142944, "rewards/margins": 0.8673825263977051, "rewards/rejected": -2.091836929321289, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -2.6198081970214844, "logits/rejected": -2.7005550861358643, "logps/chosen": -2381.856201171875, "logps/rejected": -4935.6943359375, "loss": 0.4612, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1094557046890259, "rewards/margins": 1.003557562828064, "rewards/rejected": -2.113013505935669, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.466103737583699e-06, "logits/chosen": -2.535006046295166, "logits/rejected": -2.603625774383545, "logps/chosen": -1250.8673095703125, "logps/rejected": -3654.595703125, "loss": 0.4603, "rewards/accuracies": 0.59375, "rewards/chosen": -0.42699164152145386, "rewards/margins": 0.7609654664993286, "rewards/rejected": -1.1879570484161377, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.217751806485235e-06, "logits/chosen": -2.5781049728393555, "logits/rejected": -2.6412928104400635, "logps/chosen": -1635.0478515625, "logps/rejected": -4128.06884765625, "loss": 0.4629, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.655454695224762, "rewards/margins": 0.8185914754867554, "rewards/rejected": -1.4740461111068726, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-07, "logits/chosen": -2.583189010620117, "logits/rejected": -2.722602605819702, "logps/chosen": -2002.4896240234375, "logps/rejected": -4570.92041015625, "loss": 0.4644, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8033896684646606, "rewards/margins": 0.8625272512435913, "rewards/rejected": -1.6659168004989624, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -2.6383585929870605, "logits/rejected": -2.7192063331604004, "logps/chosen": -2607.29638671875, "logps/rejected": -4494.58349609375, "loss": 0.4638, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.9701069593429565, "rewards/margins": 0.6917791962623596, "rewards/rejected": -1.661886215209961, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049723e-07, "logits/chosen": -2.616407871246338, "logits/rejected": -2.639394760131836, "logps/chosen": -2655.154296875, "logps/rejected": -3849.689453125, "loss": 0.4711, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.885340690612793, "rewards/margins": 0.4874494969844818, "rewards/rejected": -1.3727902173995972, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674344e-07, "logits/chosen": -2.6079611778259277, "logits/rejected": -2.7340540885925293, "logps/chosen": -2142.09814453125, "logps/rejected": -4976.4326171875, "loss": 0.4604, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.790375828742981, "rewards/margins": 0.7060346603393555, "rewards/rejected": -1.496410608291626, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.7248368952908055e-07, "logits/chosen": -2.6824259757995605, "logits/rejected": -2.7371983528137207, "logps/chosen": -2655.15673828125, "logps/rejected": -4562.94091796875, "loss": 0.4754, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.0050941705703735, "rewards/margins": 0.3014536201953888, "rewards/rejected": -1.3065478801727295, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -2.636775493621826, "logits/rejected": -2.734402656555176, "logps/chosen": -2063.6806640625, "logps/rejected": -4341.87109375, "loss": 0.4669, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7057104706764221, "rewards/margins": 0.690462589263916, "rewards/rejected": -1.3961730003356934, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-08, "logits/chosen": -2.631303310394287, "logits/rejected": -2.7183616161346436, "logps/chosen": -2191.196044921875, "logps/rejected": -4163.2451171875, "loss": 0.4691, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7605796456336975, "rewards/margins": 0.5482227206230164, "rewards/rejected": -1.3088023662567139, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.262559558016325e-08, "logits/chosen": -2.61507511138916, "logits/rejected": -2.646120548248291, "logps/chosen": -1883.6669921875, "logps/rejected": -3444.8046875, "loss": 0.466, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6400176286697388, "rewards/margins": 0.3954530656337738, "rewards/rejected": -1.035470724105835, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336734e-10, "logits/chosen": -2.6119272708892822, "logits/rejected": -2.7076706886291504, "logps/chosen": -2384.146240234375, "logps/rejected": -4828.73095703125, "loss": 0.4701, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8787592053413391, "rewards/margins": 0.646748960018158, "rewards/rejected": -1.525508165359497, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.47231930035811204, "train_runtime": 7349.4695, "train_samples_per_second": 2.721, "train_steps_per_second": 0.042 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }