{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 262.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 282.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.07872009277344, "logits/rejected": 80.78690338134766, "logps/chosen": -34.225643157958984, "logps/rejected": -32.97146987915039, "loss": 1.4705, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.00803940836340189, "rewards/margins": 0.01457094494253397, "rewards/rejected": -0.00653153657913208, "step": 10 }, { "epoch": 0.05, "grad_norm": 316.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.64378356933594, "logits/rejected": 80.53114318847656, "logps/chosen": -33.554771423339844, "logps/rejected": -30.747882843017578, "loss": 1.6639, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.06756959110498428, "rewards/margins": 0.05276557803153992, "rewards/rejected": 0.014804007485508919, "step": 20 }, { "epoch": 0.08, "grad_norm": 209.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.3618392944336, "logits/rejected": 82.39045715332031, "logps/chosen": -33.964210510253906, "logps/rejected": -31.346271514892578, "loss": 1.9077, "rewards/accuracies": 0.375, "rewards/chosen": 0.02430456317961216, "rewards/margins": -0.01447761058807373, "rewards/rejected": 0.03878217190504074, "step": 30 }, { "epoch": 0.1, "grad_norm": 708.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": 80.66716003417969, "logits/rejected": 80.66474151611328, "logps/chosen": -33.04372787475586, "logps/rejected": -33.376953125, "loss": 2.3531, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.06516679376363754, "rewards/margins": 0.06938251852989197, "rewards/rejected": -0.004215720109641552, "step": 40 }, { "epoch": 0.13, "grad_norm": 684.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.12054443359375, "logits/rejected": 78.13239288330078, "logps/chosen": -31.36956787109375, "logps/rejected": -31.187658309936523, "loss": 2.5686, "rewards/accuracies": 0.5, "rewards/chosen": -0.08215981721878052, "rewards/margins": -0.024972762912511826, "rewards/rejected": -0.057187050580978394, "step": 50 }, { "epoch": 0.16, "grad_norm": 326.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 82.62169647216797, "logits/rejected": 82.67021179199219, "logps/chosen": -31.33014488220215, "logps/rejected": -29.82718849182129, "loss": 2.2332, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0774935632944107, "rewards/margins": 0.04084175080060959, "rewards/rejected": -0.11833532154560089, "step": 60 }, { "epoch": 0.18, "grad_norm": 796.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.52657318115234, "logits/rejected": 83.54931640625, "logps/chosen": -30.717477798461914, "logps/rejected": -32.99928665161133, "loss": 2.7176, "rewards/accuracies": 0.375, "rewards/chosen": -0.04422692582011223, "rewards/margins": -0.06343810260295868, "rewards/rejected": 0.019211167469620705, "step": 70 }, { "epoch": 0.21, "grad_norm": 342.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.1966781616211, "logits/rejected": 81.19418334960938, "logps/chosen": -31.369304656982422, "logps/rejected": -30.800960540771484, "loss": 2.4943, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06781856715679169, "rewards/margins": 0.08947588503360748, "rewards/rejected": -0.021657321602106094, "step": 80 }, { "epoch": 0.23, "grad_norm": 502.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.3118896484375, "logits/rejected": 78.27383422851562, "logps/chosen": -32.46575164794922, "logps/rejected": -31.248449325561523, "loss": 2.2351, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.08621449768543243, "rewards/margins": 0.2347041368484497, "rewards/rejected": -0.14848963916301727, "step": 90 }, { "epoch": 0.26, "grad_norm": 296.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.64895629882812, "logits/rejected": 83.68805694580078, "logps/chosen": -34.173667907714844, "logps/rejected": -31.847728729248047, "loss": 2.7048, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06566362082958221, "rewards/margins": 0.11425628513097763, "rewards/rejected": -0.04859266057610512, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.86346435546875, "eval_logits/rejected": 98.85167694091797, "eval_logps/chosen": -32.50230407714844, "eval_logps/rejected": -36.137577056884766, "eval_loss": 2.2451484203338623, "eval_rewards/accuracies": 0.5336378812789917, "eval_rewards/chosen": -0.02956419251859188, "eval_rewards/margins": 0.05596020445227623, "eval_rewards/rejected": -0.08552439510822296, "eval_runtime": 104.0782, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 492.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.73783874511719, "logits/rejected": 83.64616394042969, "logps/chosen": -32.79869842529297, "logps/rejected": -32.73226547241211, "loss": 2.701, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.11205414682626724, "rewards/margins": 0.15024258196353912, "rewards/rejected": -0.03818843513727188, "step": 110 }, { "epoch": 0.31, "grad_norm": 324.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 84.05355072021484, "logits/rejected": 84.1574478149414, "logps/chosen": -28.724782943725586, "logps/rejected": -35.51475143432617, "loss": 2.094, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10240070521831512, "rewards/margins": 0.13020212948322296, "rewards/rejected": -0.027801427990198135, "step": 120 }, { "epoch": 0.34, "grad_norm": 185.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.37618255615234, "logits/rejected": 81.40374755859375, "logps/chosen": -30.55348777770996, "logps/rejected": -32.026222229003906, "loss": 2.2639, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.17693254351615906, "rewards/margins": 0.22423109412193298, "rewards/rejected": -0.04729856550693512, "step": 130 }, { "epoch": 0.36, "grad_norm": 356.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.8284912109375, "logits/rejected": 82.82528686523438, "logps/chosen": -27.208200454711914, "logps/rejected": -32.791831970214844, "loss": 2.1202, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11956217139959335, "rewards/margins": 0.3037911057472229, "rewards/rejected": -0.18422891199588776, "step": 140 }, { "epoch": 0.39, "grad_norm": 376.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 81.57420349121094, "logits/rejected": 81.54073333740234, "logps/chosen": -29.004501342773438, "logps/rejected": -32.630916595458984, "loss": 2.179, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19230429828166962, "rewards/margins": 0.22451286017894745, "rewards/rejected": -0.032208558171987534, "step": 150 }, { "epoch": 0.42, "grad_norm": 464.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 83.5596923828125, "logits/rejected": 83.56761169433594, "logps/chosen": -33.89311981201172, "logps/rejected": -29.929208755493164, "loss": 3.3447, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12181256711483002, "rewards/margins": 0.11014753580093384, "rewards/rejected": 0.011665038764476776, "step": 160 }, { "epoch": 0.44, "grad_norm": 296.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 84.21192169189453, "logits/rejected": 84.15568542480469, "logps/chosen": -30.989816665649414, "logps/rejected": -32.166038513183594, "loss": 2.6535, "rewards/accuracies": 0.625, "rewards/chosen": 0.13473577797412872, "rewards/margins": 0.20772309601306915, "rewards/rejected": -0.07298731058835983, "step": 170 }, { "epoch": 0.47, "grad_norm": 520.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 81.9034423828125, "logits/rejected": 81.88355255126953, "logps/chosen": -30.792713165283203, "logps/rejected": -31.40665626525879, "loss": 2.561, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.20221999287605286, "rewards/margins": 0.3269447684288025, "rewards/rejected": -0.12472478300333023, "step": 180 }, { "epoch": 0.49, "grad_norm": 181.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": 83.61729431152344, "logits/rejected": 83.60554504394531, "logps/chosen": -30.603130340576172, "logps/rejected": -30.67363929748535, "loss": 2.4852, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.08164240419864655, "rewards/margins": 0.1844494342803955, "rewards/rejected": -0.10280702263116837, "step": 190 }, { "epoch": 0.52, "grad_norm": 384.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 79.16850280761719, "logits/rejected": 79.1117935180664, "logps/chosen": -34.15711212158203, "logps/rejected": -32.46302032470703, "loss": 3.9434, "rewards/accuracies": 0.625, "rewards/chosen": 0.2978156805038452, "rewards/margins": 0.28262609243392944, "rewards/rejected": 0.015189537778496742, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.95487213134766, "eval_logits/rejected": 98.95005798339844, "eval_logps/chosen": -32.4478645324707, "eval_logps/rejected": -36.15636444091797, "eval_loss": 1.9716209173202515, "eval_rewards/accuracies": 0.5930232405662537, "eval_rewards/chosen": -0.002344276988878846, "eval_rewards/margins": 0.09257492423057556, "eval_rewards/rejected": -0.09491920471191406, "eval_runtime": 104.0484, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 756.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 81.85539245605469, "logits/rejected": 81.76030731201172, "logps/chosen": -33.80899429321289, "logps/rejected": -35.018226623535156, "loss": 2.8449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06836456805467606, "rewards/margins": 0.08442962169647217, "rewards/rejected": -0.01606505550444126, "step": 210 }, { "epoch": 0.57, "grad_norm": 290.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 84.03166198730469, "logits/rejected": 84.112060546875, "logps/chosen": -31.503955841064453, "logps/rejected": -30.661306381225586, "loss": 3.4983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2224985659122467, "rewards/margins": 0.1886742264032364, "rewards/rejected": 0.03382434695959091, "step": 220 }, { "epoch": 0.6, "grad_norm": 354.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 81.38267517089844, "logits/rejected": 81.43292999267578, "logps/chosen": -32.516029357910156, "logps/rejected": -33.79094314575195, "loss": 2.6207, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.1448572874069214, "rewards/margins": 0.034253668040037155, "rewards/rejected": 0.11060361564159393, "step": 230 }, { "epoch": 0.62, "grad_norm": 364.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 83.76815795898438, "logits/rejected": 84.01641082763672, "logps/chosen": -31.310298919677734, "logps/rejected": -31.66178321838379, "loss": 2.0842, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.18888285756111145, "rewards/margins": 0.25375330448150635, "rewards/rejected": -0.06487040966749191, "step": 240 }, { "epoch": 0.65, "grad_norm": 298.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 82.50977325439453, "logits/rejected": 82.58284759521484, "logps/chosen": -27.186697006225586, "logps/rejected": -29.944438934326172, "loss": 2.3496, "rewards/accuracies": 0.625, "rewards/chosen": 0.20459958910942078, "rewards/margins": 0.22644969820976257, "rewards/rejected": -0.02185012958943844, "step": 250 }, { "epoch": 0.68, "grad_norm": 976.0, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 79.92189025878906, "logits/rejected": 80.03739166259766, "logps/chosen": -30.968799591064453, "logps/rejected": -36.08448028564453, "loss": 2.944, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.29609745740890503, "rewards/margins": 0.28752270340919495, "rewards/rejected": 0.008574736304581165, "step": 260 }, { "epoch": 0.7, "grad_norm": 197.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": 79.33562469482422, "logits/rejected": 79.36061096191406, "logps/chosen": -30.985218048095703, "logps/rejected": -31.533437728881836, "loss": 2.6828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.39685314893722534, "rewards/margins": 0.41086119413375854, "rewards/rejected": -0.014008039608597755, "step": 270 }, { "epoch": 0.73, "grad_norm": 388.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 81.81623840332031, "logits/rejected": 81.6318588256836, "logps/chosen": -31.43568992614746, "logps/rejected": -29.53765296936035, "loss": 2.8578, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1871013194322586, "rewards/margins": 0.12242940813302994, "rewards/rejected": 0.06467190384864807, "step": 280 }, { "epoch": 0.75, "grad_norm": 270.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": 82.0903549194336, "logits/rejected": 82.0211181640625, "logps/chosen": -33.648677825927734, "logps/rejected": -31.893665313720703, "loss": 2.8329, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.22778387367725372, "rewards/margins": 0.1661832630634308, "rewards/rejected": 0.061600614339113235, "step": 290 }, { "epoch": 0.78, "grad_norm": 230.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 77.76834869384766, "logits/rejected": 77.82737731933594, "logps/chosen": -32.740943908691406, "logps/rejected": -28.960865020751953, "loss": 2.0991, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.33238038420677185, "rewards/margins": 0.26180073618888855, "rewards/rejected": 0.07057970762252808, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 99.00546264648438, "eval_logits/rejected": 98.99657440185547, "eval_logps/chosen": -32.35263442993164, "eval_logps/rejected": -35.902122497558594, "eval_loss": 2.260390281677246, "eval_rewards/accuracies": 0.48795682191848755, "eval_rewards/chosen": 0.045268867164850235, "eval_rewards/margins": 0.013066344894468784, "eval_rewards/rejected": 0.03220251947641373, "eval_runtime": 104.0306, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.81, "grad_norm": 640.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 84.65354919433594, "logits/rejected": 84.689208984375, "logps/chosen": -30.26632308959961, "logps/rejected": -32.187259674072266, "loss": 2.4257, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2986040413379669, "rewards/margins": 0.2923537790775299, "rewards/rejected": 0.0062502650544047356, "step": 310 }, { "epoch": 0.83, "grad_norm": 438.0, "learning_rate": 4.229036944380913e-07, "logits/chosen": 82.1827621459961, "logits/rejected": 82.17810821533203, "logps/chosen": -30.906885147094727, "logps/rejected": -28.876495361328125, "loss": 4.1182, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3458386957645416, "rewards/margins": 0.3243403136730194, "rewards/rejected": 0.021498391404747963, "step": 320 }, { "epoch": 0.86, "grad_norm": 336.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": 79.60859680175781, "logits/rejected": 79.64774322509766, "logps/chosen": -29.61593246459961, "logps/rejected": -32.607086181640625, "loss": 2.7354, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3683364987373352, "rewards/margins": 0.3544464707374573, "rewards/rejected": 0.013890000991523266, "step": 330 }, { "epoch": 0.88, "grad_norm": 552.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 83.55511474609375, "logits/rejected": 83.56120300292969, "logps/chosen": -32.71057891845703, "logps/rejected": -33.46552276611328, "loss": 2.9727, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2507854104042053, "rewards/margins": 0.2991998791694641, "rewards/rejected": -0.048414476215839386, "step": 340 }, { "epoch": 0.91, "grad_norm": 416.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 82.80843353271484, "logits/rejected": 82.7990951538086, "logps/chosen": -32.887977600097656, "logps/rejected": -33.06817626953125, "loss": 2.7683, "rewards/accuracies": 0.625, "rewards/chosen": 0.37787994742393494, "rewards/margins": 0.3660680055618286, "rewards/rejected": 0.011811929754912853, "step": 350 }, { "epoch": 0.94, "grad_norm": 141.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": 84.11600494384766, "logits/rejected": 84.12767791748047, "logps/chosen": -28.876087188720703, "logps/rejected": -31.597909927368164, "loss": 1.7283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3380628228187561, "rewards/margins": 0.3120931088924408, "rewards/rejected": 0.025969672948122025, "step": 360 }, { "epoch": 0.96, "grad_norm": 402.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 83.5467529296875, "logits/rejected": 83.55477142333984, "logps/chosen": -32.44352340698242, "logps/rejected": -35.13766860961914, "loss": 3.1738, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2163194715976715, "rewards/margins": 0.23046875, "rewards/rejected": -0.014149266295135021, "step": 370 }, { "epoch": 0.99, "grad_norm": 322.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": 77.60735321044922, "logits/rejected": 77.49246215820312, "logps/chosen": -30.158939361572266, "logps/rejected": -28.004974365234375, "loss": 2.7343, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.20859412848949432, "rewards/margins": 0.1307946741580963, "rewards/rejected": 0.07779943943023682, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 2.572123973400562, "train_runtime": 2556.3668, "train_samples_per_second": 1.204, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }