{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 13.312925409718954, "learning_rate": 3.125e-08, "logits/chosen": -2.1492395401000977, "logits/rejected": -2.139173746109009, "logps/chosen": -189.41439819335938, "logps/rejected": -184.15049743652344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 15.630000847331686, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.3999834060668945, "logits/rejected": -2.346851348876953, "logps/chosen": -178.99545288085938, "logps/rejected": -177.0459747314453, "loss": 0.6934, "rewards/accuracies": 0.42307692766189575, "rewards/chosen": -0.0037847168277949095, "rewards/margins": 0.0002747862017713487, "rewards/rejected": -0.004059503320604563, "step": 10 }, { "epoch": 0.13, "grad_norm": 14.549393173612225, "learning_rate": 4.989490450759331e-07, "logits/chosen": -2.4151172637939453, "logits/rejected": -2.356534004211426, "logps/chosen": -179.75003051757812, "logps/rejected": -179.4581756591797, "loss": 0.6909, "rewards/accuracies": 0.5461538434028625, "rewards/chosen": -0.055207282304763794, "rewards/margins": 0.006152572110295296, "rewards/rejected": -0.06135985627770424, "step": 20 }, { "epoch": 0.2, "grad_norm": 22.390116207007786, "learning_rate": 4.872270441827174e-07, "logits/chosen": -2.312279224395752, "logits/rejected": -2.211397886276245, "logps/chosen": -206.32656860351562, "logps/rejected": -211.81321716308594, "loss": 0.6929, "rewards/accuracies": 0.557692289352417, "rewards/chosen": -0.3904457688331604, "rewards/margins": 0.03509727492928505, "rewards/rejected": -0.42554304003715515, "step": 30 }, { "epoch": 0.26, "grad_norm": 15.933088854619298, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -2.2958626747131348, "logits/rejected": -2.3168814182281494, "logps/chosen": -236.7042999267578, "logps/rejected": -244.78851318359375, "loss": 0.6981, "rewards/accuracies": 0.5461538434028625, "rewards/chosen": -0.6312862038612366, "rewards/margins": 0.015706488862633705, "rewards/rejected": -0.6469926238059998, "step": 40 }, { "epoch": 0.33, "grad_norm": 14.014878007482002, "learning_rate": 4.277872161641681e-07, "logits/chosen": -2.368952512741089, "logits/rejected": -2.4042294025421143, "logps/chosen": -214.369384765625, "logps/rejected": -220.7718505859375, "loss": 0.6913, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2600650191307068, "rewards/margins": 0.018586795777082443, "rewards/rejected": -0.2786518335342407, "step": 50 }, { "epoch": 0.39, "grad_norm": 14.767139513110513, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -2.4127275943756104, "logits/rejected": -2.434305191040039, "logps/chosen": -217.94210815429688, "logps/rejected": -227.94302368164062, "loss": 0.6919, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.2718888223171234, "rewards/margins": 0.015998326241970062, "rewards/rejected": -0.2878871560096741, "step": 60 }, { "epoch": 0.46, "grad_norm": 15.91144067203442, "learning_rate": 3.316028034595861e-07, "logits/chosen": -2.264232635498047, "logits/rejected": -2.299992322921753, "logps/chosen": -194.38172912597656, "logps/rejected": -205.9635009765625, "loss": 0.6833, "rewards/accuracies": 0.5884615182876587, "rewards/chosen": -0.17818714678287506, "rewards/margins": 0.02437894232571125, "rewards/rejected": -0.20256608724594116, "step": 70 }, { "epoch": 0.52, "grad_norm": 18.074689046967872, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -2.299180746078491, "logits/rejected": -2.182999610900879, "logps/chosen": -231.85098266601562, "logps/rejected": -236.9989776611328, "loss": 0.6842, "rewards/accuracies": 0.5730769038200378, "rewards/chosen": -0.3959502577781677, "rewards/margins": 0.03195538371801376, "rewards/rejected": -0.4279056191444397, "step": 80 }, { "epoch": 0.58, "grad_norm": 19.634321191048826, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -2.232875347137451, "logits/rejected": -2.2362263202667236, "logps/chosen": -208.51087951660156, "logps/rejected": -207.45663452148438, "loss": 0.689, "rewards/accuracies": 0.5461538434028625, "rewards/chosen": -0.22500069439411163, "rewards/margins": 0.003552414011210203, "rewards/rejected": -0.22855311632156372, "step": 90 }, { "epoch": 0.65, "grad_norm": 17.473494481507956, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -2.236097812652588, "logits/rejected": -2.0412774085998535, "logps/chosen": -213.67514038085938, "logps/rejected": -206.89111328125, "loss": 0.6881, "rewards/accuracies": 0.5038461685180664, "rewards/chosen": -0.2356816679239273, "rewards/margins": 0.003031224012374878, "rewards/rejected": -0.23871289193630219, "step": 100 }, { "epoch": 0.71, "grad_norm": 15.544936822002546, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -2.103285789489746, "logits/rejected": -2.1786677837371826, "logps/chosen": -224.00047302246094, "logps/rejected": -220.13726806640625, "loss": 0.6848, "rewards/accuracies": 0.5615384578704834, "rewards/chosen": -0.3865113854408264, "rewards/margins": 0.03568296507000923, "rewards/rejected": -0.42219436168670654, "step": 110 }, { "epoch": 0.78, "grad_norm": 17.169881927493602, "learning_rate": 6.822945986946385e-08, "logits/chosen": -1.9218517541885376, "logits/rejected": -2.109549045562744, "logps/chosen": -220.54318237304688, "logps/rejected": -231.7896270751953, "loss": 0.6813, "rewards/accuracies": 0.5769230723381042, "rewards/chosen": -0.4736253619194031, "rewards/margins": 0.03084597922861576, "rewards/rejected": -0.5044713020324707, "step": 120 }, { "epoch": 0.84, "grad_norm": 17.60589291870986, "learning_rate": 3.397296523427806e-08, "logits/chosen": -2.146359920501709, "logits/rejected": -2.1425552368164062, "logps/chosen": -221.13165283203125, "logps/rejected": -225.94419860839844, "loss": 0.6816, "rewards/accuracies": 0.5615384578704834, "rewards/chosen": -0.4886237382888794, "rewards/margins": 0.03550608828663826, "rewards/rejected": -0.5241298675537109, "step": 130 }, { "epoch": 0.91, "grad_norm": 18.707751355883822, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -2.1278481483459473, "logits/rejected": -2.0320982933044434, "logps/chosen": -220.7178192138672, "logps/rejected": -217.0054931640625, "loss": 0.6837, "rewards/accuracies": 0.5923076868057251, "rewards/chosen": -0.3798917829990387, "rewards/margins": 0.05050484091043472, "rewards/rejected": -0.4303966164588928, "step": 140 }, { "epoch": 0.97, "grad_norm": 18.697426009812567, "learning_rate": 5.913435276374834e-10, "logits/chosen": -2.186318874359131, "logits/rejected": -2.1368911266326904, "logps/chosen": -221.08029174804688, "logps/rejected": -230.6654052734375, "loss": 0.6744, "rewards/accuracies": 0.5961538553237915, "rewards/chosen": -0.38067081570625305, "rewards/margins": 0.07161368429660797, "rewards/rejected": -0.4522845447063446, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.6871888306405809, "train_runtime": 39835.0539, "train_samples_per_second": 0.502, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }