{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 30.20068910303907, "learning_rate": 6.25e-09, "logits/chosen": 0.05829288810491562, "logits/rejected": 0.12195920199155807, "logps/chosen": -235.51776123046875, "logps/rejected": -252.3660888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 35.659220970788716, "learning_rate": 6.25e-08, "logits/chosen": -0.1302209049463272, "logits/rejected": -0.3692338764667511, "logps/chosen": -247.50286865234375, "logps/rejected": -264.7320556640625, "loss": 0.6932, "rewards/accuracies": 0.4340277910232544, "rewards/chosen": -0.0013414309360086918, "rewards/margins": -7.45424404158257e-05, "rewards/rejected": -0.001266888459213078, "step": 10 }, { "epoch": 0.13, "grad_norm": 29.89211126085747, "learning_rate": 9.979871469976195e-08, "logits/chosen": -0.259229838848114, "logits/rejected": -0.35706019401550293, "logps/chosen": -241.0030517578125, "logps/rejected": -251.1970977783203, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.02032109722495079, "rewards/margins": -0.0005441303364932537, "rewards/rejected": -0.01977696642279625, "step": 20 }, { "epoch": 0.19, "grad_norm": 38.00242034346191, "learning_rate": 9.755282581475768e-08, "logits/chosen": -0.37507274746894836, "logits/rejected": -0.30858054757118225, "logps/chosen": -250.5952911376953, "logps/rejected": -267.4319152832031, "loss": 0.6913, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.04578831046819687, "rewards/margins": 0.00868249125778675, "rewards/rejected": -0.05447079613804817, "step": 30 }, { "epoch": 0.26, "grad_norm": 44.38122948328188, "learning_rate": 9.29224396800933e-08, "logits/chosen": -0.30581027269363403, "logits/rejected": -0.25420406460762024, "logps/chosen": -245.3369598388672, "logps/rejected": -265.40728759765625, "loss": 0.6889, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.0466889962553978, "rewards/margins": 0.00311011029407382, "rewards/rejected": -0.049799107015132904, "step": 40 }, { "epoch": 0.32, "grad_norm": 33.66958666071777, "learning_rate": 8.613974319136957e-08, "logits/chosen": -0.3118899464607239, "logits/rejected": -0.2555062770843506, "logps/chosen": -246.3278045654297, "logps/rejected": -265.2810974121094, "loss": 0.6874, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.04047214612364769, "rewards/margins": 0.024090787395834923, "rewards/rejected": -0.06456293165683746, "step": 50 }, { "epoch": 0.38, "grad_norm": 39.42735209702952, "learning_rate": 7.754484907260513e-08, "logits/chosen": -0.26858726143836975, "logits/rejected": -0.28768494725227356, "logps/chosen": -255.31631469726562, "logps/rejected": -269.5321960449219, "loss": 0.6875, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.06594480574131012, "rewards/margins": 0.02973010577261448, "rewards/rejected": -0.09567491710186005, "step": 60 }, { "epoch": 0.45, "grad_norm": 38.57810394961094, "learning_rate": 6.756874120406714e-08, "logits/chosen": -0.3371458649635315, "logits/rejected": -0.1996055543422699, "logps/chosen": -242.06527709960938, "logps/rejected": -267.87457275390625, "loss": 0.685, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0042762779630720615, "rewards/margins": 0.029838895425200462, "rewards/rejected": -0.03411517292261124, "step": 70 }, { "epoch": 0.51, "grad_norm": 37.950575567812315, "learning_rate": 5.6711663290882774e-08, "logits/chosen": -0.19361785054206848, "logits/rejected": -0.26724424958229065, "logps/chosen": -230.7703399658203, "logps/rejected": -257.036376953125, "loss": 0.6904, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005543149076402187, "rewards/margins": 0.030957844108343124, "rewards/rejected": -0.03650099039077759, "step": 80 }, { "epoch": 0.58, "grad_norm": 50.50742600928591, "learning_rate": 4.551803455482833e-08, "logits/chosen": -0.23545122146606445, "logits/rejected": -0.22761189937591553, "logps/chosen": -248.32565307617188, "logps/rejected": -266.6417541503906, "loss": 0.6884, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.016439877450466156, "rewards/margins": 0.02168435789644718, "rewards/rejected": -0.03812423720955849, "step": 90 }, { "epoch": 0.64, "grad_norm": 45.90060699086576, "learning_rate": 3.4549150281252633e-08, "logits/chosen": -0.24451108276844025, "logits/rejected": -0.11925282329320908, "logps/chosen": -253.621826171875, "logps/rejected": -265.9607849121094, "loss": 0.6907, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.05188063532114029, "rewards/margins": 0.025425013154745102, "rewards/rejected": -0.07730564475059509, "step": 100 }, { "epoch": 0.7, "grad_norm": 38.076986954965236, "learning_rate": 2.43550361297047e-08, "logits/chosen": -0.12951107323169708, "logits/rejected": -0.1342063993215561, "logps/chosen": -243.32846069335938, "logps/rejected": -262.29034423828125, "loss": 0.6895, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07932275533676147, "rewards/margins": 0.019449030980467796, "rewards/rejected": -0.09877178817987442, "step": 110 }, { "epoch": 0.77, "grad_norm": 37.05649918470721, "learning_rate": 1.5446867550656767e-08, "logits/chosen": -0.2998012900352478, "logits/rejected": -0.23906514048576355, "logps/chosen": -253.05178833007812, "logps/rejected": -268.27691650390625, "loss": 0.6859, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08057109266519547, "rewards/margins": 0.011484967544674873, "rewards/rejected": -0.09205605834722519, "step": 120 }, { "epoch": 0.83, "grad_norm": 37.629166998074275, "learning_rate": 8.271337313934867e-09, "logits/chosen": 0.0005074322107248008, "logits/rejected": -0.1076931357383728, "logps/chosen": -243.9499053955078, "logps/rejected": -260.7448425292969, "loss": 0.6848, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08455522358417511, "rewards/margins": 0.022291380912065506, "rewards/rejected": -0.10684660822153091, "step": 130 }, { "epoch": 0.9, "grad_norm": 62.22968620333278, "learning_rate": 3.1882564680131396e-09, "logits/chosen": -0.28716200590133667, "logits/rejected": -0.3127291798591614, "logps/chosen": -243.41116333007812, "logps/rejected": -268.4090270996094, "loss": 0.687, "rewards/accuracies": 0.578125, "rewards/chosen": -0.06137162446975708, "rewards/margins": 0.038829904049634933, "rewards/rejected": -0.10020153224468231, "step": 140 }, { "epoch": 0.96, "grad_norm": 37.82208076011054, "learning_rate": 4.52511911603265e-10, "logits/chosen": -0.08432894945144653, "logits/rejected": -0.23105964064598083, "logps/chosen": -239.08517456054688, "logps/rejected": -257.0472106933594, "loss": 0.6819, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09014703333377838, "rewards/margins": 0.016562053933739662, "rewards/rejected": -0.1067090854048729, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.6882889736921359, "train_runtime": 18117.9848, "train_samples_per_second": 1.104, "train_steps_per_second": 0.009 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }