{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 12.825809580615244, "learning_rate": 3.125e-08, "logits/chosen": -2.23366379737854, "logits/rejected": -2.0032992362976074, "logps/chosen": -196.23782348632812, "logps/rejected": -174.6262969970703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 17.370258695749197, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.343287944793701, "logits/rejected": -2.2950587272644043, "logps/chosen": -179.1259002685547, "logps/rejected": -178.35891723632812, "loss": 0.6927, "rewards/accuracies": 0.4145299196243286, "rewards/chosen": -0.001734515419229865, "rewards/margins": 0.001407344127073884, "rewards/rejected": -0.003141859546303749, "step": 10 }, { "epoch": 0.13, "grad_norm": 15.379940853087401, "learning_rate": 4.989490450759331e-07, "logits/chosen": -2.3157341480255127, "logits/rejected": -2.2556896209716797, "logps/chosen": -182.99658203125, "logps/rejected": -181.58053588867188, "loss": 0.691, "rewards/accuracies": 0.4961538314819336, "rewards/chosen": -0.12090444564819336, "rewards/margins": 0.01673085428774357, "rewards/rejected": -0.13763530552387238, "step": 20 }, { "epoch": 0.2, "grad_norm": 16.658033176727283, "learning_rate": 4.872270441827174e-07, "logits/chosen": -2.240647077560425, "logits/rejected": -2.139284133911133, "logps/chosen": -191.006591796875, "logps/rejected": -186.6674041748047, "loss": 0.6929, "rewards/accuracies": 0.4961538314819336, "rewards/chosen": -0.23701129853725433, "rewards/margins": -5.6074215535772964e-05, "rewards/rejected": -0.23695524036884308, "step": 30 }, { "epoch": 0.26, "grad_norm": 13.482187340190304, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -2.2772746086120605, "logits/rejected": -2.310314893722534, "logps/chosen": -174.7858428955078, "logps/rejected": -184.82554626464844, "loss": 0.6912, "rewards/accuracies": 0.557692289352417, "rewards/chosen": -0.03568955510854721, "rewards/margins": 0.0036310250870883465, "rewards/rejected": -0.03932058438658714, "step": 40 }, { "epoch": 0.33, "grad_norm": 17.865521715649642, "learning_rate": 4.277872161641681e-07, "logits/chosen": -2.2564620971679688, "logits/rejected": -2.278386116027832, "logps/chosen": -192.4917755126953, "logps/rejected": -193.97947692871094, "loss": 0.6886, "rewards/accuracies": 0.48076921701431274, "rewards/chosen": -0.042664218693971634, "rewards/margins": 0.010273917578160763, "rewards/rejected": -0.05293813720345497, "step": 50 }, { "epoch": 0.39, "grad_norm": 22.19136114113986, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -2.145270824432373, "logits/rejected": -2.073192596435547, "logps/chosen": -224.1717529296875, "logps/rejected": -225.2917022705078, "loss": 0.6932, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.3118091821670532, "rewards/margins": 0.0030472425278276205, "rewards/rejected": -0.3148564398288727, "step": 60 }, { "epoch": 0.46, "grad_norm": 15.131244681651243, "learning_rate": 3.316028034595861e-07, "logits/chosen": -1.976121187210083, "logits/rejected": -1.939582347869873, "logps/chosen": -199.67727661132812, "logps/rejected": -203.1947021484375, "loss": 0.6892, "rewards/accuracies": 0.5807692408561707, "rewards/chosen": -0.179255411028862, "rewards/margins": 0.009526830166578293, "rewards/rejected": -0.1887822449207306, "step": 70 }, { "epoch": 0.52, "grad_norm": 16.3790838983982, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -2.0032222270965576, "logits/rejected": -1.8593822717666626, "logps/chosen": -202.15196228027344, "logps/rejected": -215.4062042236328, "loss": 0.6848, "rewards/accuracies": 0.5846154093742371, "rewards/chosen": -0.20821429789066315, "rewards/margins": 0.04003766551613808, "rewards/rejected": -0.24825195968151093, "step": 80 }, { "epoch": 0.58, "grad_norm": 20.64519176920963, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -1.5563592910766602, "logits/rejected": -1.8040456771850586, "logps/chosen": -219.68856811523438, "logps/rejected": -231.39486694335938, "loss": 0.6812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.41656777262687683, "rewards/margins": 0.060058608651161194, "rewards/rejected": -0.47662636637687683, "step": 90 }, { "epoch": 0.65, "grad_norm": 20.02675570793956, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -1.4402265548706055, "logits/rejected": -1.5615330934524536, "logps/chosen": -229.6835174560547, "logps/rejected": -234.88693237304688, "loss": 0.6791, "rewards/accuracies": 0.5076923370361328, "rewards/chosen": -0.511164665222168, "rewards/margins": 0.028713112697005272, "rewards/rejected": -0.5398777723312378, "step": 100 }, { "epoch": 0.71, "grad_norm": 17.88209150198185, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -1.8932424783706665, "logits/rejected": -1.7129985094070435, "logps/chosen": -198.45960998535156, "logps/rejected": -198.94027709960938, "loss": 0.6797, "rewards/accuracies": 0.6230769157409668, "rewards/chosen": -0.1736122965812683, "rewards/margins": 0.07991237938404083, "rewards/rejected": -0.25352466106414795, "step": 110 }, { "epoch": 0.78, "grad_norm": 18.1137509515123, "learning_rate": 6.822945986946385e-08, "logits/chosen": -1.6426665782928467, "logits/rejected": -1.8510866165161133, "logps/chosen": -188.033203125, "logps/rejected": -202.1238250732422, "loss": 0.6814, "rewards/accuracies": 0.5769230723381042, "rewards/chosen": -0.1463213562965393, "rewards/margins": 0.04249217361211777, "rewards/rejected": -0.18881353735923767, "step": 120 }, { "epoch": 0.84, "grad_norm": 19.900932973122792, "learning_rate": 3.397296523427806e-08, "logits/chosen": -1.7662078142166138, "logits/rejected": -1.7134820222854614, "logps/chosen": -197.23963928222656, "logps/rejected": -202.8521270751953, "loss": 0.6918, "rewards/accuracies": 0.5653846263885498, "rewards/chosen": -0.2858903706073761, "rewards/margins": 0.0338900126516819, "rewards/rejected": -0.3197803497314453, "step": 130 }, { "epoch": 0.91, "grad_norm": 19.787769013168933, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -1.6619917154312134, "logits/rejected": -1.7303296327590942, "logps/chosen": -194.7332305908203, "logps/rejected": -205.8466796875, "loss": 0.68, "rewards/accuracies": 0.6230769157409668, "rewards/chosen": -0.21614637970924377, "rewards/margins": 0.07540787756443024, "rewards/rejected": -0.2915542721748352, "step": 140 }, { "epoch": 0.97, "grad_norm": 19.04982739845647, "learning_rate": 5.913435276374834e-10, "logits/chosen": -1.928228497505188, "logits/rejected": -1.5072005987167358, "logps/chosen": -211.20840454101562, "logps/rejected": -211.61215209960938, "loss": 0.6772, "rewards/accuracies": 0.5884615182876587, "rewards/chosen": -0.28447794914245605, "rewards/margins": 0.0762421116232872, "rewards/rejected": -0.36072006821632385, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.6859961845516379, "train_runtime": 39900.7608, "train_samples_per_second": 0.501, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }