{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 18.026980953188083, "learning_rate": 3.125e-08, "logits/chosen": -1.4277913570404053, "logits/rejected": -1.5556963682174683, "logps/chosen": -139.25473022460938, "logps/pi_response": -223.89479064941406, "logps/ref_response": -223.89479064941406, "logps/rejected": -126.51285552978516, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "eta": 0.0009999999310821295, "grad_norm": 16.443972323199848, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.7425695657730103, "logits/rejected": -1.763687014579773, "logps/chosen": -179.83900451660156, "logps/pi_response": -274.66912841796875, "logps/ref_response": -272.7638854980469, "logps/rejected": -183.200927734375, "loss": 0.6931, "rewards/accuracies": 0.47863247990608215, "rewards/chosen": -0.009404909797012806, "rewards/margins": 0.0002608599897939712, "rewards/rejected": -0.009665770456194878, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 20.67365935289489, "learning_rate": 4.989490450759331e-07, "logits/chosen": -1.660647988319397, "logits/rejected": -1.5326403379440308, "logps/chosen": -199.7965087890625, "logps/pi_response": -299.9808654785156, "logps/ref_response": -270.13507080078125, "logps/rejected": -201.1602783203125, "loss": 0.6905, "rewards/accuracies": 0.5730769038200378, "rewards/chosen": -0.12711657583713531, "rewards/margins": 0.027275390923023224, "rewards/rejected": -0.15439198911190033, "step": 20 }, { "epoch": 0.2, "eta": 0.0010000000474974513, "grad_norm": 16.68985237393775, "learning_rate": 4.872270441827174e-07, "logits/chosen": -1.4958996772766113, "logits/rejected": -1.5024709701538086, "logps/chosen": -192.6185302734375, "logps/pi_response": -322.8236389160156, "logps/ref_response": -270.1427917480469, "logps/rejected": -197.94105529785156, "loss": 0.6956, "rewards/accuracies": 0.5192307829856873, "rewards/chosen": -0.22080166637897491, "rewards/margins": 0.01529843732714653, "rewards/rejected": -0.23610009253025055, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 18.609452469800672, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -1.47615385055542, "logits/rejected": -1.4204249382019043, "logps/chosen": -221.22259521484375, "logps/pi_response": -322.4052429199219, "logps/ref_response": -275.0693359375, "logps/rejected": -216.37770080566406, "loss": 0.6913, "rewards/accuracies": 0.5192307829856873, "rewards/chosen": -0.20735466480255127, "rewards/margins": -0.0017751154955476522, "rewards/rejected": -0.20557956397533417, "step": 40 }, { "epoch": 0.33, "eta": 0.0010000000474974513, "grad_norm": 16.469711313950462, "learning_rate": 4.277872161641681e-07, "logits/chosen": -1.3499563932418823, "logits/rejected": -1.2568999528884888, "logps/chosen": -196.1409149169922, "logps/pi_response": -306.9660949707031, "logps/ref_response": -266.49676513671875, "logps/rejected": -198.00282287597656, "loss": 0.693, "rewards/accuracies": 0.5269230604171753, "rewards/chosen": -0.19123147428035736, "rewards/margins": 0.02352394536137581, "rewards/rejected": -0.21475543081760406, "step": 50 }, { "epoch": 0.39, "eta": 0.0010000000474974513, "grad_norm": 14.511218616556945, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -1.4744631052017212, "logits/rejected": -1.430633306503296, "logps/chosen": -225.62750244140625, "logps/pi_response": -312.2039489746094, "logps/ref_response": -265.6466064453125, "logps/rejected": -226.8577880859375, "loss": 0.6857, "rewards/accuracies": 0.5307692289352417, "rewards/chosen": -0.27648842334747314, "rewards/margins": 0.02260260097682476, "rewards/rejected": -0.29909104108810425, "step": 60 }, { "epoch": 0.46, "eta": 0.0010000000474974513, "grad_norm": 16.09750795949007, "learning_rate": 3.316028034595861e-07, "logits/chosen": -1.326416015625, "logits/rejected": -1.3301128149032593, "logps/chosen": -239.2198028564453, "logps/pi_response": -351.46148681640625, "logps/ref_response": -276.6808166503906, "logps/rejected": -242.2137908935547, "loss": 0.6913, "rewards/accuracies": 0.48076921701431274, "rewards/chosen": -0.5429643988609314, "rewards/margins": -0.012777900323271751, "rewards/rejected": -0.5301865339279175, "step": 70 }, { "epoch": 0.52, "eta": 0.0010000000474974513, "grad_norm": 17.876259342764257, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -1.2329574823379517, "logits/rejected": -1.1573874950408936, "logps/chosen": -236.77549743652344, "logps/pi_response": -343.8861389160156, "logps/ref_response": -271.09393310546875, "logps/rejected": -233.1557159423828, "loss": 0.6869, "rewards/accuracies": 0.5115384459495544, "rewards/chosen": -0.48039481043815613, "rewards/margins": 0.0135263130068779, "rewards/rejected": -0.4939211308956146, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 16.681056784028165, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -1.3373737335205078, "logits/rejected": -1.2385435104370117, "logps/chosen": -223.1448211669922, "logps/pi_response": -334.9044494628906, "logps/ref_response": -266.6827697753906, "logps/rejected": -225.2432403564453, "loss": 0.6904, "rewards/accuracies": 0.5769230723381042, "rewards/chosen": -0.39324137568473816, "rewards/margins": 0.031876321882009506, "rewards/rejected": -0.42511770129203796, "step": 90 }, { "epoch": 0.65, "eta": 0.0010000000474974513, "grad_norm": 69.55993387837812, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -1.1172312498092651, "logits/rejected": -1.0658454895019531, "logps/chosen": -242.79571533203125, "logps/pi_response": -359.3255310058594, "logps/ref_response": -268.1200256347656, "logps/rejected": -244.661865234375, "loss": 0.6854, "rewards/accuracies": 0.5769230723381042, "rewards/chosen": -0.6469340920448303, "rewards/margins": 0.03381125256419182, "rewards/rejected": -0.6807453632354736, "step": 100 }, { "epoch": 0.71, "eta": 0.0010000000474974513, "grad_norm": 17.847360893927068, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -1.0057731866836548, "logits/rejected": -1.0688775777816772, "logps/chosen": -267.8451232910156, "logps/pi_response": -402.1688537597656, "logps/ref_response": -288.839599609375, "logps/rejected": -276.8705139160156, "loss": 0.68, "rewards/accuracies": 0.5346153974533081, "rewards/chosen": -0.8569359183311462, "rewards/margins": 0.06452393531799316, "rewards/rejected": -0.9214598536491394, "step": 110 }, { "epoch": 0.78, "eta": 0.0010000000474974513, "grad_norm": 15.803343162533332, "learning_rate": 6.822945986946385e-08, "logits/chosen": -0.9977442622184753, "logits/rejected": -1.0390866994857788, "logps/chosen": -259.6402587890625, "logps/pi_response": -377.9400939941406, "logps/ref_response": -266.2973937988281, "logps/rejected": -268.2873840332031, "loss": 0.6756, "rewards/accuracies": 0.5269230604171753, "rewards/chosen": -0.7336382269859314, "rewards/margins": 0.044992994517087936, "rewards/rejected": -0.7786312103271484, "step": 120 }, { "epoch": 0.84, "eta": 0.0010000000474974513, "grad_norm": 22.692522756797395, "learning_rate": 3.397296523427806e-08, "logits/chosen": -0.895865797996521, "logits/rejected": -0.7990739345550537, "logps/chosen": -237.3929443359375, "logps/pi_response": -364.76470947265625, "logps/ref_response": -257.66021728515625, "logps/rejected": -240.6884307861328, "loss": 0.6773, "rewards/accuracies": 0.5653846263885498, "rewards/chosen": -0.688443124294281, "rewards/margins": 0.04434271529316902, "rewards/rejected": -0.7327858209609985, "step": 130 }, { "epoch": 0.91, "eta": 0.0010000000474974513, "grad_norm": 19.97244033218429, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -1.052285075187683, "logits/rejected": -1.1165075302124023, "logps/chosen": -247.59332275390625, "logps/pi_response": -372.91534423828125, "logps/ref_response": -265.1993408203125, "logps/rejected": -252.52793884277344, "loss": 0.6715, "rewards/accuracies": 0.5923076868057251, "rewards/chosen": -0.640332043170929, "rewards/margins": 0.03581571578979492, "rewards/rejected": -0.6761477589607239, "step": 140 }, { "epoch": 0.97, "eta": 0.0010000000474974513, "grad_norm": 17.943905253705193, "learning_rate": 5.913435276374834e-10, "logits/chosen": -1.0871530771255493, "logits/rejected": -1.1508903503417969, "logps/chosen": -246.49159240722656, "logps/pi_response": -384.3344421386719, "logps/ref_response": -275.81292724609375, "logps/rejected": -250.1259307861328, "loss": 0.6818, "rewards/accuracies": 0.5692307949066162, "rewards/chosen": -0.6599874496459961, "rewards/margins": 0.061169885098934174, "rewards/rejected": -0.721157431602478, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.685623891213361, "train_runtime": 23340.5225, "train_samples_per_second": 0.857, "train_steps_per_second": 0.007 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }