{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006279434850863423, "grad_norm": 5.603575760096964, "learning_rate": 3.125e-08, "logits/chosen": 0.18015038967132568, "logits/rejected": 0.2519298493862152, "logps/chosen": -297.10906982421875, "logps/pi_response": -130.58929443359375, "logps/ref_response": -130.58929443359375, "logps/rejected": -316.44769287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06279434850863422, "grad_norm": 5.927013620517721, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.16517189145088196, "logits/rejected": 0.31397953629493713, "logps/chosen": -243.69070434570312, "logps/pi_response": -120.14439392089844, "logps/ref_response": -120.15902709960938, "logps/rejected": -281.0929870605469, "loss": 0.6929, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 9.301294630859047e-05, "rewards/margins": 0.0011727283708751202, "rewards/rejected": -0.001079715322703123, "step": 10 }, { "epoch": 0.12558869701726844, "grad_norm": 6.040197286277514, "learning_rate": 4.990353313429303e-07, "logits/chosen": 0.13175079226493835, "logits/rejected": 0.32193881273269653, "logps/chosen": -244.1867218017578, "logps/pi_response": -121.63621520996094, "logps/ref_response": -121.85536193847656, "logps/rejected": -266.6852722167969, "loss": 0.6886, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01275191456079483, "rewards/margins": 0.0108140604570508, "rewards/rejected": -0.023565974086523056, "step": 20 }, { "epoch": 0.18838304552590268, "grad_norm": 6.198937762628843, "learning_rate": 4.882681251368548e-07, "logits/chosen": 0.1714327037334442, "logits/rejected": 0.3029390871524811, "logps/chosen": -244.3697509765625, "logps/pi_response": -109.6906967163086, "logps/ref_response": -110.8894271850586, "logps/rejected": -290.1263732910156, "loss": 0.6686, "rewards/accuracies": 0.6875, "rewards/chosen": -0.042444389313459396, "rewards/margins": 0.055226124823093414, "rewards/rejected": -0.09767051041126251, "step": 30 }, { "epoch": 0.25117739403453687, "grad_norm": 5.780662683878471, "learning_rate": 4.6604720940421207e-07, "logits/chosen": 0.2108462154865265, "logits/rejected": 0.3984522521495819, "logps/chosen": -287.33648681640625, "logps/pi_response": -125.24674224853516, "logps/ref_response": -129.86325073242188, "logps/rejected": -316.37408447265625, "loss": 0.6347, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08861993253231049, "rewards/margins": 0.1344626396894455, "rewards/rejected": -0.22308258712291718, "step": 40 }, { "epoch": 0.3139717425431711, "grad_norm": 5.870253990249712, "learning_rate": 4.3344075855595097e-07, "logits/chosen": 0.37238794565200806, "logits/rejected": 0.5105336308479309, "logps/chosen": -246.9188232421875, "logps/pi_response": -108.8661117553711, "logps/ref_response": -116.5090560913086, "logps/rejected": -310.613037109375, "loss": 0.606, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1104748398065567, "rewards/margins": 0.24330809712409973, "rewards/rejected": -0.3537829518318176, "step": 50 }, { "epoch": 0.37676609105180536, "grad_norm": 9.335664801309742, "learning_rate": 3.920161866827889e-07, "logits/chosen": 0.5169380903244019, "logits/rejected": 0.6794592142105103, "logps/chosen": -267.8638000488281, "logps/pi_response": -116.31705474853516, "logps/ref_response": -119.4989242553711, "logps/rejected": -347.69482421875, "loss": 0.5813, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19944007694721222, "rewards/margins": 0.4112206995487213, "rewards/rejected": -0.6106608510017395, "step": 60 }, { "epoch": 0.43956043956043955, "grad_norm": 7.251270500863869, "learning_rate": 3.4376480090239047e-07, "logits/chosen": 0.6026707887649536, "logits/rejected": 0.7696335911750793, "logps/chosen": -236.616455078125, "logps/pi_response": -114.6134262084961, "logps/ref_response": -116.70068359375, "logps/rejected": -368.1206359863281, "loss": 0.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.262630432844162, "rewards/margins": 0.5536423921585083, "rewards/rejected": -0.8162728548049927, "step": 70 }, { "epoch": 0.5023547880690737, "grad_norm": 7.652733380703419, "learning_rate": 2.910060778827554e-07, "logits/chosen": 0.4738255441188812, "logits/rejected": 0.8158077001571655, "logps/chosen": -325.28857421875, "logps/pi_response": -128.89028930664062, "logps/ref_response": -127.53900146484375, "logps/rejected": -348.1357116699219, "loss": 0.5848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36525511741638184, "rewards/margins": 0.3771258294582367, "rewards/rejected": -0.7423809766769409, "step": 80 }, { "epoch": 0.565149136577708, "grad_norm": 6.2380923795979655, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 0.43513059616088867, "logits/rejected": 0.8127248883247375, "logps/chosen": -301.32684326171875, "logps/pi_response": -135.54782104492188, "logps/ref_response": -129.38760375976562, "logps/rejected": -420.8568420410156, "loss": 0.5435, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.29517459869384766, "rewards/margins": 0.6168515086174011, "rewards/rejected": -0.9120261073112488, "step": 90 }, { "epoch": 0.6279434850863422, "grad_norm": 7.580754321821852, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 0.6774718165397644, "logits/rejected": 0.8504177331924438, "logps/chosen": -283.39569091796875, "logps/pi_response": -121.60555267333984, "logps/ref_response": -114.0061264038086, "logps/rejected": -304.47467041015625, "loss": 0.5426, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3073732852935791, "rewards/margins": 0.3342594504356384, "rewards/rejected": -0.6416326761245728, "step": 100 }, { "epoch": 0.6907378335949764, "grad_norm": 7.058762821080314, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 0.6427871584892273, "logits/rejected": 0.9476824998855591, "logps/chosen": -297.91217041015625, "logps/pi_response": -136.62130737304688, "logps/ref_response": -125.8144760131836, "logps/rejected": -383.70025634765625, "loss": 0.5314, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.33735179901123047, "rewards/margins": 0.5089818835258484, "rewards/rejected": -0.8463336825370789, "step": 110 }, { "epoch": 0.7535321821036107, "grad_norm": 7.301058922446218, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.5713673830032349, "logits/rejected": 0.8841819763183594, "logps/chosen": -320.47723388671875, "logps/pi_response": -131.65252685546875, "logps/ref_response": -120.58707427978516, "logps/rejected": -369.21661376953125, "loss": 0.5542, "rewards/accuracies": 0.75, "rewards/chosen": -0.39976996183395386, "rewards/margins": 0.5199070572853088, "rewards/rejected": -0.9196769595146179, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 6.728836632952595, "learning_rate": 4.904486005914027e-08, "logits/chosen": 0.5993797779083252, "logits/rejected": 0.8266558647155762, "logps/chosen": -276.78338623046875, "logps/pi_response": -135.93252563476562, "logps/ref_response": -123.1449966430664, "logps/rejected": -372.402587890625, "loss": 0.5271, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3279780447483063, "rewards/margins": 0.5935764908790588, "rewards/rejected": -0.921554446220398, "step": 130 }, { "epoch": 0.8791208791208791, "grad_norm": 7.263275222680605, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 0.5662352442741394, "logits/rejected": 0.8682255744934082, "logps/chosen": -283.9330139160156, "logps/pi_response": -130.8988037109375, "logps/ref_response": -121.63087463378906, "logps/rejected": -404.0865173339844, "loss": 0.5299, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3388724625110626, "rewards/margins": 0.7475099563598633, "rewards/rejected": -1.0863823890686035, "step": 140 }, { "epoch": 0.9419152276295133, "grad_norm": 7.996654815605861, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 0.5043578147888184, "logits/rejected": 0.8957873582839966, "logps/chosen": -317.9568786621094, "logps/pi_response": -145.6123504638672, "logps/ref_response": -132.86119079589844, "logps/rejected": -398.7013854980469, "loss": 0.5193, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3905355930328369, "rewards/margins": 0.5843077898025513, "rewards/rejected": -0.9748433232307434, "step": 150 }, { "epoch": 0.9984301412872841, "step": 159, "total_flos": 0.0, "train_loss": 0.5789255676029613, "train_runtime": 4418.8006, "train_samples_per_second": 4.612, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }