{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 239, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0041841004184100415, "grad_norm": 2.3349975667313307, "learning_rate": 2.083333333333333e-08, "logits/chosen": -1.168128490447998, "logits/rejected": -1.1001684665679932, "logps/chosen": -345.7468566894531, "logps/rejected": -388.03436279296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04184100418410042, "grad_norm": 2.2444991754758727, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.1528964042663574, "logits/rejected": -1.2175092697143555, "logps/chosen": -327.3994445800781, "logps/rejected": -330.5547790527344, "loss": 0.6932, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0001323310425505042, "rewards/margins": 0.0002724311489146203, "rewards/rejected": -0.00014010007726028562, "step": 10 }, { "epoch": 0.08368200836820083, "grad_norm": 3.1970801165125993, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.1212791204452515, "logits/rejected": -1.1984214782714844, "logps/chosen": -337.3634033203125, "logps/rejected": -310.3741455078125, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.0006343486020341516, "rewards/margins": 0.00016590832092333585, "rewards/rejected": 0.00046844035387039185, "step": 20 }, { "epoch": 0.12552301255230125, "grad_norm": 2.033994682703014, "learning_rate": 4.990398100856366e-07, "logits/chosen": -1.1630992889404297, "logits/rejected": -1.1824390888214111, "logps/chosen": -327.43975830078125, "logps/rejected": -347.76409912109375, "loss": 0.6922, "rewards/accuracies": 0.546875, "rewards/chosen": 0.004818023648113012, "rewards/margins": 0.0015862248837947845, "rewards/rejected": 0.003231799229979515, "step": 30 }, { "epoch": 0.16736401673640167, "grad_norm": 2.045259811639636, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.2456501722335815, "logits/rejected": -1.2868419885635376, "logps/chosen": -352.53753662109375, "logps/rejected": -314.9628601074219, "loss": 0.6901, "rewards/accuracies": 0.640625, "rewards/chosen": 0.013885138556361198, "rewards/margins": 0.00753264594823122, "rewards/rejected": 0.006352494470775127, "step": 40 }, { "epoch": 0.20920502092050208, "grad_norm": 1.9514164939530987, "learning_rate": 4.821741763807186e-07, "logits/chosen": -1.2360882759094238, "logits/rejected": -1.3141974210739136, "logps/chosen": -316.40191650390625, "logps/rejected": -312.9485168457031, "loss": 0.6873, "rewards/accuracies": 0.671875, "rewards/chosen": 0.020323345437645912, "rewards/margins": 0.012846221216022968, "rewards/rejected": 0.007477124221622944, "step": 50 }, { "epoch": 0.2510460251046025, "grad_norm": 2.209686138841628, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.2470484972000122, "logits/rejected": -1.358208417892456, "logps/chosen": -348.89031982421875, "logps/rejected": -317.46673583984375, "loss": 0.685, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.03178168088197708, "rewards/margins": 0.01709566079080105, "rewards/rejected": 0.014686022885143757, "step": 60 }, { "epoch": 0.2928870292887029, "grad_norm": 1.8450350094954893, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.2555049657821655, "logits/rejected": -1.3169441223144531, "logps/chosen": -344.35125732421875, "logps/rejected": -338.9962158203125, "loss": 0.6816, "rewards/accuracies": 0.640625, "rewards/chosen": 0.040135230869054794, "rewards/margins": 0.024550432339310646, "rewards/rejected": 0.015584799461066723, "step": 70 }, { "epoch": 0.33472803347280333, "grad_norm": 1.8585724869830902, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -1.2849452495574951, "logits/rejected": -1.3933959007263184, "logps/chosen": -328.14727783203125, "logps/rejected": -317.79901123046875, "loss": 0.6802, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03799828886985779, "rewards/margins": 0.022710641846060753, "rewards/rejected": 0.015287647023797035, "step": 80 }, { "epoch": 0.37656903765690375, "grad_norm": 1.9259218744474744, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.2762553691864014, "logits/rejected": -1.367308497428894, "logps/chosen": -303.9913024902344, "logps/rejected": -290.3238830566406, "loss": 0.6758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04959210380911827, "rewards/margins": 0.03906155750155449, "rewards/rejected": 0.010530550964176655, "step": 90 }, { "epoch": 0.41841004184100417, "grad_norm": 2.007278325554356, "learning_rate": 3.610497133404795e-07, "logits/chosen": -1.3190323114395142, "logits/rejected": -1.3636301755905151, "logps/chosen": -319.6354064941406, "logps/rejected": -312.177490234375, "loss": 0.6759, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.053187936544418335, "rewards/margins": 0.04710903763771057, "rewards/rejected": 0.0060789017006754875, "step": 100 }, { "epoch": 0.4602510460251046, "grad_norm": 1.8893655511763634, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.2609952688217163, "logits/rejected": -1.3663299083709717, "logps/chosen": -330.7979736328125, "logps/rejected": -304.4183654785156, "loss": 0.6735, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.04585585743188858, "rewards/margins": 0.048381414264440536, "rewards/rejected": -0.0025255570653826, "step": 110 }, { "epoch": 0.502092050209205, "grad_norm": 15.033556125532698, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -1.2779179811477661, "logits/rejected": -1.3179519176483154, "logps/chosen": -308.0982666015625, "logps/rejected": -307.01239013671875, "loss": 0.6698, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.05374608188867569, "rewards/margins": 0.05363842099905014, "rewards/rejected": 0.00010766675404738635, "step": 120 }, { "epoch": 0.5439330543933054, "grad_norm": 1.7668289559235293, "learning_rate": 2.55479083351317e-07, "logits/chosen": -1.2395732402801514, "logits/rejected": -1.33193039894104, "logps/chosen": -348.57562255859375, "logps/rejected": -316.04498291015625, "loss": 0.6689, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06320692598819733, "rewards/margins": 0.07001270353794098, "rewards/rejected": -0.006805785000324249, "step": 130 }, { "epoch": 0.5857740585774058, "grad_norm": 2.0024361354347793, "learning_rate": 2.19029145890313e-07, "logits/chosen": -1.2449181079864502, "logits/rejected": -1.329097867012024, "logps/chosen": -326.33868408203125, "logps/rejected": -311.33966064453125, "loss": 0.6678, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.054462842643260956, "rewards/margins": 0.05262676998972893, "rewards/rejected": 0.0018360689282417297, "step": 140 }, { "epoch": 0.6276150627615062, "grad_norm": 1.8787432387858691, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -1.2301727533340454, "logits/rejected": -1.332716703414917, "logps/chosen": -326.64910888671875, "logps/rejected": -308.0601501464844, "loss": 0.6659, "rewards/accuracies": 0.65625, "rewards/chosen": 0.035623423755168915, "rewards/margins": 0.05233610421419144, "rewards/rejected": -0.016712676733732224, "step": 150 }, { "epoch": 0.6694560669456067, "grad_norm": 1.9176367647768078, "learning_rate": 1.488723393865766e-07, "logits/chosen": -1.27090322971344, "logits/rejected": -1.314906358718872, "logps/chosen": -344.8522644042969, "logps/rejected": -312.78271484375, "loss": 0.6612, "rewards/accuracies": 0.6875, "rewards/chosen": 0.045879464596509933, "rewards/margins": 0.0723222941160202, "rewards/rejected": -0.02644282951951027, "step": 160 }, { "epoch": 0.7112970711297071, "grad_norm": 2.0586908284329564, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -1.2575898170471191, "logits/rejected": -1.2899234294891357, "logps/chosen": -332.93658447265625, "logps/rejected": -329.26226806640625, "loss": 0.6644, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03310205787420273, "rewards/margins": 0.057156242430210114, "rewards/rejected": -0.024054180830717087, "step": 170 }, { "epoch": 0.7531380753138075, "grad_norm": 1.8213789779768046, "learning_rate": 8.729103716819111e-08, "logits/chosen": -1.2486387491226196, "logits/rejected": -1.2856649160385132, "logps/chosen": -332.7528076171875, "logps/rejected": -310.4466857910156, "loss": 0.6616, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.047333668917417526, "rewards/margins": 0.07636959850788116, "rewards/rejected": -0.029035940766334534, "step": 180 }, { "epoch": 0.7949790794979079, "grad_norm": 1.863301500159963, "learning_rate": 6.138919252022435e-08, "logits/chosen": -1.2751197814941406, "logits/rejected": -1.3302768468856812, "logps/chosen": -313.1865539550781, "logps/rejected": -310.4251708984375, "loss": 0.6592, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.020702695474028587, "rewards/margins": 0.06860536336898804, "rewards/rejected": -0.0479026660323143, "step": 190 }, { "epoch": 0.8368200836820083, "grad_norm": 4.872667925392408, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -1.2594833374023438, "logits/rejected": -1.2656358480453491, "logps/chosen": -339.21563720703125, "logps/rejected": -337.2076416015625, "loss": 0.6604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.039656586945056915, "rewards/margins": 0.0757637619972229, "rewards/rejected": -0.036107175052165985, "step": 200 }, { "epoch": 0.8786610878661087, "grad_norm": 1.9793278923515518, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -1.1997952461242676, "logits/rejected": -1.318477988243103, "logps/chosen": -347.9354553222656, "logps/rejected": -335.8204040527344, "loss": 0.6613, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.03624551743268967, "rewards/margins": 0.0588444285094738, "rewards/rejected": -0.022598912939429283, "step": 210 }, { "epoch": 0.9205020920502092, "grad_norm": 1.9125182215206764, "learning_rate": 9.57301420397924e-09, "logits/chosen": -1.2063685655593872, "logits/rejected": -1.248859167098999, "logps/chosen": -338.15325927734375, "logps/rejected": -323.5889892578125, "loss": 0.6584, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04115135595202446, "rewards/margins": 0.07347537577152252, "rewards/rejected": -0.03232400864362717, "step": 220 }, { "epoch": 0.9623430962343096, "grad_norm": 1.9573083804318165, "learning_rate": 2.158697848236607e-09, "logits/chosen": -1.2343095541000366, "logits/rejected": -1.3073015213012695, "logps/chosen": -334.5377197265625, "logps/rejected": -336.1081237792969, "loss": 0.6611, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.028930548578500748, "rewards/margins": 0.054905764758586884, "rewards/rejected": -0.025975216180086136, "step": 230 }, { "epoch": 1.0, "step": 239, "total_flos": 0.0, "train_loss": 0.6731121569996599, "train_runtime": 6739.8654, "train_samples_per_second": 9.071, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 239, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }