{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984942278735152, "eval_steps": 1000, "global_step": 373, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -2.4137167930603027, "logits/rejected": -2.3947949409484863, "logps/chosen": -426.3388671875, "logps/rejected": -370.72119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/mix_margin": 8.940696716308594e-08, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.392120599746704, "logits/rejected": -2.363373041152954, "logps/chosen": -433.6437683105469, "logps/rejected": -373.4803161621094, "loss": 0.6931, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.0020564268343150616, "rewards/confidence": -0.015508468262851238, "rewards/confidence_mean_diff": 0.015508468262851238, "rewards/confidence_moving_diff": 0.0011424963595345616, "rewards/margins": 0.0006688511930406094, "rewards/mix_margin": 0.0004061816434841603, "rewards/real_percentage": 13.142857551574707, "rewards/rejected": 0.0013875758741050959, "step": 10 }, { "epoch": 0.05, "learning_rate": 5.263157894736842e-07, "logits/chosen": -2.468066930770874, "logits/rejected": -2.4569571018218994, "logps/chosen": -417.98712158203125, "logps/rejected": -369.22174072265625, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": 0.07105853408575058, "rewards/confidence": -0.015660269185900688, "rewards/confidence_mean_diff": 0.015660269185900688, "rewards/confidence_moving_diff": 0.0015042915474623442, "rewards/margins": 0.017939578741788864, "rewards/mix_margin": 0.00918654352426529, "rewards/real_percentage": 7.199999809265137, "rewards/rejected": 0.05311895161867142, "step": 20 }, { "epoch": 0.08, "learning_rate": 7.894736842105263e-07, "logits/chosen": -2.3964927196502686, "logits/rejected": -2.36332368850708, "logps/chosen": -378.4501953125, "logps/rejected": -373.8832702636719, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": 0.26502424478530884, "rewards/confidence": -0.07313869893550873, "rewards/confidence_mean_diff": 0.07313869893550873, "rewards/confidence_moving_diff": 0.006530501879751682, "rewards/margins": 0.06049920991063118, "rewards/mix_margin": 0.03238716349005699, "rewards/real_percentage": 7.199999809265137, "rewards/rejected": 0.20452503859996796, "step": 30 }, { "epoch": 0.11, "learning_rate": 9.99912057785006e-07, "logits/chosen": -2.3308424949645996, "logits/rejected": -2.3119282722473145, "logps/chosen": -391.1062927246094, "logps/rejected": -353.74322509765625, "loss": 0.6275, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.41230279207229614, "rewards/confidence": -0.18921701610088348, "rewards/confidence_mean_diff": 0.18921701610088348, "rewards/confidence_moving_diff": 0.015285460278391838, "rewards/margins": 0.16258221864700317, "rewards/mix_margin": 0.08998984843492508, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": 0.24972060322761536, "step": 40 }, { "epoch": 0.13, "learning_rate": 9.96837327251561e-07, "logits/chosen": -2.093959093093872, "logits/rejected": -2.0577762126922607, "logps/chosen": -372.08795166015625, "logps/rejected": -342.28985595703125, "loss": 0.5908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5377326607704163, "rewards/confidence": -0.2739863395690918, "rewards/confidence_mean_diff": 0.2739863395690918, "rewards/confidence_moving_diff": -0.00014854370965622365, "rewards/margins": 0.255431205034256, "rewards/mix_margin": 0.15421859920024872, "rewards/real_percentage": 5.400000095367432, "rewards/rejected": 0.2823014557361603, "step": 50 }, { "epoch": 0.16, "learning_rate": 9.893963724218455e-07, "logits/chosen": -2.016526222229004, "logits/rejected": -1.9890506267547607, "logps/chosen": -398.5238342285156, "logps/rejected": -394.8996887207031, "loss": 0.5858, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.7251685857772827, "rewards/confidence": -0.3129069209098816, "rewards/confidence_mean_diff": 0.3129069209098816, "rewards/confidence_moving_diff": 0.03315434604883194, "rewards/margins": 0.24435606598854065, "rewards/mix_margin": 0.1545059084892273, "rewards/real_percentage": 6.0, "rewards/rejected": 0.4808124899864197, "step": 60 }, { "epoch": 0.19, "learning_rate": 9.776545846744508e-07, "logits/chosen": -2.0126547813415527, "logits/rejected": -1.9728899002075195, "logps/chosen": -415.135009765625, "logps/rejected": -401.66015625, "loss": 0.5708, "rewards/accuracies": 0.75, "rewards/chosen": 0.6391752362251282, "rewards/confidence": -0.3218304216861725, "rewards/confidence_mean_diff": 0.3218304216861725, "rewards/confidence_moving_diff": -0.009388929232954979, "rewards/margins": 0.45264172554016113, "rewards/mix_margin": 0.2859700620174408, "rewards/real_percentage": 6.0, "rewards/rejected": 0.18653348088264465, "step": 70 }, { "epoch": 0.21, "learning_rate": 9.617151512622916e-07, "logits/chosen": -2.166015863418579, "logits/rejected": -2.138554811477661, "logps/chosen": -388.05712890625, "logps/rejected": -370.957275390625, "loss": 0.5711, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.5867148041725159, "rewards/confidence": -0.36461326479911804, "rewards/confidence_mean_diff": 0.36461326479911804, "rewards/confidence_moving_diff": -0.02922860160470009, "rewards/margins": 0.4257655739784241, "rewards/mix_margin": 0.2721550762653351, "rewards/real_percentage": 6.0, "rewards/rejected": 0.16094925999641418, "step": 80 }, { "epoch": 0.24, "learning_rate": 9.417181484993153e-07, "logits/chosen": -2.1650993824005127, "logits/rejected": -2.115447521209717, "logps/chosen": -439.16943359375, "logps/rejected": -413.0699157714844, "loss": 0.5203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4852641224861145, "rewards/confidence": -0.28467947244644165, "rewards/confidence_mean_diff": 0.28467947244644165, "rewards/confidence_moving_diff": 0.02202179655432701, "rewards/margins": 0.4944303035736084, "rewards/mix_margin": 0.3211807608604431, "rewards/real_percentage": 7.400000095367432, "rewards/rejected": -0.009166148491203785, "step": 90 }, { "epoch": 0.27, "learning_rate": 9.178393107648192e-07, "logits/chosen": -1.87009596824646, "logits/rejected": -1.8090906143188477, "logps/chosen": -472.92431640625, "logps/rejected": -443.81890869140625, "loss": 0.5228, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.16720852255821228, "rewards/confidence": -0.3673645853996277, "rewards/confidence_mean_diff": 0.3673645853996277, "rewards/confidence_moving_diff": -0.01578204333782196, "rewards/margins": 0.4659162163734436, "rewards/mix_margin": 0.31855860352516174, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": -0.6331247091293335, "step": 100 }, { "epoch": 0.29, "learning_rate": 8.902884861434064e-07, "logits/chosen": -1.9227062463760376, "logits/rejected": -1.8704681396484375, "logps/chosen": -456.46038818359375, "logps/rejected": -427.544677734375, "loss": 0.5155, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3435952365398407, "rewards/confidence": -0.3272072374820709, "rewards/confidence_mean_diff": 0.3272072374820709, "rewards/confidence_moving_diff": 0.029508760198950768, "rewards/margins": 0.6112080812454224, "rewards/mix_margin": 0.4010244905948639, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -0.9548032879829407, "step": 110 }, { "epoch": 0.32, "learning_rate": 8.593077922724732e-07, "logits/chosen": -2.068099021911621, "logits/rejected": -2.0194125175476074, "logps/chosen": -418.9891052246094, "logps/rejected": -398.68817138671875, "loss": 0.518, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.024905536323785782, "rewards/confidence": -0.35911375284194946, "rewards/confidence_mean_diff": 0.35911375284194946, "rewards/confidence_moving_diff": -0.02757805585861206, "rewards/margins": 0.589150071144104, "rewards/mix_margin": 0.36011582612991333, "rewards/real_percentage": 5.0, "rewards/rejected": -0.5642445087432861, "step": 120 }, { "epoch": 0.35, "learning_rate": 8.251694886037051e-07, "logits/chosen": -2.033480167388916, "logits/rejected": -2.021919012069702, "logps/chosen": -435.73822021484375, "logps/rejected": -440.695068359375, "loss": 0.5357, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.17932221293449402, "rewards/confidence": -0.3874027132987976, "rewards/confidence_mean_diff": 0.3874027132987976, "rewards/confidence_moving_diff": 0.001680800342001021, "rewards/margins": 0.5359073877334595, "rewards/mix_margin": 0.2893298268318176, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -0.35658517479896545, "step": 130 }, { "epoch": 0.37, "learning_rate": 7.881735837772273e-07, "logits/chosen": -2.053534984588623, "logits/rejected": -2.0388946533203125, "logps/chosen": -462.8492736816406, "logps/rejected": -420.775634765625, "loss": 0.5225, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.02652936615049839, "rewards/confidence": -0.37694039940834045, "rewards/confidence_mean_diff": 0.37694039940834045, "rewards/confidence_moving_diff": 0.013669964857399464, "rewards/margins": 0.5515505075454712, "rewards/mix_margin": 0.36244601011276245, "rewards/real_percentage": 7.400000095367432, "rewards/rejected": -0.578079879283905, "step": 140 }, { "epoch": 0.4, "learning_rate": 7.486451991348871e-07, "logits/chosen": -1.9849458932876587, "logits/rejected": -1.9663019180297852, "logps/chosen": -419.9559631347656, "logps/rejected": -412.6697692871094, "loss": 0.5085, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.40688568353652954, "rewards/confidence": -0.42670464515686035, "rewards/confidence_mean_diff": 0.42670464515686035, "rewards/confidence_moving_diff": 0.004930758383125067, "rewards/margins": 0.526603102684021, "rewards/mix_margin": 0.3023320734500885, "rewards/real_percentage": 6.199999809265137, "rewards/rejected": -0.9334887266159058, "step": 150 }, { "epoch": 0.43, "learning_rate": 7.069317115422119e-07, "logits/chosen": -1.9053707122802734, "logits/rejected": -1.8682701587677002, "logps/chosen": -479.7124938964844, "logps/rejected": -441.03643798828125, "loss": 0.5223, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5245426893234253, "rewards/confidence": -0.20899562537670135, "rewards/confidence_mean_diff": 0.20899562537670135, "rewards/confidence_moving_diff": -0.03813329339027405, "rewards/margins": 0.6602107286453247, "rewards/mix_margin": 0.4546773433685303, "rewards/real_percentage": 6.0, "rewards/rejected": -1.1847535371780396, "step": 160 }, { "epoch": 0.46, "learning_rate": 6.633997006280252e-07, "logits/chosen": -1.8678109645843506, "logits/rejected": -1.818471908569336, "logps/chosen": -467.0858459472656, "logps/rejected": -463.423583984375, "loss": 0.4881, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.3335951268672943, "rewards/confidence": -0.19945284724235535, "rewards/confidence_mean_diff": 0.19945284724235535, "rewards/confidence_moving_diff": 0.0005248263478279114, "rewards/margins": 0.659382700920105, "rewards/mix_margin": 0.4060254991054535, "rewards/real_percentage": 6.199999809265137, "rewards/rejected": -0.9929777979850769, "step": 170 }, { "epoch": 0.48, "learning_rate": 6.184317272694866e-07, "logits/chosen": -1.9066665172576904, "logits/rejected": -1.8793361186981201, "logps/chosen": -473.1067810058594, "logps/rejected": -460.03759765625, "loss": 0.5252, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5049529671669006, "rewards/confidence": -0.2724582552909851, "rewards/confidence_mean_diff": 0.2724582552909851, "rewards/confidence_moving_diff": 0.012605907395482063, "rewards/margins": 0.5843724608421326, "rewards/mix_margin": 0.37383323907852173, "rewards/real_percentage": 5.599999904632568, "rewards/rejected": -1.0893253087997437, "step": 180 }, { "epoch": 0.51, "learning_rate": 5.724229716333479e-07, "logits/chosen": -1.9442212581634521, "logits/rejected": -1.938481330871582, "logps/chosen": -475.08673095703125, "logps/rejected": -469.3644104003906, "loss": 0.504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.728175938129425, "rewards/confidence": -0.38360652327537537, "rewards/confidence_mean_diff": 0.38360652327537537, "rewards/confidence_moving_diff": 0.03283499926328659, "rewards/margins": 0.6667336225509644, "rewards/mix_margin": 0.3667552173137665, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -1.3949092626571655, "step": 190 }, { "epoch": 0.54, "learning_rate": 5.257777603184407e-07, "logits/chosen": -2.007739543914795, "logits/rejected": -1.9801855087280273, "logps/chosen": -495.10406494140625, "logps/rejected": -476.86932373046875, "loss": 0.4823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6766387224197388, "rewards/confidence": -0.3196393549442291, "rewards/confidence_mean_diff": 0.3196393549442291, "rewards/confidence_moving_diff": -0.0256502665579319, "rewards/margins": 0.7569822072982788, "rewards/mix_margin": 0.48049411177635193, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -1.433620810508728, "step": 200 }, { "epoch": 0.56, "learning_rate": 4.789060131189845e-07, "logits/chosen": -1.912851095199585, "logits/rejected": -1.8825013637542725, "logps/chosen": -509.876220703125, "logps/rejected": -508.9344177246094, "loss": 0.4879, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.584080696105957, "rewards/confidence": -0.3971852660179138, "rewards/confidence_mean_diff": 0.3971852660179138, "rewards/confidence_moving_diff": 0.014336923137307167, "rewards/margins": 0.6768587231636047, "rewards/mix_margin": 0.35885554552078247, "rewards/real_percentage": 6.599999904632568, "rewards/rejected": -1.260939359664917, "step": 210 }, { "epoch": 0.59, "learning_rate": 4.322196406346984e-07, "logits/chosen": -1.889007329940796, "logits/rejected": -1.8577247858047485, "logps/chosen": -529.1480712890625, "logps/rejected": -514.1619873046875, "loss": 0.5224, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9949491620063782, "rewards/confidence": -0.36616700887680054, "rewards/confidence_mean_diff": 0.36616700887680054, "rewards/confidence_moving_diff": -0.02095675840973854, "rewards/margins": 0.6964308023452759, "rewards/mix_margin": 0.40404224395751953, "rewards/real_percentage": 5.400000095367432, "rewards/rejected": -1.6913799047470093, "step": 220 }, { "epoch": 0.62, "learning_rate": 3.8612892438563874e-07, "logits/chosen": -1.8933786153793335, "logits/rejected": -1.8514988422393799, "logps/chosen": -497.08837890625, "logps/rejected": -488.1795959472656, "loss": 0.5032, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.31867650151252747, "rewards/confidence": -0.3651738166809082, "rewards/confidence_mean_diff": 0.3651738166809082, "rewards/confidence_moving_diff": 0.02553940750658512, "rewards/margins": 0.6790135502815247, "rewards/mix_margin": 0.4231902062892914, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": -0.9976900219917297, "step": 230 }, { "epoch": 0.64, "learning_rate": 3.410389112434499e-07, "logits/chosen": -1.9019248485565186, "logits/rejected": -1.8891223669052124, "logps/chosen": -448.92529296875, "logps/rejected": -434.49310302734375, "loss": 0.5052, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.28666627407073975, "rewards/confidence": -0.39491331577301025, "rewards/confidence_mean_diff": 0.39491331577301025, "rewards/confidence_moving_diff": -0.022241462022066116, "rewards/margins": 0.7103021740913391, "rewards/mix_margin": 0.4018441140651703, "rewards/real_percentage": 5.599999904632568, "rewards/rejected": -0.9969684481620789, "step": 240 }, { "epoch": 0.67, "learning_rate": 2.9734585386489093e-07, "logits/chosen": -1.9463539123535156, "logits/rejected": -1.9137918949127197, "logps/chosen": -465.1922912597656, "logps/rejected": -452.851318359375, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2856825292110443, "rewards/confidence": -0.36342883110046387, "rewards/confidence_mean_diff": 0.36342883110046387, "rewards/confidence_moving_diff": -0.0016809016233310103, "rewards/margins": 0.7171444892883301, "rewards/mix_margin": 0.44476184248924255, "rewards/real_percentage": 6.0, "rewards/rejected": -1.0028270483016968, "step": 250 }, { "epoch": 0.7, "learning_rate": 2.5543372840924103e-07, "logits/chosen": -1.8962440490722656, "logits/rejected": -1.8909025192260742, "logps/chosen": -458.53973388671875, "logps/rejected": -445.6280822753906, "loss": 0.5161, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5167608261108398, "rewards/confidence": -0.35129833221435547, "rewards/confidence_mean_diff": 0.35129833221435547, "rewards/confidence_moving_diff": 0.0023426979314535856, "rewards/margins": 0.6272808313369751, "rewards/mix_margin": 0.40769854187965393, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -1.144041657447815, "step": 260 }, { "epoch": 0.72, "learning_rate": 2.156708601420053e-07, "logits/chosen": -1.9512195587158203, "logits/rejected": -1.9191925525665283, "logps/chosen": -465.26800537109375, "logps/rejected": -461.1874084472656, "loss": 0.4685, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4135049283504486, "rewards/confidence": -0.3663369417190552, "rewards/confidence_mean_diff": 0.3663369417190552, "rewards/confidence_moving_diff": 0.027239680290222168, "rewards/margins": 0.7379857301712036, "rewards/mix_margin": 0.429837167263031, "rewards/real_percentage": 7.0, "rewards/rejected": -1.1514906883239746, "step": 270 }, { "epoch": 0.75, "learning_rate": 1.7840668657923836e-07, "logits/chosen": -1.874678373336792, "logits/rejected": -1.8355417251586914, "logps/chosen": -480.41778564453125, "logps/rejected": -482.2197265625, "loss": 0.5016, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5271696448326111, "rewards/confidence": -0.48199835419654846, "rewards/confidence_mean_diff": 0.48199835419654846, "rewards/confidence_moving_diff": 0.011687606573104858, "rewards/margins": 0.7221012711524963, "rewards/mix_margin": 0.4206576347351074, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -1.2492707967758179, "step": 280 }, { "epoch": 0.78, "learning_rate": 1.4396868661808776e-07, "logits/chosen": -1.8965253829956055, "logits/rejected": -1.8730520009994507, "logps/chosen": -481.58349609375, "logps/rejected": -476.36224365234375, "loss": 0.5154, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.45735082030296326, "rewards/confidence": -0.503350555896759, "rewards/confidence_mean_diff": 0.503350555896759, "rewards/confidence_moving_diff": -0.010494095273315907, "rewards/margins": 0.6847480535507202, "rewards/mix_margin": 0.40963077545166016, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -1.1420990228652954, "step": 290 }, { "epoch": 0.8, "learning_rate": 1.1265950264047169e-07, "logits/chosen": -1.8776991367340088, "logits/rejected": -1.838903784751892, "logps/chosen": -494.5914001464844, "logps/rejected": -468.793212890625, "loss": 0.4821, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5077553987503052, "rewards/confidence": -0.37904122471809387, "rewards/confidence_mean_diff": 0.37904122471809387, "rewards/confidence_moving_diff": -0.0364709310233593, "rewards/margins": 0.8465288281440735, "rewards/mix_margin": 0.5325880646705627, "rewards/real_percentage": 5.199999809265137, "rewards/rejected": -1.3542842864990234, "step": 300 }, { "epoch": 0.83, "learning_rate": 8.475428088094516e-08, "logits/chosen": -1.8273499011993408, "logits/rejected": -1.7930196523666382, "logps/chosen": -467.7921447753906, "logps/rejected": -441.1510314941406, "loss": 0.4678, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3718186020851135, "rewards/confidence": -0.2409948855638504, "rewards/confidence_mean_diff": 0.2409948855638504, "rewards/confidence_moving_diff": 0.0065197511576116085, "rewards/margins": 0.8056579828262329, "rewards/mix_margin": 0.502047061920166, "rewards/real_percentage": 6.199999809265137, "rewards/rejected": -1.1774766445159912, "step": 310 }, { "epoch": 0.86, "learning_rate": 6.049825343169652e-08, "logits/chosen": -1.8773448467254639, "logits/rejected": -1.8380086421966553, "logps/chosen": -480.5743103027344, "logps/rejected": -496.74774169921875, "loss": 0.5112, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8989919424057007, "rewards/confidence": -0.4660707116127014, "rewards/confidence_mean_diff": 0.4660707116127014, "rewards/confidence_moving_diff": 0.02156786061823368, "rewards/margins": 0.5795319080352783, "rewards/mix_margin": 0.32589632272720337, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -1.478523850440979, "step": 320 }, { "epoch": 0.88, "learning_rate": 4.010458313410459e-08, "logits/chosen": -1.8770291805267334, "logits/rejected": -1.8352234363555908, "logps/chosen": -449.00286865234375, "logps/rejected": -464.95880126953125, "loss": 0.4886, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.32519200444221497, "rewards/confidence": -0.37203681468963623, "rewards/confidence_mean_diff": 0.37203681468963623, "rewards/confidence_moving_diff": -0.037306807935237885, "rewards/margins": 0.7662613391876221, "rewards/mix_margin": 0.4235256612300873, "rewards/real_percentage": 6.0, "rewards/rejected": -1.0914533138275146, "step": 330 }, { "epoch": 0.91, "learning_rate": 2.3752490296023388e-08, "logits/chosen": -1.890636682510376, "logits/rejected": -1.8676478862762451, "logps/chosen": -493.1441955566406, "logps/rejected": -471.08038330078125, "loss": 0.4736, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5223912000656128, "rewards/confidence": -0.3331066071987152, "rewards/confidence_mean_diff": 0.3331066071987152, "rewards/confidence_moving_diff": 0.037895917892456055, "rewards/margins": 0.8462656736373901, "rewards/mix_margin": 0.5043337941169739, "rewards/real_percentage": 7.400000095367432, "rewards/rejected": -1.368657112121582, "step": 340 }, { "epoch": 0.94, "learning_rate": 1.158567769727331e-08, "logits/chosen": -1.8919689655303955, "logits/rejected": -1.8817275762557983, "logps/chosen": -454.0144958496094, "logps/rejected": -446.63787841796875, "loss": 0.4802, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5384470224380493, "rewards/confidence": -0.4307991862297058, "rewards/confidence_mean_diff": 0.4307991862297058, "rewards/confidence_moving_diff": -0.021659499034285545, "rewards/margins": 0.6522639989852905, "rewards/mix_margin": 0.39593905210494995, "rewards/real_percentage": 5.0, "rewards/rejected": -1.1907110214233398, "step": 350 }, { "epoch": 0.96, "learning_rate": 3.7110677244445166e-09, "logits/chosen": -1.825492262840271, "logits/rejected": -1.782222032546997, "logps/chosen": -459.6104431152344, "logps/rejected": -475.7046813964844, "loss": 0.4753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5981888175010681, "rewards/confidence": -0.25743329524993896, "rewards/confidence_mean_diff": 0.25743329524993896, "rewards/confidence_moving_diff": 0.007767287082970142, "rewards/margins": 0.8224313855171204, "rewards/mix_margin": 0.4543371796607971, "rewards/real_percentage": 6.599999904632568, "rewards/rejected": -1.4206202030181885, "step": 360 }, { "epoch": 0.99, "learning_rate": 1.9786273311928058e-10, "logits/chosen": -1.8085676431655884, "logits/rejected": -1.8041887283325195, "logps/chosen": -472.97760009765625, "logps/rejected": -481.787109375, "loss": 0.4946, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.23779034614562988, "rewards/confidence": -0.37813520431518555, "rewards/confidence_mean_diff": 0.37813520431518555, "rewards/confidence_moving_diff": -0.012357574887573719, "rewards/margins": 0.7020485401153564, "rewards/mix_margin": 0.39195528626441956, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -0.9398388862609863, "step": 370 }, { "epoch": 1.0, "step": 373, "total_flos": 0.0, "train_loss": 0.5273514372413983, "train_runtime": 28310.8993, "train_samples_per_second": 0.844, "train_steps_per_second": 0.013 } ], "logging_steps": 10, "max_steps": 373, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }