{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 269.7842833735451, "learning_rate": 3.1847133757961784e-09, "logits/chosen": -2.8781068325042725, "logits/rejected": -3.18619966506958, "logps/chosen": -132.49632263183594, "logps/rejected": -97.49405670166016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 225.840088756475, "learning_rate": 3.184713375796178e-08, "logits/chosen": -2.7182540893554688, "logits/rejected": -3.014042854309082, "logps/chosen": -114.23727416992188, "logps/rejected": -89.29263305664062, "loss": 0.6761, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.005112478509545326, "rewards/margins": 0.04120528697967529, "rewards/rejected": -0.03609280660748482, "step": 10 }, { "epoch": 0.01, "grad_norm": 139.5446217816026, "learning_rate": 6.369426751592356e-08, "logits/chosen": -2.7113776206970215, "logits/rejected": -2.9822986125946045, "logps/chosen": -120.40122985839844, "logps/rejected": -87.35284423828125, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 0.11869080364704132, "rewards/margins": 0.47487640380859375, "rewards/rejected": -0.3561856150627136, "step": 20 }, { "epoch": 0.02, "grad_norm": 52.502479723007994, "learning_rate": 9.554140127388536e-08, "logits/chosen": -2.7034497261047363, "logits/rejected": -2.912091016769409, "logps/chosen": -112.85099792480469, "logps/rejected": -92.99781036376953, "loss": 0.239, "rewards/accuracies": 1.0, "rewards/chosen": 0.27735695242881775, "rewards/margins": 1.6260454654693604, "rewards/rejected": -1.3486883640289307, "step": 30 }, { "epoch": 0.03, "grad_norm": 24.342254507057344, "learning_rate": 1.2738853503184713e-07, "logits/chosen": -2.6779820919036865, "logits/rejected": -2.9538276195526123, "logps/chosen": -114.9598617553711, "logps/rejected": -109.4762191772461, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 0.3945399522781372, "rewards/margins": 2.8680927753448486, "rewards/rejected": -2.47355318069458, "step": 40 }, { "epoch": 0.03, "grad_norm": 16.84971880044338, "learning_rate": 1.592356687898089e-07, "logits/chosen": -2.63224720954895, "logits/rejected": -2.869575023651123, "logps/chosen": -120.35627746582031, "logps/rejected": -114.70686340332031, "loss": 0.0746, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.559775173664093, "rewards/margins": 3.929781436920166, "rewards/rejected": -3.3700060844421387, "step": 50 }, { "epoch": 0.04, "grad_norm": 22.345296735538415, "learning_rate": 1.9108280254777072e-07, "logits/chosen": -2.649770975112915, "logits/rejected": -2.9390110969543457, "logps/chosen": -118.53907775878906, "logps/rejected": -128.2223358154297, "loss": 0.049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25389423966407776, "rewards/margins": 4.323164939880371, "rewards/rejected": -4.069270610809326, "step": 60 }, { "epoch": 0.04, "grad_norm": 20.651969023179603, "learning_rate": 2.2292993630573247e-07, "logits/chosen": -2.620734453201294, "logits/rejected": -2.904691696166992, "logps/chosen": -118.86378479003906, "logps/rejected": -142.30442810058594, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 0.2997106611728668, "rewards/margins": 5.346270561218262, "rewards/rejected": -5.046560287475586, "step": 70 }, { "epoch": 0.05, "grad_norm": 18.756699820611576, "learning_rate": 2.5477707006369425e-07, "logits/chosen": -2.644515037536621, "logits/rejected": -2.8679375648498535, "logps/chosen": -141.45867919921875, "logps/rejected": -139.5041961669922, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 0.016187960281968117, "rewards/margins": 5.637503147125244, "rewards/rejected": -5.6213154792785645, "step": 80 }, { "epoch": 0.06, "grad_norm": 17.304042934264913, "learning_rate": 2.86624203821656e-07, "logits/chosen": -2.6533827781677246, "logits/rejected": -2.8769655227661133, "logps/chosen": -131.0616455078125, "logps/rejected": -137.1123046875, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 0.2292163372039795, "rewards/margins": 5.93363094329834, "rewards/rejected": -5.704414367675781, "step": 90 }, { "epoch": 0.06, "grad_norm": 17.418215277278303, "learning_rate": 3.184713375796178e-07, "logits/chosen": -2.5998005867004395, "logits/rejected": -2.8436856269836426, "logps/chosen": -118.36405181884766, "logps/rejected": -150.16636657714844, "loss": 0.0283, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1456707864999771, "rewards/margins": 6.479343414306641, "rewards/rejected": -6.625013828277588, "step": 100 }, { "epoch": 0.07, "grad_norm": 12.48057297022973, "learning_rate": 3.5031847133757957e-07, "logits/chosen": -2.5239391326904297, "logits/rejected": -2.7482004165649414, "logps/chosen": -124.32521057128906, "logps/rejected": -143.4300537109375, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 0.06046473979949951, "rewards/margins": 6.917716026306152, "rewards/rejected": -6.8572516441345215, "step": 110 }, { "epoch": 0.08, "grad_norm": 11.25165220079385, "learning_rate": 3.8216560509554143e-07, "logits/chosen": -2.6064770221710205, "logits/rejected": -2.8154265880584717, "logps/chosen": -127.79212951660156, "logps/rejected": -166.4564971923828, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.18089526891708374, "rewards/margins": 8.502016067504883, "rewards/rejected": -8.321121215820312, "step": 120 }, { "epoch": 0.08, "grad_norm": 9.982779910930713, "learning_rate": 4.140127388535032e-07, "logits/chosen": -2.629976749420166, "logits/rejected": -2.829049587249756, "logps/chosen": -119.15159606933594, "logps/rejected": -171.82168579101562, "loss": 0.0242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10970219224691391, "rewards/margins": 8.503213882446289, "rewards/rejected": -8.393510818481445, "step": 130 }, { "epoch": 0.09, "grad_norm": 1.068441186553867, "learning_rate": 4.4585987261146494e-07, "logits/chosen": -2.4788918495178223, "logits/rejected": -2.7960944175720215, "logps/chosen": -122.7088623046875, "logps/rejected": -180.638427734375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.3950616419315338, "rewards/margins": 8.885313987731934, "rewards/rejected": -9.280375480651855, "step": 140 }, { "epoch": 0.1, "grad_norm": 1.6980178715972738, "learning_rate": 4.777070063694267e-07, "logits/chosen": -2.533334732055664, "logits/rejected": -2.7179126739501953, "logps/chosen": -114.43672943115234, "logps/rejected": -182.99908447265625, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3883129954338074, "rewards/margins": 10.101364135742188, "rewards/rejected": -9.713052749633789, "step": 150 }, { "epoch": 0.1, "grad_norm": 5.589702998473681, "learning_rate": 4.989331436699858e-07, "logits/chosen": -2.5194196701049805, "logits/rejected": -2.7274131774902344, "logps/chosen": -146.7877655029297, "logps/rejected": -185.3484649658203, "loss": 0.0171, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5904136896133423, "rewards/margins": 8.3977632522583, "rewards/rejected": -9.988175392150879, "step": 160 }, { "epoch": 0.11, "grad_norm": 1.0748044802435313, "learning_rate": 4.953769559032717e-07, "logits/chosen": -2.478966474533081, "logits/rejected": -2.706465005874634, "logps/chosen": -123.1487045288086, "logps/rejected": -187.28634643554688, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.411639928817749, "rewards/margins": 9.865466117858887, "rewards/rejected": -10.277105331420898, "step": 170 }, { "epoch": 0.12, "grad_norm": 1.303203442204381, "learning_rate": 4.918207681365576e-07, "logits/chosen": -2.5271456241607666, "logits/rejected": -2.7161953449249268, "logps/chosen": -139.56097412109375, "logps/rejected": -192.1598358154297, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.4142029285430908, "rewards/margins": 9.24979019165039, "rewards/rejected": -10.663992881774902, "step": 180 }, { "epoch": 0.12, "grad_norm": 0.8568137300390842, "learning_rate": 4.882645803698435e-07, "logits/chosen": -2.4333901405334473, "logits/rejected": -2.620060682296753, "logps/chosen": -140.07345581054688, "logps/rejected": -180.193603515625, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -1.630637526512146, "rewards/margins": 8.6264066696167, "rewards/rejected": -10.257043838500977, "step": 190 }, { "epoch": 0.13, "grad_norm": 13.412950576813955, "learning_rate": 4.847083926031294e-07, "logits/chosen": -2.461732864379883, "logits/rejected": -2.675943374633789, "logps/chosen": -137.39089965820312, "logps/rejected": -195.60218811035156, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.9913924336433411, "rewards/margins": 10.286381721496582, "rewards/rejected": -11.277772903442383, "step": 200 }, { "epoch": 0.13, "grad_norm": 21.7222990738424, "learning_rate": 4.811522048364154e-07, "logits/chosen": -2.5159268379211426, "logits/rejected": -2.7606940269470215, "logps/chosen": -162.57406616210938, "logps/rejected": -213.39517211914062, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.11065955460071564, "rewards/margins": 12.143023490905762, "rewards/rejected": -12.253683090209961, "step": 210 }, { "epoch": 0.14, "grad_norm": 0.046180032897506273, "learning_rate": 4.775960170697012e-07, "logits/chosen": -2.4487884044647217, "logits/rejected": -2.664316415786743, "logps/chosen": -129.77688598632812, "logps/rejected": -209.32772827148438, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.11752267926931381, "rewards/margins": 12.149419784545898, "rewards/rejected": -12.266942977905273, "step": 220 }, { "epoch": 0.15, "grad_norm": 1.4824700240745254, "learning_rate": 4.7403982930298717e-07, "logits/chosen": -2.4436874389648438, "logits/rejected": -2.694324016571045, "logps/chosen": -135.8723602294922, "logps/rejected": -216.75387573242188, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -1.2215015888214111, "rewards/margins": 11.901634216308594, "rewards/rejected": -13.123135566711426, "step": 230 }, { "epoch": 0.15, "grad_norm": 17.463403344491365, "learning_rate": 4.7048364153627306e-07, "logits/chosen": -2.4271080493927, "logits/rejected": -2.691582441329956, "logps/chosen": -156.45681762695312, "logps/rejected": -219.70529174804688, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.2694718837738037, "rewards/margins": 10.988931655883789, "rewards/rejected": -13.258402824401855, "step": 240 }, { "epoch": 0.16, "grad_norm": 2.8943147376404292, "learning_rate": 4.66927453769559e-07, "logits/chosen": -2.4071226119995117, "logits/rejected": -2.6235671043395996, "logps/chosen": -136.54635620117188, "logps/rejected": -195.32444763183594, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.8205227851867676, "rewards/margins": 10.77901840209961, "rewards/rejected": -11.599540710449219, "step": 250 }, { "epoch": 0.17, "grad_norm": 1.3617461125493031, "learning_rate": 4.633712660028449e-07, "logits/chosen": -2.439023971557617, "logits/rejected": -2.676661729812622, "logps/chosen": -132.08621215820312, "logps/rejected": -209.95077514648438, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.34371912479400635, "rewards/margins": 12.549942970275879, "rewards/rejected": -12.893661499023438, "step": 260 }, { "epoch": 0.17, "grad_norm": 1.0505376091417922, "learning_rate": 4.5981507823613085e-07, "logits/chosen": -2.4203948974609375, "logits/rejected": -2.6439831256866455, "logps/chosen": -143.63368225097656, "logps/rejected": -215.7870330810547, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.3733088970184326, "rewards/margins": 12.261556625366211, "rewards/rejected": -13.634866714477539, "step": 270 }, { "epoch": 0.18, "grad_norm": 0.30085985079958266, "learning_rate": 4.562588904694168e-07, "logits/chosen": -2.4349520206451416, "logits/rejected": -2.671058177947998, "logps/chosen": -138.3717498779297, "logps/rejected": -223.06948852539062, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.9617056846618652, "rewards/margins": 13.077425003051758, "rewards/rejected": -14.039129257202148, "step": 280 }, { "epoch": 0.19, "grad_norm": 1.459881483419176, "learning_rate": 4.5270270270270264e-07, "logits/chosen": -2.5198299884796143, "logits/rejected": -2.6836562156677246, "logps/chosen": -146.54750061035156, "logps/rejected": -236.26171875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -1.627685308456421, "rewards/margins": 13.237665176391602, "rewards/rejected": -14.865351676940918, "step": 290 }, { "epoch": 0.19, "grad_norm": 0.10587938886796702, "learning_rate": 4.491465149359886e-07, "logits/chosen": -2.531820774078369, "logits/rejected": -2.704437017440796, "logps/chosen": -152.57876586914062, "logps/rejected": -237.67868041992188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.0694496631622314, "rewards/margins": 12.964448928833008, "rewards/rejected": -15.033899307250977, "step": 300 }, { "epoch": 0.2, "grad_norm": 3.1066767259351065, "learning_rate": 4.4559032716927454e-07, "logits/chosen": -2.4913744926452637, "logits/rejected": -2.7113704681396484, "logps/chosen": -150.282958984375, "logps/rejected": -228.7188262939453, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.2832634449005127, "rewards/margins": 12.166834831237793, "rewards/rejected": -14.450098991394043, "step": 310 }, { "epoch": 0.2, "grad_norm": 0.28682507599353846, "learning_rate": 4.420341394025605e-07, "logits/chosen": -2.3985180854797363, "logits/rejected": -2.6870627403259277, "logps/chosen": -160.65838623046875, "logps/rejected": -225.05581665039062, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.4125454425811768, "rewards/margins": 10.527777671813965, "rewards/rejected": -13.940322875976562, "step": 320 }, { "epoch": 0.21, "grad_norm": 5.225802985518444, "learning_rate": 4.384779516358463e-07, "logits/chosen": -2.39935040473938, "logits/rejected": -2.6781036853790283, "logps/chosen": -170.07534790039062, "logps/rejected": -235.141357421875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.992198944091797, "rewards/margins": 11.101171493530273, "rewards/rejected": -15.09337043762207, "step": 330 }, { "epoch": 0.22, "grad_norm": 3.405010658068852, "learning_rate": 4.3492176386913227e-07, "logits/chosen": -2.3850109577178955, "logits/rejected": -2.697495937347412, "logps/chosen": -162.45657348632812, "logps/rejected": -248.86428833007812, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.1875181198120117, "rewards/margins": 13.499272346496582, "rewards/rejected": -16.686786651611328, "step": 340 }, { "epoch": 0.22, "grad_norm": 1.533659428300562, "learning_rate": 4.313655761024182e-07, "logits/chosen": -2.2804832458496094, "logits/rejected": -2.6101315021514893, "logps/chosen": -167.80911254882812, "logps/rejected": -265.25482177734375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -5.248920440673828, "rewards/margins": 12.490852355957031, "rewards/rejected": -17.73977279663086, "step": 350 }, { "epoch": 0.23, "grad_norm": 1.8069977925253333, "learning_rate": 4.278093883357041e-07, "logits/chosen": -2.348785400390625, "logits/rejected": -2.618760824203491, "logps/chosen": -191.90760803222656, "logps/rejected": -262.60369873046875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.235640048980713, "rewards/margins": 13.658535957336426, "rewards/rejected": -17.894176483154297, "step": 360 }, { "epoch": 0.24, "grad_norm": 1.1519087510764392, "learning_rate": 4.2425320056899e-07, "logits/chosen": -2.3367717266082764, "logits/rejected": -2.6459882259368896, "logps/chosen": -157.1217041015625, "logps/rejected": -270.79559326171875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.419424057006836, "rewards/margins": 14.605476379394531, "rewards/rejected": -18.024898529052734, "step": 370 }, { "epoch": 0.24, "grad_norm": 0.12503486691431573, "learning_rate": 4.2069701280227595e-07, "logits/chosen": -2.2905983924865723, "logits/rejected": -2.6188652515411377, "logps/chosen": -161.6908416748047, "logps/rejected": -277.34710693359375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.108786582946777, "rewards/margins": 14.594747543334961, "rewards/rejected": -18.703533172607422, "step": 380 }, { "epoch": 0.25, "grad_norm": 1.5481281251493877, "learning_rate": 4.1714082503556185e-07, "logits/chosen": -2.266719341278076, "logits/rejected": -2.5488734245300293, "logps/chosen": -176.82908630371094, "logps/rejected": -274.1158447265625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -6.371943950653076, "rewards/margins": 12.869401931762695, "rewards/rejected": -19.24134635925293, "step": 390 }, { "epoch": 0.26, "grad_norm": 0.5072108881730257, "learning_rate": 4.135846372688478e-07, "logits/chosen": -2.29506254196167, "logits/rejected": -2.585667133331299, "logps/chosen": -166.04281616210938, "logps/rejected": -287.45941162109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.006166934967041, "rewards/margins": 16.814212799072266, "rewards/rejected": -19.820377349853516, "step": 400 }, { "epoch": 0.26, "grad_norm": 14.57194509259927, "learning_rate": 4.100284495021337e-07, "logits/chosen": -2.1163532733917236, "logits/rejected": -2.5131845474243164, "logps/chosen": -142.27735900878906, "logps/rejected": -264.2358703613281, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.5816712379455566, "rewards/margins": 15.512285232543945, "rewards/rejected": -18.093957901000977, "step": 410 }, { "epoch": 0.27, "grad_norm": 14.91037080313429, "learning_rate": 4.064722617354196e-07, "logits/chosen": -2.1361889839172363, "logits/rejected": -2.529857635498047, "logps/chosen": -167.25698852539062, "logps/rejected": -278.71826171875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.816511631011963, "rewards/margins": 16.648120880126953, "rewards/rejected": -19.46463394165039, "step": 420 }, { "epoch": 0.28, "grad_norm": 0.5762867478889705, "learning_rate": 4.0291607396870553e-07, "logits/chosen": -2.1066999435424805, "logits/rejected": -2.456259250640869, "logps/chosen": -163.7415008544922, "logps/rejected": -305.5865783691406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -5.017763137817383, "rewards/margins": 17.317230224609375, "rewards/rejected": -22.33499526977539, "step": 430 }, { "epoch": 0.28, "grad_norm": 3.706715513864746, "learning_rate": 3.993598862019915e-07, "logits/chosen": -2.2640228271484375, "logits/rejected": -2.544032096862793, "logps/chosen": -191.6356964111328, "logps/rejected": -306.35443115234375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -5.6402459144592285, "rewards/margins": 16.29918670654297, "rewards/rejected": -21.939434051513672, "step": 440 }, { "epoch": 0.29, "grad_norm": 0.4884505823214583, "learning_rate": 3.9580369843527737e-07, "logits/chosen": -2.258357048034668, "logits/rejected": -2.5526187419891357, "logps/chosen": -217.8997039794922, "logps/rejected": -372.7241516113281, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -9.196186065673828, "rewards/margins": 19.7209529876709, "rewards/rejected": -28.917139053344727, "step": 450 }, { "epoch": 0.29, "grad_norm": 0.045591560325134, "learning_rate": 3.9224751066856327e-07, "logits/chosen": -1.883296251296997, "logits/rejected": -2.3051819801330566, "logps/chosen": -215.2701416015625, "logps/rejected": -373.6488342285156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -8.807767868041992, "rewards/margins": 20.2095947265625, "rewards/rejected": -29.017364501953125, "step": 460 }, { "epoch": 0.3, "grad_norm": 0.12470490893983527, "learning_rate": 3.886913229018492e-07, "logits/chosen": -1.5328924655914307, "logits/rejected": -2.0810158252716064, "logps/chosen": -184.50637817382812, "logps/rejected": -393.2000732421875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.6133012771606445, "rewards/margins": 24.581329345703125, "rewards/rejected": -30.194625854492188, "step": 470 }, { "epoch": 0.31, "grad_norm": 3.452822600566543, "learning_rate": 3.851351351351351e-07, "logits/chosen": -1.7886974811553955, "logits/rejected": -2.2504382133483887, "logps/chosen": -184.6663360595703, "logps/rejected": -360.86578369140625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.6061811447143555, "rewards/margins": 22.705333709716797, "rewards/rejected": -27.311513900756836, "step": 480 }, { "epoch": 0.31, "grad_norm": 0.08494215992540369, "learning_rate": 3.8157894736842105e-07, "logits/chosen": -1.8703672885894775, "logits/rejected": -2.233332872390747, "logps/chosen": -182.80052185058594, "logps/rejected": -391.44683837890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.9865665435791016, "rewards/margins": 26.53458023071289, "rewards/rejected": -30.521148681640625, "step": 490 }, { "epoch": 0.32, "grad_norm": 0.4084224053393073, "learning_rate": 3.7802275960170695e-07, "logits/chosen": -1.906399130821228, "logits/rejected": -2.272561550140381, "logps/chosen": -149.4335479736328, "logps/rejected": -365.5450439453125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.793308734893799, "rewards/margins": 25.208282470703125, "rewards/rejected": -28.0015926361084, "step": 500 }, { "epoch": 0.33, "grad_norm": 0.8910605143175322, "learning_rate": 3.7446657183499284e-07, "logits/chosen": -1.7485355138778687, "logits/rejected": -2.077558755874634, "logps/chosen": -166.2713623046875, "logps/rejected": -369.4629821777344, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.770512342453003, "rewards/margins": 24.904991149902344, "rewards/rejected": -28.67550277709961, "step": 510 }, { "epoch": 0.33, "grad_norm": 1.7685911190779986, "learning_rate": 3.709103840682788e-07, "logits/chosen": -1.7549102306365967, "logits/rejected": -2.1138863563537598, "logps/chosen": -170.63426208496094, "logps/rejected": -404.83123779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.2706170082092285, "rewards/margins": 27.674823760986328, "rewards/rejected": -31.9454402923584, "step": 520 }, { "epoch": 0.34, "grad_norm": 4.243301477338939, "learning_rate": 3.6735419630156474e-07, "logits/chosen": -1.8727014064788818, "logits/rejected": -2.1714062690734863, "logps/chosen": -178.07815551757812, "logps/rejected": -372.9120788574219, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.480609893798828, "rewards/margins": 25.141685485839844, "rewards/rejected": -28.622295379638672, "step": 530 }, { "epoch": 0.35, "grad_norm": 1.7633763298373188, "learning_rate": 3.637980085348506e-07, "logits/chosen": -2.259653091430664, "logits/rejected": -2.5058536529541016, "logps/chosen": -166.86276245117188, "logps/rejected": -310.83026123046875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.225189685821533, "rewards/margins": 19.310260772705078, "rewards/rejected": -22.535449981689453, "step": 540 }, { "epoch": 0.35, "grad_norm": 0.5027205143527536, "learning_rate": 3.602418207681365e-07, "logits/chosen": -2.3244869709014893, "logits/rejected": -2.5319793224334717, "logps/chosen": -170.75979614257812, "logps/rejected": -300.6156311035156, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.17783260345459, "rewards/margins": 17.45343017578125, "rewards/rejected": -21.631261825561523, "step": 550 }, { "epoch": 0.36, "grad_norm": 15.947782616592031, "learning_rate": 3.5668563300142247e-07, "logits/chosen": -2.2984859943389893, "logits/rejected": -2.4519248008728027, "logps/chosen": -189.85989379882812, "logps/rejected": -333.6773681640625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -5.824889183044434, "rewards/margins": 19.22607421875, "rewards/rejected": -25.050960540771484, "step": 560 }, { "epoch": 0.36, "grad_norm": 6.26719943945485, "learning_rate": 3.5312944523470837e-07, "logits/chosen": -2.2542061805725098, "logits/rejected": -2.432546615600586, "logps/chosen": -136.48843383789062, "logps/rejected": -301.9817810058594, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.9576592445373535, "rewards/margins": 19.037410736083984, "rewards/rejected": -21.995067596435547, "step": 570 }, { "epoch": 0.37, "grad_norm": 0.11415543320354572, "learning_rate": 3.495732574679943e-07, "logits/chosen": -2.215407609939575, "logits/rejected": -2.4386544227600098, "logps/chosen": -152.66677856445312, "logps/rejected": -342.9513244628906, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.211902618408203, "rewards/margins": 23.44979476928711, "rewards/rejected": -25.661701202392578, "step": 580 }, { "epoch": 0.38, "grad_norm": 0.22610570945942704, "learning_rate": 3.460170697012802e-07, "logits/chosen": -2.1291861534118652, "logits/rejected": -2.409412384033203, "logps/chosen": -160.59091186523438, "logps/rejected": -361.2604064941406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.499822616577148, "rewards/margins": 22.70589828491211, "rewards/rejected": -27.205724716186523, "step": 590 }, { "epoch": 0.38, "grad_norm": 0.7406158964206918, "learning_rate": 3.424608819345661e-07, "logits/chosen": -2.1191792488098145, "logits/rejected": -2.3479321002960205, "logps/chosen": -197.30201721191406, "logps/rejected": -371.0838317871094, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -6.3380818367004395, "rewards/margins": 21.930988311767578, "rewards/rejected": -28.269067764282227, "step": 600 }, { "epoch": 0.39, "grad_norm": 0.32062082605886466, "learning_rate": 3.3890469416785205e-07, "logits/chosen": -2.0577409267425537, "logits/rejected": -2.3198914527893066, "logps/chosen": -195.90158081054688, "logps/rejected": -384.8281555175781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.974644660949707, "rewards/margins": 23.86060905456543, "rewards/rejected": -29.835254669189453, "step": 610 }, { "epoch": 0.4, "grad_norm": 1.6177298432916143, "learning_rate": 3.35348506401138e-07, "logits/chosen": -1.9579404592514038, "logits/rejected": -2.269507646560669, "logps/chosen": -160.80618286132812, "logps/rejected": -329.6241455078125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -4.555537223815918, "rewards/margins": 20.43609619140625, "rewards/rejected": -24.99163246154785, "step": 620 }, { "epoch": 0.4, "grad_norm": 0.187358584161895, "learning_rate": 3.3179231863442384e-07, "logits/chosen": -2.113788366317749, "logits/rejected": -2.3290324211120605, "logps/chosen": -185.5271759033203, "logps/rejected": -375.39453125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.738511085510254, "rewards/margins": 24.537811279296875, "rewards/rejected": -29.276325225830078, "step": 630 }, { "epoch": 0.41, "grad_norm": 0.010996930190111638, "learning_rate": 3.282361308677098e-07, "logits/chosen": -2.0769455432891846, "logits/rejected": -2.347015142440796, "logps/chosen": -175.67599487304688, "logps/rejected": -431.5035705566406, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.6663079261779785, "rewards/margins": 29.519245147705078, "rewards/rejected": -34.18555450439453, "step": 640 }, { "epoch": 0.42, "grad_norm": 0.21277301345388175, "learning_rate": 3.2467994310099573e-07, "logits/chosen": -2.198702335357666, "logits/rejected": -2.4985084533691406, "logps/chosen": -149.29171752929688, "logps/rejected": -294.216552734375, "loss": 0.0169, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3964667320251465, "rewards/margins": 17.87206268310547, "rewards/rejected": -21.26852798461914, "step": 650 }, { "epoch": 0.42, "grad_norm": 0.02451780480832141, "learning_rate": 3.211237553342817e-07, "logits/chosen": -2.237781047821045, "logits/rejected": -2.509354591369629, "logps/chosen": -160.78524780273438, "logps/rejected": -333.3505554199219, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.4144351482391357, "rewards/margins": 20.91831398010254, "rewards/rejected": -24.332752227783203, "step": 660 }, { "epoch": 0.43, "grad_norm": 0.5799365179235819, "learning_rate": 3.175675675675675e-07, "logits/chosen": -2.2056496143341064, "logits/rejected": -2.4211344718933105, "logps/chosen": -162.71096801757812, "logps/rejected": -334.28765869140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.9299492835998535, "rewards/margins": 21.33687973022461, "rewards/rejected": -25.266826629638672, "step": 670 }, { "epoch": 0.44, "grad_norm": 0.1914899774779342, "learning_rate": 3.1401137980085347e-07, "logits/chosen": -2.2409019470214844, "logits/rejected": -2.518660068511963, "logps/chosen": -164.3127899169922, "logps/rejected": -349.6018371582031, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -3.266920566558838, "rewards/margins": 22.99898338317871, "rewards/rejected": -26.265905380249023, "step": 680 }, { "epoch": 0.44, "grad_norm": 1.3949951852806233, "learning_rate": 3.104551920341394e-07, "logits/chosen": -2.3294060230255127, "logits/rejected": -2.461803674697876, "logps/chosen": -169.67840576171875, "logps/rejected": -362.62786865234375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.1314125061035156, "rewards/margins": 23.983592987060547, "rewards/rejected": -27.115009307861328, "step": 690 }, { "epoch": 0.45, "grad_norm": 0.5539714398553305, "learning_rate": 3.068990042674253e-07, "logits/chosen": -2.197174072265625, "logits/rejected": -2.44199275970459, "logps/chosen": -160.1177520751953, "logps/rejected": -375.48504638671875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.5987319946289062, "rewards/margins": 25.49329948425293, "rewards/rejected": -29.0920352935791, "step": 700 }, { "epoch": 0.45, "grad_norm": 0.06116625860123878, "learning_rate": 3.033428165007112e-07, "logits/chosen": -2.296790361404419, "logits/rejected": -2.4758524894714355, "logps/chosen": -194.8965301513672, "logps/rejected": -405.22930908203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.60103178024292, "rewards/margins": 27.618661880493164, "rewards/rejected": -31.219696044921875, "step": 710 }, { "epoch": 0.46, "grad_norm": 0.101166393207417, "learning_rate": 2.9978662873399715e-07, "logits/chosen": -2.209062099456787, "logits/rejected": -2.4515223503112793, "logps/chosen": -161.6198272705078, "logps/rejected": -389.5179138183594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.203439235687256, "rewards/margins": 26.09065818786621, "rewards/rejected": -30.29409408569336, "step": 720 }, { "epoch": 0.47, "grad_norm": 0.12317613401454625, "learning_rate": 2.9623044096728305e-07, "logits/chosen": -2.3108105659484863, "logits/rejected": -2.4695842266082764, "logps/chosen": -167.56552124023438, "logps/rejected": -386.9646301269531, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.244081974029541, "rewards/margins": 26.8917236328125, "rewards/rejected": -30.135807037353516, "step": 730 }, { "epoch": 0.47, "grad_norm": 0.16193920318148997, "learning_rate": 2.92674253200569e-07, "logits/chosen": -2.24593186378479, "logits/rejected": -2.3808069229125977, "logps/chosen": -166.65890502929688, "logps/rejected": -371.7357482910156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.039949417114258, "rewards/margins": 25.134775161743164, "rewards/rejected": -29.174724578857422, "step": 740 }, { "epoch": 0.48, "grad_norm": 0.24139035561225006, "learning_rate": 2.8911806543385494e-07, "logits/chosen": -2.2541756629943848, "logits/rejected": -2.4730725288391113, "logps/chosen": -153.8241729736328, "logps/rejected": -393.72454833984375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.2770607471466064, "rewards/margins": 27.69826889038086, "rewards/rejected": -30.975330352783203, "step": 750 }, { "epoch": 0.49, "grad_norm": 0.2273306730321036, "learning_rate": 2.855618776671408e-07, "logits/chosen": -2.2953999042510986, "logits/rejected": -2.4938910007476807, "logps/chosen": -156.7337646484375, "logps/rejected": -422.956298828125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.969212055206299, "rewards/margins": 31.034259796142578, "rewards/rejected": -34.00347137451172, "step": 760 }, { "epoch": 0.49, "grad_norm": 0.14372879524614457, "learning_rate": 2.8200568990042673e-07, "logits/chosen": -2.2638182640075684, "logits/rejected": -2.4787094593048096, "logps/chosen": -156.54861450195312, "logps/rejected": -413.94390869140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.366678237915039, "rewards/margins": 29.10895347595215, "rewards/rejected": -32.47563552856445, "step": 770 }, { "epoch": 0.5, "grad_norm": 0.7791877496674509, "learning_rate": 2.784495021337127e-07, "logits/chosen": -2.0938005447387695, "logits/rejected": -2.353400468826294, "logps/chosen": -185.4242401123047, "logps/rejected": -443.72674560546875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.7681474685668945, "rewards/margins": 31.02960777282715, "rewards/rejected": -35.79775619506836, "step": 780 }, { "epoch": 0.51, "grad_norm": 0.03127157807702757, "learning_rate": 2.7489331436699857e-07, "logits/chosen": -1.8011245727539062, "logits/rejected": -2.1883206367492676, "logps/chosen": -211.3428192138672, "logps/rejected": -450.826904296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -8.674280166625977, "rewards/margins": 28.067846298217773, "rewards/rejected": -36.74212646484375, "step": 790 }, { "epoch": 0.51, "grad_norm": 0.7457419151516147, "learning_rate": 2.7133712660028446e-07, "logits/chosen": -1.7626945972442627, "logits/rejected": -2.1461925506591797, "logps/chosen": -196.3053741455078, "logps/rejected": -414.73797607421875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -8.334815979003906, "rewards/margins": 25.075695037841797, "rewards/rejected": -33.41051483154297, "step": 800 }, { "epoch": 0.52, "grad_norm": 1.5620477401101704, "learning_rate": 2.677809388335704e-07, "logits/chosen": -1.8202106952667236, "logits/rejected": -2.1248602867126465, "logps/chosen": -214.88455200195312, "logps/rejected": -442.2652893066406, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.491216659545898, "rewards/margins": 26.589365005493164, "rewards/rejected": -35.08058547973633, "step": 810 }, { "epoch": 0.52, "grad_norm": 0.08613623540760612, "learning_rate": 2.642247510668563e-07, "logits/chosen": -1.7793203592300415, "logits/rejected": -2.0444858074188232, "logps/chosen": -193.2935333251953, "logps/rejected": -417.64971923828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.92196798324585, "rewards/margins": 26.353992462158203, "rewards/rejected": -34.275962829589844, "step": 820 }, { "epoch": 0.53, "grad_norm": 0.17621240034584026, "learning_rate": 2.6066856330014225e-07, "logits/chosen": -1.7755126953125, "logits/rejected": -2.158407688140869, "logps/chosen": -204.4994354248047, "logps/rejected": -403.656494140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.456037521362305, "rewards/margins": 23.214176177978516, "rewards/rejected": -31.670211791992188, "step": 830 }, { "epoch": 0.54, "grad_norm": 0.2755963701991524, "learning_rate": 2.5711237553342815e-07, "logits/chosen": -1.8789972066879272, "logits/rejected": -2.1819803714752197, "logps/chosen": -228.1376190185547, "logps/rejected": -361.107421875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -8.826920509338379, "rewards/margins": 18.567317962646484, "rewards/rejected": -27.394237518310547, "step": 840 }, { "epoch": 0.54, "grad_norm": 0.3567356300171107, "learning_rate": 2.5355618776671404e-07, "logits/chosen": -1.8468620777130127, "logits/rejected": -2.1529593467712402, "logps/chosen": -211.9832000732422, "logps/rejected": -359.4861755371094, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.586137771606445, "rewards/margins": 18.8839054107666, "rewards/rejected": -27.470043182373047, "step": 850 }, { "epoch": 0.55, "grad_norm": 0.8886436285985593, "learning_rate": 2.5e-07, "logits/chosen": -1.8835853338241577, "logits/rejected": -2.2427430152893066, "logps/chosen": -199.7351531982422, "logps/rejected": -357.59173583984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.047845840454102, "rewards/margins": 18.773927688598633, "rewards/rejected": -26.8217716217041, "step": 860 }, { "epoch": 0.56, "grad_norm": 9.219078186901797, "learning_rate": 2.4644381223328594e-07, "logits/chosen": -1.9670299291610718, "logits/rejected": -2.3183913230895996, "logps/chosen": -194.19821166992188, "logps/rejected": -351.28363037109375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -7.383183479309082, "rewards/margins": 18.614337921142578, "rewards/rejected": -25.997522354125977, "step": 870 }, { "epoch": 0.56, "grad_norm": 1.4090438099252132, "learning_rate": 2.4288762446657183e-07, "logits/chosen": -2.102022647857666, "logits/rejected": -2.330972194671631, "logps/chosen": -205.2625274658203, "logps/rejected": -360.0517578125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.618918418884277, "rewards/margins": 19.44923973083496, "rewards/rejected": -27.068157196044922, "step": 880 }, { "epoch": 0.57, "grad_norm": 0.18211470434359084, "learning_rate": 2.393314366998578e-07, "logits/chosen": -2.056673765182495, "logits/rejected": -2.251537322998047, "logps/chosen": -206.20236206054688, "logps/rejected": -347.0545349121094, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.2250471115112305, "rewards/margins": 19.100948333740234, "rewards/rejected": -26.325998306274414, "step": 890 }, { "epoch": 0.58, "grad_norm": 0.07707203124683878, "learning_rate": 2.3577524893314365e-07, "logits/chosen": -2.017874002456665, "logits/rejected": -2.231964588165283, "logps/chosen": -202.12701416015625, "logps/rejected": -368.251953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.979673862457275, "rewards/margins": 21.422286987304688, "rewards/rejected": -28.401958465576172, "step": 900 }, { "epoch": 0.58, "grad_norm": 0.001873680906220021, "learning_rate": 2.322190611664296e-07, "logits/chosen": -2.009934902191162, "logits/rejected": -2.166846752166748, "logps/chosen": -194.57254028320312, "logps/rejected": -352.115234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.7056145668029785, "rewards/margins": 20.501380920410156, "rewards/rejected": -27.206995010375977, "step": 910 }, { "epoch": 0.59, "grad_norm": 0.08432204531455857, "learning_rate": 2.2866287339971549e-07, "logits/chosen": -2.018400192260742, "logits/rejected": -2.2199556827545166, "logps/chosen": -194.8164520263672, "logps/rejected": -383.21234130859375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -6.944446563720703, "rewards/margins": 22.65452003479004, "rewards/rejected": -29.598968505859375, "step": 920 }, { "epoch": 0.6, "grad_norm": 0.018002114098924, "learning_rate": 2.251066856330014e-07, "logits/chosen": -1.9886525869369507, "logits/rejected": -2.1241865158081055, "logps/chosen": -203.24600219726562, "logps/rejected": -367.25384521484375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.765786170959473, "rewards/margins": 22.038408279418945, "rewards/rejected": -28.8041934967041, "step": 930 }, { "epoch": 0.6, "grad_norm": 0.17969240498985034, "learning_rate": 2.2155049786628733e-07, "logits/chosen": -1.9080575704574585, "logits/rejected": -2.2484161853790283, "logps/chosen": -193.9615478515625, "logps/rejected": -385.5549011230469, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.898443698883057, "rewards/margins": 22.909486770629883, "rewards/rejected": -29.807926177978516, "step": 940 }, { "epoch": 0.61, "grad_norm": 0.44454200648221737, "learning_rate": 2.1799431009957325e-07, "logits/chosen": -1.9811222553253174, "logits/rejected": -2.146716833114624, "logps/chosen": -183.6280975341797, "logps/rejected": -362.95428466796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.245885372161865, "rewards/margins": 21.720762252807617, "rewards/rejected": -27.96664810180664, "step": 950 }, { "epoch": 0.61, "grad_norm": 0.040533997029042736, "learning_rate": 2.1443812233285914e-07, "logits/chosen": -1.913124680519104, "logits/rejected": -2.1230170726776123, "logps/chosen": -185.01473999023438, "logps/rejected": -370.7552490234375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.70156192779541, "rewards/margins": 23.934677124023438, "rewards/rejected": -28.636241912841797, "step": 960 }, { "epoch": 0.62, "grad_norm": 0.008272842079264974, "learning_rate": 2.108819345661451e-07, "logits/chosen": -1.9076120853424072, "logits/rejected": -2.0281999111175537, "logps/chosen": -170.41903686523438, "logps/rejected": -355.3038024902344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.106360912322998, "rewards/margins": 23.382038116455078, "rewards/rejected": -27.488399505615234, "step": 970 }, { "epoch": 0.63, "grad_norm": 0.12698409042008263, "learning_rate": 2.0732574679943098e-07, "logits/chosen": -1.7689754962921143, "logits/rejected": -2.0027101039886475, "logps/chosen": -151.05068969726562, "logps/rejected": -353.2601623535156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.083472728729248, "rewards/margins": 23.301097869873047, "rewards/rejected": -27.384571075439453, "step": 980 }, { "epoch": 0.63, "grad_norm": 0.526750803695656, "learning_rate": 2.0376955903271693e-07, "logits/chosen": -1.8657863140106201, "logits/rejected": -2.1982533931732178, "logps/chosen": -149.32810974121094, "logps/rejected": -375.81890869140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.5236129760742188, "rewards/margins": 25.390033721923828, "rewards/rejected": -28.913650512695312, "step": 990 }, { "epoch": 0.64, "grad_norm": 0.4630571760419437, "learning_rate": 2.0021337126600283e-07, "logits/chosen": -1.8830486536026, "logits/rejected": -2.1774730682373047, "logps/chosen": -171.07569885253906, "logps/rejected": -375.9671630859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6209583282470703, "rewards/margins": 24.981908798217773, "rewards/rejected": -28.60286521911621, "step": 1000 }, { "epoch": 0.65, "grad_norm": 0.9404509604551488, "learning_rate": 1.9665718349928875e-07, "logits/chosen": -1.9226410388946533, "logits/rejected": -2.0701773166656494, "logps/chosen": -169.32447814941406, "logps/rejected": -370.9498596191406, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.240044593811035, "rewards/margins": 24.28924560546875, "rewards/rejected": -28.5292911529541, "step": 1010 }, { "epoch": 0.65, "grad_norm": 0.372566229137264, "learning_rate": 1.931009957325747e-07, "logits/chosen": -1.871727705001831, "logits/rejected": -2.173356533050537, "logps/chosen": -175.205810546875, "logps/rejected": -347.47344970703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.738024711608887, "rewards/margins": 21.8662052154541, "rewards/rejected": -26.604228973388672, "step": 1020 }, { "epoch": 0.66, "grad_norm": 0.33200076373892806, "learning_rate": 1.895448079658606e-07, "logits/chosen": -1.9309930801391602, "logits/rejected": -2.196063280105591, "logps/chosen": -179.72796630859375, "logps/rejected": -327.9047546386719, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.597779273986816, "rewards/margins": 19.917034149169922, "rewards/rejected": -24.514814376831055, "step": 1030 }, { "epoch": 0.67, "grad_norm": 0.06316375390092686, "learning_rate": 1.859886201991465e-07, "logits/chosen": -1.8018014430999756, "logits/rejected": -2.151124954223633, "logps/chosen": -157.43798828125, "logps/rejected": -364.316162109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.867424011230469, "rewards/margins": 23.382282257080078, "rewards/rejected": -28.249704360961914, "step": 1040 }, { "epoch": 0.67, "grad_norm": 0.017817721309851955, "learning_rate": 1.8243243243243243e-07, "logits/chosen": -1.8452457189559937, "logits/rejected": -2.215014696121216, "logps/chosen": -171.06100463867188, "logps/rejected": -375.04791259765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.973246097564697, "rewards/margins": 23.576169967651367, "rewards/rejected": -28.549413681030273, "step": 1050 }, { "epoch": 0.68, "grad_norm": 0.06108555524800413, "learning_rate": 1.7887624466571835e-07, "logits/chosen": -1.908624291419983, "logits/rejected": -2.1829276084899902, "logps/chosen": -194.8101348876953, "logps/rejected": -388.7918395996094, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -6.008497714996338, "rewards/margins": 24.203271865844727, "rewards/rejected": -30.21177101135254, "step": 1060 }, { "epoch": 0.68, "grad_norm": 0.18272607275595262, "learning_rate": 1.7532005689900424e-07, "logits/chosen": -1.9170128107070923, "logits/rejected": -2.2122740745544434, "logps/chosen": -179.13600158691406, "logps/rejected": -377.26361083984375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -6.440054416656494, "rewards/margins": 22.63031768798828, "rewards/rejected": -29.07037353515625, "step": 1070 }, { "epoch": 0.69, "grad_norm": 0.05453926680821035, "learning_rate": 1.717638691322902e-07, "logits/chosen": -1.9766355752944946, "logits/rejected": -2.228825569152832, "logps/chosen": -205.33908081054688, "logps/rejected": -388.71685791015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.267792701721191, "rewards/margins": 23.473896026611328, "rewards/rejected": -29.741689682006836, "step": 1080 }, { "epoch": 0.7, "grad_norm": 8.450138043857905, "learning_rate": 1.6820768136557609e-07, "logits/chosen": -1.929616928100586, "logits/rejected": -2.1833229064941406, "logps/chosen": -177.30699157714844, "logps/rejected": -369.02569580078125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.677231311798096, "rewards/margins": 23.399320602416992, "rewards/rejected": -29.076553344726562, "step": 1090 }, { "epoch": 0.7, "grad_norm": 0.2629017265227386, "learning_rate": 1.64651493598862e-07, "logits/chosen": -1.8782516717910767, "logits/rejected": -2.1519296169281006, "logps/chosen": -171.37185668945312, "logps/rejected": -345.8654479980469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.002556800842285, "rewards/margins": 21.3153018951416, "rewards/rejected": -26.317859649658203, "step": 1100 }, { "epoch": 0.71, "grad_norm": 0.08681759669643914, "learning_rate": 1.6109530583214793e-07, "logits/chosen": -1.944676160812378, "logits/rejected": -2.2825026512145996, "logps/chosen": -172.86749267578125, "logps/rejected": -348.4911804199219, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -5.514270305633545, "rewards/margins": 20.45355987548828, "rewards/rejected": -25.967830657958984, "step": 1110 }, { "epoch": 0.72, "grad_norm": 0.14772131857638982, "learning_rate": 1.5753911806543385e-07, "logits/chosen": -2.047938823699951, "logits/rejected": -2.198514938354492, "logps/chosen": -188.20437622070312, "logps/rejected": -351.10565185546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.5845184326171875, "rewards/margins": 21.684741973876953, "rewards/rejected": -27.26926040649414, "step": 1120 }, { "epoch": 0.72, "grad_norm": 0.020583492505652848, "learning_rate": 1.5398293029871974e-07, "logits/chosen": -2.0079150199890137, "logits/rejected": -2.2482612133026123, "logps/chosen": -177.460205078125, "logps/rejected": -346.9248046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.606606960296631, "rewards/margins": 20.772077560424805, "rewards/rejected": -26.378686904907227, "step": 1130 }, { "epoch": 0.73, "grad_norm": 1.7059121905673134, "learning_rate": 1.504267425320057e-07, "logits/chosen": -1.9664523601531982, "logits/rejected": -2.1823642253875732, "logps/chosen": -174.95205688476562, "logps/rejected": -388.5026550292969, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.427298069000244, "rewards/margins": 23.900110244750977, "rewards/rejected": -30.327407836914062, "step": 1140 }, { "epoch": 0.74, "grad_norm": 0.9977733695628042, "learning_rate": 1.4687055476529158e-07, "logits/chosen": -2.050489664077759, "logits/rejected": -2.270904064178467, "logps/chosen": -186.78663635253906, "logps/rejected": -372.7840270996094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.426581382751465, "rewards/margins": 23.374040603637695, "rewards/rejected": -28.800622940063477, "step": 1150 }, { "epoch": 0.74, "grad_norm": 0.002404789660626006, "learning_rate": 1.4331436699857753e-07, "logits/chosen": -1.9210751056671143, "logits/rejected": -2.240872859954834, "logps/chosen": -171.84713745117188, "logps/rejected": -396.109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.953668117523193, "rewards/margins": 25.2002010345459, "rewards/rejected": -31.15386962890625, "step": 1160 }, { "epoch": 0.75, "grad_norm": 0.018292107983041178, "learning_rate": 1.3975817923186345e-07, "logits/chosen": -2.0512757301330566, "logits/rejected": -2.261383533477783, "logps/chosen": -198.47787475585938, "logps/rejected": -409.4540710449219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.596243858337402, "rewards/margins": 26.373615264892578, "rewards/rejected": -31.969860076904297, "step": 1170 }, { "epoch": 0.75, "grad_norm": 8.705213421112179, "learning_rate": 1.3620199146514935e-07, "logits/chosen": -1.9703617095947266, "logits/rejected": -2.2260239124298096, "logps/chosen": -185.51535034179688, "logps/rejected": -381.4412536621094, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.784358978271484, "rewards/margins": 23.76874542236328, "rewards/rejected": -29.5531063079834, "step": 1180 }, { "epoch": 0.76, "grad_norm": 1.1752034294380853, "learning_rate": 1.326458036984353e-07, "logits/chosen": -2.005053758621216, "logits/rejected": -2.2459633350372314, "logps/chosen": -178.1356964111328, "logps/rejected": -355.98211669921875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.596381664276123, "rewards/margins": 21.09072494506836, "rewards/rejected": -26.687108993530273, "step": 1190 }, { "epoch": 0.77, "grad_norm": 0.05160111703860194, "learning_rate": 1.290896159317212e-07, "logits/chosen": -2.0035533905029297, "logits/rejected": -2.2457115650177, "logps/chosen": -178.15768432617188, "logps/rejected": -336.3076171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.5091423988342285, "rewards/margins": 19.467090606689453, "rewards/rejected": -24.97623634338379, "step": 1200 }, { "epoch": 0.77, "grad_norm": 0.061931926760717695, "learning_rate": 1.255334281650071e-07, "logits/chosen": -2.0695555210113525, "logits/rejected": -2.266425132751465, "logps/chosen": -189.8621826171875, "logps/rejected": -356.81158447265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.535902500152588, "rewards/margins": 21.406814575195312, "rewards/rejected": -26.94271469116211, "step": 1210 }, { "epoch": 0.78, "grad_norm": 0.13621183527581743, "learning_rate": 1.2197724039829303e-07, "logits/chosen": -2.019291877746582, "logits/rejected": -2.1981494426727295, "logps/chosen": -187.91079711914062, "logps/rejected": -356.6788330078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.211094856262207, "rewards/margins": 21.122692108154297, "rewards/rejected": -27.333786010742188, "step": 1220 }, { "epoch": 0.79, "grad_norm": 0.013930629258428272, "learning_rate": 1.1842105263157894e-07, "logits/chosen": -1.9782531261444092, "logits/rejected": -2.2405457496643066, "logps/chosen": -169.53475952148438, "logps/rejected": -377.62298583984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.386349678039551, "rewards/margins": 23.819812774658203, "rewards/rejected": -29.206165313720703, "step": 1230 }, { "epoch": 0.79, "grad_norm": 0.63025336913527, "learning_rate": 1.1486486486486487e-07, "logits/chosen": -1.9963802099227905, "logits/rejected": -2.239816427230835, "logps/chosen": -163.4903106689453, "logps/rejected": -334.11529541015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.6800830364227295, "rewards/margins": 21.3232479095459, "rewards/rejected": -25.003332138061523, "step": 1240 }, { "epoch": 0.8, "grad_norm": 0.055495248029239844, "learning_rate": 1.1130867709815078e-07, "logits/chosen": -1.9950904846191406, "logits/rejected": -2.1991913318634033, "logps/chosen": -180.9514923095703, "logps/rejected": -378.4940185546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.002464771270752, "rewards/margins": 24.19596290588379, "rewards/rejected": -29.19842529296875, "step": 1250 }, { "epoch": 0.81, "grad_norm": 0.23651663696164235, "learning_rate": 1.077524893314367e-07, "logits/chosen": -2.0281214714050293, "logits/rejected": -2.173586368560791, "logps/chosen": -199.83065795898438, "logps/rejected": -370.74591064453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.204631328582764, "rewards/margins": 23.164012908935547, "rewards/rejected": -29.3686466217041, "step": 1260 }, { "epoch": 0.81, "grad_norm": 87.17809979841759, "learning_rate": 1.0419630156472262e-07, "logits/chosen": -1.8399674892425537, "logits/rejected": -2.0879006385803223, "logps/chosen": -224.9874267578125, "logps/rejected": -438.8709411621094, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -10.09913444519043, "rewards/margins": 24.833473205566406, "rewards/rejected": -34.9326057434082, "step": 1270 }, { "epoch": 0.82, "grad_norm": 0.03529290404850911, "learning_rate": 1.0064011379800854e-07, "logits/chosen": -2.116236686706543, "logits/rejected": -2.2769484519958496, "logps/chosen": -199.0480499267578, "logps/rejected": -381.0927734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.298292636871338, "rewards/margins": 23.004474639892578, "rewards/rejected": -29.30276870727539, "step": 1280 }, { "epoch": 0.83, "grad_norm": 0.01781110705857936, "learning_rate": 9.708392603129445e-08, "logits/chosen": -2.098355770111084, "logits/rejected": -2.3084280490875244, "logps/chosen": -185.13162231445312, "logps/rejected": -364.68902587890625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.411550998687744, "rewards/margins": 21.45414924621582, "rewards/rejected": -27.865697860717773, "step": 1290 }, { "epoch": 0.83, "grad_norm": 0.21380541746917897, "learning_rate": 9.352773826458037e-08, "logits/chosen": -2.1513562202453613, "logits/rejected": -2.315798282623291, "logps/chosen": -192.1603546142578, "logps/rejected": -370.1746520996094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.151523590087891, "rewards/margins": 22.305927276611328, "rewards/rejected": -28.457448959350586, "step": 1300 }, { "epoch": 0.84, "grad_norm": 0.053698061298588354, "learning_rate": 8.997155049786629e-08, "logits/chosen": -2.152535915374756, "logits/rejected": -2.2665324211120605, "logps/chosen": -185.61489868164062, "logps/rejected": -373.6748962402344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.086173057556152, "rewards/margins": 22.81212615966797, "rewards/rejected": -28.898296356201172, "step": 1310 }, { "epoch": 0.84, "grad_norm": 0.21920871800240746, "learning_rate": 8.64153627311522e-08, "logits/chosen": -2.11517333984375, "logits/rejected": -2.210675001144409, "logps/chosen": -184.41207885742188, "logps/rejected": -339.87030029296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.820789337158203, "rewards/margins": 20.895233154296875, "rewards/rejected": -26.71602439880371, "step": 1320 }, { "epoch": 0.85, "grad_norm": 0.09090747369415979, "learning_rate": 8.285917496443812e-08, "logits/chosen": -2.1557207107543945, "logits/rejected": -2.3053669929504395, "logps/chosen": -193.05734252929688, "logps/rejected": -388.4326171875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.772280216217041, "rewards/margins": 23.938325881958008, "rewards/rejected": -29.71060562133789, "step": 1330 }, { "epoch": 0.86, "grad_norm": 0.018865859918979065, "learning_rate": 7.930298719772404e-08, "logits/chosen": -2.1138229370117188, "logits/rejected": -2.251180648803711, "logps/chosen": -168.01907348632812, "logps/rejected": -391.6059265136719, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.083653450012207, "rewards/margins": 25.798690795898438, "rewards/rejected": -30.882349014282227, "step": 1340 }, { "epoch": 0.86, "grad_norm": 0.015984656139045435, "learning_rate": 7.574679943100994e-08, "logits/chosen": -2.174654722213745, "logits/rejected": -2.2935032844543457, "logps/chosen": -176.84854125976562, "logps/rejected": -357.89007568359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.099386215209961, "rewards/margins": 22.505481719970703, "rewards/rejected": -27.604867935180664, "step": 1350 }, { "epoch": 0.87, "grad_norm": 0.5463133360618999, "learning_rate": 7.219061166429587e-08, "logits/chosen": -2.146613359451294, "logits/rejected": -2.340137004852295, "logps/chosen": -194.63243103027344, "logps/rejected": -398.6822204589844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.568980693817139, "rewards/margins": 25.14784812927246, "rewards/rejected": -30.71683120727539, "step": 1360 }, { "epoch": 0.88, "grad_norm": 0.5871369349452629, "learning_rate": 6.863442389758179e-08, "logits/chosen": -2.031893491744995, "logits/rejected": -2.307396411895752, "logps/chosen": -166.64334106445312, "logps/rejected": -362.17724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.520142555236816, "rewards/margins": 21.91988754272461, "rewards/rejected": -27.440032958984375, "step": 1370 }, { "epoch": 0.88, "grad_norm": 0.001151297675766669, "learning_rate": 6.507823613086771e-08, "logits/chosen": -2.0853652954101562, "logits/rejected": -2.2990689277648926, "logps/chosen": -166.94778442382812, "logps/rejected": -373.9269104003906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.408658981323242, "rewards/margins": 23.76215362548828, "rewards/rejected": -29.170812606811523, "step": 1380 }, { "epoch": 0.89, "grad_norm": 0.09711906493171378, "learning_rate": 6.152204836415363e-08, "logits/chosen": -2.073453426361084, "logits/rejected": -2.21116304397583, "logps/chosen": -171.67530822753906, "logps/rejected": -370.3066101074219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.4291462898254395, "rewards/margins": 23.211612701416016, "rewards/rejected": -28.640756607055664, "step": 1390 }, { "epoch": 0.9, "grad_norm": 0.12505838228275154, "learning_rate": 5.796586059743954e-08, "logits/chosen": -2.0895724296569824, "logits/rejected": -2.207096576690674, "logps/chosen": -163.5934600830078, "logps/rejected": -365.29693603515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.118988990783691, "rewards/margins": 23.092578887939453, "rewards/rejected": -28.21156883239746, "step": 1400 }, { "epoch": 0.9, "grad_norm": 2.6436825298723967, "learning_rate": 5.4409672830725456e-08, "logits/chosen": -2.142705202102661, "logits/rejected": -2.2424240112304688, "logps/chosen": -190.54376220703125, "logps/rejected": -385.0908203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.370065689086914, "rewards/margins": 24.37637710571289, "rewards/rejected": -29.746444702148438, "step": 1410 }, { "epoch": 0.91, "grad_norm": 0.5795739537460435, "learning_rate": 5.0853485064011376e-08, "logits/chosen": -2.0973267555236816, "logits/rejected": -2.2426836490631104, "logps/chosen": -177.80783081054688, "logps/rejected": -375.2071228027344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.52896785736084, "rewards/margins": 23.480552673339844, "rewards/rejected": -29.009521484375, "step": 1420 }, { "epoch": 0.91, "grad_norm": 2.1741346744880965, "learning_rate": 4.72972972972973e-08, "logits/chosen": -2.207512617111206, "logits/rejected": -2.280388832092285, "logps/chosen": -201.04510498046875, "logps/rejected": -363.89398193359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.225054740905762, "rewards/margins": 22.87972068786621, "rewards/rejected": -28.10477638244629, "step": 1430 }, { "epoch": 0.92, "grad_norm": 0.20540781104484357, "learning_rate": 4.374110953058322e-08, "logits/chosen": -2.1444334983825684, "logits/rejected": -2.261331081390381, "logps/chosen": -170.37319946289062, "logps/rejected": -368.539794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.832165718078613, "rewards/margins": 23.566085815429688, "rewards/rejected": -28.398250579833984, "step": 1440 }, { "epoch": 0.93, "grad_norm": 0.05727460316662975, "learning_rate": 4.018492176386913e-08, "logits/chosen": -2.1009721755981445, "logits/rejected": -2.2630536556243896, "logps/chosen": -166.5227508544922, "logps/rejected": -385.5596923828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.496529579162598, "rewards/margins": 25.6341552734375, "rewards/rejected": -30.130685806274414, "step": 1450 }, { "epoch": 0.93, "grad_norm": 0.013882423497417175, "learning_rate": 3.6628733997155046e-08, "logits/chosen": -2.139456272125244, "logits/rejected": -2.292842149734497, "logps/chosen": -169.7640838623047, "logps/rejected": -355.3850402832031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.837841987609863, "rewards/margins": 22.442852020263672, "rewards/rejected": -27.28069496154785, "step": 1460 }, { "epoch": 0.94, "grad_norm": 0.4937457805533542, "learning_rate": 3.3072546230440967e-08, "logits/chosen": -2.09645414352417, "logits/rejected": -2.3198351860046387, "logps/chosen": -166.16136169433594, "logps/rejected": -372.30316162109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.320296764373779, "rewards/margins": 23.720216751098633, "rewards/rejected": -28.040512084960938, "step": 1470 }, { "epoch": 0.95, "grad_norm": 0.00827921495481582, "learning_rate": 2.9516358463726884e-08, "logits/chosen": -2.13936185836792, "logits/rejected": -2.291224241256714, "logps/chosen": -171.7252960205078, "logps/rejected": -388.84356689453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.682975769042969, "rewards/margins": 25.828012466430664, "rewards/rejected": -30.5109920501709, "step": 1480 }, { "epoch": 0.95, "grad_norm": 0.004370788618868951, "learning_rate": 2.59601706970128e-08, "logits/chosen": -2.1469688415527344, "logits/rejected": -2.289008617401123, "logps/chosen": -170.2058868408203, "logps/rejected": -381.54656982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.3197221755981445, "rewards/margins": 25.29262924194336, "rewards/rejected": -29.612350463867188, "step": 1490 }, { "epoch": 0.96, "grad_norm": 0.19726476194562312, "learning_rate": 2.240398293029872e-08, "logits/chosen": -2.1768240928649902, "logits/rejected": -2.3078079223632812, "logps/chosen": -173.7339630126953, "logps/rejected": -385.33099365234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.993574619293213, "rewards/margins": 25.351411819458008, "rewards/rejected": -29.344989776611328, "step": 1500 }, { "epoch": 0.97, "grad_norm": 0.00858791476540018, "learning_rate": 1.8847795163584636e-08, "logits/chosen": -2.18503737449646, "logits/rejected": -2.308922529220581, "logps/chosen": -176.88064575195312, "logps/rejected": -373.804443359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.562438011169434, "rewards/margins": 24.36642837524414, "rewards/rejected": -28.928863525390625, "step": 1510 }, { "epoch": 0.97, "grad_norm": 3.138354599232436, "learning_rate": 1.5291607396870554e-08, "logits/chosen": -2.116145610809326, "logits/rejected": -2.2737691402435303, "logps/chosen": -175.29978942871094, "logps/rejected": -384.5267639160156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.154788017272949, "rewards/margins": 24.504108428955078, "rewards/rejected": -29.658893585205078, "step": 1520 }, { "epoch": 0.98, "grad_norm": 0.03067506561400629, "learning_rate": 1.1735419630156473e-08, "logits/chosen": -2.0927538871765137, "logits/rejected": -2.2137911319732666, "logps/chosen": -163.97552490234375, "logps/rejected": -366.1586608886719, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.887188911437988, "rewards/margins": 23.64038848876953, "rewards/rejected": -28.527576446533203, "step": 1530 }, { "epoch": 0.99, "grad_norm": 0.44417234512279563, "learning_rate": 8.179231863442388e-09, "logits/chosen": -2.15065860748291, "logits/rejected": -2.25085711479187, "logps/chosen": -174.73416137695312, "logps/rejected": -349.6435241699219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.77908992767334, "rewards/margins": 21.8181209564209, "rewards/rejected": -26.597209930419922, "step": 1540 }, { "epoch": 0.99, "grad_norm": 0.02198506573420274, "learning_rate": 4.623044096728307e-09, "logits/chosen": -2.1306891441345215, "logits/rejected": -2.3441672325134277, "logps/chosen": -183.6136016845703, "logps/rejected": -383.9253234863281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.792498588562012, "rewards/margins": 24.779081344604492, "rewards/rejected": -29.571582794189453, "step": 1550 }, { "epoch": 1.0, "grad_norm": 0.01071067202668103, "learning_rate": 1.0668563300142248e-09, "logits/chosen": -2.1825332641601562, "logits/rejected": -2.2997748851776123, "logps/chosen": -168.47274780273438, "logps/rejected": -385.79119873046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.786942481994629, "rewards/margins": 25.29857635498047, "rewards/rejected": -30.085519790649414, "step": 1560 }, { "epoch": 1.0, "step": 1563, "total_flos": 0.0, "train_loss": 0.013828172848911992, "train_runtime": 8967.2368, "train_samples_per_second": 5.576, "train_steps_per_second": 0.174 } ], "logging_steps": 10, "max_steps": 1563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }