diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10762 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9990680335507922, + "eval_steps": 500, + "global_step": 670, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014911463187325257, + "grad_norm": 131.6420531261408, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": 1.860435962677002, + "logits/rejected": 2.0789663791656494, + "logps/chosen": -1.3781263828277588, + "logps/rejected": -1.480776309967041, + "loss": 4.9287, + "nll_loss": 1.3781262636184692, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.78126335144043, + "rewards/margins": 1.0264991521835327, + "rewards/rejected": -14.80776309967041, + "step": 1 + }, + { + "epoch": 0.0029822926374650513, + "grad_norm": 103.40519385579142, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": 3.1205239295959473, + "logits/rejected": 2.894362211227417, + "logps/chosen": -0.8937948346138, + "logps/rejected": -1.622621774673462, + "loss": 3.7891, + "nll_loss": 0.8937948346138, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.937948226928711, + "rewards/margins": 7.28826904296875, + "rewards/rejected": -16.226219177246094, + "step": 2 + }, + { + "epoch": 0.004473438956197577, + "grad_norm": 134.5616836750816, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": 3.2682483196258545, + "logits/rejected": 3.209494113922119, + "logps/chosen": -0.8724318742752075, + "logps/rejected": -1.301709532737732, + "loss": 5.7572, + "nll_loss": 0.8724318742752075, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.724318504333496, + "rewards/margins": 4.292776584625244, + "rewards/rejected": -13.017093658447266, + "step": 3 + }, + { + "epoch": 0.005964585274930103, + "grad_norm": 60.52582088050329, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": 3.037123918533325, + "logits/rejected": 3.441870927810669, + "logps/chosen": -1.0890567302703857, + "logps/rejected": -1.773290753364563, + "loss": 3.3092, + "nll_loss": 1.0890568494796753, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.890567779541016, + "rewards/margins": 6.842340469360352, + "rewards/rejected": -17.732908248901367, + "step": 4 + }, + { + "epoch": 0.007455731593662628, + "grad_norm": 299.54504667805105, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": 1.6548478603363037, + "logits/rejected": 1.8445292711257935, + "logps/chosen": -0.7503100037574768, + "logps/rejected": -1.4727164506912231, + "loss": 3.4786, + "nll_loss": 0.7503100633621216, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.503100395202637, + "rewards/margins": 7.224064826965332, + "rewards/rejected": -14.727165222167969, + "step": 5 + }, + { + "epoch": 0.008946877912395153, + "grad_norm": 325.1467810043402, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": 1.5452815294265747, + "logits/rejected": 1.7093256711959839, + "logps/chosen": -0.9170266389846802, + "logps/rejected": -1.4084343910217285, + "loss": 7.1976, + "nll_loss": 0.9170266389846802, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.170266151428223, + "rewards/margins": 4.914077281951904, + "rewards/rejected": -14.084342956542969, + "step": 6 + }, + { + "epoch": 0.01043802423112768, + "grad_norm": 77.8378923449896, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": 1.6482737064361572, + "logits/rejected": 2.253169059753418, + "logps/chosen": -0.9095208644866943, + "logps/rejected": -1.4046701192855835, + "loss": 3.2881, + "nll_loss": 0.9095209240913391, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.095209121704102, + "rewards/margins": 4.951492786407471, + "rewards/rejected": -14.046701431274414, + "step": 7 + }, + { + "epoch": 0.011929170549860205, + "grad_norm": 211.71750009391528, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": 2.1762900352478027, + "logits/rejected": 1.8468989133834839, + "logps/chosen": -0.9643529057502747, + "logps/rejected": -1.0850187540054321, + "loss": 4.9795, + "nll_loss": 0.9643529057502747, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.643528938293457, + "rewards/margins": 1.2066583633422852, + "rewards/rejected": -10.850187301635742, + "step": 8 + }, + { + "epoch": 0.01342031686859273, + "grad_norm": 89.30493536935535, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": 2.073772430419922, + "logits/rejected": 2.0000739097595215, + "logps/chosen": -1.0350722074508667, + "logps/rejected": -1.282395839691162, + "loss": 3.8779, + "nll_loss": 1.0350722074508667, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.35072135925293, + "rewards/margins": 2.4732351303100586, + "rewards/rejected": -12.823957443237305, + "step": 9 + }, + { + "epoch": 0.014911463187325256, + "grad_norm": 448.01455210884984, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": 2.8771941661834717, + "logits/rejected": 2.724689483642578, + "logps/chosen": -0.9448939561843872, + "logps/rejected": -2.669597625732422, + "loss": 5.7577, + "nll_loss": 0.9448938369750977, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.448939323425293, + "rewards/margins": 17.24703598022461, + "rewards/rejected": -26.69597625732422, + "step": 10 + }, + { + "epoch": 0.01640260950605778, + "grad_norm": 52.029727877884525, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": 2.3021833896636963, + "logits/rejected": 2.522644281387329, + "logps/chosen": -0.7436463832855225, + "logps/rejected": -0.9411755800247192, + "loss": 3.1168, + "nll_loss": 0.7436463832855225, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.436463832855225, + "rewards/margins": 1.9752916097640991, + "rewards/rejected": -9.411755561828613, + "step": 11 + }, + { + "epoch": 0.017893755824790306, + "grad_norm": 82.59162493133492, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": 1.856669306755066, + "logits/rejected": 2.4997129440307617, + "logps/chosen": -0.7935373187065125, + "logps/rejected": -1.27372145652771, + "loss": 2.2473, + "nll_loss": 0.7935372591018677, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.935372829437256, + "rewards/margins": 4.801842212677002, + "rewards/rejected": -12.737215042114258, + "step": 12 + }, + { + "epoch": 0.01938490214352283, + "grad_norm": 374.9421894888616, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": 2.017672061920166, + "logits/rejected": 2.787160873413086, + "logps/chosen": -0.9906985759735107, + "logps/rejected": -1.5864818096160889, + "loss": 6.4503, + "nll_loss": 0.9906984567642212, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.90698528289795, + "rewards/margins": 5.9578328132629395, + "rewards/rejected": -15.86481761932373, + "step": 13 + }, + { + "epoch": 0.02087604846225536, + "grad_norm": 43.83168786420967, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": 2.2166688442230225, + "logits/rejected": 2.6085205078125, + "logps/chosen": -0.7322432994842529, + "logps/rejected": -1.4798029661178589, + "loss": 3.4529, + "nll_loss": 0.7322432398796082, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.322432994842529, + "rewards/margins": 7.475597381591797, + "rewards/rejected": -14.798028945922852, + "step": 14 + }, + { + "epoch": 0.022367194780987885, + "grad_norm": 44.227628229595126, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": 3.3875794410705566, + "logits/rejected": 3.3296778202056885, + "logps/chosen": -1.0347462892532349, + "logps/rejected": -3.000295877456665, + "loss": 2.4859, + "nll_loss": 1.0347462892532349, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.34746265411377, + "rewards/margins": 19.65549087524414, + "rewards/rejected": -30.002954483032227, + "step": 15 + }, + { + "epoch": 0.02385834109972041, + "grad_norm": 325.8346322242626, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": 3.1263082027435303, + "logits/rejected": 3.1564300060272217, + "logps/chosen": -0.895110011100769, + "logps/rejected": -1.4188649654388428, + "loss": 6.0086, + "nll_loss": 0.895110011100769, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.951099395751953, + "rewards/margins": 5.237548828125, + "rewards/rejected": -14.188648223876953, + "step": 16 + }, + { + "epoch": 0.025349487418452936, + "grad_norm": 587.6736588822685, + "learning_rate": 5e-07, + "logits/chosen": 1.43706214427948, + "logits/rejected": 1.386991262435913, + "logps/chosen": -1.0890893936157227, + "logps/rejected": -2.1828713417053223, + "loss": 4.7301, + "nll_loss": 1.0890893936157227, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.890893936157227, + "rewards/margins": 10.93781852722168, + "rewards/rejected": -21.828712463378906, + "step": 17 + }, + { + "epoch": 0.02684063373718546, + "grad_norm": 48.89890995560396, + "learning_rate": 5.294117647058823e-07, + "logits/chosen": 2.488058090209961, + "logits/rejected": 2.8513152599334717, + "logps/chosen": -0.8674882650375366, + "logps/rejected": -2.094695568084717, + "loss": 3.4653, + "nll_loss": 0.8674882650375366, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.674882888793945, + "rewards/margins": 12.272073745727539, + "rewards/rejected": -20.946956634521484, + "step": 18 + }, + { + "epoch": 0.028331780055917986, + "grad_norm": 70.05145049092668, + "learning_rate": 5.588235294117647e-07, + "logits/chosen": 1.3571797609329224, + "logits/rejected": 1.412168025970459, + "logps/chosen": -0.5889593958854675, + "logps/rejected": -0.9319955110549927, + "loss": 3.0801, + "nll_loss": 0.5889593362808228, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.889594078063965, + "rewards/margins": 3.430360794067383, + "rewards/rejected": -9.319953918457031, + "step": 19 + }, + { + "epoch": 0.02982292637465051, + "grad_norm": 451.3908655974561, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": 2.2031502723693848, + "logits/rejected": 2.382431983947754, + "logps/chosen": -1.0798367261886597, + "logps/rejected": -1.9304254055023193, + "loss": 7.9661, + "nll_loss": 1.0798367261886597, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.79836654663086, + "rewards/margins": 8.505887985229492, + "rewards/rejected": -19.30425453186035, + "step": 20 + }, + { + "epoch": 0.03131407269338304, + "grad_norm": 52.30093423337889, + "learning_rate": 6.176470588235294e-07, + "logits/chosen": 2.417792797088623, + "logits/rejected": 2.771883487701416, + "logps/chosen": -1.1893094778060913, + "logps/rejected": -1.3052068948745728, + "loss": 3.2391, + "nll_loss": 1.1893094778060913, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.893095016479492, + "rewards/margins": 1.1589728593826294, + "rewards/rejected": -13.052067756652832, + "step": 21 + }, + { + "epoch": 0.03280521901211556, + "grad_norm": 308.86340481320207, + "learning_rate": 6.470588235294117e-07, + "logits/chosen": 2.9378466606140137, + "logits/rejected": 2.937854528427124, + "logps/chosen": -3.167470693588257, + "logps/rejected": -1.3397622108459473, + "loss": 6.9591, + "nll_loss": 3.167470693588257, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.67470932006836, + "rewards/margins": -18.277084350585938, + "rewards/rejected": -13.397623062133789, + "step": 22 + }, + { + "epoch": 0.03429636533084809, + "grad_norm": 98.96765350390677, + "learning_rate": 6.764705882352941e-07, + "logits/chosen": 2.7766013145446777, + "logits/rejected": 2.894148111343384, + "logps/chosen": -1.078629970550537, + "logps/rejected": -2.440201997756958, + "loss": 3.5107, + "nll_loss": 1.0786300897598267, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.786300659179688, + "rewards/margins": 13.61571979522705, + "rewards/rejected": -24.402023315429688, + "step": 23 + }, + { + "epoch": 0.03578751164958061, + "grad_norm": 396.6066334256457, + "learning_rate": 7.058823529411765e-07, + "logits/chosen": 2.2593963146209717, + "logits/rejected": 2.1909866333007812, + "logps/chosen": -0.9177834987640381, + "logps/rejected": -1.28047513961792, + "loss": 5.0049, + "nll_loss": 0.9177834987640381, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.177834510803223, + "rewards/margins": 3.6269168853759766, + "rewards/rejected": -12.804752349853516, + "step": 24 + }, + { + "epoch": 0.03727865796831314, + "grad_norm": 408.09262397804815, + "learning_rate": 7.352941176470589e-07, + "logits/chosen": 1.8168671131134033, + "logits/rejected": 2.147278070449829, + "logps/chosen": -0.9170699119567871, + "logps/rejected": -1.5485506057739258, + "loss": 6.9351, + "nll_loss": 0.9170699119567871, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.170699119567871, + "rewards/margins": 6.314807415008545, + "rewards/rejected": -15.485507011413574, + "step": 25 + }, + { + "epoch": 0.03876980428704566, + "grad_norm": 48.476267730529635, + "learning_rate": 7.647058823529411e-07, + "logits/chosen": 2.122601270675659, + "logits/rejected": 2.130781888961792, + "logps/chosen": -0.8701527714729309, + "logps/rejected": -4.8353047370910645, + "loss": 2.8474, + "nll_loss": 0.8701527714729309, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.701526641845703, + "rewards/margins": 39.651519775390625, + "rewards/rejected": -48.353050231933594, + "step": 26 + }, + { + "epoch": 0.040260950605778195, + "grad_norm": 60.6168649217365, + "learning_rate": 7.941176470588235e-07, + "logits/chosen": 2.607726812362671, + "logits/rejected": 2.7943997383117676, + "logps/chosen": -0.9930699467658997, + "logps/rejected": -2.903250217437744, + "loss": 1.8576, + "nll_loss": 0.9930700063705444, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.930700302124023, + "rewards/margins": 19.101806640625, + "rewards/rejected": -29.03250503540039, + "step": 27 + }, + { + "epoch": 0.04175209692451072, + "grad_norm": 323.1578672926767, + "learning_rate": 8.235294117647058e-07, + "logits/chosen": 1.7018678188323975, + "logits/rejected": 2.3391165733337402, + "logps/chosen": -0.7163273096084595, + "logps/rejected": -1.9551293849945068, + "loss": 4.318, + "nll_loss": 0.7163272500038147, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.163272857666016, + "rewards/margins": 12.388022422790527, + "rewards/rejected": -19.55129623413086, + "step": 28 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 648.8657305986615, + "learning_rate": 8.529411764705882e-07, + "logits/chosen": 2.8854246139526367, + "logits/rejected": 2.9412496089935303, + "logps/chosen": -0.7824481129646301, + "logps/rejected": -3.6611781120300293, + "loss": 7.4867, + "nll_loss": 0.7824481129646301, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.824481010437012, + "rewards/margins": 28.787302017211914, + "rewards/rejected": -36.611785888671875, + "step": 29 + }, + { + "epoch": 0.04473438956197577, + "grad_norm": 201.44323353397405, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": 2.1763527393341064, + "logits/rejected": 2.4524543285369873, + "logps/chosen": -0.9366781115531921, + "logps/rejected": -1.4384242296218872, + "loss": 6.2664, + "nll_loss": 0.9366780519485474, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.366782188415527, + "rewards/margins": 5.017460346221924, + "rewards/rejected": -14.384241104125977, + "step": 30 + }, + { + "epoch": 0.046225535880708296, + "grad_norm": 58.18949616340279, + "learning_rate": 9.117647058823529e-07, + "logits/chosen": 1.7160942554473877, + "logits/rejected": 1.9639906883239746, + "logps/chosen": -0.7702720761299133, + "logps/rejected": -1.9634369611740112, + "loss": 2.8173, + "nll_loss": 0.7702720761299133, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.702720642089844, + "rewards/margins": 11.931649208068848, + "rewards/rejected": -19.634368896484375, + "step": 31 + }, + { + "epoch": 0.04771668219944082, + "grad_norm": 53.2747982513516, + "learning_rate": 9.411764705882352e-07, + "logits/chosen": 1.9022067785263062, + "logits/rejected": 2.0756680965423584, + "logps/chosen": -0.7626714706420898, + "logps/rejected": -1.3213756084442139, + "loss": 3.8545, + "nll_loss": 0.7626714706420898, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.62671422958374, + "rewards/margins": 5.587039947509766, + "rewards/rejected": -13.213754653930664, + "step": 32 + }, + { + "epoch": 0.04920782851817335, + "grad_norm": 123.13299849078159, + "learning_rate": 9.705882352941176e-07, + "logits/chosen": 2.2254092693328857, + "logits/rejected": 2.159079074859619, + "logps/chosen": -0.6334936618804932, + "logps/rejected": -1.3932712078094482, + "loss": 2.7757, + "nll_loss": 0.6334936022758484, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.334936618804932, + "rewards/margins": 7.597774505615234, + "rewards/rejected": -13.932710647583008, + "step": 33 + }, + { + "epoch": 0.05069897483690587, + "grad_norm": 514.3589700810505, + "learning_rate": 1e-06, + "logits/chosen": 2.442638874053955, + "logits/rejected": 2.389988422393799, + "logps/chosen": -0.9179913997650146, + "logps/rejected": -2.180492877960205, + "loss": 5.3914, + "nll_loss": 0.9179913997650146, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.179913520812988, + "rewards/margins": 12.625017166137695, + "rewards/rejected": -21.804927825927734, + "step": 34 + }, + { + "epoch": 0.0521901211556384, + "grad_norm": 351.36582028421606, + "learning_rate": 9.999939000729715e-07, + "logits/chosen": 3.020676374435425, + "logits/rejected": 2.6687097549438477, + "logps/chosen": -1.1960551738739014, + "logps/rejected": -1.4366477727890015, + "loss": 5.0978, + "nll_loss": 1.1960551738739014, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.960551261901855, + "rewards/margins": 2.4059267044067383, + "rewards/rejected": -14.366477966308594, + "step": 35 + }, + { + "epoch": 0.05368126747437092, + "grad_norm": 33.27397343776287, + "learning_rate": 9.999756004407228e-07, + "logits/chosen": 2.4607324600219727, + "logits/rejected": 2.1372451782226562, + "logps/chosen": -0.7419140338897705, + "logps/rejected": -1.294709324836731, + "loss": 2.6714, + "nll_loss": 0.7419140338897705, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.419139862060547, + "rewards/margins": 5.527953147888184, + "rewards/rejected": -12.94709300994873, + "step": 36 + }, + { + "epoch": 0.05517241379310345, + "grad_norm": 481.47428057363646, + "learning_rate": 9.999451015497595e-07, + "logits/chosen": 1.1678117513656616, + "logits/rejected": 1.1830850839614868, + "logps/chosen": -0.8966320753097534, + "logps/rejected": -1.7006372213363647, + "loss": 7.4877, + "nll_loss": 0.8966320157051086, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.966320991516113, + "rewards/margins": 8.04005241394043, + "rewards/rejected": -17.00637435913086, + "step": 37 + }, + { + "epoch": 0.05666356011183597, + "grad_norm": 93.58445404338936, + "learning_rate": 9.999024041442455e-07, + "logits/chosen": 2.5838398933410645, + "logits/rejected": 2.9061648845672607, + "logps/chosen": -1.2142046689987183, + "logps/rejected": -1.276504397392273, + "loss": 4.1691, + "nll_loss": 1.2142047882080078, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.142046928405762, + "rewards/margins": 0.6229971051216125, + "rewards/rejected": -12.765044212341309, + "step": 38 + }, + { + "epoch": 0.0581547064305685, + "grad_norm": 40.82279691083783, + "learning_rate": 9.998475092659849e-07, + "logits/chosen": 1.7935913801193237, + "logits/rejected": 2.12984299659729, + "logps/chosen": -0.8291258215904236, + "logps/rejected": -1.4954478740692139, + "loss": 2.2373, + "nll_loss": 0.8291257619857788, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.291257858276367, + "rewards/margins": 6.66322135925293, + "rewards/rejected": -14.954479217529297, + "step": 39 + }, + { + "epoch": 0.05964585274930102, + "grad_norm": 53.81500379033792, + "learning_rate": 9.99780418254397e-07, + "logits/chosen": 1.685670018196106, + "logits/rejected": 1.92978036403656, + "logps/chosen": -0.5348352789878845, + "logps/rejected": -0.830666720867157, + "loss": 3.8797, + "nll_loss": 0.5348352789878845, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.348352909088135, + "rewards/margins": 2.9583141803741455, + "rewards/rejected": -8.30666732788086, + "step": 40 + }, + { + "epoch": 0.06113699906803355, + "grad_norm": 32.317751201915385, + "learning_rate": 9.99701132746483e-07, + "logits/chosen": 1.0136024951934814, + "logits/rejected": 1.234411597251892, + "logps/chosen": -0.6656249761581421, + "logps/rejected": -1.2858957052230835, + "loss": 2.9056, + "nll_loss": 0.6656249165534973, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.656249046325684, + "rewards/margins": 6.202707290649414, + "rewards/rejected": -12.858956336975098, + "step": 41 + }, + { + "epoch": 0.06262814538676607, + "grad_norm": 37.920752742703854, + "learning_rate": 9.996096546767859e-07, + "logits/chosen": 1.643129825592041, + "logits/rejected": 1.9104418754577637, + "logps/chosen": -0.8929121494293213, + "logps/rejected": -3.583488702774048, + "loss": 2.6743, + "nll_loss": 0.8929121494293213, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.929121017456055, + "rewards/margins": 26.905765533447266, + "rewards/rejected": -35.83489227294922, + "step": 42 + }, + { + "epoch": 0.0641192917054986, + "grad_norm": 220.01417151470528, + "learning_rate": 9.995059862773438e-07, + "logits/chosen": 2.98364520072937, + "logits/rejected": 3.2048516273498535, + "logps/chosen": -0.9894806146621704, + "logps/rejected": -1.338758945465088, + "loss": 4.432, + "nll_loss": 0.9894806742668152, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.894806861877441, + "rewards/margins": 3.4927821159362793, + "rewards/rejected": -13.387588500976562, + "step": 43 + }, + { + "epoch": 0.06561043802423112, + "grad_norm": 129.84779630155947, + "learning_rate": 9.993901300776358e-07, + "logits/chosen": 2.7791249752044678, + "logits/rejected": 2.397531509399414, + "logps/chosen": -0.9826548099517822, + "logps/rejected": -1.3197441101074219, + "loss": 3.2615, + "nll_loss": 0.982654869556427, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.826549530029297, + "rewards/margins": 3.3708925247192383, + "rewards/rejected": -13.197441101074219, + "step": 44 + }, + { + "epoch": 0.06710158434296365, + "grad_norm": 39.440699831745064, + "learning_rate": 9.99262088904519e-07, + "logits/chosen": 2.2303521633148193, + "logits/rejected": 2.4076573848724365, + "logps/chosen": -0.8233699202537537, + "logps/rejected": -2.383819818496704, + "loss": 2.8246, + "nll_loss": 0.8233699202537537, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.233698844909668, + "rewards/margins": 15.604498863220215, + "rewards/rejected": -23.838197708129883, + "step": 45 + }, + { + "epoch": 0.06859273066169617, + "grad_norm": 61.5149468898277, + "learning_rate": 9.991218658821608e-07, + "logits/chosen": 1.7433161735534668, + "logits/rejected": 1.4342732429504395, + "logps/chosen": -0.9544675946235657, + "logps/rejected": -1.520774245262146, + "loss": 3.262, + "nll_loss": 0.9544676542282104, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.544675827026367, + "rewards/margins": 5.6630659103393555, + "rewards/rejected": -15.207741737365723, + "step": 46 + }, + { + "epoch": 0.0700838769804287, + "grad_norm": 46.130144503376954, + "learning_rate": 9.989694644319617e-07, + "logits/chosen": 2.038182497024536, + "logits/rejected": 1.2625888586044312, + "logps/chosen": -1.2647312879562378, + "logps/rejected": -4.061408042907715, + "loss": 3.1697, + "nll_loss": 1.2647314071655273, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.64731216430664, + "rewards/margins": 27.96677017211914, + "rewards/rejected": -40.61408996582031, + "step": 47 + }, + { + "epoch": 0.07157502329916123, + "grad_norm": 59.34170299379562, + "learning_rate": 9.988048882724732e-07, + "logits/chosen": 2.8762378692626953, + "logits/rejected": 2.991826295852661, + "logps/chosen": -0.9993945360183716, + "logps/rejected": -1.4654295444488525, + "loss": 3.4738, + "nll_loss": 0.999394416809082, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.993945121765137, + "rewards/margins": 4.6603498458862305, + "rewards/rejected": -14.654294967651367, + "step": 48 + }, + { + "epoch": 0.07306616961789375, + "grad_norm": 102.94683203629245, + "learning_rate": 9.98628141419305e-07, + "logits/chosen": 2.6454639434814453, + "logits/rejected": 2.2062056064605713, + "logps/chosen": -0.9753769636154175, + "logps/rejected": -1.7307155132293701, + "loss": 3.7492, + "nll_loss": 0.9753769636154175, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.753769874572754, + "rewards/margins": 7.553386211395264, + "rewards/rejected": -17.30715560913086, + "step": 49 + }, + { + "epoch": 0.07455731593662628, + "grad_norm": 100.46810904952117, + "learning_rate": 9.98439228185029e-07, + "logits/chosen": 1.7063933610916138, + "logits/rejected": 2.1597023010253906, + "logps/chosen": -0.791731595993042, + "logps/rejected": -1.4047749042510986, + "loss": 2.9333, + "nll_loss": 0.7917314767837524, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.917316436767578, + "rewards/margins": 6.13043212890625, + "rewards/rejected": -14.047748565673828, + "step": 50 + }, + { + "epoch": 0.0760484622553588, + "grad_norm": 47.125555335318396, + "learning_rate": 9.982381531790732e-07, + "logits/chosen": 2.3178887367248535, + "logits/rejected": 2.353886842727661, + "logps/chosen": -0.9494035840034485, + "logps/rejected": -3.0622718334198, + "loss": 2.6833, + "nll_loss": 0.9494035840034485, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.494034767150879, + "rewards/margins": 21.12868309020996, + "rewards/rejected": -30.622718811035156, + "step": 51 + }, + { + "epoch": 0.07753960857409133, + "grad_norm": 27.564523433846276, + "learning_rate": 9.980249213076084e-07, + "logits/chosen": 2.2474324703216553, + "logits/rejected": 2.361985206604004, + "logps/chosen": -1.0604387521743774, + "logps/rejected": -1.6388243436813354, + "loss": 2.8513, + "nll_loss": 1.0604385137557983, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.604386329650879, + "rewards/margins": 5.783858776092529, + "rewards/rejected": -16.38824462890625, + "step": 52 + }, + { + "epoch": 0.07903075489282387, + "grad_norm": 47.86126354521992, + "learning_rate": 9.977995377734306e-07, + "logits/chosen": 2.8253977298736572, + "logits/rejected": 2.822557210922241, + "logps/chosen": -0.9413918852806091, + "logps/rejected": -1.1508779525756836, + "loss": 3.4441, + "nll_loss": 0.9413918256759644, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.413918495178223, + "rewards/margins": 2.0948617458343506, + "rewards/rejected": -11.508780479431152, + "step": 53 + }, + { + "epoch": 0.08052190121155639, + "grad_norm": 75.25519513038266, + "learning_rate": 9.97562008075832e-07, + "logits/chosen": 2.55771541595459, + "logits/rejected": 2.859192371368408, + "logps/chosen": -1.1559171676635742, + "logps/rejected": -1.4593706130981445, + "loss": 2.9522, + "nll_loss": 1.1559171676635742, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.559171676635742, + "rewards/margins": 3.034536361694336, + "rewards/rejected": -14.593707084655762, + "step": 54 + }, + { + "epoch": 0.08201304753028892, + "grad_norm": 198.24815027946113, + "learning_rate": 9.97312338010468e-07, + "logits/chosen": 1.073845386505127, + "logits/rejected": 0.7879983186721802, + "logps/chosen": -0.825156033039093, + "logps/rejected": -1.5017766952514648, + "loss": 3.7043, + "nll_loss": 0.8251559734344482, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.25156021118164, + "rewards/margins": 6.766207695007324, + "rewards/rejected": -15.017766952514648, + "step": 55 + }, + { + "epoch": 0.08350419384902144, + "grad_norm": 104.23207197532834, + "learning_rate": 9.970505336692153e-07, + "logits/chosen": 1.5102717876434326, + "logits/rejected": 1.6282591819763184, + "logps/chosen": -0.8507825136184692, + "logps/rejected": -1.504204273223877, + "loss": 2.2505, + "nll_loss": 0.850782573223114, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.50782585144043, + "rewards/margins": 6.534218788146973, + "rewards/rejected": -15.042045593261719, + "step": 56 + }, + { + "epoch": 0.08499534016775397, + "grad_norm": 71.0325316202818, + "learning_rate": 9.96776601440023e-07, + "logits/chosen": 1.9039965867996216, + "logits/rejected": 2.3082637786865234, + "logps/chosen": -1.021749496459961, + "logps/rejected": -1.833884596824646, + "loss": 2.796, + "nll_loss": 1.021749496459961, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.21749496459961, + "rewards/margins": 8.121349334716797, + "rewards/rejected": -18.338844299316406, + "step": 57 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 36.642253230458486, + "learning_rate": 9.964905480067584e-07, + "logits/chosen": 1.909174919128418, + "logits/rejected": 2.0987725257873535, + "logps/chosen": -0.8566492199897766, + "logps/rejected": -1.418030023574829, + "loss": 3.3146, + "nll_loss": 0.8566492795944214, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.566493034362793, + "rewards/margins": 5.6138081550598145, + "rewards/rejected": -14.18030071258545, + "step": 58 + }, + { + "epoch": 0.08797763280521902, + "grad_norm": 61.18925563704483, + "learning_rate": 9.96192380349041e-07, + "logits/chosen": 0.8996385335922241, + "logits/rejected": 0.7567811012268066, + "logps/chosen": -0.71656334400177, + "logps/rejected": -2.003258466720581, + "loss": 2.4045, + "nll_loss": 0.71656334400177, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.165633201599121, + "rewards/margins": 12.866951942443848, + "rewards/rejected": -20.03258514404297, + "step": 59 + }, + { + "epoch": 0.08946877912395154, + "grad_norm": 71.5541363297922, + "learning_rate": 9.958821057420752e-07, + "logits/chosen": 3.4364728927612305, + "logits/rejected": 2.817261219024658, + "logps/chosen": -1.2784249782562256, + "logps/rejected": -1.459514856338501, + "loss": 3.4693, + "nll_loss": 1.2784249782562256, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.784249305725098, + "rewards/margins": 1.8109009265899658, + "rewards/rejected": -14.595149993896484, + "step": 60 + }, + { + "epoch": 0.09095992544268407, + "grad_norm": 39.528470047855876, + "learning_rate": 9.955597317564703e-07, + "logits/chosen": 2.21244215965271, + "logits/rejected": 2.1904029846191406, + "logps/chosen": -0.8074604868888855, + "logps/rejected": -1.7122533321380615, + "loss": 3.0047, + "nll_loss": 0.8074605464935303, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.074604034423828, + "rewards/margins": 9.047929763793945, + "rewards/rejected": -17.122533798217773, + "step": 61 + }, + { + "epoch": 0.09245107176141659, + "grad_norm": 78.73411061437659, + "learning_rate": 9.952252662580579e-07, + "logits/chosen": 2.613969326019287, + "logits/rejected": 2.271270513534546, + "logps/chosen": -0.8780649900436401, + "logps/rejected": -1.5495920181274414, + "loss": 2.8028, + "nll_loss": 0.8780649900436401, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.780649185180664, + "rewards/margins": 6.715270519256592, + "rewards/rejected": -15.49592113494873, + "step": 62 + }, + { + "epoch": 0.09394221808014912, + "grad_norm": 76.89341482591242, + "learning_rate": 9.948787174076981e-07, + "logits/chosen": 2.1724722385406494, + "logits/rejected": 2.4555037021636963, + "logps/chosen": -1.1089688539505005, + "logps/rejected": -1.4283640384674072, + "loss": 2.669, + "nll_loss": 1.1089688539505005, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.08968734741211, + "rewards/margins": 3.193953037261963, + "rewards/rejected": -14.28364086151123, + "step": 63 + }, + { + "epoch": 0.09543336439888164, + "grad_norm": 43.861657538915026, + "learning_rate": 9.94520093661082e-07, + "logits/chosen": 1.7243608236312866, + "logits/rejected": 2.012627601623535, + "logps/chosen": -1.3577117919921875, + "logps/rejected": -1.9268991947174072, + "loss": 3.2175, + "nll_loss": 1.3577117919921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.577116966247559, + "rewards/margins": 5.691873550415039, + "rewards/rejected": -19.268991470336914, + "step": 64 + }, + { + "epoch": 0.09692451071761417, + "grad_norm": 87.80753078470336, + "learning_rate": 9.941494037685243e-07, + "logits/chosen": 1.7657297849655151, + "logits/rejected": 1.7299094200134277, + "logps/chosen": -0.83369380235672, + "logps/rejected": -1.3283755779266357, + "loss": 3.6815, + "nll_loss": 0.83369380235672, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.336938858032227, + "rewards/margins": 4.946816444396973, + "rewards/rejected": -13.283754348754883, + "step": 65 + }, + { + "epoch": 0.0984156570363467, + "grad_norm": 37.44360560489009, + "learning_rate": 9.9376665677475e-07, + "logits/chosen": 2.8668999671936035, + "logits/rejected": 2.786085844039917, + "logps/chosen": -0.8269742131233215, + "logps/rejected": -1.4122259616851807, + "loss": 3.2433, + "nll_loss": 0.8269742131233215, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.269742965698242, + "rewards/margins": 5.852518081665039, + "rewards/rejected": -14.122259140014648, + "step": 66 + }, + { + "epoch": 0.09990680335507922, + "grad_norm": 75.06063398922664, + "learning_rate": 9.933718620186744e-07, + "logits/chosen": 2.2985568046569824, + "logits/rejected": 2.2154862880706787, + "logps/chosen": -0.9047732949256897, + "logps/rejected": -2.214324712753296, + "loss": 2.4105, + "nll_loss": 0.9047732353210449, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.04773235321045, + "rewards/margins": 13.095513343811035, + "rewards/rejected": -22.143245697021484, + "step": 67 + }, + { + "epoch": 0.10139794967381174, + "grad_norm": 40.87038075692728, + "learning_rate": 9.929650291331739e-07, + "logits/chosen": 2.396892547607422, + "logits/rejected": 2.3966078758239746, + "logps/chosen": -0.9977419376373291, + "logps/rejected": -3.083773374557495, + "loss": 2.5921, + "nll_loss": 0.9977419376373291, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.977418899536133, + "rewards/margins": 20.86031723022461, + "rewards/rejected": -30.837732315063477, + "step": 68 + }, + { + "epoch": 0.10288909599254427, + "grad_norm": 65.73975701903078, + "learning_rate": 9.925461680448525e-07, + "logits/chosen": 2.516754150390625, + "logits/rejected": 2.5938100814819336, + "logps/chosen": -0.8624992370605469, + "logps/rejected": -1.3350712060928345, + "loss": 2.9874, + "nll_loss": 0.8624992966651917, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.624992370605469, + "rewards/margins": 4.725719451904297, + "rewards/rejected": -13.350711822509766, + "step": 69 + }, + { + "epoch": 0.1043802423112768, + "grad_norm": 40.804175507450815, + "learning_rate": 9.921152889737984e-07, + "logits/chosen": 2.944434404373169, + "logits/rejected": 3.123687744140625, + "logps/chosen": -1.1898823976516724, + "logps/rejected": -2.2927870750427246, + "loss": 2.6063, + "nll_loss": 1.1898823976516724, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.898823738098145, + "rewards/margins": 11.029047966003418, + "rewards/rejected": -22.927871704101562, + "step": 70 + }, + { + "epoch": 0.10587138863000932, + "grad_norm": 43.42248899213977, + "learning_rate": 9.916724024333349e-07, + "logits/chosen": 1.6870605945587158, + "logits/rejected": 2.1503679752349854, + "logps/chosen": -0.8132368326187134, + "logps/rejected": -2.27059268951416, + "loss": 2.3886, + "nll_loss": 0.8132367730140686, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.132368087768555, + "rewards/margins": 14.57356071472168, + "rewards/rejected": -22.705928802490234, + "step": 71 + }, + { + "epoch": 0.10736253494874184, + "grad_norm": 43.24943223571458, + "learning_rate": 9.912175192297647e-07, + "logits/chosen": 2.581845283508301, + "logits/rejected": 2.648552894592285, + "logps/chosen": -1.249975562095642, + "logps/rejected": -3.19691801071167, + "loss": 3.2368, + "nll_loss": 1.249975562095642, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.499755859375, + "rewards/margins": 19.469423294067383, + "rewards/rejected": -31.969181060791016, + "step": 72 + }, + { + "epoch": 0.10885368126747437, + "grad_norm": 86.67815176112049, + "learning_rate": 9.90750650462105e-07, + "logits/chosen": 2.0326600074768066, + "logits/rejected": 1.7139393091201782, + "logps/chosen": -0.8729643821716309, + "logps/rejected": -2.466681718826294, + "loss": 3.1837, + "nll_loss": 0.8729644417762756, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.729644775390625, + "rewards/margins": 15.937172889709473, + "rewards/rejected": -24.66681671142578, + "step": 73 + }, + { + "epoch": 0.1103448275862069, + "grad_norm": 85.54815320638102, + "learning_rate": 9.902718075218176e-07, + "logits/chosen": 2.492297649383545, + "logits/rejected": 2.0416009426116943, + "logps/chosen": -0.8708876371383667, + "logps/rejected": -3.123574733734131, + "loss": 2.7176, + "nll_loss": 0.8708876371383667, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.708876609802246, + "rewards/margins": 22.526870727539062, + "rewards/rejected": -31.235746383666992, + "step": 74 + }, + { + "epoch": 0.11183597390493942, + "grad_norm": 175.12315126357532, + "learning_rate": 9.8978100209253e-07, + "logits/chosen": 2.0305941104888916, + "logits/rejected": 2.296379566192627, + "logps/chosen": -0.9669104814529419, + "logps/rejected": -2.073458194732666, + "loss": 3.1995, + "nll_loss": 0.9669104814529419, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.669103622436523, + "rewards/margins": 11.065479278564453, + "rewards/rejected": -20.73458480834961, + "step": 75 + }, + { + "epoch": 0.11332712022367195, + "grad_norm": 46.256680217091585, + "learning_rate": 9.89278246149752e-07, + "logits/chosen": 3.132906913757324, + "logits/rejected": 3.1125741004943848, + "logps/chosen": -0.998193621635437, + "logps/rejected": -1.794511079788208, + "loss": 2.4239, + "nll_loss": 0.9981937408447266, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.981935501098633, + "rewards/margins": 7.963172912597656, + "rewards/rejected": -17.945110321044922, + "step": 76 + }, + { + "epoch": 0.11481826654240447, + "grad_norm": 96.89440127224765, + "learning_rate": 9.887635519605815e-07, + "logits/chosen": 1.6895731687545776, + "logits/rejected": 2.142848014831543, + "logps/chosen": -1.096168875694275, + "logps/rejected": -1.9435949325561523, + "loss": 3.3364, + "nll_loss": 1.096168875694275, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.961688995361328, + "rewards/margins": 8.474259376525879, + "rewards/rejected": -19.435949325561523, + "step": 77 + }, + { + "epoch": 0.116309412861137, + "grad_norm": 44.44278994953829, + "learning_rate": 9.882369320834068e-07, + "logits/chosen": 2.2599291801452637, + "logits/rejected": 1.5205597877502441, + "logps/chosen": -0.9418594837188721, + "logps/rejected": -1.2209088802337646, + "loss": 2.9725, + "nll_loss": 0.9418594837188721, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.418594360351562, + "rewards/margins": 2.7904937267303467, + "rewards/rejected": -12.209087371826172, + "step": 78 + }, + { + "epoch": 0.11780055917986952, + "grad_norm": 107.08151395089075, + "learning_rate": 9.876983993675989e-07, + "logits/chosen": 2.659163236618042, + "logits/rejected": 2.7966227531433105, + "logps/chosen": -0.8889027833938599, + "logps/rejected": -1.249839425086975, + "loss": 3.018, + "nll_loss": 0.8889028429985046, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.88902759552002, + "rewards/margins": 3.609365940093994, + "rewards/rejected": -12.498394966125488, + "step": 79 + }, + { + "epoch": 0.11929170549860205, + "grad_norm": 22.34809477010669, + "learning_rate": 9.871479669531988e-07, + "logits/chosen": 2.099395513534546, + "logits/rejected": 2.030531406402588, + "logps/chosen": -0.9590896368026733, + "logps/rejected": -1.314590334892273, + "loss": 2.559, + "nll_loss": 0.9590896368026733, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.590896606445312, + "rewards/margins": 3.555006742477417, + "rewards/rejected": -13.145903587341309, + "step": 80 + }, + { + "epoch": 0.12078285181733457, + "grad_norm": 50.15274973664094, + "learning_rate": 9.865856482705972e-07, + "logits/chosen": 3.312157154083252, + "logits/rejected": 3.159273386001587, + "logps/chosen": -1.3754456043243408, + "logps/rejected": -2.9079883098602295, + "loss": 2.8137, + "nll_loss": 1.3754454851150513, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.75445556640625, + "rewards/margins": 15.325429916381836, + "rewards/rejected": -29.079885482788086, + "step": 81 + }, + { + "epoch": 0.1222739981360671, + "grad_norm": 43.46345081289328, + "learning_rate": 9.860114570402054e-07, + "logits/chosen": 2.0022189617156982, + "logits/rejected": 1.6277714967727661, + "logps/chosen": -1.0034466981887817, + "logps/rejected": -2.133042335510254, + "loss": 3.1481, + "nll_loss": 1.0034466981887817, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.034467697143555, + "rewards/margins": 11.295957565307617, + "rewards/rejected": -21.330425262451172, + "step": 82 + }, + { + "epoch": 0.12376514445479962, + "grad_norm": 58.38128560109899, + "learning_rate": 9.85425407272122e-07, + "logits/chosen": 2.490449905395508, + "logits/rejected": 2.7972893714904785, + "logps/chosen": -1.2935540676116943, + "logps/rejected": -2.6080098152160645, + "loss": 2.5246, + "nll_loss": 1.2935540676116943, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.935540199279785, + "rewards/margins": 13.14455795288086, + "rewards/rejected": -26.08009910583496, + "step": 83 + }, + { + "epoch": 0.12525629077353215, + "grad_norm": 178.3311223173836, + "learning_rate": 9.8482751326579e-07, + "logits/chosen": 1.821961760520935, + "logits/rejected": 2.0110504627227783, + "logps/chosen": -0.774723470211029, + "logps/rejected": -2.1148810386657715, + "loss": 2.3433, + "nll_loss": 0.774723470211029, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.747235298156738, + "rewards/margins": 13.401576042175293, + "rewards/rejected": -21.1488094329834, + "step": 84 + }, + { + "epoch": 0.1267474370922647, + "grad_norm": 51.77425803563619, + "learning_rate": 9.842177896096493e-07, + "logits/chosen": 2.007399082183838, + "logits/rejected": 2.0587520599365234, + "logps/chosen": -0.9214052557945251, + "logps/rejected": -2.382662534713745, + "loss": 2.3806, + "nll_loss": 0.9214051961898804, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.214052200317383, + "rewards/margins": 14.612571716308594, + "rewards/rejected": -23.82662582397461, + "step": 85 + }, + { + "epoch": 0.1282385834109972, + "grad_norm": 136.05280033355075, + "learning_rate": 9.835962511807785e-07, + "logits/chosen": 2.5353775024414062, + "logits/rejected": 2.7205231189727783, + "logps/chosen": -0.8614702224731445, + "logps/rejected": -1.641845703125, + "loss": 3.0362, + "nll_loss": 0.8614702820777893, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.614703178405762, + "rewards/margins": 7.8037543296813965, + "rewards/rejected": -16.41845703125, + "step": 86 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 30.05683499780679, + "learning_rate": 9.82962913144534e-07, + "logits/chosen": 2.634047746658325, + "logits/rejected": 2.6122305393218994, + "logps/chosen": -0.9235073328018188, + "logps/rejected": -1.4619425535202026, + "loss": 2.434, + "nll_loss": 0.9235073328018188, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.235074043273926, + "rewards/margins": 5.384352684020996, + "rewards/rejected": -14.619426727294922, + "step": 87 + }, + { + "epoch": 0.13122087604846225, + "grad_norm": 50.14650415819595, + "learning_rate": 9.823177909541793e-07, + "logits/chosen": 2.015709638595581, + "logits/rejected": 1.9986441135406494, + "logps/chosen": -0.8479939699172974, + "logps/rejected": -1.2945144176483154, + "loss": 2.7979, + "nll_loss": 0.8479939699172974, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.479939460754395, + "rewards/margins": 4.46520471572876, + "rewards/rejected": -12.945144653320312, + "step": 88 + }, + { + "epoch": 0.1327120223671948, + "grad_norm": 52.862158002098944, + "learning_rate": 9.816609003505072e-07, + "logits/chosen": 3.347060203552246, + "logits/rejected": 3.233002185821533, + "logps/chosen": -1.1397117376327515, + "logps/rejected": -1.4866154193878174, + "loss": 2.6573, + "nll_loss": 1.1397117376327515, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.397117614746094, + "rewards/margins": 3.4690380096435547, + "rewards/rejected": -14.866154670715332, + "step": 89 + }, + { + "epoch": 0.1342031686859273, + "grad_norm": 48.28420889494256, + "learning_rate": 9.809922573614569e-07, + "logits/chosen": 2.897589921951294, + "logits/rejected": 2.594212532043457, + "logps/chosen": -1.155824899673462, + "logps/rejected": -1.7766919136047363, + "loss": 2.6429, + "nll_loss": 1.1558247804641724, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.558248519897461, + "rewards/margins": 6.208670616149902, + "rewards/rejected": -17.76692008972168, + "step": 90 + }, + { + "epoch": 0.13569431500465984, + "grad_norm": 44.09837920772329, + "learning_rate": 9.80311878301722e-07, + "logits/chosen": 2.5982978343963623, + "logits/rejected": 2.4047904014587402, + "logps/chosen": -1.1400195360183716, + "logps/rejected": -1.6405586004257202, + "loss": 2.7758, + "nll_loss": 1.1400195360183716, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.400195121765137, + "rewards/margins": 5.005392074584961, + "rewards/rejected": -16.40558624267578, + "step": 91 + }, + { + "epoch": 0.13718546132339235, + "grad_norm": 35.373221853287035, + "learning_rate": 9.796197797723532e-07, + "logits/chosen": 1.8639134168624878, + "logits/rejected": 1.8991543054580688, + "logps/chosen": -0.9351028203964233, + "logps/rejected": -2.294522523880005, + "loss": 2.4379, + "nll_loss": 0.9351028800010681, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.351028442382812, + "rewards/margins": 13.594196319580078, + "rewards/rejected": -22.94522476196289, + "step": 92 + }, + { + "epoch": 0.1386766076421249, + "grad_norm": 139.38985986452, + "learning_rate": 9.789159786603522e-07, + "logits/chosen": 1.8421365022659302, + "logits/rejected": 2.1121838092803955, + "logps/chosen": -0.7293067574501038, + "logps/rejected": -1.8387272357940674, + "loss": 3.2568, + "nll_loss": 0.729306697845459, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.293067932128906, + "rewards/margins": 11.094205856323242, + "rewards/rejected": -18.387271881103516, + "step": 93 + }, + { + "epoch": 0.1401677539608574, + "grad_norm": 66.73824264608913, + "learning_rate": 9.78200492138261e-07, + "logits/chosen": 2.5073771476745605, + "logits/rejected": 2.620734453201294, + "logps/chosen": -1.0046569108963013, + "logps/rejected": -2.5618834495544434, + "loss": 2.5804, + "nll_loss": 1.0046569108963013, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.046568870544434, + "rewards/margins": 15.57226848602295, + "rewards/rejected": -25.61883544921875, + "step": 94 + }, + { + "epoch": 0.14165890027958994, + "grad_norm": 60.19100070878113, + "learning_rate": 9.774733376637421e-07, + "logits/chosen": 2.4931178092956543, + "logits/rejected": 2.3655691146850586, + "logps/chosen": -0.9876638054847717, + "logps/rejected": -1.5048593282699585, + "loss": 3.3979, + "nll_loss": 0.9876636266708374, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.87663745880127, + "rewards/margins": 5.1719560623168945, + "rewards/rejected": -15.048593521118164, + "step": 95 + }, + { + "epoch": 0.14315004659832245, + "grad_norm": 93.58142504504913, + "learning_rate": 9.76734532979152e-07, + "logits/chosen": 1.8486552238464355, + "logits/rejected": 2.0405972003936768, + "logps/chosen": -0.7252814769744873, + "logps/rejected": -1.593367099761963, + "loss": 1.4345, + "nll_loss": 0.7252814769744873, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.252814292907715, + "rewards/margins": 8.680856704711914, + "rewards/rejected": -15.933670997619629, + "step": 96 + }, + { + "epoch": 0.144641192917055, + "grad_norm": 103.20755248724876, + "learning_rate": 9.759840961111097e-07, + "logits/chosen": 2.693235397338867, + "logits/rejected": 2.6906933784484863, + "logps/chosen": -0.8279076218605042, + "logps/rejected": -1.965848445892334, + "loss": 1.9372, + "nll_loss": 0.8279076218605042, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.27907657623291, + "rewards/margins": 11.37940788269043, + "rewards/rejected": -19.658483505249023, + "step": 97 + }, + { + "epoch": 0.1461323392357875, + "grad_norm": 88.41840643452079, + "learning_rate": 9.752220453700554e-07, + "logits/chosen": 2.36478328704834, + "logits/rejected": 2.2650041580200195, + "logps/chosen": -0.9894752502441406, + "logps/rejected": -1.866193413734436, + "loss": 2.8837, + "nll_loss": 0.9894753694534302, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.894752502441406, + "rewards/margins": 8.767179489135742, + "rewards/rejected": -18.66193199157715, + "step": 98 + }, + { + "epoch": 0.14762348555452004, + "grad_norm": 86.18987498941965, + "learning_rate": 9.744483993498052e-07, + "logits/chosen": 1.8514745235443115, + "logits/rejected": 2.3874871730804443, + "logps/chosen": -0.9877501726150513, + "logps/rejected": -2.1680359840393066, + "loss": 2.5626, + "nll_loss": 0.987750232219696, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.87750244140625, + "rewards/margins": 11.802858352661133, + "rewards/rejected": -21.68035888671875, + "step": 99 + }, + { + "epoch": 0.14911463187325255, + "grad_norm": 63.73675084474543, + "learning_rate": 9.736631769270957e-07, + "logits/chosen": 2.287912368774414, + "logits/rejected": 2.1225051879882812, + "logps/chosen": -0.9378949403762817, + "logps/rejected": -1.3148002624511719, + "loss": 2.2336, + "nll_loss": 0.9378949403762817, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.378949165344238, + "rewards/margins": 3.769054412841797, + "rewards/rejected": -13.148003578186035, + "step": 100 + }, + { + "epoch": 0.1506057781919851, + "grad_norm": 106.232957474972, + "learning_rate": 9.72866397261125e-07, + "logits/chosen": 1.9837908744812012, + "logits/rejected": 2.2438063621520996, + "logps/chosen": -0.9606898427009583, + "logps/rejected": -1.9379208087921143, + "loss": 2.9115, + "nll_loss": 0.960689902305603, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.60689926147461, + "rewards/margins": 9.772309303283691, + "rewards/rejected": -19.37920570373535, + "step": 101 + }, + { + "epoch": 0.1520969245107176, + "grad_norm": 38.68435341536512, + "learning_rate": 9.720580797930844e-07, + "logits/chosen": 2.5899598598480225, + "logits/rejected": 2.651418685913086, + "logps/chosen": -0.9334679841995239, + "logps/rejected": -1.3885936737060547, + "loss": 2.4316, + "nll_loss": 0.9334679841995239, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.334678649902344, + "rewards/margins": 4.551258087158203, + "rewards/rejected": -13.885936737060547, + "step": 102 + }, + { + "epoch": 0.15358807082945014, + "grad_norm": 68.61885464106034, + "learning_rate": 9.712382442456844e-07, + "logits/chosen": 2.4430806636810303, + "logits/rejected": 2.515192985534668, + "logps/chosen": -1.4942792654037476, + "logps/rejected": -1.9828426837921143, + "loss": 2.8353, + "nll_loss": 1.4942792654037476, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.942790985107422, + "rewards/margins": 4.885636329650879, + "rewards/rejected": -19.828428268432617, + "step": 103 + }, + { + "epoch": 0.15507921714818265, + "grad_norm": 33.861836411287534, + "learning_rate": 9.704069106226727e-07, + "logits/chosen": 0.8605862855911255, + "logits/rejected": 0.6118173599243164, + "logps/chosen": -0.7482974529266357, + "logps/rejected": -1.6876130104064941, + "loss": 2.861, + "nll_loss": 0.7482973337173462, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.482974052429199, + "rewards/margins": 9.393156051635742, + "rewards/rejected": -16.876129150390625, + "step": 104 + }, + { + "epoch": 0.1565703634669152, + "grad_norm": 31.203010053391782, + "learning_rate": 9.695640992083471e-07, + "logits/chosen": 2.2685840129852295, + "logits/rejected": 2.6703624725341797, + "logps/chosen": -1.1785123348236084, + "logps/rejected": -1.6574785709381104, + "loss": 2.4841, + "nll_loss": 1.1785123348236084, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.785123825073242, + "rewards/margins": 4.789661407470703, + "rewards/rejected": -16.574787139892578, + "step": 105 + }, + { + "epoch": 0.15806150978564773, + "grad_norm": 70.20320807885953, + "learning_rate": 9.687098305670604e-07, + "logits/chosen": 1.7209910154342651, + "logits/rejected": 2.1076905727386475, + "logps/chosen": -0.9404205083847046, + "logps/rejected": -1.0514883995056152, + "loss": 2.579, + "nll_loss": 0.9404205083847046, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.404204368591309, + "rewards/margins": 1.1106791496276855, + "rewards/rejected": -10.514884948730469, + "step": 106 + }, + { + "epoch": 0.15955265610438024, + "grad_norm": 48.677672121591414, + "learning_rate": 9.678441255427179e-07, + "logits/chosen": 2.7841222286224365, + "logits/rejected": 2.7015132904052734, + "logps/chosen": -1.0275628566741943, + "logps/rejected": -2.475385904312134, + "loss": 2.7087, + "nll_loss": 1.0275628566741943, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.275627136230469, + "rewards/margins": 14.478230476379395, + "rewards/rejected": -24.753860473632812, + "step": 107 + }, + { + "epoch": 0.16104380242311278, + "grad_norm": 36.30484378835504, + "learning_rate": 9.669670052582693e-07, + "logits/chosen": 0.9016488790512085, + "logits/rejected": 0.7409514784812927, + "logps/chosen": -0.7989850640296936, + "logps/rejected": -2.0533151626586914, + "loss": 2.3533, + "nll_loss": 0.7989850640296936, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.9898505210876465, + "rewards/margins": 12.543301582336426, + "rewards/rejected": -20.533153533935547, + "step": 108 + }, + { + "epoch": 0.1625349487418453, + "grad_norm": 90.07954865659732, + "learning_rate": 9.66078491115194e-07, + "logits/chosen": 1.88589608669281, + "logits/rejected": 1.5732561349868774, + "logps/chosen": -1.0318375825881958, + "logps/rejected": -1.31565523147583, + "loss": 2.8364, + "nll_loss": 1.0318375825881958, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.318375587463379, + "rewards/margins": 2.838176727294922, + "rewards/rejected": -13.156554222106934, + "step": 109 + }, + { + "epoch": 0.16402609506057783, + "grad_norm": 93.22012698020275, + "learning_rate": 9.651786047929772e-07, + "logits/chosen": 1.7846213579177856, + "logits/rejected": 2.1781160831451416, + "logps/chosen": -0.945646345615387, + "logps/rejected": -1.7606041431427002, + "loss": 2.0061, + "nll_loss": 0.945646345615387, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.456462860107422, + "rewards/margins": 8.149579048156738, + "rewards/rejected": -17.606042861938477, + "step": 110 + }, + { + "epoch": 0.16551724137931034, + "grad_norm": 161.0227005909337, + "learning_rate": 9.642673682485828e-07, + "logits/chosen": 1.544893741607666, + "logits/rejected": 1.7132470607757568, + "logps/chosen": -0.7085322737693787, + "logps/rejected": -2.0612924098968506, + "loss": 3.0642, + "nll_loss": 0.7085322737693787, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.085322380065918, + "rewards/margins": 13.52760124206543, + "rewards/rejected": -20.612924575805664, + "step": 111 + }, + { + "epoch": 0.16700838769804288, + "grad_norm": 86.38909626151015, + "learning_rate": 9.633448037159166e-07, + "logits/chosen": 1.8809916973114014, + "logits/rejected": 1.7090436220169067, + "logps/chosen": -0.9259803891181946, + "logps/rejected": -2.25235915184021, + "loss": 2.541, + "nll_loss": 0.9259804487228394, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.259803771972656, + "rewards/margins": 13.263786315917969, + "rewards/rejected": -22.523590087890625, + "step": 112 + }, + { + "epoch": 0.1684995340167754, + "grad_norm": 40.13582518270976, + "learning_rate": 9.624109337052837e-07, + "logits/chosen": 2.2280895709991455, + "logits/rejected": 2.281741142272949, + "logps/chosen": -0.8172340393066406, + "logps/rejected": -1.4414923191070557, + "loss": 1.9405, + "nll_loss": 0.8172340393066406, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.172340393066406, + "rewards/margins": 6.24258279800415, + "rewards/rejected": -14.414921760559082, + "step": 113 + }, + { + "epoch": 0.16999068033550793, + "grad_norm": 77.81905751277964, + "learning_rate": 9.6146578100284e-07, + "logits/chosen": 1.5080493688583374, + "logits/rejected": 1.7396886348724365, + "logps/chosen": -0.8940697312355042, + "logps/rejected": -1.4884928464889526, + "loss": 2.6538, + "nll_loss": 0.8940697908401489, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.94069766998291, + "rewards/margins": 5.944231986999512, + "rewards/rejected": -14.884929656982422, + "step": 114 + }, + { + "epoch": 0.17148182665424044, + "grad_norm": 40.75714524289799, + "learning_rate": 9.605093686700353e-07, + "logits/chosen": 1.5538138151168823, + "logits/rejected": 1.5067213773727417, + "logps/chosen": -0.7099705338478088, + "logps/rejected": -1.2894647121429443, + "loss": 2.1382, + "nll_loss": 0.7099704742431641, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.099704742431641, + "rewards/margins": 5.794943332672119, + "rewards/rejected": -12.894647598266602, + "step": 115 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 39.36609870883376, + "learning_rate": 9.595417200430515e-07, + "logits/chosen": 1.8860256671905518, + "logits/rejected": 2.049478054046631, + "logps/chosen": -0.7657876014709473, + "logps/rejected": -2.3453309535980225, + "loss": 2.6743, + "nll_loss": 0.7657876014709473, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.657876014709473, + "rewards/margins": 15.795431137084961, + "rewards/rejected": -23.453306198120117, + "step": 116 + }, + { + "epoch": 0.1744641192917055, + "grad_norm": 55.96194893684382, + "learning_rate": 9.585628587322328e-07, + "logits/chosen": 1.9524157047271729, + "logits/rejected": 1.797798991203308, + "logps/chosen": -1.1159601211547852, + "logps/rejected": -1.6060389280319214, + "loss": 3.3782, + "nll_loss": 1.1159600019454956, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.159601211547852, + "rewards/margins": 4.9007887840271, + "rewards/rejected": -16.06039047241211, + "step": 117 + }, + { + "epoch": 0.17595526561043803, + "grad_norm": 41.101151414498915, + "learning_rate": 9.575728086215091e-07, + "logits/chosen": 1.20786714553833, + "logits/rejected": 1.257465124130249, + "logps/chosen": -1.0939463376998901, + "logps/rejected": -1.5144948959350586, + "loss": 2.4688, + "nll_loss": 1.0939463376998901, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.93946361541748, + "rewards/margins": 4.20548677444458, + "rewards/rejected": -15.144948959350586, + "step": 118 + }, + { + "epoch": 0.17744641192917054, + "grad_norm": 35.493847514643186, + "learning_rate": 9.565715938678145e-07, + "logits/chosen": 2.0380167961120605, + "logits/rejected": 2.095473527908325, + "logps/chosen": -0.7855863571166992, + "logps/rejected": -1.56964910030365, + "loss": 2.261, + "nll_loss": 0.7855863571166992, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.855864524841309, + "rewards/margins": 7.840627670288086, + "rewards/rejected": -15.696491241455078, + "step": 119 + }, + { + "epoch": 0.17893755824790308, + "grad_norm": 57.163210188312476, + "learning_rate": 9.555592389004966e-07, + "logits/chosen": 2.790797710418701, + "logits/rejected": 1.9768295288085938, + "logps/chosen": -1.2539488077163696, + "logps/rejected": -3.159113883972168, + "loss": 1.935, + "nll_loss": 1.2539489269256592, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.539488792419434, + "rewards/margins": 19.051651000976562, + "rewards/rejected": -31.591140747070312, + "step": 120 + }, + { + "epoch": 0.1804287045666356, + "grad_norm": 79.11517785089488, + "learning_rate": 9.54535768420721e-07, + "logits/chosen": 2.376458168029785, + "logits/rejected": 2.3259100914001465, + "logps/chosen": -1.0639259815216064, + "logps/rejected": -2.951509952545166, + "loss": 2.1214, + "nll_loss": 1.0639259815216064, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.639259338378906, + "rewards/margins": 18.875837326049805, + "rewards/rejected": -29.51509666442871, + "step": 121 + }, + { + "epoch": 0.18191985088536813, + "grad_norm": 161.6550146595864, + "learning_rate": 9.535012074008686e-07, + "logits/chosen": 2.5242788791656494, + "logits/rejected": 2.7374792098999023, + "logps/chosen": -1.1811103820800781, + "logps/rejected": -1.8748825788497925, + "loss": 3.2394, + "nll_loss": 1.1811102628707886, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.811103820800781, + "rewards/margins": 6.937723159790039, + "rewards/rejected": -18.74882698059082, + "step": 122 + }, + { + "epoch": 0.18341099720410065, + "grad_norm": 68.86205628275955, + "learning_rate": 9.524555810839266e-07, + "logits/chosen": 1.5950171947479248, + "logits/rejected": 1.648951768875122, + "logps/chosen": -1.2424349784851074, + "logps/rejected": -2.0438003540039062, + "loss": 2.3309, + "nll_loss": 1.2424349784851074, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.424349784851074, + "rewards/margins": 8.013655662536621, + "rewards/rejected": -20.438003540039062, + "step": 123 + }, + { + "epoch": 0.18490214352283318, + "grad_norm": 66.32682008465957, + "learning_rate": 9.513989149828717e-07, + "logits/chosen": 0.9972255229949951, + "logits/rejected": 2.005828619003296, + "logps/chosen": -0.8773561120033264, + "logps/rejected": -1.890647530555725, + "loss": 2.728, + "nll_loss": 0.8773560523986816, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.773560523986816, + "rewards/margins": 10.132913589477539, + "rewards/rejected": -18.906475067138672, + "step": 124 + }, + { + "epoch": 0.1863932898415657, + "grad_norm": 45.06477535770408, + "learning_rate": 9.503312348800485e-07, + "logits/chosen": 2.1520888805389404, + "logits/rejected": 2.2559258937835693, + "logps/chosen": -1.0796295404434204, + "logps/rejected": -2.325199842453003, + "loss": 2.3406, + "nll_loss": 1.0796295404434204, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.796295166015625, + "rewards/margins": 12.455702781677246, + "rewards/rejected": -23.251996994018555, + "step": 125 + }, + { + "epoch": 0.18788443616029823, + "grad_norm": 76.37524825807895, + "learning_rate": 9.492525668265399e-07, + "logits/chosen": 2.432969093322754, + "logits/rejected": 2.440464496612549, + "logps/chosen": -0.8307008743286133, + "logps/rejected": -1.703589916229248, + "loss": 2.1258, + "nll_loss": 0.8307008147239685, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.307008743286133, + "rewards/margins": 8.728890419006348, + "rewards/rejected": -17.035898208618164, + "step": 126 + }, + { + "epoch": 0.18937558247903075, + "grad_norm": 106.49269707447871, + "learning_rate": 9.481629371415313e-07, + "logits/chosen": 2.3593316078186035, + "logits/rejected": 2.433490753173828, + "logps/chosen": -0.9862598180770874, + "logps/rejected": -1.6820356845855713, + "loss": 2.0768, + "nll_loss": 0.9862598180770874, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.862598419189453, + "rewards/margins": 6.957759380340576, + "rewards/rejected": -16.820358276367188, + "step": 127 + }, + { + "epoch": 0.19086672879776329, + "grad_norm": 119.96635126252004, + "learning_rate": 9.470623724116692e-07, + "logits/chosen": 1.5657812356948853, + "logits/rejected": 2.199969530105591, + "logps/chosen": -0.754319965839386, + "logps/rejected": -1.9645689725875854, + "loss": 2.2599, + "nll_loss": 0.754319965839386, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.543199062347412, + "rewards/margins": 12.102490425109863, + "rewards/rejected": -19.645689010620117, + "step": 128 + }, + { + "epoch": 0.1923578751164958, + "grad_norm": 27.764965593166103, + "learning_rate": 9.459508994904117e-07, + "logits/chosen": 1.3861720561981201, + "logits/rejected": 1.846404790878296, + "logps/chosen": -0.7986534833908081, + "logps/rejected": -1.7672803401947021, + "loss": 1.6964, + "nll_loss": 0.7986533641815186, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.986534118652344, + "rewards/margins": 9.686269760131836, + "rewards/rejected": -17.67280387878418, + "step": 129 + }, + { + "epoch": 0.19384902143522834, + "grad_norm": 51.67532936315, + "learning_rate": 9.448285454973737e-07, + "logits/chosen": 1.0668275356292725, + "logits/rejected": 1.1291849613189697, + "logps/chosen": -0.7701537609100342, + "logps/rejected": -2.3240244388580322, + "loss": 2.1264, + "nll_loss": 0.7701537013053894, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.7015380859375, + "rewards/margins": 15.53870677947998, + "rewards/rejected": -23.240245819091797, + "step": 130 + }, + { + "epoch": 0.19534016775396085, + "grad_norm": 93.9771989772975, + "learning_rate": 9.436953378176648e-07, + "logits/chosen": 1.8385086059570312, + "logits/rejected": 1.6848714351654053, + "logps/chosen": -0.7946656942367554, + "logps/rejected": -2.1132001876831055, + "loss": 2.6497, + "nll_loss": 0.7946656942367554, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.946656703948975, + "rewards/margins": 13.185344696044922, + "rewards/rejected": -21.132001876831055, + "step": 131 + }, + { + "epoch": 0.1968313140726934, + "grad_norm": 61.220728520327334, + "learning_rate": 9.425513041012219e-07, + "logits/chosen": 1.8041369915008545, + "logits/rejected": 2.2424862384796143, + "logps/chosen": -0.901249885559082, + "logps/rejected": -2.4530515670776367, + "loss": 2.2458, + "nll_loss": 0.9012499451637268, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.01249885559082, + "rewards/margins": 15.518016815185547, + "rewards/rejected": -24.530517578125, + "step": 132 + }, + { + "epoch": 0.1983224603914259, + "grad_norm": 61.87691602410989, + "learning_rate": 9.413964722621337e-07, + "logits/chosen": 1.863405704498291, + "logits/rejected": 1.7748808860778809, + "logps/chosen": -0.8791462779045105, + "logps/rejected": -1.6963310241699219, + "loss": 2.7868, + "nll_loss": 0.8791462779045105, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.791460990905762, + "rewards/margins": 8.17184829711914, + "rewards/rejected": -16.96331024169922, + "step": 133 + }, + { + "epoch": 0.19981360671015844, + "grad_norm": 60.53050864085932, + "learning_rate": 9.402308704779598e-07, + "logits/chosen": 2.4300382137298584, + "logits/rejected": 2.6305856704711914, + "logps/chosen": -1.0600104331970215, + "logps/rejected": -1.7147343158721924, + "loss": 2.2858, + "nll_loss": 1.060010552406311, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.600104331970215, + "rewards/margins": 6.547240257263184, + "rewards/rejected": -17.147342681884766, + "step": 134 + }, + { + "epoch": 0.20130475302889095, + "grad_norm": 91.17746842885462, + "learning_rate": 9.390545271890437e-07, + "logits/chosen": 2.2293026447296143, + "logits/rejected": 2.311601161956787, + "logps/chosen": -0.912305474281311, + "logps/rejected": -1.4468194246292114, + "loss": 2.9031, + "nll_loss": 0.912305474281311, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.123055458068848, + "rewards/margins": 5.3451385498046875, + "rewards/rejected": -14.468194007873535, + "step": 135 + }, + { + "epoch": 0.2027958993476235, + "grad_norm": 37.83215752166125, + "learning_rate": 9.378674710978183e-07, + "logits/chosen": 2.585904359817505, + "logits/rejected": 2.4605495929718018, + "logps/chosen": -1.084753394126892, + "logps/rejected": -1.461085319519043, + "loss": 2.405, + "nll_loss": 1.084753394126892, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.847535133361816, + "rewards/margins": 3.7633185386657715, + "rewards/rejected": -14.610852241516113, + "step": 136 + }, + { + "epoch": 0.204287045666356, + "grad_norm": 109.12252125543667, + "learning_rate": 9.366697311681057e-07, + "logits/chosen": 2.2206501960754395, + "logits/rejected": 1.6608248949050903, + "logps/chosen": -0.8300536274909973, + "logps/rejected": -1.4110511541366577, + "loss": 2.315, + "nll_loss": 0.8300537467002869, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.300537109375, + "rewards/margins": 5.8099751472473145, + "rewards/rejected": -14.110511779785156, + "step": 137 + }, + { + "epoch": 0.20577819198508854, + "grad_norm": 33.452774130013616, + "learning_rate": 9.354613366244106e-07, + "logits/chosen": 1.9580575227737427, + "logits/rejected": 2.0505290031433105, + "logps/chosen": -0.8890818953514099, + "logps/rejected": -1.7066799402236938, + "loss": 2.4832, + "nll_loss": 0.8890818953514099, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.890819549560547, + "rewards/margins": 8.175981521606445, + "rewards/rejected": -17.06679916381836, + "step": 138 + }, + { + "epoch": 0.20726933830382105, + "grad_norm": 63.238180246927655, + "learning_rate": 9.342423169512071e-07, + "logits/chosen": 1.7694742679595947, + "logits/rejected": 1.70145845413208, + "logps/chosen": -1.2566676139831543, + "logps/rejected": -1.3205902576446533, + "loss": 3.0226, + "nll_loss": 1.2566676139831543, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.566676139831543, + "rewards/margins": 0.6392264366149902, + "rewards/rejected": -13.205903053283691, + "step": 139 + }, + { + "epoch": 0.2087604846225536, + "grad_norm": 243.72222558177097, + "learning_rate": 9.330127018922193e-07, + "logits/chosen": 2.3753468990325928, + "logits/rejected": 1.9688851833343506, + "logps/chosen": -1.1052511930465698, + "logps/rejected": -2.9732117652893066, + "loss": 3.8775, + "nll_loss": 1.1052511930465698, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.052511215209961, + "rewards/margins": 18.67960548400879, + "rewards/rejected": -29.73211669921875, + "step": 140 + }, + { + "epoch": 0.21025163094128613, + "grad_norm": 85.72150014359558, + "learning_rate": 9.317725214496959e-07, + "logits/chosen": 1.732937216758728, + "logits/rejected": 1.5855662822723389, + "logps/chosen": -0.9783846139907837, + "logps/rejected": -2.407928943634033, + "loss": 3.3785, + "nll_loss": 0.9783846139907837, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.783845901489258, + "rewards/margins": 14.295442581176758, + "rewards/rejected": -24.07929039001465, + "step": 141 + }, + { + "epoch": 0.21174277726001864, + "grad_norm": 76.65538553179297, + "learning_rate": 9.305218058836776e-07, + "logits/chosen": 2.091996669769287, + "logits/rejected": 2.419956684112549, + "logps/chosen": -0.8971315622329712, + "logps/rejected": -1.9470024108886719, + "loss": 2.8238, + "nll_loss": 0.897131621837616, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.97131633758545, + "rewards/margins": 10.498709678649902, + "rewards/rejected": -19.47002601623535, + "step": 142 + }, + { + "epoch": 0.21323392357875118, + "grad_norm": 27.340783406736662, + "learning_rate": 9.292605857112594e-07, + "logits/chosen": 2.6284255981445312, + "logits/rejected": 2.7718348503112793, + "logps/chosen": -1.089355230331421, + "logps/rejected": -1.6409265995025635, + "loss": 1.8538, + "nll_loss": 1.089355230331421, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.893552780151367, + "rewards/margins": 5.515713214874268, + "rewards/rejected": -16.409265518188477, + "step": 143 + }, + { + "epoch": 0.2147250698974837, + "grad_norm": 58.23543041434007, + "learning_rate": 9.279888917058451e-07, + "logits/chosen": 2.169576406478882, + "logits/rejected": 2.6294941902160645, + "logps/chosen": -1.123268723487854, + "logps/rejected": -2.1478488445281982, + "loss": 2.3351, + "nll_loss": 1.123268723487854, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.232686996459961, + "rewards/margins": 10.245802879333496, + "rewards/rejected": -21.478490829467773, + "step": 144 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 101.4240161071547, + "learning_rate": 9.267067548963974e-07, + "logits/chosen": 1.8942837715148926, + "logits/rejected": 2.2905285358428955, + "logps/chosen": -1.0335276126861572, + "logps/rejected": -2.0474987030029297, + "loss": 3.0506, + "nll_loss": 1.0335276126861572, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.335275650024414, + "rewards/margins": 10.13970947265625, + "rewards/rejected": -20.474985122680664, + "step": 145 + }, + { + "epoch": 0.21770736253494874, + "grad_norm": 164.47407171109947, + "learning_rate": 9.2541420656668e-07, + "logits/chosen": 2.1074047088623047, + "logits/rejected": 2.3331003189086914, + "logps/chosen": -0.8820992112159729, + "logps/rejected": -1.657348394393921, + "loss": 3.3369, + "nll_loss": 0.8820992112159729, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.820991516113281, + "rewards/margins": 7.752492427825928, + "rewards/rejected": -16.573484420776367, + "step": 146 + }, + { + "epoch": 0.21919850885368128, + "grad_norm": 86.30863915055022, + "learning_rate": 9.241112782544951e-07, + "logits/chosen": 1.4786248207092285, + "logits/rejected": 1.909964680671692, + "logps/chosen": -1.0103991031646729, + "logps/rejected": -2.397908926010132, + "loss": 2.5863, + "nll_loss": 1.0103992223739624, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.10399055480957, + "rewards/margins": 13.875099182128906, + "rewards/rejected": -23.97909164428711, + "step": 147 + }, + { + "epoch": 0.2206896551724138, + "grad_norm": 77.89270460808487, + "learning_rate": 9.22798001750913e-07, + "logits/chosen": 1.9113011360168457, + "logits/rejected": 1.9127446413040161, + "logps/chosen": -1.0667080879211426, + "logps/rejected": -2.3992092609405518, + "loss": 2.0689, + "nll_loss": 1.0667080879211426, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.667080879211426, + "rewards/margins": 13.325010299682617, + "rewards/rejected": -23.99209213256836, + "step": 148 + }, + { + "epoch": 0.22218080149114633, + "grad_norm": 30.33940939627606, + "learning_rate": 9.214744090994973e-07, + "logits/chosen": 1.8804258108139038, + "logits/rejected": 1.7602787017822266, + "logps/chosen": -0.7917377352714539, + "logps/rejected": -1.8364883661270142, + "loss": 1.9412, + "nll_loss": 0.7917377948760986, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.91737699508667, + "rewards/margins": 10.447505950927734, + "rewards/rejected": -18.364883422851562, + "step": 149 + }, + { + "epoch": 0.22367194780987884, + "grad_norm": 26.804555421773678, + "learning_rate": 9.20140532595522e-07, + "logits/chosen": 2.0434508323669434, + "logits/rejected": 2.001006841659546, + "logps/chosen": -0.8837192058563232, + "logps/rejected": -2.5913500785827637, + "loss": 2.1711, + "nll_loss": 0.8837192058563232, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.83719253540039, + "rewards/margins": 17.07630729675293, + "rewards/rejected": -25.913501739501953, + "step": 150 + }, + { + "epoch": 0.22516309412861138, + "grad_norm": 56.03717385018212, + "learning_rate": 9.18796404785185e-07, + "logits/chosen": 1.435347318649292, + "logits/rejected": 1.2728326320648193, + "logps/chosen": -1.0722644329071045, + "logps/rejected": -1.2300254106521606, + "loss": 3.2513, + "nll_loss": 1.072264552116394, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.722643852233887, + "rewards/margins": 1.5776088237762451, + "rewards/rejected": -12.300253868103027, + "step": 151 + }, + { + "epoch": 0.2266542404473439, + "grad_norm": 33.26179033555356, + "learning_rate": 9.174420584648122e-07, + "logits/chosen": 1.7228583097457886, + "logits/rejected": 1.9539413452148438, + "logps/chosen": -0.999208927154541, + "logps/rejected": -2.489034652709961, + "loss": 2.3513, + "nll_loss": 0.9992088675498962, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.99208927154541, + "rewards/margins": 14.8982572555542, + "rewards/rejected": -24.89034652709961, + "step": 152 + }, + { + "epoch": 0.22814538676607643, + "grad_norm": 272.4402787764153, + "learning_rate": 9.160775266800582e-07, + "logits/chosen": 1.1529669761657715, + "logits/rejected": 1.5829405784606934, + "logps/chosen": -0.8998862504959106, + "logps/rejected": -1.6488455533981323, + "loss": 3.4049, + "nll_loss": 0.8998862504959106, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.998862266540527, + "rewards/margins": 7.489593505859375, + "rewards/rejected": -16.48845672607422, + "step": 153 + }, + { + "epoch": 0.22963653308480894, + "grad_norm": 100.54421248357787, + "learning_rate": 9.147028427251009e-07, + "logits/chosen": 1.2050219774246216, + "logits/rejected": 1.6227113008499146, + "logps/chosen": -0.6130104064941406, + "logps/rejected": -1.9992823600769043, + "loss": 2.1776, + "nll_loss": 0.6130104064941406, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.130103588104248, + "rewards/margins": 13.862720489501953, + "rewards/rejected": -19.99282455444336, + "step": 154 + }, + { + "epoch": 0.23112767940354148, + "grad_norm": 102.84523025559395, + "learning_rate": 9.13318040141827e-07, + "logits/chosen": 1.593552589416504, + "logits/rejected": 1.550580620765686, + "logps/chosen": -0.819354236125946, + "logps/rejected": -1.9264180660247803, + "loss": 2.7251, + "nll_loss": 0.8193541169166565, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.19354248046875, + "rewards/margins": 11.070638656616211, + "rewards/rejected": -19.264179229736328, + "step": 155 + }, + { + "epoch": 0.232618825722274, + "grad_norm": 79.66896245906493, + "learning_rate": 9.119231527190158e-07, + "logits/chosen": 1.7341417074203491, + "logits/rejected": 1.8601384162902832, + "logps/chosen": -0.9296043515205383, + "logps/rejected": -1.9038300514221191, + "loss": 2.6692, + "nll_loss": 0.9296042919158936, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.296043395996094, + "rewards/margins": 9.742258071899414, + "rewards/rejected": -19.038301467895508, + "step": 156 + }, + { + "epoch": 0.23410997204100653, + "grad_norm": 61.386657530764126, + "learning_rate": 9.105182144915129e-07, + "logits/chosen": 1.5324293375015259, + "logits/rejected": 1.4971590042114258, + "logps/chosen": -1.050441861152649, + "logps/rejected": -2.616759777069092, + "loss": 2.0364, + "nll_loss": 1.0504419803619385, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.504420280456543, + "rewards/margins": 15.663180351257324, + "rewards/rejected": -26.167598724365234, + "step": 157 + }, + { + "epoch": 0.23560111835973904, + "grad_norm": 31.59053647001086, + "learning_rate": 9.091032597394012e-07, + "logits/chosen": 1.8084895610809326, + "logits/rejected": 1.6436604261398315, + "logps/chosen": -1.0257327556610107, + "logps/rejected": -1.6942673921585083, + "loss": 2.1228, + "nll_loss": 1.0257327556610107, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.257328033447266, + "rewards/margins": 6.6853461265563965, + "rewards/rejected": -16.94267463684082, + "step": 158 + }, + { + "epoch": 0.23709226467847158, + "grad_norm": 99.7544542434043, + "learning_rate": 9.076783229871634e-07, + "logits/chosen": 1.39780855178833, + "logits/rejected": 1.226241111755371, + "logps/chosen": -1.3455320596694946, + "logps/rejected": -2.6359424591064453, + "loss": 2.157, + "nll_loss": 1.3455320596694946, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.455320358276367, + "rewards/margins": 12.904104232788086, + "rewards/rejected": -26.359424591064453, + "step": 159 + }, + { + "epoch": 0.2385834109972041, + "grad_norm": 65.81757474306599, + "learning_rate": 9.062434390028407e-07, + "logits/chosen": 2.117692232131958, + "logits/rejected": 2.501654624938965, + "logps/chosen": -1.0490020513534546, + "logps/rejected": -2.2091197967529297, + "loss": 2.1304, + "nll_loss": 1.0490020513534546, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.490020751953125, + "rewards/margins": 11.601176261901855, + "rewards/rejected": -22.091196060180664, + "step": 160 + }, + { + "epoch": 0.24007455731593663, + "grad_norm": 60.48173676184412, + "learning_rate": 9.04798642797183e-07, + "logits/chosen": 2.0881049633026123, + "logits/rejected": 2.2217414379119873, + "logps/chosen": -1.2211053371429443, + "logps/rejected": -2.209552764892578, + "loss": 2.9786, + "nll_loss": 1.2211053371429443, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.211054801940918, + "rewards/margins": 9.884474754333496, + "rewards/rejected": -22.09552764892578, + "step": 161 + }, + { + "epoch": 0.24156570363466914, + "grad_norm": 63.745333924082374, + "learning_rate": 9.033439696227965e-07, + "logits/chosen": 1.9421632289886475, + "logits/rejected": 2.154196262359619, + "logps/chosen": -0.9993748068809509, + "logps/rejected": -1.5965303182601929, + "loss": 1.8593, + "nll_loss": 0.9993748068809509, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.993748664855957, + "rewards/margins": 5.971555709838867, + "rewards/rejected": -15.965304374694824, + "step": 162 + }, + { + "epoch": 0.24305684995340168, + "grad_norm": 84.14922727194168, + "learning_rate": 9.018794549732817e-07, + "logits/chosen": 1.134750485420227, + "logits/rejected": 1.1239275932312012, + "logps/chosen": -0.850483238697052, + "logps/rejected": -2.0938446521759033, + "loss": 1.8786, + "nll_loss": 0.850483238697052, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.50483226776123, + "rewards/margins": 12.433614730834961, + "rewards/rejected": -20.938446044921875, + "step": 163 + }, + { + "epoch": 0.2445479962721342, + "grad_norm": 37.840829532481365, + "learning_rate": 9.004051345823688e-07, + "logits/chosen": 2.2150847911834717, + "logits/rejected": 2.0349390506744385, + "logps/chosen": -0.9850893020629883, + "logps/rejected": -2.0254619121551514, + "loss": 1.7743, + "nll_loss": 0.9850894212722778, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.850893020629883, + "rewards/margins": 10.40372371673584, + "rewards/rejected": -20.25461769104004, + "step": 164 + }, + { + "epoch": 0.24603914259086673, + "grad_norm": 25.344809556394146, + "learning_rate": 8.989210444230449e-07, + "logits/chosen": 1.4311250448226929, + "logits/rejected": 1.9220129251480103, + "logps/chosen": -0.9075038433074951, + "logps/rejected": -3.189478635787964, + "loss": 1.9771, + "nll_loss": 0.9075037240982056, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.075037956237793, + "rewards/margins": 22.819747924804688, + "rewards/rejected": -31.89478302001953, + "step": 165 + }, + { + "epoch": 0.24753028890959924, + "grad_norm": 111.42978654994121, + "learning_rate": 8.974272207066767e-07, + "logits/chosen": 1.41557776927948, + "logits/rejected": 1.558440089225769, + "logps/chosen": -1.0430129766464233, + "logps/rejected": -1.9753856658935547, + "loss": 2.8302, + "nll_loss": 1.0430129766464233, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.430130958557129, + "rewards/margins": 9.323726654052734, + "rewards/rejected": -19.753856658935547, + "step": 166 + }, + { + "epoch": 0.24902143522833178, + "grad_norm": 108.79421551199331, + "learning_rate": 8.959236998821266e-07, + "logits/chosen": 1.4906010627746582, + "logits/rejected": 1.4089393615722656, + "logps/chosen": -1.2226719856262207, + "logps/rejected": -2.027992010116577, + "loss": 2.139, + "nll_loss": 1.2226719856262207, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.226719856262207, + "rewards/margins": 8.05319881439209, + "rewards/rejected": -20.279918670654297, + "step": 167 + }, + { + "epoch": 0.2505125815470643, + "grad_norm": 124.37069299295982, + "learning_rate": 8.944105186348645e-07, + "logits/chosen": 2.3309714794158936, + "logits/rejected": 2.2218449115753174, + "logps/chosen": -0.8178737163543701, + "logps/rejected": -1.4837485551834106, + "loss": 1.8927, + "nll_loss": 0.8178737163543701, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.178736686706543, + "rewards/margins": 6.658747673034668, + "rewards/rejected": -14.837486267089844, + "step": 168 + }, + { + "epoch": 0.25200372786579683, + "grad_norm": 49.07982484344473, + "learning_rate": 8.928877138860706e-07, + "logits/chosen": 2.5806398391723633, + "logits/rejected": 2.6362061500549316, + "logps/chosen": -1.1920853853225708, + "logps/rejected": -1.460356593132019, + "loss": 2.77, + "nll_loss": 1.1920853853225708, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.920853614807129, + "rewards/margins": 2.6827123165130615, + "rewards/rejected": -14.603567123413086, + "step": 169 + }, + { + "epoch": 0.2534948741845294, + "grad_norm": 57.895474258549186, + "learning_rate": 8.913553227917365e-07, + "logits/chosen": 1.51754891872406, + "logits/rejected": 1.5921522378921509, + "logps/chosen": -1.219523310661316, + "logps/rejected": -2.685457944869995, + "loss": 1.996, + "nll_loss": 1.219523310661316, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.195232391357422, + "rewards/margins": 14.659347534179688, + "rewards/rejected": -26.85457992553711, + "step": 170 + }, + { + "epoch": 0.25498602050326186, + "grad_norm": 50.35318504402622, + "learning_rate": 8.898133827417577e-07, + "logits/chosen": 1.492168664932251, + "logits/rejected": 1.3493200540542603, + "logps/chosen": -0.917851984500885, + "logps/rejected": -1.3204575777053833, + "loss": 2.4887, + "nll_loss": 0.917851984500885, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.178519248962402, + "rewards/margins": 4.026056289672852, + "rewards/rejected": -13.204574584960938, + "step": 171 + }, + { + "epoch": 0.2564771668219944, + "grad_norm": 123.00885834314533, + "learning_rate": 8.882619313590212e-07, + "logits/chosen": 2.1078224182128906, + "logits/rejected": 1.8869191408157349, + "logps/chosen": -1.1606742143630981, + "logps/rejected": -1.7969461679458618, + "loss": 2.2829, + "nll_loss": 1.1606740951538086, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.606741905212402, + "rewards/margins": 6.36271858215332, + "rewards/rejected": -17.96946144104004, + "step": 172 + }, + { + "epoch": 0.25796831314072693, + "grad_norm": 87.34749076746039, + "learning_rate": 8.867010064984879e-07, + "logits/chosen": 1.897976279258728, + "logits/rejected": 1.5353047847747803, + "logps/chosen": -1.3330408334732056, + "logps/rejected": -2.919948101043701, + "loss": 2.1636, + "nll_loss": 1.3330408334732056, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.33040714263916, + "rewards/margins": 15.869071006774902, + "rewards/rejected": -29.199480056762695, + "step": 173 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 90.01313232214396, + "learning_rate": 8.851306462462688e-07, + "logits/chosen": 1.750410556793213, + "logits/rejected": 1.5374597311019897, + "logps/chosen": -1.1078706979751587, + "logps/rejected": -2.269902229309082, + "loss": 2.1, + "nll_loss": 1.1078706979751587, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.078706741333008, + "rewards/margins": 11.620316505432129, + "rewards/rejected": -22.69902229309082, + "step": 174 + }, + { + "epoch": 0.26095060577819196, + "grad_norm": 76.91976422919461, + "learning_rate": 8.835508889186956e-07, + "logits/chosen": 1.9853813648223877, + "logits/rejected": 2.1543500423431396, + "logps/chosen": -1.3369462490081787, + "logps/rejected": -1.8868308067321777, + "loss": 2.8227, + "nll_loss": 1.3369462490081787, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.369462013244629, + "rewards/margins": 5.498845100402832, + "rewards/rejected": -18.86830711364746, + "step": 175 + }, + { + "epoch": 0.2624417520969245, + "grad_norm": 55.13369487383775, + "learning_rate": 8.819617730613862e-07, + "logits/chosen": 1.2609195709228516, + "logits/rejected": 1.3708417415618896, + "logps/chosen": -1.2357456684112549, + "logps/rejected": -2.014441728591919, + "loss": 2.5874, + "nll_loss": 1.2357456684112549, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.35745620727539, + "rewards/margins": 7.786960601806641, + "rewards/rejected": -20.144418716430664, + "step": 176 + }, + { + "epoch": 0.26393289841565704, + "grad_norm": 76.22181618820312, + "learning_rate": 8.803633374483035e-07, + "logits/chosen": 1.3541204929351807, + "logits/rejected": 1.1606299877166748, + "logps/chosen": -0.838552713394165, + "logps/rejected": -2.6401939392089844, + "loss": 1.0984, + "nll_loss": 0.838552713394165, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.385526657104492, + "rewards/margins": 18.01641273498535, + "rewards/rejected": -26.40193748474121, + "step": 177 + }, + { + "epoch": 0.2654240447343896, + "grad_norm": 64.42198540747405, + "learning_rate": 8.7875562108081e-07, + "logits/chosen": 0.4180157482624054, + "logits/rejected": 0.13569970428943634, + "logps/chosen": -0.8393802046775818, + "logps/rejected": -2.059353828430176, + "loss": 2.2279, + "nll_loss": 0.8393802642822266, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.39380168914795, + "rewards/margins": 12.199737548828125, + "rewards/rejected": -20.59354019165039, + "step": 178 + }, + { + "epoch": 0.2669151910531221, + "grad_norm": 65.33071806962461, + "learning_rate": 8.771386631867157e-07, + "logits/chosen": 2.04560923576355, + "logits/rejected": 2.5156006813049316, + "logps/chosen": -0.8595657348632812, + "logps/rejected": -1.5896443128585815, + "loss": 2.4389, + "nll_loss": 0.8595657348632812, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.595657348632812, + "rewards/margins": 7.300787925720215, + "rewards/rejected": -15.896444320678711, + "step": 179 + }, + { + "epoch": 0.2684063373718546, + "grad_norm": 56.757214632206704, + "learning_rate": 8.755125032193214e-07, + "logits/chosen": 2.0090067386627197, + "logits/rejected": 1.9684314727783203, + "logps/chosen": -1.0932655334472656, + "logps/rejected": -1.6011093854904175, + "loss": 2.2746, + "nll_loss": 1.0932655334472656, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.932655334472656, + "rewards/margins": 5.078439235687256, + "rewards/rejected": -16.01109504699707, + "step": 180 + }, + { + "epoch": 0.26989748369058714, + "grad_norm": 57.80690014465061, + "learning_rate": 8.738771808564555e-07, + "logits/chosen": 1.186753749847412, + "logits/rejected": 1.113171935081482, + "logps/chosen": -0.7738239765167236, + "logps/rejected": -3.2991132736206055, + "loss": 2.059, + "nll_loss": 0.7738240361213684, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.738240718841553, + "rewards/margins": 25.25288963317871, + "rewards/rejected": -32.99113082885742, + "step": 181 + }, + { + "epoch": 0.2713886300093197, + "grad_norm": 100.06676793231738, + "learning_rate": 8.722327359995063e-07, + "logits/chosen": 1.3211489915847778, + "logits/rejected": 1.5033185482025146, + "logps/chosen": -0.6754621267318726, + "logps/rejected": -4.53987979888916, + "loss": 1.9102, + "nll_loss": 0.6754621267318726, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.7546210289001465, + "rewards/margins": 38.6441764831543, + "rewards/rejected": -45.398799896240234, + "step": 182 + }, + { + "epoch": 0.2728797763280522, + "grad_norm": 75.50764406211981, + "learning_rate": 8.705792087724484e-07, + "logits/chosen": 2.040464401245117, + "logits/rejected": 2.061976909637451, + "logps/chosen": -0.8497739434242249, + "logps/rejected": -1.8095765113830566, + "loss": 2.4733, + "nll_loss": 0.8497739434242249, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.497739791870117, + "rewards/margins": 9.598026275634766, + "rewards/rejected": -18.095766067504883, + "step": 183 + }, + { + "epoch": 0.2743709226467847, + "grad_norm": 100.4492023921661, + "learning_rate": 8.689166395208636e-07, + "logits/chosen": 1.2215818166732788, + "logits/rejected": 1.020734190940857, + "logps/chosen": -0.8507159948348999, + "logps/rejected": -2.4126875400543213, + "loss": 2.3262, + "nll_loss": 0.8507159948348999, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.507159233093262, + "rewards/margins": 15.619712829589844, + "rewards/rejected": -24.126873016357422, + "step": 184 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 180.94476563933884, + "learning_rate": 8.672450688109563e-07, + "logits/chosen": 2.0847837924957275, + "logits/rejected": 2.215726613998413, + "logps/chosen": -1.1926288604736328, + "logps/rejected": -1.9761557579040527, + "loss": 1.5812, + "nll_loss": 1.1926288604736328, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.926289558410645, + "rewards/margins": 7.835270881652832, + "rewards/rejected": -19.761560440063477, + "step": 185 + }, + { + "epoch": 0.2773532152842498, + "grad_norm": 82.03593596024302, + "learning_rate": 8.655645374285636e-07, + "logits/chosen": 2.012111186981201, + "logits/rejected": 2.043102264404297, + "logps/chosen": -1.3665342330932617, + "logps/rejected": -2.8659491539001465, + "loss": 2.409, + "nll_loss": 1.3665341138839722, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.665343284606934, + "rewards/margins": 14.994148254394531, + "rewards/rejected": -28.65949249267578, + "step": 186 + }, + { + "epoch": 0.2788443616029823, + "grad_norm": 79.23210472745285, + "learning_rate": 8.638750863781612e-07, + "logits/chosen": 1.6436148881912231, + "logits/rejected": 2.1180176734924316, + "logps/chosen": -0.9192164540290833, + "logps/rejected": -1.9463462829589844, + "loss": 3.5592, + "nll_loss": 0.9192165732383728, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.192164421081543, + "rewards/margins": 10.271299362182617, + "rewards/rejected": -19.463462829589844, + "step": 187 + }, + { + "epoch": 0.2803355079217148, + "grad_norm": 26.47667976709633, + "learning_rate": 8.621767568818612e-07, + "logits/chosen": 0.6154603958129883, + "logits/rejected": 0.6709473133087158, + "logps/chosen": -0.9064656496047974, + "logps/rejected": -2.3845648765563965, + "loss": 1.662, + "nll_loss": 0.9064657092094421, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.064657211303711, + "rewards/margins": 14.780990600585938, + "rewards/rejected": -23.84564781188965, + "step": 188 + }, + { + "epoch": 0.28182665424044734, + "grad_norm": 101.4282206916804, + "learning_rate": 8.604695903784079e-07, + "logits/chosen": 0.42705780267715454, + "logits/rejected": 0.9927361607551575, + "logps/chosen": -0.9533852934837341, + "logps/rejected": -2.294440746307373, + "loss": 1.7952, + "nll_loss": 0.9533852338790894, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.533852577209473, + "rewards/margins": 13.410554885864258, + "rewards/rejected": -22.944406509399414, + "step": 189 + }, + { + "epoch": 0.2833178005591799, + "grad_norm": 78.40187913745498, + "learning_rate": 8.587536285221655e-07, + "logits/chosen": 1.032325267791748, + "logits/rejected": 1.2811267375946045, + "logps/chosen": -0.676677942276001, + "logps/rejected": -1.7353984117507935, + "loss": 2.2404, + "nll_loss": 0.6766780018806458, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.766780853271484, + "rewards/margins": 10.587203979492188, + "rewards/rejected": -17.353984832763672, + "step": 190 + }, + { + "epoch": 0.2848089468779124, + "grad_norm": 45.624929829577816, + "learning_rate": 8.570289131821024e-07, + "logits/chosen": 1.9201204776763916, + "logits/rejected": 2.1733713150024414, + "logps/chosen": -1.3790674209594727, + "logps/rejected": -2.599050521850586, + "loss": 2.4758, + "nll_loss": 1.3790674209594727, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.790674209594727, + "rewards/margins": 12.19983196258545, + "rewards/rejected": -25.990507125854492, + "step": 191 + }, + { + "epoch": 0.2863000931966449, + "grad_norm": 92.43704748026303, + "learning_rate": 8.552954864407697e-07, + "logits/chosen": 1.352588415145874, + "logits/rejected": 1.5578041076660156, + "logps/chosen": -0.861832857131958, + "logps/rejected": -1.6976982355117798, + "loss": 2.9659, + "nll_loss": 0.861832857131958, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.618329048156738, + "rewards/margins": 8.358652114868164, + "rewards/rejected": -16.97698211669922, + "step": 192 + }, + { + "epoch": 0.28779123951537744, + "grad_norm": 51.642031676003405, + "learning_rate": 8.535533905932737e-07, + "logits/chosen": 1.9250706434249878, + "logits/rejected": 2.259392023086548, + "logps/chosen": -1.432119369506836, + "logps/rejected": -2.4200923442840576, + "loss": 2.3788, + "nll_loss": 1.432119369506836, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.321192741394043, + "rewards/margins": 9.879730224609375, + "rewards/rejected": -24.200923919677734, + "step": 193 + }, + { + "epoch": 0.28928238583411, + "grad_norm": 57.855469319584095, + "learning_rate": 8.518026681462447e-07, + "logits/chosen": 0.6635885834693909, + "logits/rejected": 0.5963816046714783, + "logps/chosen": -0.943480908870697, + "logps/rejected": -1.6345998048782349, + "loss": 2.6174, + "nll_loss": 0.9434809684753418, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.434808731079102, + "rewards/margins": 6.911189079284668, + "rewards/rejected": -16.345998764038086, + "step": 194 + }, + { + "epoch": 0.2907735321528425, + "grad_norm": 252.85548256416274, + "learning_rate": 8.500433618167992e-07, + "logits/chosen": 1.7494518756866455, + "logits/rejected": 1.9818410873413086, + "logps/chosen": -1.2326170206069946, + "logps/rejected": -2.1006078720092773, + "loss": 2.0705, + "nll_loss": 1.2326171398162842, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.326169967651367, + "rewards/margins": 8.679910659790039, + "rewards/rejected": -21.00608253479004, + "step": 195 + }, + { + "epoch": 0.292264678471575, + "grad_norm": 44.05241942856872, + "learning_rate": 8.482755145314985e-07, + "logits/chosen": 2.0492377281188965, + "logits/rejected": 2.0539886951446533, + "logps/chosen": -1.2818440198898315, + "logps/rejected": -3.063652753829956, + "loss": 2.1861, + "nll_loss": 1.281843900680542, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.818438529968262, + "rewards/margins": 17.818090438842773, + "rewards/rejected": -30.63652992248535, + "step": 196 + }, + { + "epoch": 0.29375582479030754, + "grad_norm": 135.40620235088878, + "learning_rate": 8.464991694253e-07, + "logits/chosen": 1.953153133392334, + "logits/rejected": 1.931667685508728, + "logps/chosen": -1.0608192682266235, + "logps/rejected": -2.6127781867980957, + "loss": 3.1345, + "nll_loss": 1.060819387435913, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.60819149017334, + "rewards/margins": 15.519588470458984, + "rewards/rejected": -26.12778091430664, + "step": 197 + }, + { + "epoch": 0.2952469711090401, + "grad_norm": 63.912791933616546, + "learning_rate": 8.447143698405059e-07, + "logits/chosen": 1.5860276222229004, + "logits/rejected": 1.4829771518707275, + "logps/chosen": -1.001695156097412, + "logps/rejected": -1.9609891176223755, + "loss": 2.4446, + "nll_loss": 1.001695156097412, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.016950607299805, + "rewards/margins": 9.592939376831055, + "rewards/rejected": -19.60988998413086, + "step": 198 + }, + { + "epoch": 0.2967381174277726, + "grad_norm": 185.48348945131482, + "learning_rate": 8.429211593257052e-07, + "logits/chosen": 2.0849227905273438, + "logits/rejected": 2.0906858444213867, + "logps/chosen": -1.3137246370315552, + "logps/rejected": -2.2139267921447754, + "loss": 4.0443, + "nll_loss": 1.3137246370315552, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.137248039245605, + "rewards/margins": 9.002019882202148, + "rewards/rejected": -22.139266967773438, + "step": 199 + }, + { + "epoch": 0.2982292637465051, + "grad_norm": 39.150692300992226, + "learning_rate": 8.41119581634711e-07, + "logits/chosen": 2.822572946548462, + "logits/rejected": 2.72379469871521, + "logps/chosen": -1.3302345275878906, + "logps/rejected": -2.1766927242279053, + "loss": 1.6867, + "nll_loss": 1.3302347660064697, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.302345275878906, + "rewards/margins": 8.464580535888672, + "rewards/rejected": -21.766925811767578, + "step": 200 + }, + { + "epoch": 0.29972041006523764, + "grad_norm": 72.1674290233523, + "learning_rate": 8.393096807254931e-07, + "logits/chosen": 1.5731306076049805, + "logits/rejected": 1.7772272825241089, + "logps/chosen": -0.8957565426826477, + "logps/rejected": -1.3099387884140015, + "loss": 2.6164, + "nll_loss": 0.8957564830780029, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.957565307617188, + "rewards/margins": 4.141822814941406, + "rewards/rejected": -13.099388122558594, + "step": 201 + }, + { + "epoch": 0.3012115563839702, + "grad_norm": 44.80807964315525, + "learning_rate": 8.374915007591052e-07, + "logits/chosen": 1.3576855659484863, + "logits/rejected": 1.9479650259017944, + "logps/chosen": -1.4070185422897339, + "logps/rejected": -2.286010265350342, + "loss": 1.6984, + "nll_loss": 1.4070186614990234, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.070186614990234, + "rewards/margins": 8.78991413116455, + "rewards/rejected": -22.86009979248047, + "step": 202 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 36.876274463209846, + "learning_rate": 8.356650860986081e-07, + "logits/chosen": 0.7637724280357361, + "logits/rejected": 0.8034124970436096, + "logps/chosen": -1.2442846298217773, + "logps/rejected": -2.19854474067688, + "loss": 2.6261, + "nll_loss": 1.244284749031067, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.44284725189209, + "rewards/margins": 9.542598724365234, + "rewards/rejected": -21.985445022583008, + "step": 203 + }, + { + "epoch": 0.3041938490214352, + "grad_norm": 67.41135362002561, + "learning_rate": 8.338304813079864e-07, + "logits/chosen": 1.3719521760940552, + "logits/rejected": 1.5598344802856445, + "logps/chosen": -1.2198079824447632, + "logps/rejected": -1.4364782571792603, + "loss": 2.5946, + "nll_loss": 1.2198078632354736, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.198080062866211, + "rewards/margins": 2.1667017936706543, + "rewards/rejected": -14.364782333374023, + "step": 204 + }, + { + "epoch": 0.30568499534016774, + "grad_norm": 44.739442068978555, + "learning_rate": 8.319877311510612e-07, + "logits/chosen": 2.3804569244384766, + "logits/rejected": 2.126559257507324, + "logps/chosen": -0.942017674446106, + "logps/rejected": -1.4884874820709229, + "loss": 2.0004, + "nll_loss": 0.9420175552368164, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.42017650604248, + "rewards/margins": 5.464698314666748, + "rewards/rejected": -14.88487434387207, + "step": 205 + }, + { + "epoch": 0.3071761416589003, + "grad_norm": 56.20523602273714, + "learning_rate": 8.301368805903986e-07, + "logits/chosen": 1.1620374917984009, + "logits/rejected": 1.2464760541915894, + "logps/chosen": -1.3252227306365967, + "logps/rejected": -2.126899003982544, + "loss": 2.0718, + "nll_loss": 1.3252228498458862, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.252228736877441, + "rewards/margins": 8.016761779785156, + "rewards/rejected": -21.26898956298828, + "step": 206 + }, + { + "epoch": 0.3086672879776328, + "grad_norm": 53.096782115478575, + "learning_rate": 8.282779747862121e-07, + "logits/chosen": 0.9776243567466736, + "logits/rejected": 1.0243405103683472, + "logps/chosen": -0.9001644849777222, + "logps/rejected": -2.3409206867218018, + "loss": 1.6615, + "nll_loss": 0.9001644849777222, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.0016450881958, + "rewards/margins": 14.407562255859375, + "rewards/rejected": -23.409208297729492, + "step": 207 + }, + { + "epoch": 0.3101584342963653, + "grad_norm": 104.49663966984495, + "learning_rate": 8.264110590952607e-07, + "logits/chosen": 1.2132370471954346, + "logits/rejected": 1.3872023820877075, + "logps/chosen": -0.9424193501472473, + "logps/rejected": -1.9429956674575806, + "loss": 2.2228, + "nll_loss": 0.9424192905426025, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.424193382263184, + "rewards/margins": 10.005763053894043, + "rewards/rejected": -19.429956436157227, + "step": 208 + }, + { + "epoch": 0.31164958061509784, + "grad_norm": 46.95221324695929, + "learning_rate": 8.245361790697425e-07, + "logits/chosen": 1.2756640911102295, + "logits/rejected": 1.76149320602417, + "logps/chosen": -1.1788560152053833, + "logps/rejected": -2.207589864730835, + "loss": 1.8589, + "nll_loss": 1.1788560152053833, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.788559913635254, + "rewards/margins": 10.287336349487305, + "rewards/rejected": -22.075897216796875, + "step": 209 + }, + { + "epoch": 0.3131407269338304, + "grad_norm": 62.19023946080797, + "learning_rate": 8.226533804561826e-07, + "logits/chosen": 1.648912787437439, + "logits/rejected": 1.7056903839111328, + "logps/chosen": -1.1090244054794312, + "logps/rejected": -1.929490566253662, + "loss": 2.3942, + "nll_loss": 1.1090245246887207, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.09024429321289, + "rewards/margins": 8.204662322998047, + "rewards/rejected": -19.294906616210938, + "step": 210 + }, + { + "epoch": 0.3146318732525629, + "grad_norm": 39.42181196509823, + "learning_rate": 8.207627091943177e-07, + "logits/chosen": 2.0703964233398438, + "logits/rejected": 2.2637991905212402, + "logps/chosen": -1.0555453300476074, + "logps/rejected": -1.8380850553512573, + "loss": 1.8003, + "nll_loss": 1.0555452108383179, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.555453300476074, + "rewards/margins": 7.8253960609436035, + "rewards/rejected": -18.38085174560547, + "step": 211 + }, + { + "epoch": 0.31612301957129546, + "grad_norm": 54.07476460664365, + "learning_rate": 8.188642114159746e-07, + "logits/chosen": 1.6686798334121704, + "logits/rejected": 1.6440613269805908, + "logps/chosen": -1.2204382419586182, + "logps/rejected": -1.252202033996582, + "loss": 2.4999, + "nll_loss": 1.2204382419586182, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.204381942749023, + "rewards/margins": 0.31763792037963867, + "rewards/rejected": -12.52202033996582, + "step": 212 + }, + { + "epoch": 0.31761416589002794, + "grad_norm": 108.18112958443436, + "learning_rate": 8.169579334439452e-07, + "logits/chosen": 1.827782154083252, + "logits/rejected": 2.365203857421875, + "logps/chosen": -0.9755070805549622, + "logps/rejected": -2.1315362453460693, + "loss": 1.8175, + "nll_loss": 0.9755070805549622, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.755070686340332, + "rewards/margins": 11.56029224395752, + "rewards/rejected": -21.31536293029785, + "step": 213 + }, + { + "epoch": 0.3191053122087605, + "grad_norm": 139.8642654166928, + "learning_rate": 8.150439217908556e-07, + "logits/chosen": 1.1471678018569946, + "logits/rejected": 1.8254046440124512, + "logps/chosen": -1.2145636081695557, + "logps/rejected": -2.123124122619629, + "loss": 2.0822, + "nll_loss": 1.2145636081695557, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.145635604858398, + "rewards/margins": 9.085602760314941, + "rewards/rejected": -21.231239318847656, + "step": 214 + }, + { + "epoch": 0.320596458527493, + "grad_norm": 56.175736770434895, + "learning_rate": 8.131222231580313e-07, + "logits/chosen": 1.517196536064148, + "logits/rejected": 1.686833381652832, + "logps/chosen": -0.9890428781509399, + "logps/rejected": -2.2462382316589355, + "loss": 1.8137, + "nll_loss": 0.9890428185462952, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.89042854309082, + "rewards/margins": 12.571954727172852, + "rewards/rejected": -22.46238136291504, + "step": 215 + }, + { + "epoch": 0.32208760484622556, + "grad_norm": 115.21812937694328, + "learning_rate": 8.111928844343578e-07, + "logits/chosen": 1.493722915649414, + "logits/rejected": 1.4394516944885254, + "logps/chosen": -0.7523890733718872, + "logps/rejected": -2.181933641433716, + "loss": 2.421, + "nll_loss": 0.752389132976532, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.523890018463135, + "rewards/margins": 14.295445442199707, + "rewards/rejected": -21.819337844848633, + "step": 216 + }, + { + "epoch": 0.32357875116495805, + "grad_norm": 33.129629195390294, + "learning_rate": 8.092559526951374e-07, + "logits/chosen": 2.051661729812622, + "logits/rejected": 2.359527349472046, + "logps/chosen": -1.0224583148956299, + "logps/rejected": -2.7590489387512207, + "loss": 1.7781, + "nll_loss": 1.0224583148956299, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.224583625793457, + "rewards/margins": 17.36590576171875, + "rewards/rejected": -27.590490341186523, + "step": 217 + }, + { + "epoch": 0.3250698974836906, + "grad_norm": 49.585571265516876, + "learning_rate": 8.073114752009387e-07, + "logits/chosen": 0.7824015617370605, + "logits/rejected": 0.8636319637298584, + "logps/chosen": -1.0705488920211792, + "logps/rejected": -1.5145641565322876, + "loss": 2.0008, + "nll_loss": 1.0705488920211792, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.705488204956055, + "rewards/margins": 4.440152645111084, + "rewards/rejected": -15.14564037322998, + "step": 218 + }, + { + "epoch": 0.3265610438024231, + "grad_norm": 78.79188629369138, + "learning_rate": 8.053594993964452e-07, + "logits/chosen": 0.21954438090324402, + "logits/rejected": 0.705274760723114, + "logps/chosen": -1.100860595703125, + "logps/rejected": -2.3908841609954834, + "loss": 1.687, + "nll_loss": 1.100860595703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.00860595703125, + "rewards/margins": 12.90023422241211, + "rewards/rejected": -23.90884017944336, + "step": 219 + }, + { + "epoch": 0.32805219012115566, + "grad_norm": 188.92539787768402, + "learning_rate": 8.034000729092967e-07, + "logits/chosen": 1.0372661352157593, + "logits/rejected": 1.5985794067382812, + "logps/chosen": -0.9695538282394409, + "logps/rejected": -2.930168628692627, + "loss": 2.9507, + "nll_loss": 0.9695538282394409, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.695537567138672, + "rewards/margins": 19.60614585876465, + "rewards/rejected": -29.301685333251953, + "step": 220 + }, + { + "epoch": 0.32954333643988815, + "grad_norm": 94.92786446919038, + "learning_rate": 8.014332435489275e-07, + "logits/chosen": 1.1338255405426025, + "logits/rejected": 1.1260263919830322, + "logps/chosen": -1.053155779838562, + "logps/rejected": -2.1766085624694824, + "loss": 2.4868, + "nll_loss": 1.053155779838562, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.531557083129883, + "rewards/margins": 11.234529495239258, + "rewards/rejected": -21.76608657836914, + "step": 221 + }, + { + "epoch": 0.3310344827586207, + "grad_norm": 60.85719317292509, + "learning_rate": 7.994590593054e-07, + "logits/chosen": 1.7554562091827393, + "logits/rejected": 1.8874410390853882, + "logps/chosen": -1.1214702129364014, + "logps/rejected": -1.3658537864685059, + "loss": 2.3518, + "nll_loss": 1.1214702129364014, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.214702606201172, + "rewards/margins": 2.4438343048095703, + "rewards/rejected": -13.658536911010742, + "step": 222 + }, + { + "epoch": 0.3325256290773532, + "grad_norm": 87.43032722800943, + "learning_rate": 7.974775683482337e-07, + "logits/chosen": 0.4400970935821533, + "logits/rejected": 0.8527993559837341, + "logps/chosen": -0.97307950258255, + "logps/rejected": -1.959856390953064, + "loss": 1.2906, + "nll_loss": 0.9730795621871948, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.730794906616211, + "rewards/margins": 9.867768287658691, + "rewards/rejected": -19.59856414794922, + "step": 223 + }, + { + "epoch": 0.33401677539608576, + "grad_norm": 98.45314305769313, + "learning_rate": 7.954888190252291e-07, + "logits/chosen": 1.3810157775878906, + "logits/rejected": 0.5474785566329956, + "logps/chosen": -1.309791922569275, + "logps/rejected": -2.2794816493988037, + "loss": 2.1317, + "nll_loss": 1.309791922569275, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.097918510437012, + "rewards/margins": 9.696897506713867, + "rewards/rejected": -22.794815063476562, + "step": 224 + }, + { + "epoch": 0.33550792171481825, + "grad_norm": 35.120986400927954, + "learning_rate": 7.934928598612895e-07, + "logits/chosen": 1.025597095489502, + "logits/rejected": 1.417877435684204, + "logps/chosen": -1.4049136638641357, + "logps/rejected": -3.658029317855835, + "loss": 1.7105, + "nll_loss": 1.4049136638641357, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.049138069152832, + "rewards/margins": 22.53115463256836, + "rewards/rejected": -36.580291748046875, + "step": 225 + }, + { + "epoch": 0.3369990680335508, + "grad_norm": 81.14812842799493, + "learning_rate": 7.91489739557236e-07, + "logits/chosen": 0.44910019636154175, + "logits/rejected": 0.7166433334350586, + "logps/chosen": -1.2824578285217285, + "logps/rejected": -2.48989200592041, + "loss": 2.5273, + "nll_loss": 1.2824578285217285, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.824578285217285, + "rewards/margins": 12.074341773986816, + "rewards/rejected": -24.8989200592041, + "step": 226 + }, + { + "epoch": 0.3384902143522833, + "grad_norm": 39.55482261548164, + "learning_rate": 7.894795069886191e-07, + "logits/chosen": 0.6071157455444336, + "logits/rejected": 1.4952878952026367, + "logps/chosen": -0.8678624629974365, + "logps/rejected": -1.9564037322998047, + "loss": 2.135, + "nll_loss": 0.8678624629974365, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.678624153137207, + "rewards/margins": 10.885414123535156, + "rewards/rejected": -19.56403923034668, + "step": 227 + }, + { + "epoch": 0.33998136067101586, + "grad_norm": 47.63663070972851, + "learning_rate": 7.874622112045268e-07, + "logits/chosen": 2.552133321762085, + "logits/rejected": 2.604572296142578, + "logps/chosen": -1.0520918369293213, + "logps/rejected": -1.1822909116744995, + "loss": 2.2303, + "nll_loss": 1.0520917177200317, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.520917892456055, + "rewards/margins": 1.3019917011260986, + "rewards/rejected": -11.82291030883789, + "step": 228 + }, + { + "epoch": 0.34147250698974835, + "grad_norm": 147.2009343892087, + "learning_rate": 7.854379014263876e-07, + "logits/chosen": 1.5173033475875854, + "logits/rejected": 1.780479073524475, + "logps/chosen": -1.1946444511413574, + "logps/rejected": -2.0428507328033447, + "loss": 2.6133, + "nll_loss": 1.1946444511413574, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.946443557739258, + "rewards/margins": 8.482065200805664, + "rewards/rejected": -20.428508758544922, + "step": 229 + }, + { + "epoch": 0.3429636533084809, + "grad_norm": 68.3722920532662, + "learning_rate": 7.834066270467689e-07, + "logits/chosen": 2.3850624561309814, + "logits/rejected": 2.422222137451172, + "logps/chosen": -1.7031844854354858, + "logps/rejected": -2.2421417236328125, + "loss": 2.4329, + "nll_loss": 1.7031843662261963, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.031845092773438, + "rewards/margins": 5.3895721435546875, + "rewards/rejected": -22.421419143676758, + "step": 230 + }, + { + "epoch": 0.3444547996272134, + "grad_norm": 263.55673959692865, + "learning_rate": 7.813684376281729e-07, + "logits/chosen": 2.1059210300445557, + "logits/rejected": 2.218613862991333, + "logps/chosen": -1.3147671222686768, + "logps/rejected": -2.3974826335906982, + "loss": 2.335, + "nll_loss": 1.3147673606872559, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.147672653198242, + "rewards/margins": 10.827152252197266, + "rewards/rejected": -23.974824905395508, + "step": 231 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 103.1795350178365, + "learning_rate": 7.793233829018262e-07, + "logits/chosen": 2.139861583709717, + "logits/rejected": 2.9385435581207275, + "logps/chosen": -1.1145626306533813, + "logps/rejected": -2.2230255603790283, + "loss": 2.1556, + "nll_loss": 1.1145626306533813, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.14562702178955, + "rewards/margins": 11.08462905883789, + "rewards/rejected": -22.230255126953125, + "step": 232 + }, + { + "epoch": 0.34743709226467845, + "grad_norm": 60.679522428681075, + "learning_rate": 7.772715127664676e-07, + "logits/chosen": 1.7296453714370728, + "logits/rejected": 1.732195496559143, + "logps/chosen": -1.1248666048049927, + "logps/rejected": -2.177182197570801, + "loss": 2.0203, + "nll_loss": 1.1248664855957031, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.248664855957031, + "rewards/margins": 10.52315616607666, + "rewards/rejected": -21.771820068359375, + "step": 233 + }, + { + "epoch": 0.348928238583411, + "grad_norm": 99.81297830742463, + "learning_rate": 7.752128772871292e-07, + "logits/chosen": 0.9700308442115784, + "logits/rejected": 1.4567511081695557, + "logps/chosen": -1.3183832168579102, + "logps/rejected": -2.960069417953491, + "loss": 2.8233, + "nll_loss": 1.3183832168579102, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.183832168579102, + "rewards/margins": 16.416860580444336, + "rewards/rejected": -29.600690841674805, + "step": 234 + }, + { + "epoch": 0.3504193849021435, + "grad_norm": 96.12368102104242, + "learning_rate": 7.731475266939158e-07, + "logits/chosen": 1.5304874181747437, + "logits/rejected": 0.8800415396690369, + "logps/chosen": -0.9192510843276978, + "logps/rejected": -2.304888963699341, + "loss": 2.0082, + "nll_loss": 0.9192511439323425, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.192510604858398, + "rewards/margins": 13.856379508972168, + "rewards/rejected": -23.04888916015625, + "step": 235 + }, + { + "epoch": 0.35191053122087607, + "grad_norm": 44.76418279154148, + "learning_rate": 7.710755113807793e-07, + "logits/chosen": 1.342360496520996, + "logits/rejected": 1.1625380516052246, + "logps/chosen": -0.9219427108764648, + "logps/rejected": -2.1155216693878174, + "loss": 2.2282, + "nll_loss": 0.9219425916671753, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.219427108764648, + "rewards/margins": 11.935790061950684, + "rewards/rejected": -21.155216217041016, + "step": 236 + }, + { + "epoch": 0.35340167753960855, + "grad_norm": 52.81015880959803, + "learning_rate": 7.689968819042882e-07, + "logits/chosen": 1.2148919105529785, + "logits/rejected": 1.383809208869934, + "logps/chosen": -1.007696270942688, + "logps/rejected": -2.194352626800537, + "loss": 1.9773, + "nll_loss": 1.007696270942688, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0769624710083, + "rewards/margins": 11.866562843322754, + "rewards/rejected": -21.943525314331055, + "step": 237 + }, + { + "epoch": 0.3548928238583411, + "grad_norm": 28.186399074668028, + "learning_rate": 7.669116889823954e-07, + "logits/chosen": 0.8694225549697876, + "logits/rejected": 1.2528514862060547, + "logps/chosen": -1.0248780250549316, + "logps/rejected": -2.271354913711548, + "loss": 1.7403, + "nll_loss": 1.0248781442642212, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.248781204223633, + "rewards/margins": 12.464767456054688, + "rewards/rejected": -22.713546752929688, + "step": 238 + }, + { + "epoch": 0.35638397017707363, + "grad_norm": 100.15322759268103, + "learning_rate": 7.648199834931992e-07, + "logits/chosen": 0.634692907333374, + "logits/rejected": 1.1396393775939941, + "logps/chosen": -1.1581244468688965, + "logps/rejected": -5.170987129211426, + "loss": 1.8909, + "nll_loss": 1.158124566078186, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.581245422363281, + "rewards/margins": 40.12862777709961, + "rewards/rejected": -51.709869384765625, + "step": 239 + }, + { + "epoch": 0.35787511649580617, + "grad_norm": 300.44269554022554, + "learning_rate": 7.62721816473703e-07, + "logits/chosen": 2.2034199237823486, + "logits/rejected": 2.4888501167297363, + "logps/chosen": -1.7597814798355103, + "logps/rejected": -2.8892478942871094, + "loss": 2.6985, + "nll_loss": 1.7597814798355103, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.59781265258789, + "rewards/margins": 11.29466438293457, + "rewards/rejected": -28.892480850219727, + "step": 240 + }, + { + "epoch": 0.35936626281453865, + "grad_norm": 84.61056841792454, + "learning_rate": 7.606172391185699e-07, + "logits/chosen": 0.9357536435127258, + "logits/rejected": 1.495705485343933, + "logps/chosen": -0.7502763867378235, + "logps/rejected": -2.4145474433898926, + "loss": 1.6242, + "nll_loss": 0.7502763867378235, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.502763748168945, + "rewards/margins": 16.64270782470703, + "rewards/rejected": -24.14547348022461, + "step": 241 + }, + { + "epoch": 0.3608574091332712, + "grad_norm": 30.01462048225669, + "learning_rate": 7.58506302778873e-07, + "logits/chosen": 1.139107584953308, + "logits/rejected": 1.5516914129257202, + "logps/chosen": -0.9279584884643555, + "logps/rejected": -2.4357917308807373, + "loss": 1.9693, + "nll_loss": 0.927958607673645, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.279584884643555, + "rewards/margins": 15.07833480834961, + "rewards/rejected": -24.357919692993164, + "step": 242 + }, + { + "epoch": 0.36234855545200373, + "grad_norm": 49.87160158067065, + "learning_rate": 7.563890589608426e-07, + "logits/chosen": 0.7547241449356079, + "logits/rejected": 1.018823266029358, + "logps/chosen": -0.6867074966430664, + "logps/rejected": -2.4492809772491455, + "loss": 1.627, + "nll_loss": 0.6867074966430664, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.867075443267822, + "rewards/margins": 17.625732421875, + "rewards/rejected": -24.492809295654297, + "step": 243 + }, + { + "epoch": 0.36383970177073627, + "grad_norm": 79.18909399401014, + "learning_rate": 7.542655593246103e-07, + "logits/chosen": 2.3590550422668457, + "logits/rejected": 2.4431886672973633, + "logps/chosen": -1.115024447441101, + "logps/rejected": -2.00443172454834, + "loss": 2.179, + "nll_loss": 1.1150243282318115, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.150243759155273, + "rewards/margins": 8.894072532653809, + "rewards/rejected": -20.04431915283203, + "step": 244 + }, + { + "epoch": 0.36533084808946875, + "grad_norm": 76.09626746873882, + "learning_rate": 7.521358556829469e-07, + "logits/chosen": 2.7354531288146973, + "logits/rejected": 2.9832749366760254, + "logps/chosen": -1.0923782587051392, + "logps/rejected": -2.1333441734313965, + "loss": 2.8052, + "nll_loss": 1.0923781394958496, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.923782348632812, + "rewards/margins": 10.409658432006836, + "rewards/rejected": -21.33344078063965, + "step": 245 + }, + { + "epoch": 0.3668219944082013, + "grad_norm": 51.58831925276444, + "learning_rate": 7.5e-07, + "logits/chosen": 0.9712691307067871, + "logits/rejected": 2.3313350677490234, + "logps/chosen": -1.1223640441894531, + "logps/rejected": -3.0363516807556152, + "loss": 1.9817, + "nll_loss": 1.1223642826080322, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.223642349243164, + "rewards/margins": 19.139873504638672, + "rewards/rejected": -30.36351203918457, + "step": 246 + }, + { + "epoch": 0.36831314072693383, + "grad_norm": 72.27660793379593, + "learning_rate": 7.478580443900246e-07, + "logits/chosen": 1.3063576221466064, + "logits/rejected": 1.4398159980773926, + "logps/chosen": -1.2569646835327148, + "logps/rejected": -2.022681951522827, + "loss": 2.9092, + "nll_loss": 1.2569645643234253, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.569645881652832, + "rewards/margins": 7.657173156738281, + "rewards/rejected": -20.226818084716797, + "step": 247 + }, + { + "epoch": 0.36980428704566637, + "grad_norm": 69.93348511338252, + "learning_rate": 7.457100411161127e-07, + "logits/chosen": 1.797006368637085, + "logits/rejected": 1.4189502000808716, + "logps/chosen": -0.7586382031440735, + "logps/rejected": -1.9290591478347778, + "loss": 2.2318, + "nll_loss": 0.7586381435394287, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5863823890686035, + "rewards/margins": 11.70421028137207, + "rewards/rejected": -19.290592193603516, + "step": 248 + }, + { + "epoch": 0.3712954333643989, + "grad_norm": 31.343399697142512, + "learning_rate": 7.435560425889168e-07, + "logits/chosen": 0.6109998822212219, + "logits/rejected": 0.8668403625488281, + "logps/chosen": -0.9712101221084595, + "logps/rejected": -2.0562825202941895, + "loss": 1.4127, + "nll_loss": 0.9712100625038147, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.712100982666016, + "rewards/margins": 10.850724220275879, + "rewards/rejected": -20.562824249267578, + "step": 249 + }, + { + "epoch": 0.3727865796831314, + "grad_norm": 59.38203502577897, + "learning_rate": 7.413961013653725e-07, + "logits/chosen": 1.6231962442398071, + "logits/rejected": 1.9316778182983398, + "logps/chosen": -0.8256513476371765, + "logps/rejected": -1.3191328048706055, + "loss": 2.3647, + "nll_loss": 0.8256514072418213, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.256513595581055, + "rewards/margins": 4.934813976287842, + "rewards/rejected": -13.191327095031738, + "step": 250 + }, + { + "epoch": 0.37427772600186393, + "grad_norm": 39.973759798819756, + "learning_rate": 7.39230270147415e-07, + "logits/chosen": 1.6450904607772827, + "logits/rejected": 1.9193065166473389, + "logps/chosen": -1.230831265449524, + "logps/rejected": -2.295335054397583, + "loss": 2.7592, + "nll_loss": 1.230831265449524, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.308311462402344, + "rewards/margins": 10.645038604736328, + "rewards/rejected": -22.953350067138672, + "step": 251 + }, + { + "epoch": 0.37576887232059647, + "grad_norm": 67.97110261391505, + "learning_rate": 7.370586017806941e-07, + "logits/chosen": 2.092968463897705, + "logits/rejected": 2.5929911136627197, + "logps/chosen": -0.9929396510124207, + "logps/rejected": -4.780749797821045, + "loss": 2.546, + "nll_loss": 0.9929396510124207, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.929396629333496, + "rewards/margins": 37.87810134887695, + "rewards/rejected": -47.8074951171875, + "step": 252 + }, + { + "epoch": 0.377260018639329, + "grad_norm": 37.78048177829846, + "learning_rate": 7.348811492532839e-07, + "logits/chosen": 1.4211300611495972, + "logits/rejected": 1.9608873128890991, + "logps/chosen": -1.005782961845398, + "logps/rejected": -1.8782293796539307, + "loss": 2.26, + "nll_loss": 1.005782961845398, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.057829856872559, + "rewards/margins": 8.724465370178223, + "rewards/rejected": -18.78229522705078, + "step": 253 + }, + { + "epoch": 0.3787511649580615, + "grad_norm": 99.38104653847641, + "learning_rate": 7.326979656943905e-07, + "logits/chosen": 2.002239227294922, + "logits/rejected": 2.198601722717285, + "logps/chosen": -1.406559944152832, + "logps/rejected": -2.0653531551361084, + "loss": 2.4987, + "nll_loss": 1.4065600633621216, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.06559944152832, + "rewards/margins": 6.587931156158447, + "rewards/rejected": -20.653532028198242, + "step": 254 + }, + { + "epoch": 0.38024231127679403, + "grad_norm": 60.106932026337496, + "learning_rate": 7.305091043730557e-07, + "logits/chosen": 2.4402213096618652, + "logits/rejected": 2.591193675994873, + "logps/chosen": -1.0940014123916626, + "logps/rejected": -1.9400519132614136, + "loss": 1.4045, + "nll_loss": 1.0940014123916626, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.940013885498047, + "rewards/margins": 8.460504531860352, + "rewards/rejected": -19.40052032470703, + "step": 255 + }, + { + "epoch": 0.38173345759552657, + "grad_norm": 122.9602377946072, + "learning_rate": 7.283146186968565e-07, + "logits/chosen": 2.069875478744507, + "logits/rejected": 2.4555001258850098, + "logps/chosen": -1.2773873805999756, + "logps/rejected": -3.3619141578674316, + "loss": 1.7268, + "nll_loss": 1.277387261390686, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.773874282836914, + "rewards/margins": 20.845266342163086, + "rewards/rejected": -33.619140625, + "step": 256 + }, + { + "epoch": 0.3832246039142591, + "grad_norm": 52.28679278358108, + "learning_rate": 7.261145622106032e-07, + "logits/chosen": 0.6450112462043762, + "logits/rejected": 0.8822189569473267, + "logps/chosen": -1.0086485147476196, + "logps/rejected": -2.0962486267089844, + "loss": 2.059, + "nll_loss": 1.0086486339569092, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.086485862731934, + "rewards/margins": 10.87600040435791, + "rewards/rejected": -20.962486267089844, + "step": 257 + }, + { + "epoch": 0.3847157502329916, + "grad_norm": 77.09312372429817, + "learning_rate": 7.239089885950316e-07, + "logits/chosen": 1.278009057044983, + "logits/rejected": 1.4298620223999023, + "logps/chosen": -0.978814423084259, + "logps/rejected": -2.0365781784057617, + "loss": 2.7127, + "nll_loss": 0.978814423084259, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.7881441116333, + "rewards/margins": 10.57763671875, + "rewards/rejected": -20.36578369140625, + "step": 258 + }, + { + "epoch": 0.38620689655172413, + "grad_norm": 55.887201759176676, + "learning_rate": 7.216979516654943e-07, + "logits/chosen": 1.4817012548446655, + "logits/rejected": 1.557416319847107, + "logps/chosen": -0.8139100074768066, + "logps/rejected": -2.3971123695373535, + "loss": 1.4846, + "nll_loss": 0.8139100670814514, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.139101028442383, + "rewards/margins": 15.832023620605469, + "rewards/rejected": -23.97112464904785, + "step": 259 + }, + { + "epoch": 0.38769804287045667, + "grad_norm": 88.12892241472093, + "learning_rate": 7.19481505370647e-07, + "logits/chosen": 2.003685712814331, + "logits/rejected": 1.614970326423645, + "logps/chosen": -1.1081788539886475, + "logps/rejected": -2.8416566848754883, + "loss": 1.35, + "nll_loss": 1.108178973197937, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.081789016723633, + "rewards/margins": 17.334775924682617, + "rewards/rejected": -28.41656494140625, + "step": 260 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 99.391692996486, + "learning_rate": 7.172597037911322e-07, + "logits/chosen": 0.43143507838249207, + "logits/rejected": 0.20857657492160797, + "logps/chosen": -1.3084512948989868, + "logps/rejected": -3.066157579421997, + "loss": 2.148, + "nll_loss": 1.3084512948989868, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.084512710571289, + "rewards/margins": 17.577064514160156, + "rewards/rejected": -30.661577224731445, + "step": 261 + }, + { + "epoch": 0.3906803355079217, + "grad_norm": 97.93389248759458, + "learning_rate": 7.150326011382603e-07, + "logits/chosen": 0.6526418924331665, + "logits/rejected": 1.205736517906189, + "logps/chosen": -0.9462600946426392, + "logps/rejected": -2.2558352947235107, + "loss": 1.8139, + "nll_loss": 0.9462600946426392, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.462600708007812, + "rewards/margins": 13.09575080871582, + "rewards/rejected": -22.558353424072266, + "step": 262 + }, + { + "epoch": 0.39217148182665423, + "grad_norm": 53.98492289271101, + "learning_rate": 7.128002517526856e-07, + "logits/chosen": 0.9504834413528442, + "logits/rejected": 0.7977679967880249, + "logps/chosen": -0.8659110069274902, + "logps/rejected": -1.5257205963134766, + "loss": 2.0547, + "nll_loss": 0.8659110069274902, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.659110069274902, + "rewards/margins": 6.598095893859863, + "rewards/rejected": -15.257207870483398, + "step": 263 + }, + { + "epoch": 0.3936626281453868, + "grad_norm": 53.028756273355576, + "learning_rate": 7.105627101030815e-07, + "logits/chosen": 1.035836935043335, + "logits/rejected": 1.1337536573410034, + "logps/chosen": -0.9609843492507935, + "logps/rejected": -2.066173791885376, + "loss": 2.0787, + "nll_loss": 0.9609844088554382, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.609843254089355, + "rewards/margins": 11.051895141601562, + "rewards/rejected": -20.6617374420166, + "step": 264 + }, + { + "epoch": 0.3951537744641193, + "grad_norm": 70.03179421250289, + "learning_rate": 7.083200307848115e-07, + "logits/chosen": 2.2480337619781494, + "logits/rejected": 2.4453728199005127, + "logps/chosen": -1.1059800386428833, + "logps/rejected": -2.8152523040771484, + "loss": 1.693, + "nll_loss": 1.1059800386428833, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.059800148010254, + "rewards/margins": 17.092721939086914, + "rewards/rejected": -28.152523040771484, + "step": 265 + }, + { + "epoch": 0.3966449207828518, + "grad_norm": 65.05167358830555, + "learning_rate": 7.06072268518596e-07, + "logits/chosen": 1.054613471031189, + "logits/rejected": 1.4466171264648438, + "logps/chosen": -1.3642425537109375, + "logps/rejected": -1.3795865774154663, + "loss": 1.9083, + "nll_loss": 1.364242434501648, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.642424583435059, + "rewards/margins": 0.15344035625457764, + "rewards/rejected": -13.795865058898926, + "step": 266 + }, + { + "epoch": 0.39813606710158433, + "grad_norm": 115.63895045724882, + "learning_rate": 7.038194781491785e-07, + "logits/chosen": 1.7098942995071411, + "logits/rejected": 2.203080892562866, + "logps/chosen": -1.1696326732635498, + "logps/rejected": -2.4187185764312744, + "loss": 2.6433, + "nll_loss": 1.1696325540542603, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.69632625579834, + "rewards/margins": 12.490857124328613, + "rewards/rejected": -24.18718147277832, + "step": 267 + }, + { + "epoch": 0.3996272134203169, + "grad_norm": 67.04895942633188, + "learning_rate": 7.015617146439861e-07, + "logits/chosen": 1.6581642627716064, + "logits/rejected": 1.669770359992981, + "logps/chosen": -1.2857623100280762, + "logps/rejected": -2.7904677391052246, + "loss": 1.8274, + "nll_loss": 1.2857623100280762, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.857623100280762, + "rewards/margins": 15.047052383422852, + "rewards/rejected": -27.904674530029297, + "step": 268 + }, + { + "epoch": 0.4011183597390494, + "grad_norm": 91.36427605731544, + "learning_rate": 6.992990330917896e-07, + "logits/chosen": 0.932449996471405, + "logits/rejected": 1.0148789882659912, + "logps/chosen": -0.8048295974731445, + "logps/rejected": -2.076087474822998, + "loss": 2.27, + "nll_loss": 0.8048295974731445, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.048295974731445, + "rewards/margins": 12.712578773498535, + "rewards/rejected": -20.760875701904297, + "step": 269 + }, + { + "epoch": 0.4026095060577819, + "grad_norm": 41.60383549655561, + "learning_rate": 6.970314887013585e-07, + "logits/chosen": 1.052612066268921, + "logits/rejected": 1.0198369026184082, + "logps/chosen": -0.9521026611328125, + "logps/rejected": -3.0467047691345215, + "loss": 1.3033, + "nll_loss": 0.9521026611328125, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.521026611328125, + "rewards/margins": 20.946022033691406, + "rewards/rejected": -30.46704864501953, + "step": 270 + }, + { + "epoch": 0.40410065237651444, + "grad_norm": 37.2054556810954, + "learning_rate": 6.947591368001137e-07, + "logits/chosen": 1.8026392459869385, + "logits/rejected": 1.524202585220337, + "logps/chosen": -1.2173160314559937, + "logps/rejected": -3.043490409851074, + "loss": 2.0298, + "nll_loss": 1.2173161506652832, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.173162460327148, + "rewards/margins": 18.261743545532227, + "rewards/rejected": -30.434906005859375, + "step": 271 + }, + { + "epoch": 0.405591798695247, + "grad_norm": 40.791654458384144, + "learning_rate": 6.924820328327785e-07, + "logits/chosen": 0.8878618478775024, + "logits/rejected": 0.8154944181442261, + "logps/chosen": -1.2944376468658447, + "logps/rejected": -1.636922001838684, + "loss": 2.34, + "nll_loss": 1.2944377660751343, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.944376945495605, + "rewards/margins": 3.4248437881469727, + "rewards/rejected": -16.369220733642578, + "step": 272 + }, + { + "epoch": 0.4070829450139795, + "grad_norm": 87.21918058543856, + "learning_rate": 6.902002323600251e-07, + "logits/chosen": 1.4907180070877075, + "logits/rejected": 1.9472870826721191, + "logps/chosen": -1.0754882097244263, + "logps/rejected": -3.0603857040405273, + "loss": 2.6077, + "nll_loss": 1.0754882097244263, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.754881858825684, + "rewards/margins": 19.848976135253906, + "rewards/rejected": -30.60385513305664, + "step": 273 + }, + { + "epoch": 0.408574091332712, + "grad_norm": 54.02250862515129, + "learning_rate": 6.87913791057119e-07, + "logits/chosen": 1.0437284708023071, + "logits/rejected": 1.4450937509536743, + "logps/chosen": -1.167959451675415, + "logps/rejected": -2.6578826904296875, + "loss": 1.453, + "nll_loss": 1.1679595708847046, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.679594039916992, + "rewards/margins": 14.89923095703125, + "rewards/rejected": -26.578826904296875, + "step": 274 + }, + { + "epoch": 0.41006523765144454, + "grad_norm": 75.6400790003386, + "learning_rate": 6.856227647125607e-07, + "logits/chosen": 0.8017367720603943, + "logits/rejected": 1.474280834197998, + "logps/chosen": -1.1050833463668823, + "logps/rejected": -1.9618220329284668, + "loss": 2.6651, + "nll_loss": 1.1050833463668823, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.050833702087402, + "rewards/margins": 8.56738567352295, + "rewards/rejected": -19.61821937561035, + "step": 275 + }, + { + "epoch": 0.4115563839701771, + "grad_norm": 30.388626869364693, + "learning_rate": 6.83327209226724e-07, + "logits/chosen": 1.4652159214019775, + "logits/rejected": 1.2830939292907715, + "logps/chosen": -0.8472062349319458, + "logps/rejected": -1.9603122472763062, + "loss": 1.5787, + "nll_loss": 0.8472062945365906, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.472063064575195, + "rewards/margins": 11.131060600280762, + "rewards/rejected": -19.60312271118164, + "step": 276 + }, + { + "epoch": 0.4130475302889096, + "grad_norm": 39.216919718833466, + "learning_rate": 6.81027180610493e-07, + "logits/chosen": 1.3558590412139893, + "logits/rejected": 1.70036780834198, + "logps/chosen": -1.2558890581130981, + "logps/rejected": -2.966331958770752, + "loss": 2.0647, + "nll_loss": 1.2558890581130981, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.558890342712402, + "rewards/margins": 17.104429244995117, + "rewards/rejected": -29.66331672668457, + "step": 277 + }, + { + "epoch": 0.4145386766076421, + "grad_norm": 58.789675286827766, + "learning_rate": 6.787227349838946e-07, + "logits/chosen": 0.9922891855239868, + "logits/rejected": 1.273958683013916, + "logps/chosen": -1.1365571022033691, + "logps/rejected": -1.5940685272216797, + "loss": 2.0423, + "nll_loss": 1.1365571022033691, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.365571022033691, + "rewards/margins": 4.575113773345947, + "rewards/rejected": -15.940685272216797, + "step": 278 + }, + { + "epoch": 0.41602982292637464, + "grad_norm": 127.19344910907691, + "learning_rate": 6.764139285747291e-07, + "logits/chosen": 0.5256351232528687, + "logits/rejected": 0.7551167011260986, + "logps/chosen": -0.8241851329803467, + "logps/rejected": -1.77034592628479, + "loss": 1.9205, + "nll_loss": 0.8241850733757019, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.241851806640625, + "rewards/margins": 9.461607933044434, + "rewards/rejected": -17.703458786010742, + "step": 279 + }, + { + "epoch": 0.4175209692451072, + "grad_norm": 52.297853670482816, + "learning_rate": 6.741008177171993e-07, + "logits/chosen": 0.844875693321228, + "logits/rejected": 0.7142741084098816, + "logps/chosen": -1.081954836845398, + "logps/rejected": -1.9283109903335571, + "loss": 1.703, + "nll_loss": 1.081954836845398, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.819547653198242, + "rewards/margins": 8.463563919067383, + "rewards/rejected": -19.283111572265625, + "step": 280 + }, + { + "epoch": 0.4190121155638397, + "grad_norm": 41.30444921997634, + "learning_rate": 6.717834588505349e-07, + "logits/chosen": 0.35103124380111694, + "logits/rejected": 0.7616575956344604, + "logps/chosen": -1.1927512884140015, + "logps/rejected": -2.2596030235290527, + "loss": 1.7112, + "nll_loss": 1.192751407623291, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.927513122558594, + "rewards/margins": 10.668517112731934, + "rewards/rejected": -22.596031188964844, + "step": 281 + }, + { + "epoch": 0.42050326188257225, + "grad_norm": 55.48858060729688, + "learning_rate": 6.694619085176159e-07, + "logits/chosen": 1.6717503070831299, + "logits/rejected": 1.8445909023284912, + "logps/chosen": -1.1160863637924194, + "logps/rejected": -1.5369952917099, + "loss": 1.5962, + "nll_loss": 1.1160863637924194, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.16086483001709, + "rewards/margins": 4.2090888023376465, + "rewards/rejected": -15.369952201843262, + "step": 282 + }, + { + "epoch": 0.42199440820130474, + "grad_norm": 57.240320712688224, + "learning_rate": 6.671362233635925e-07, + "logits/chosen": 2.156038522720337, + "logits/rejected": 2.2529256343841553, + "logps/chosen": -1.3289601802825928, + "logps/rejected": -2.3087949752807617, + "loss": 2.1229, + "nll_loss": 1.3289601802825928, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.289603233337402, + "rewards/margins": 9.798346519470215, + "rewards/rejected": -23.087949752807617, + "step": 283 + }, + { + "epoch": 0.4234855545200373, + "grad_norm": 74.80923783541773, + "learning_rate": 6.64806460134504e-07, + "logits/chosen": 2.358487129211426, + "logits/rejected": 2.6080195903778076, + "logps/chosen": -1.087075114250183, + "logps/rejected": -3.3152096271514893, + "loss": 1.7529, + "nll_loss": 1.087075114250183, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.870750427246094, + "rewards/margins": 22.281347274780273, + "rewards/rejected": -33.152099609375, + "step": 284 + }, + { + "epoch": 0.4249767008387698, + "grad_norm": 27.565727625360296, + "learning_rate": 6.624726756758927e-07, + "logits/chosen": 1.5838301181793213, + "logits/rejected": 1.3991775512695312, + "logps/chosen": -1.7774771451950073, + "logps/rejected": -2.480024814605713, + "loss": 1.3078, + "nll_loss": 1.7774772644042969, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.77477264404297, + "rewards/margins": 7.025475025177002, + "rewards/rejected": -24.800247192382812, + "step": 285 + }, + { + "epoch": 0.42646784715750236, + "grad_norm": 154.74744398137636, + "learning_rate": 6.601349269314187e-07, + "logits/chosen": 1.8415088653564453, + "logits/rejected": 1.869084358215332, + "logps/chosen": -1.4172766208648682, + "logps/rejected": -2.5456197261810303, + "loss": 2.3037, + "nll_loss": 1.4172766208648682, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.172767639160156, + "rewards/margins": 11.283432006835938, + "rewards/rejected": -25.45619773864746, + "step": 286 + }, + { + "epoch": 0.42795899347623484, + "grad_norm": 33.654264848772605, + "learning_rate": 6.577932709414689e-07, + "logits/chosen": 0.06796303391456604, + "logits/rejected": 0.024106621742248535, + "logps/chosen": -1.2476297616958618, + "logps/rejected": -2.5938475131988525, + "loss": 1.9129, + "nll_loss": 1.2476297616958618, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.476297378540039, + "rewards/margins": 13.462179183959961, + "rewards/rejected": -25.938474655151367, + "step": 287 + }, + { + "epoch": 0.4294501397949674, + "grad_norm": 61.274908342374445, + "learning_rate": 6.554477648417655e-07, + "logits/chosen": 1.084038496017456, + "logits/rejected": 0.7969342470169067, + "logps/chosen": -0.8771347999572754, + "logps/rejected": -1.9453235864639282, + "loss": 2.2445, + "nll_loss": 0.8771347999572754, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.771347999572754, + "rewards/margins": 10.68188762664795, + "rewards/rejected": -19.453235626220703, + "step": 288 + }, + { + "epoch": 0.4309412861136999, + "grad_norm": 33.92864029856102, + "learning_rate": 6.530984658619733e-07, + "logits/chosen": 0.6454198956489563, + "logits/rejected": 0.6794722080230713, + "logps/chosen": -0.6796860694885254, + "logps/rejected": -2.1921310424804688, + "loss": 1.4368, + "nll_loss": 0.6796860098838806, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.796860694885254, + "rewards/margins": 15.124448776245117, + "rewards/rejected": -21.921310424804688, + "step": 289 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 102.89530504035014, + "learning_rate": 6.507454313243015e-07, + "logits/chosen": 0.5887856483459473, + "logits/rejected": 0.8982241749763489, + "logps/chosen": -1.2342543601989746, + "logps/rejected": -3.1363353729248047, + "loss": 1.4046, + "nll_loss": 1.2342543601989746, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.342544555664062, + "rewards/margins": 19.020811080932617, + "rewards/rejected": -31.363353729248047, + "step": 290 + }, + { + "epoch": 0.43392357875116494, + "grad_norm": 49.68076761184129, + "learning_rate": 6.483887186421058e-07, + "logits/chosen": 0.8749809265136719, + "logits/rejected": 0.6832801103591919, + "logps/chosen": -0.9976881742477417, + "logps/rejected": -1.4613938331604004, + "loss": 2.2379, + "nll_loss": 0.9976882338523865, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.976881980895996, + "rewards/margins": 4.63705587387085, + "rewards/rejected": -14.613937377929688, + "step": 291 + }, + { + "epoch": 0.4354147250698975, + "grad_norm": 39.12656003901273, + "learning_rate": 6.460283853184879e-07, + "logits/chosen": 1.0188599824905396, + "logits/rejected": 1.0585620403289795, + "logps/chosen": -0.8566121459007263, + "logps/rejected": -2.6598193645477295, + "loss": 2.3505, + "nll_loss": 0.8566122055053711, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.566122055053711, + "rewards/margins": 18.032073974609375, + "rewards/rejected": -26.598194122314453, + "step": 292 + }, + { + "epoch": 0.43690587138863, + "grad_norm": 123.87555388902, + "learning_rate": 6.436644889448919e-07, + "logits/chosen": 2.1357312202453613, + "logits/rejected": 2.082622766494751, + "logps/chosen": -1.5212548971176147, + "logps/rejected": -2.3258817195892334, + "loss": 2.7392, + "nll_loss": 1.5212547779083252, + "rewards/accuracies": 0.625, + "rewards/chosen": -15.21254825592041, + "rewards/margins": 8.046268463134766, + "rewards/rejected": -23.25881576538086, + "step": 293 + }, + { + "epoch": 0.43839701770736256, + "grad_norm": 44.15702706121298, + "learning_rate": 6.412970871996995e-07, + "logits/chosen": 2.1622915267944336, + "logits/rejected": 2.064239263534546, + "logps/chosen": -1.0755740404129028, + "logps/rejected": -3.1556620597839355, + "loss": 1.7613, + "nll_loss": 1.0755740404129028, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.755739212036133, + "rewards/margins": 20.800884246826172, + "rewards/rejected": -31.556623458862305, + "step": 294 + }, + { + "epoch": 0.43988816402609504, + "grad_norm": 63.85906467207744, + "learning_rate": 6.389262378468219e-07, + "logits/chosen": 0.337302565574646, + "logits/rejected": 0.3216743469238281, + "logps/chosen": -1.0648306608200073, + "logps/rejected": -1.7549852132797241, + "loss": 2.2155, + "nll_loss": 1.0648306608200073, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.648306846618652, + "rewards/margins": 6.901545524597168, + "rewards/rejected": -17.54985237121582, + "step": 295 + }, + { + "epoch": 0.4413793103448276, + "grad_norm": 63.02949408369288, + "learning_rate": 6.365519987342915e-07, + "logits/chosen": 1.4290008544921875, + "logits/rejected": 2.151444435119629, + "logps/chosen": -1.3096098899841309, + "logps/rejected": -3.7374768257141113, + "loss": 2.3925, + "nll_loss": 1.3096097707748413, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.096098899841309, + "rewards/margins": 24.278675079345703, + "rewards/rejected": -37.37477111816406, + "step": 296 + }, + { + "epoch": 0.4428704566635601, + "grad_norm": 103.14840083443293, + "learning_rate": 6.341744277928499e-07, + "logits/chosen": -0.02471376582980156, + "logits/rejected": 0.1734105348587036, + "logps/chosen": -1.2056688070297241, + "logps/rejected": -1.9607056379318237, + "loss": 2.3567, + "nll_loss": 1.2056688070297241, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.056687355041504, + "rewards/margins": 7.550368785858154, + "rewards/rejected": -19.6070556640625, + "step": 297 + }, + { + "epoch": 0.44436160298229266, + "grad_norm": 60.70508187697226, + "learning_rate": 6.317935830345338e-07, + "logits/chosen": 1.4392168521881104, + "logits/rejected": 1.5099271535873413, + "logps/chosen": -1.2347393035888672, + "logps/rejected": -1.8257811069488525, + "loss": 2.9378, + "nll_loss": 1.2347395420074463, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.347393989562988, + "rewards/margins": 5.910416126251221, + "rewards/rejected": -18.257810592651367, + "step": 298 + }, + { + "epoch": 0.44585274930102514, + "grad_norm": 196.96211764289563, + "learning_rate": 6.294095225512604e-07, + "logits/chosen": 0.2620046138763428, + "logits/rejected": 0.18626247346401215, + "logps/chosen": -1.016269564628601, + "logps/rejected": -2.0889391899108887, + "loss": 1.6622, + "nll_loss": 1.016269564628601, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.162696838378906, + "rewards/margins": 10.72669792175293, + "rewards/rejected": -20.889392852783203, + "step": 299 + }, + { + "epoch": 0.4473438956197577, + "grad_norm": 57.732026030259775, + "learning_rate": 6.270223045134095e-07, + "logits/chosen": 1.0570734739303589, + "logits/rejected": 1.2986547946929932, + "logps/chosen": -1.048396348953247, + "logps/rejected": -2.0673375129699707, + "loss": 2.7017, + "nll_loss": 1.048396348953247, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.483963966369629, + "rewards/margins": 10.189414978027344, + "rewards/rejected": -20.67337989807129, + "step": 300 + }, + { + "epoch": 0.4488350419384902, + "grad_norm": 33.69710577119018, + "learning_rate": 6.246319871684047e-07, + "logits/chosen": 0.7944878339767456, + "logits/rejected": 1.3021888732910156, + "logps/chosen": -1.0949335098266602, + "logps/rejected": -3.1596837043762207, + "loss": 1.4899, + "nll_loss": 1.0949335098266602, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.949335098266602, + "rewards/margins": 20.64750099182129, + "rewards/rejected": -31.59683609008789, + "step": 301 + }, + { + "epoch": 0.45032618825722276, + "grad_norm": 49.16607074744664, + "learning_rate": 6.222386288392914e-07, + "logits/chosen": 1.3668736219406128, + "logits/rejected": 1.658337116241455, + "logps/chosen": -0.9744982123374939, + "logps/rejected": -2.594623327255249, + "loss": 2.4226, + "nll_loss": 0.9744983315467834, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.74498176574707, + "rewards/margins": 16.201250076293945, + "rewards/rejected": -25.946231842041016, + "step": 302 + }, + { + "epoch": 0.45181733457595524, + "grad_norm": 67.63665525502225, + "learning_rate": 6.19842287923314e-07, + "logits/chosen": 0.8895013332366943, + "logits/rejected": 0.7559343576431274, + "logps/chosen": -0.8952937126159668, + "logps/rejected": -3.781191110610962, + "loss": 1.7441, + "nll_loss": 0.8952935934066772, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.952936172485352, + "rewards/margins": 28.858976364135742, + "rewards/rejected": -37.81190872192383, + "step": 303 + }, + { + "epoch": 0.4533084808946878, + "grad_norm": 84.23259859109938, + "learning_rate": 6.174430228904919e-07, + "logits/chosen": 0.8855783343315125, + "logits/rejected": 0.9021013975143433, + "logps/chosen": -0.9263310432434082, + "logps/rejected": -2.732335329055786, + "loss": 1.809, + "nll_loss": 0.9263309836387634, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.263310432434082, + "rewards/margins": 18.060047149658203, + "rewards/rejected": -27.32335662841797, + "step": 304 + }, + { + "epoch": 0.4547996272134203, + "grad_norm": 95.50345921648152, + "learning_rate": 6.150408922821911e-07, + "logits/chosen": 0.833683431148529, + "logits/rejected": 0.7572767734527588, + "logps/chosen": -0.8070122003555298, + "logps/rejected": -1.7145060300827026, + "loss": 2.7335, + "nll_loss": 0.8070122599601746, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.070121765136719, + "rewards/margins": 9.07493782043457, + "rewards/rejected": -17.14505958557129, + "step": 305 + }, + { + "epoch": 0.45629077353215286, + "grad_norm": 47.04220627181392, + "learning_rate": 6.126359547096974e-07, + "logits/chosen": 0.17184323072433472, + "logits/rejected": 0.23235543072223663, + "logps/chosen": -0.9182405471801758, + "logps/rejected": -2.4804115295410156, + "loss": 1.858, + "nll_loss": 0.9182405471801758, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.182406425476074, + "rewards/margins": 15.621709823608398, + "rewards/rejected": -24.804115295410156, + "step": 306 + }, + { + "epoch": 0.45778191985088534, + "grad_norm": 89.25209929634599, + "learning_rate": 6.102282688527859e-07, + "logits/chosen": 1.0285824537277222, + "logits/rejected": 0.3567197620868683, + "logps/chosen": -1.3488802909851074, + "logps/rejected": -2.2083587646484375, + "loss": 2.0896, + "nll_loss": 1.3488802909851074, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.488801956176758, + "rewards/margins": 8.594785690307617, + "rewards/rejected": -22.083589553833008, + "step": 307 + }, + { + "epoch": 0.4592730661696179, + "grad_norm": 47.26521425033619, + "learning_rate": 6.078178934582885e-07, + "logits/chosen": 0.8061944246292114, + "logits/rejected": 1.3660072088241577, + "logps/chosen": -1.383995771408081, + "logps/rejected": -3.055185317993164, + "loss": 2.1618, + "nll_loss": 1.383995771408081, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.839958190917969, + "rewards/margins": 16.711894989013672, + "rewards/rejected": -30.55185317993164, + "step": 308 + }, + { + "epoch": 0.4607642124883504, + "grad_norm": 44.101062459563245, + "learning_rate": 6.054048873386612e-07, + "logits/chosen": 1.240386962890625, + "logits/rejected": 1.2306119203567505, + "logps/chosen": -1.1784520149230957, + "logps/rejected": -3.2531423568725586, + "loss": 1.1368, + "nll_loss": 1.1784520149230957, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.78451919555664, + "rewards/margins": 20.746902465820312, + "rewards/rejected": -32.53142547607422, + "step": 309 + }, + { + "epoch": 0.46225535880708296, + "grad_norm": 147.43759399932563, + "learning_rate": 6.029893093705491e-07, + "logits/chosen": 0.9249796271324158, + "logits/rejected": 0.8558982610702515, + "logps/chosen": -1.0307352542877197, + "logps/rejected": -2.5594286918640137, + "loss": 2.2389, + "nll_loss": 1.0307352542877197, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.307353019714355, + "rewards/margins": 15.286933898925781, + "rewards/rejected": -25.594287872314453, + "step": 310 + }, + { + "epoch": 0.46374650512581544, + "grad_norm": 56.960912871190224, + "learning_rate": 6.005712184933497e-07, + "logits/chosen": 1.5723234415054321, + "logits/rejected": 2.2183189392089844, + "logps/chosen": -1.2421478033065796, + "logps/rejected": -2.286712408065796, + "loss": 1.4243, + "nll_loss": 1.2421478033065796, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.421477317810059, + "rewards/margins": 10.445649147033691, + "rewards/rejected": -22.86712646484375, + "step": 311 + }, + { + "epoch": 0.465237651444548, + "grad_norm": 45.47868786092567, + "learning_rate": 5.981506737077743e-07, + "logits/chosen": 1.2714695930480957, + "logits/rejected": 1.4802864789962769, + "logps/chosen": -0.8951252698898315, + "logps/rejected": -2.4004831314086914, + "loss": 2.2213, + "nll_loss": 0.8951252698898315, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.951252937316895, + "rewards/margins": 15.053577423095703, + "rewards/rejected": -24.00482940673828, + "step": 312 + }, + { + "epoch": 0.4667287977632805, + "grad_norm": 40.21728414149163, + "learning_rate": 5.957277340744094e-07, + "logits/chosen": 0.9764309525489807, + "logits/rejected": 0.9029860496520996, + "logps/chosen": -1.137838363647461, + "logps/rejected": -1.8395271301269531, + "loss": 2.2428, + "nll_loss": 1.137838363647461, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.37838363647461, + "rewards/margins": 7.016888618469238, + "rewards/rejected": -18.39527130126953, + "step": 313 + }, + { + "epoch": 0.46821994408201306, + "grad_norm": 185.20061412223453, + "learning_rate": 5.933024587122745e-07, + "logits/chosen": 1.2819750308990479, + "logits/rejected": 0.6815961003303528, + "logps/chosen": -1.3975375890731812, + "logps/rejected": -2.166456699371338, + "loss": 1.5696, + "nll_loss": 1.3975378274917603, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.97537612915039, + "rewards/margins": 7.689189910888672, + "rewards/rejected": -21.664566040039062, + "step": 314 + }, + { + "epoch": 0.46971109040074555, + "grad_norm": 73.99447495932938, + "learning_rate": 5.908749067973809e-07, + "logits/chosen": 1.1808252334594727, + "logits/rejected": 1.8961601257324219, + "logps/chosen": -1.1919835805892944, + "logps/rejected": -1.9030534029006958, + "loss": 2.9395, + "nll_loss": 1.1919835805892944, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.919836044311523, + "rewards/margins": 7.11069917678833, + "rewards/rejected": -19.030534744262695, + "step": 315 + }, + { + "epoch": 0.4712022367194781, + "grad_norm": 41.91892232926434, + "learning_rate": 5.884451375612865e-07, + "logits/chosen": 0.43265220522880554, + "logits/rejected": 0.669975221157074, + "logps/chosen": -1.172359585762024, + "logps/rejected": -2.715709686279297, + "loss": 2.1396, + "nll_loss": 1.172359585762024, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.723597526550293, + "rewards/margins": 15.43349838256836, + "rewards/rejected": -27.157094955444336, + "step": 316 + }, + { + "epoch": 0.4726933830382106, + "grad_norm": 67.47008277225329, + "learning_rate": 5.860132102896515e-07, + "logits/chosen": 1.0299711227416992, + "logits/rejected": 1.513453483581543, + "logps/chosen": -1.4263699054718018, + "logps/rejected": -3.114851236343384, + "loss": 2.4576, + "nll_loss": 1.4263699054718018, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.26369857788086, + "rewards/margins": 16.884815216064453, + "rewards/rejected": -31.148513793945312, + "step": 317 + }, + { + "epoch": 0.47418452935694316, + "grad_norm": 106.55390409463216, + "learning_rate": 5.835791843207916e-07, + "logits/chosen": 0.5274050235748291, + "logits/rejected": 1.4777504205703735, + "logps/chosen": -1.3521366119384766, + "logps/rejected": -5.964361667633057, + "loss": 2.3519, + "nll_loss": 1.3521366119384766, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.521367073059082, + "rewards/margins": 46.122249603271484, + "rewards/rejected": -59.64361572265625, + "step": 318 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 52.47560817081044, + "learning_rate": 5.8114311904423e-07, + "logits/chosen": 0.8356415033340454, + "logits/rejected": 1.1021226644515991, + "logps/chosen": -1.2969446182250977, + "logps/rejected": -2.987818956375122, + "loss": 2.0389, + "nll_loss": 1.296944499015808, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.96944808959961, + "rewards/margins": 16.908740997314453, + "rewards/rejected": -29.878189086914062, + "step": 319 + }, + { + "epoch": 0.4771668219944082, + "grad_norm": 44.58696731411979, + "learning_rate": 5.787050738992481e-07, + "logits/chosen": 0.8062883615493774, + "logits/rejected": 1.2388557195663452, + "logps/chosen": -0.9122974276542664, + "logps/rejected": -3.280360221862793, + "loss": 2.3128, + "nll_loss": 0.9122973680496216, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.122974395751953, + "rewards/margins": 23.680631637573242, + "rewards/rejected": -32.80360412597656, + "step": 320 + }, + { + "epoch": 0.4786579683131407, + "grad_norm": 47.624989755164464, + "learning_rate": 5.762651083734362e-07, + "logits/chosen": 1.5454519987106323, + "logits/rejected": 1.7682874202728271, + "logps/chosen": -1.2734403610229492, + "logps/rejected": -2.9317433834075928, + "loss": 1.8263, + "nll_loss": 1.2734405994415283, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.734405517578125, + "rewards/margins": 16.583026885986328, + "rewards/rejected": -29.317432403564453, + "step": 321 + }, + { + "epoch": 0.48014911463187326, + "grad_norm": 62.34214654651794, + "learning_rate": 5.738232820012407e-07, + "logits/chosen": 1.1003084182739258, + "logits/rejected": 1.1037921905517578, + "logps/chosen": -1.0845376253128052, + "logps/rejected": -2.8120718002319336, + "loss": 2.0593, + "nll_loss": 1.0845376253128052, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.845376968383789, + "rewards/margins": 17.275341033935547, + "rewards/rejected": -28.120716094970703, + "step": 322 + }, + { + "epoch": 0.4816402609506058, + "grad_norm": 36.0212815292935, + "learning_rate": 5.713796543625122e-07, + "logits/chosen": 1.975404143333435, + "logits/rejected": 1.3179008960723877, + "logps/chosen": -1.2851284742355347, + "logps/rejected": -2.0051522254943848, + "loss": 1.4676, + "nll_loss": 1.2851283550262451, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.85128402709961, + "rewards/margins": 7.200236797332764, + "rewards/rejected": -20.05152130126953, + "step": 323 + }, + { + "epoch": 0.4831314072693383, + "grad_norm": 87.18844716839915, + "learning_rate": 5.689342850810522e-07, + "logits/chosen": 1.2656850814819336, + "logits/rejected": 1.8935495615005493, + "logps/chosen": -1.0454943180084229, + "logps/rejected": -3.235139846801758, + "loss": 2.5662, + "nll_loss": 1.0454943180084229, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.454943656921387, + "rewards/margins": 21.896453857421875, + "rewards/rejected": -32.35139846801758, + "step": 324 + }, + { + "epoch": 0.4846225535880708, + "grad_norm": 84.9837736807849, + "learning_rate": 5.664872338231571e-07, + "logits/chosen": 1.088675856590271, + "logits/rejected": 1.2341116666793823, + "logps/chosen": -1.3650341033935547, + "logps/rejected": -2.4809322357177734, + "loss": 2.9029, + "nll_loss": 1.3650341033935547, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.650341987609863, + "rewards/margins": 11.158980369567871, + "rewards/rejected": -24.809322357177734, + "step": 325 + }, + { + "epoch": 0.48611369990680336, + "grad_norm": 60.076091304902064, + "learning_rate": 5.640385602961634e-07, + "logits/chosen": 0.5524032115936279, + "logits/rejected": 0.778762936592102, + "logps/chosen": -1.2645853757858276, + "logps/rejected": -4.169212341308594, + "loss": 1.5392, + "nll_loss": 1.264585256576538, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.645853042602539, + "rewards/margins": 29.0462703704834, + "rewards/rejected": -41.69211959838867, + "step": 326 + }, + { + "epoch": 0.4876048462255359, + "grad_norm": 45.87994623387452, + "learning_rate": 5.615883242469905e-07, + "logits/chosen": 0.7510640621185303, + "logits/rejected": 0.7140066623687744, + "logps/chosen": -1.442211627960205, + "logps/rejected": -2.5686025619506836, + "loss": 2.3796, + "nll_loss": 1.4422115087509155, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.422115325927734, + "rewards/margins": 11.263908386230469, + "rewards/rejected": -25.686025619506836, + "step": 327 + }, + { + "epoch": 0.4890959925442684, + "grad_norm": 63.29029676996501, + "learning_rate": 5.591365854606829e-07, + "logits/chosen": 2.1231472492218018, + "logits/rejected": 2.1009750366210938, + "logps/chosen": -0.9882567524909973, + "logps/rejected": -1.7455428838729858, + "loss": 2.2137, + "nll_loss": 0.9882567524909973, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.882567405700684, + "rewards/margins": 7.572861671447754, + "rewards/rejected": -17.455429077148438, + "step": 328 + }, + { + "epoch": 0.4905871388630009, + "grad_norm": 304.2743901322164, + "learning_rate": 5.566834037589511e-07, + "logits/chosen": 2.063725709915161, + "logits/rejected": 2.0055620670318604, + "logps/chosen": -1.1700128316879272, + "logps/rejected": -2.869445323944092, + "loss": 2.0797, + "nll_loss": 1.1700127124786377, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.700127601623535, + "rewards/margins": 16.994325637817383, + "rewards/rejected": -28.694454193115234, + "step": 329 + }, + { + "epoch": 0.49207828518173347, + "grad_norm": 45.53013522811574, + "learning_rate": 5.542288389987128e-07, + "logits/chosen": 0.43271785974502563, + "logits/rejected": 1.0451266765594482, + "logps/chosen": -0.6082872748374939, + "logps/rejected": -2.1641836166381836, + "loss": 2.4139, + "nll_loss": 0.6082872152328491, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.08287239074707, + "rewards/margins": 15.558965682983398, + "rewards/rejected": -21.6418399810791, + "step": 330 + }, + { + "epoch": 0.493569431500466, + "grad_norm": 43.98817680932417, + "learning_rate": 5.517729510706315e-07, + "logits/chosen": 1.076559066772461, + "logits/rejected": 1.4049584865570068, + "logps/chosen": -1.6192044019699097, + "logps/rejected": -2.2700164318084717, + "loss": 2.5434, + "nll_loss": 1.6192045211791992, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.192045211791992, + "rewards/margins": 6.508121013641357, + "rewards/rejected": -22.700164794921875, + "step": 331 + }, + { + "epoch": 0.4950605778191985, + "grad_norm": 55.50922181112932, + "learning_rate": 5.493157998976559e-07, + "logits/chosen": 1.8846677541732788, + "logits/rejected": 2.510406017303467, + "logps/chosen": -1.385533094406128, + "logps/rejected": -3.4347152709960938, + "loss": 1.1139, + "nll_loss": 1.385533094406128, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.855329513549805, + "rewards/margins": 20.491819381713867, + "rewards/rejected": -34.34715270996094, + "step": 332 + }, + { + "epoch": 0.496551724137931, + "grad_norm": 86.0791136849411, + "learning_rate": 5.468574454335574e-07, + "logits/chosen": 1.4486123323440552, + "logits/rejected": 1.5563056468963623, + "logps/chosen": -1.481582760810852, + "logps/rejected": -7.358170509338379, + "loss": 2.5366, + "nll_loss": 1.4815826416015625, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.815827369689941, + "rewards/margins": 58.765869140625, + "rewards/rejected": -73.58169555664062, + "step": 333 + }, + { + "epoch": 0.49804287045666357, + "grad_norm": 275.8773072828461, + "learning_rate": 5.443979476614674e-07, + "logits/chosen": 0.7218424081802368, + "logits/rejected": 1.135964274406433, + "logps/chosen": -1.110957145690918, + "logps/rejected": -3.2192254066467285, + "loss": 2.6062, + "nll_loss": 1.1109572649002075, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.10957145690918, + "rewards/margins": 21.082679748535156, + "rewards/rejected": -32.19225311279297, + "step": 334 + }, + { + "epoch": 0.4995340167753961, + "grad_norm": 70.54195359782109, + "learning_rate": 5.419373665924136e-07, + "logits/chosen": 1.0993138551712036, + "logits/rejected": 1.5738513469696045, + "logps/chosen": -1.3446593284606934, + "logps/rejected": -2.769869327545166, + "loss": 1.7894, + "nll_loss": 1.3446592092514038, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.446593284606934, + "rewards/margins": 14.252100944519043, + "rewards/rejected": -27.698692321777344, + "step": 335 + }, + { + "epoch": 0.5010251630941286, + "grad_norm": 67.12861397994634, + "learning_rate": 5.394757622638559e-07, + "logits/chosen": 1.9011707305908203, + "logits/rejected": 2.1747989654541016, + "logps/chosen": -1.1282011270523071, + "logps/rejected": -1.4653615951538086, + "loss": 2.6, + "nll_loss": 1.1282011270523071, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.282011985778809, + "rewards/margins": 3.3716037273406982, + "rewards/rejected": -14.653615951538086, + "step": 336 + }, + { + "epoch": 0.5025163094128612, + "grad_norm": 51.2055086895284, + "learning_rate": 5.370131947382214e-07, + "logits/chosen": 1.7519341707229614, + "logits/rejected": 1.9194046258926392, + "logps/chosen": -1.4238516092300415, + "logps/rejected": -3.2271335124969482, + "loss": 1.6988, + "nll_loss": 1.423851490020752, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.238516807556152, + "rewards/margins": 18.032821655273438, + "rewards/rejected": -32.27133560180664, + "step": 337 + }, + { + "epoch": 0.5040074557315937, + "grad_norm": 325.67543178955356, + "learning_rate": 5.34549724101439e-07, + "logits/chosen": 1.4702489376068115, + "logits/rejected": 1.1438722610473633, + "logps/chosen": -1.0996648073196411, + "logps/rejected": -1.7886468172073364, + "loss": 1.822, + "nll_loss": 1.0996649265289307, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.996648788452148, + "rewards/margins": 6.889819145202637, + "rewards/rejected": -17.8864688873291, + "step": 338 + }, + { + "epoch": 0.5054986020503262, + "grad_norm": 280.19664588162436, + "learning_rate": 5.32085410461473e-07, + "logits/chosen": 1.8264939785003662, + "logits/rejected": 1.9685840606689453, + "logps/chosen": -1.5246539115905762, + "logps/rejected": -2.096618413925171, + "loss": 3.1637, + "nll_loss": 1.5246539115905762, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.246540069580078, + "rewards/margins": 5.719644546508789, + "rewards/rejected": -20.966184616088867, + "step": 339 + }, + { + "epoch": 0.5069897483690587, + "grad_norm": 76.58759054404746, + "learning_rate": 5.296203139468571e-07, + "logits/chosen": 0.6917173266410828, + "logits/rejected": 1.098940134048462, + "logps/chosen": -1.4546515941619873, + "logps/rejected": -3.8073456287384033, + "loss": 2.1558, + "nll_loss": 1.4546515941619873, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.546515464782715, + "rewards/margins": 23.526941299438477, + "rewards/rejected": -38.073455810546875, + "step": 340 + }, + { + "epoch": 0.5084808946877912, + "grad_norm": 104.9310864210115, + "learning_rate": 5.271544947052266e-07, + "logits/chosen": 1.0520612001419067, + "logits/rejected": 1.0490987300872803, + "logps/chosen": -1.0005167722702026, + "logps/rejected": -1.7621972560882568, + "loss": 2.0453, + "nll_loss": 1.000516653060913, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.005167961120605, + "rewards/margins": 7.616805553436279, + "rewards/rejected": -17.621973037719727, + "step": 341 + }, + { + "epoch": 0.5099720410065237, + "grad_norm": 53.97625383827924, + "learning_rate": 5.246880129018515e-07, + "logits/chosen": 2.2423133850097656, + "logits/rejected": 2.537630558013916, + "logps/chosen": -1.5444772243499756, + "logps/rejected": -2.222283363342285, + "loss": 1.6948, + "nll_loss": 1.5444772243499756, + "rewards/accuracies": 0.625, + "rewards/chosen": -15.444771766662598, + "rewards/margins": 6.778061389923096, + "rewards/rejected": -22.22283363342285, + "step": 342 + }, + { + "epoch": 0.5114631873252563, + "grad_norm": 112.33294253398827, + "learning_rate": 5.222209287181676e-07, + "logits/chosen": 0.6487884521484375, + "logits/rejected": 0.6867713332176208, + "logps/chosen": -1.3492956161499023, + "logps/rejected": -2.9653830528259277, + "loss": 2.5893, + "nll_loss": 1.3492956161499023, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.492956161499023, + "rewards/margins": 16.160873413085938, + "rewards/rejected": -29.653831481933594, + "step": 343 + }, + { + "epoch": 0.5129543336439888, + "grad_norm": 107.60256343611687, + "learning_rate": 5.197533023503089e-07, + "logits/chosen": 0.4045717716217041, + "logits/rejected": 0.7231236696243286, + "logps/chosen": -0.9761478900909424, + "logps/rejected": -2.7345666885375977, + "loss": 2.312, + "nll_loss": 0.9761478900909424, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.761479377746582, + "rewards/margins": 17.584186553955078, + "rewards/rejected": -27.345664978027344, + "step": 344 + }, + { + "epoch": 0.5144454799627214, + "grad_norm": 156.63547061998838, + "learning_rate": 5.172851940076387e-07, + "logits/chosen": 0.7327659130096436, + "logits/rejected": 0.9826483726501465, + "logps/chosen": -1.4324758052825928, + "logps/rejected": -2.6677839756011963, + "loss": 2.8925, + "nll_loss": 1.4324758052825928, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.324756622314453, + "rewards/margins": 12.353079795837402, + "rewards/rejected": -26.677839279174805, + "step": 345 + }, + { + "epoch": 0.5159366262814539, + "grad_norm": 129.0147341187787, + "learning_rate": 5.148166639112799e-07, + "logits/chosen": 0.9722475409507751, + "logits/rejected": 0.9801483154296875, + "logps/chosen": -1.194012999534607, + "logps/rejected": -2.0182580947875977, + "loss": 2.5496, + "nll_loss": 1.1940131187438965, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.940130233764648, + "rewards/margins": 8.242452621459961, + "rewards/rejected": -20.18258285522461, + "step": 346 + }, + { + "epoch": 0.5174277726001864, + "grad_norm": 41.688227996168365, + "learning_rate": 5.123477722926464e-07, + "logits/chosen": 1.3067666292190552, + "logits/rejected": 2.00052809715271, + "logps/chosen": -1.4092761278152466, + "logps/rejected": -3.3266148567199707, + "loss": 2.2764, + "nll_loss": 1.4092758893966675, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.092761993408203, + "rewards/margins": 19.17338752746582, + "rewards/rejected": -33.266151428222656, + "step": 347 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 40.993761190079624, + "learning_rate": 5.098785793919732e-07, + "logits/chosen": 1.0896856784820557, + "logits/rejected": 1.4323949813842773, + "logps/chosen": -1.5167326927185059, + "logps/rejected": -2.3191676139831543, + "loss": 2.6912, + "nll_loss": 1.5167325735092163, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.167327880859375, + "rewards/margins": 8.024349212646484, + "rewards/rejected": -23.19167709350586, + "step": 348 + }, + { + "epoch": 0.5204100652376514, + "grad_norm": 25.50126534337896, + "learning_rate": 5.074091454568463e-07, + "logits/chosen": 1.669668436050415, + "logits/rejected": 2.283931255340576, + "logps/chosen": -1.0836777687072754, + "logps/rejected": -2.233471632003784, + "loss": 1.4233, + "nll_loss": 1.0836777687072754, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.836777687072754, + "rewards/margins": 11.497940063476562, + "rewards/rejected": -22.334716796875, + "step": 349 + }, + { + "epoch": 0.5219012115563839, + "grad_norm": 50.976717957780835, + "learning_rate": 5.049395307407328e-07, + "logits/chosen": 0.9089032411575317, + "logits/rejected": 0.991895854473114, + "logps/chosen": -1.0036165714263916, + "logps/rejected": -1.5741000175476074, + "loss": 1.6741, + "nll_loss": 1.0036165714263916, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.036165237426758, + "rewards/margins": 5.704835414886475, + "rewards/rejected": -15.741000175476074, + "step": 350 + }, + { + "epoch": 0.5233923578751165, + "grad_norm": 56.51327778810148, + "learning_rate": 5.024697955015111e-07, + "logits/chosen": 1.0180648565292358, + "logits/rejected": 1.2536265850067139, + "logps/chosen": -1.1623241901397705, + "logps/rejected": -1.6183818578720093, + "loss": 2.5333, + "nll_loss": 1.162324070930481, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.623241424560547, + "rewards/margins": 4.560578346252441, + "rewards/rejected": -16.183818817138672, + "step": 351 + }, + { + "epoch": 0.524883504193849, + "grad_norm": 49.29582711152062, + "learning_rate": 5e-07, + "logits/chosen": 0.0769793838262558, + "logits/rejected": 1.3817670345306396, + "logps/chosen": -1.8828178644180298, + "logps/rejected": -3.151564121246338, + "loss": 2.696, + "nll_loss": 1.8828177452087402, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.828176498413086, + "rewards/margins": 12.687463760375977, + "rewards/rejected": -31.515642166137695, + "step": 352 + }, + { + "epoch": 0.5263746505125816, + "grad_norm": 127.45507991470484, + "learning_rate": 4.975302044984888e-07, + "logits/chosen": 1.6833350658416748, + "logits/rejected": 1.887645959854126, + "logps/chosen": -0.7672575116157532, + "logps/rejected": -1.969806432723999, + "loss": 3.2517, + "nll_loss": 0.7672575116157532, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.6725754737854, + "rewards/margins": 12.025486946105957, + "rewards/rejected": -19.698062896728516, + "step": 353 + }, + { + "epoch": 0.5278657968313141, + "grad_norm": 43.674234447941004, + "learning_rate": 4.950604692592673e-07, + "logits/chosen": 1.0634629726409912, + "logits/rejected": 1.2298520803451538, + "logps/chosen": -1.300775170326233, + "logps/rejected": -2.43308424949646, + "loss": 1.3793, + "nll_loss": 1.300775170326233, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.007752418518066, + "rewards/margins": 11.323091506958008, + "rewards/rejected": -24.330841064453125, + "step": 354 + }, + { + "epoch": 0.5293569431500466, + "grad_norm": 52.989957684512696, + "learning_rate": 4.925908545431537e-07, + "logits/chosen": 1.6194576025009155, + "logits/rejected": 2.248783588409424, + "logps/chosen": -1.4075263738632202, + "logps/rejected": -2.3023507595062256, + "loss": 1.6829, + "nll_loss": 1.4075263738632202, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.075263977050781, + "rewards/margins": 8.94824504852295, + "rewards/rejected": -23.023508071899414, + "step": 355 + }, + { + "epoch": 0.5308480894687791, + "grad_norm": 73.36737835982937, + "learning_rate": 4.901214206080268e-07, + "logits/chosen": 1.199830174446106, + "logits/rejected": 1.2315750122070312, + "logps/chosen": -1.0550888776779175, + "logps/rejected": -2.9333043098449707, + "loss": 1.3172, + "nll_loss": 1.055088758468628, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.550888061523438, + "rewards/margins": 18.782155990600586, + "rewards/rejected": -29.333044052124023, + "step": 356 + }, + { + "epoch": 0.5323392357875116, + "grad_norm": 122.00666595480716, + "learning_rate": 4.876522277073534e-07, + "logits/chosen": 0.8202993869781494, + "logits/rejected": 1.3249539136886597, + "logps/chosen": -1.5087026357650757, + "logps/rejected": -2.674197196960449, + "loss": 2.4716, + "nll_loss": 1.5087026357650757, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.087026596069336, + "rewards/margins": 11.654947280883789, + "rewards/rejected": -26.741973876953125, + "step": 357 + }, + { + "epoch": 0.5338303821062442, + "grad_norm": 246.76773763488845, + "learning_rate": 4.851833360887201e-07, + "logits/chosen": 0.8156963586807251, + "logits/rejected": 1.3810663223266602, + "logps/chosen": -1.6010940074920654, + "logps/rejected": -2.600064754486084, + "loss": 2.0726, + "nll_loss": 1.6010942459106445, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.010940551757812, + "rewards/margins": 9.989707946777344, + "rewards/rejected": -26.00065040588379, + "step": 358 + }, + { + "epoch": 0.5353215284249767, + "grad_norm": 35.91044102378384, + "learning_rate": 4.827148059923613e-07, + "logits/chosen": 0.42476320266723633, + "logits/rejected": 0.9761238694190979, + "logps/chosen": -1.138462781906128, + "logps/rejected": -2.1033501625061035, + "loss": 1.8047, + "nll_loss": 1.138462781906128, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.384627342224121, + "rewards/margins": 9.648874282836914, + "rewards/rejected": -21.03350067138672, + "step": 359 + }, + { + "epoch": 0.5368126747437092, + "grad_norm": 95.29465790699732, + "learning_rate": 4.802466976496911e-07, + "logits/chosen": 0.7101988196372986, + "logits/rejected": 0.693535566329956, + "logps/chosen": -1.7004222869873047, + "logps/rejected": -4.368194103240967, + "loss": 1.8352, + "nll_loss": 1.7004221677780151, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.004222869873047, + "rewards/margins": 26.677719116210938, + "rewards/rejected": -43.681941986083984, + "step": 360 + }, + { + "epoch": 0.5383038210624418, + "grad_norm": 76.17525593551915, + "learning_rate": 4.777790712818323e-07, + "logits/chosen": 1.3881099224090576, + "logits/rejected": 1.3718510866165161, + "logps/chosen": -1.3567442893981934, + "logps/rejected": -1.6875207424163818, + "loss": 2.2402, + "nll_loss": 1.356744408607483, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.56744384765625, + "rewards/margins": 3.3077638149261475, + "rewards/rejected": -16.875205993652344, + "step": 361 + }, + { + "epoch": 0.5397949673811743, + "grad_norm": 89.06386872504564, + "learning_rate": 4.753119870981485e-07, + "logits/chosen": 1.8389939069747925, + "logits/rejected": 1.5736289024353027, + "logps/chosen": -1.0070418119430542, + "logps/rejected": -2.262326240539551, + "loss": 2.3572, + "nll_loss": 1.0070418119430542, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.070418357849121, + "rewards/margins": 12.55284309387207, + "rewards/rejected": -22.623260498046875, + "step": 362 + }, + { + "epoch": 0.5412861136999068, + "grad_norm": 36.98114107146971, + "learning_rate": 4.728455052947732e-07, + "logits/chosen": 0.567602813243866, + "logits/rejected": 1.1926748752593994, + "logps/chosen": -1.1497440338134766, + "logps/rejected": -3.3005409240722656, + "loss": 1.661, + "nll_loss": 1.1497440338134766, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.497440338134766, + "rewards/margins": 21.50796890258789, + "rewards/rejected": -33.005409240722656, + "step": 363 + }, + { + "epoch": 0.5427772600186394, + "grad_norm": 57.16827451171126, + "learning_rate": 4.703796860531429e-07, + "logits/chosen": 1.8566818237304688, + "logits/rejected": 1.505059838294983, + "logps/chosen": -1.3204902410507202, + "logps/rejected": -2.035156726837158, + "loss": 1.8646, + "nll_loss": 1.3204902410507202, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.204903602600098, + "rewards/margins": 7.146665096282959, + "rewards/rejected": -20.351566314697266, + "step": 364 + }, + { + "epoch": 0.5442684063373718, + "grad_norm": 62.435237140372806, + "learning_rate": 4.679145895385269e-07, + "logits/chosen": 0.8371939659118652, + "logits/rejected": 1.0040203332901, + "logps/chosen": -1.4417243003845215, + "logps/rejected": -2.8484854698181152, + "loss": 1.8472, + "nll_loss": 1.4417245388031006, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.417244911193848, + "rewards/margins": 14.067611694335938, + "rewards/rejected": -28.48485565185547, + "step": 365 + }, + { + "epoch": 0.5457595526561044, + "grad_norm": 115.67435377041399, + "learning_rate": 4.6545027589856105e-07, + "logits/chosen": 1.0038310289382935, + "logits/rejected": 1.2312933206558228, + "logps/chosen": -0.9938388466835022, + "logps/rejected": -2.520301342010498, + "loss": 1.9517, + "nll_loss": 0.9938388466835022, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.93838882446289, + "rewards/margins": 15.264625549316406, + "rewards/rejected": -25.203014373779297, + "step": 366 + }, + { + "epoch": 0.5472506989748369, + "grad_norm": 59.543682048827165, + "learning_rate": 4.6298680526177855e-07, + "logits/chosen": 1.9276726245880127, + "logits/rejected": 1.532571792602539, + "logps/chosen": -1.876322865486145, + "logps/rejected": -2.3194832801818848, + "loss": 2.5512, + "nll_loss": 1.876322865486145, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.763229370117188, + "rewards/margins": 4.43160343170166, + "rewards/rejected": -23.194833755493164, + "step": 367 + }, + { + "epoch": 0.5487418452935694, + "grad_norm": 35.52641208148878, + "learning_rate": 4.60524237736144e-07, + "logits/chosen": 1.6724348068237305, + "logits/rejected": 1.3174070119857788, + "logps/chosen": -1.13054358959198, + "logps/rejected": -2.897552490234375, + "loss": 2.0898, + "nll_loss": 1.1305434703826904, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.305435180664062, + "rewards/margins": 17.670085906982422, + "rewards/rejected": -28.975521087646484, + "step": 368 + }, + { + "epoch": 0.550232991612302, + "grad_norm": 39.92830775665279, + "learning_rate": 4.5806263340758636e-07, + "logits/chosen": 2.316521167755127, + "logits/rejected": 2.480522871017456, + "logps/chosen": -1.6596410274505615, + "logps/rejected": -2.200403928756714, + "loss": 2.3369, + "nll_loss": 1.6596410274505615, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.596410751342773, + "rewards/margins": 5.407629013061523, + "rewards/rejected": -22.004037857055664, + "step": 369 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 60.985171266996204, + "learning_rate": 4.556020523385326e-07, + "logits/chosen": 1.0311435461044312, + "logits/rejected": 1.2425912618637085, + "logps/chosen": -1.3279130458831787, + "logps/rejected": -2.388808012008667, + "loss": 2.5089, + "nll_loss": 1.3279130458831787, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.279131889343262, + "rewards/margins": 10.6089506149292, + "rewards/rejected": -23.888080596923828, + "step": 370 + }, + { + "epoch": 0.553215284249767, + "grad_norm": 59.15727767457908, + "learning_rate": 4.531425545664425e-07, + "logits/chosen": 0.4395480751991272, + "logits/rejected": 0.9258232712745667, + "logps/chosen": -0.7140366435050964, + "logps/rejected": -3.106039524078369, + "loss": 2.1409, + "nll_loss": 0.7140365839004517, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.140366554260254, + "rewards/margins": 23.92003059387207, + "rewards/rejected": -31.060396194458008, + "step": 371 + }, + { + "epoch": 0.5547064305684996, + "grad_norm": 113.41652725050223, + "learning_rate": 4.5068420010234413e-07, + "logits/chosen": 1.692410945892334, + "logits/rejected": 1.9226614236831665, + "logps/chosen": -1.5776925086975098, + "logps/rejected": -2.640813112258911, + "loss": 2.5214, + "nll_loss": 1.5776923894882202, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.776925086975098, + "rewards/margins": 10.631204605102539, + "rewards/rejected": -26.408130645751953, + "step": 372 + }, + { + "epoch": 0.556197576887232, + "grad_norm": 42.52993255604326, + "learning_rate": 4.482270489293685e-07, + "logits/chosen": 1.9083011150360107, + "logits/rejected": 2.202073574066162, + "logps/chosen": -1.2748469114303589, + "logps/rejected": -3.1168975830078125, + "loss": 1.9004, + "nll_loss": 1.2748469114303589, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.748468399047852, + "rewards/margins": 18.420507431030273, + "rewards/rejected": -31.168975830078125, + "step": 373 + }, + { + "epoch": 0.5576887232059646, + "grad_norm": 45.99191086143298, + "learning_rate": 4.457711610012873e-07, + "logits/chosen": 1.3463859558105469, + "logits/rejected": 2.0031678676605225, + "logps/chosen": -1.2941997051239014, + "logps/rejected": -2.7461254596710205, + "loss": 2.0979, + "nll_loss": 1.2941997051239014, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.941995620727539, + "rewards/margins": 14.519258499145508, + "rewards/rejected": -27.461254119873047, + "step": 374 + }, + { + "epoch": 0.5591798695246971, + "grad_norm": 65.97353088513727, + "learning_rate": 4.4331659624104876e-07, + "logits/chosen": 1.1032230854034424, + "logits/rejected": 0.8625024557113647, + "logps/chosen": -1.2884620428085327, + "logps/rejected": -2.2827272415161133, + "loss": 2.5707, + "nll_loss": 1.2884619235992432, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.88461971282959, + "rewards/margins": 9.94265365600586, + "rewards/rejected": -22.827274322509766, + "step": 375 + }, + { + "epoch": 0.5606710158434296, + "grad_norm": 63.093501055358686, + "learning_rate": 4.4086341453931714e-07, + "logits/chosen": 0.72505784034729, + "logits/rejected": 1.518155813217163, + "logps/chosen": -1.6150670051574707, + "logps/rejected": -2.603109359741211, + "loss": 3.561, + "nll_loss": 1.6150668859481812, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.15066909790039, + "rewards/margins": 9.880425453186035, + "rewards/rejected": -26.03109359741211, + "step": 376 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 45.300199929904046, + "learning_rate": 4.3841167575300933e-07, + "logits/chosen": 0.9565849900245667, + "logits/rejected": 1.066584587097168, + "logps/chosen": -1.4316563606262207, + "logps/rejected": -1.8951045274734497, + "loss": 1.5506, + "nll_loss": 1.4316564798355103, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.316564559936523, + "rewards/margins": 4.634482383728027, + "rewards/rejected": -18.951045989990234, + "step": 377 + }, + { + "epoch": 0.5636533084808947, + "grad_norm": 67.19431471811166, + "learning_rate": 4.359614397038366e-07, + "logits/chosen": 1.5544148683547974, + "logits/rejected": 1.700253963470459, + "logps/chosen": -1.516951560974121, + "logps/rejected": -1.8979204893112183, + "loss": 2.6884, + "nll_loss": 1.516951560974121, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.169515609741211, + "rewards/margins": 3.809689521789551, + "rewards/rejected": -18.979204177856445, + "step": 378 + }, + { + "epoch": 0.5651444547996272, + "grad_norm": 50.18842054638841, + "learning_rate": 4.3351276617684285e-07, + "logits/chosen": 1.82659113407135, + "logits/rejected": 2.0349600315093994, + "logps/chosen": -1.2060390710830688, + "logps/rejected": -2.184781551361084, + "loss": 1.1838, + "nll_loss": 1.2060391902923584, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.060392379760742, + "rewards/margins": 9.787421226501465, + "rewards/rejected": -21.84781265258789, + "step": 379 + }, + { + "epoch": 0.5666356011183598, + "grad_norm": 100.82150396227959, + "learning_rate": 4.310657149189478e-07, + "logits/chosen": 0.8955983519554138, + "logits/rejected": 1.8611811399459839, + "logps/chosen": -1.1403526067733765, + "logps/rejected": -2.4114925861358643, + "loss": 2.1743, + "nll_loss": 1.1403526067733765, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.403526306152344, + "rewards/margins": 12.71139907836914, + "rewards/rejected": -24.114925384521484, + "step": 380 + }, + { + "epoch": 0.5681267474370922, + "grad_norm": 49.37574167209818, + "learning_rate": 4.2862034563748765e-07, + "logits/chosen": 1.445024847984314, + "logits/rejected": 1.0723494291305542, + "logps/chosen": -1.2697113752365112, + "logps/rejected": -2.8943827152252197, + "loss": 1.8205, + "nll_loss": 1.2697113752365112, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.697113990783691, + "rewards/margins": 16.246713638305664, + "rewards/rejected": -28.94382667541504, + "step": 381 + }, + { + "epoch": 0.5696178937558248, + "grad_norm": 71.69580673521774, + "learning_rate": 4.2617671799875944e-07, + "logits/chosen": 1.3698145151138306, + "logits/rejected": 1.2572070360183716, + "logps/chosen": -1.0628803968429565, + "logps/rejected": -1.848346471786499, + "loss": 2.5224, + "nll_loss": 1.0628806352615356, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.628803253173828, + "rewards/margins": 7.854660511016846, + "rewards/rejected": -18.48346710205078, + "step": 382 + }, + { + "epoch": 0.5711090400745573, + "grad_norm": 49.57487875570872, + "learning_rate": 4.237348916265637e-07, + "logits/chosen": 0.9029265642166138, + "logits/rejected": 1.0498796701431274, + "logps/chosen": -0.828253984451294, + "logps/rejected": -2.1390380859375, + "loss": 1.2875, + "nll_loss": 0.8282539248466492, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.282539367675781, + "rewards/margins": 13.107840538024902, + "rewards/rejected": -21.390378952026367, + "step": 383 + }, + { + "epoch": 0.5726001863932898, + "grad_norm": 47.192019525125666, + "learning_rate": 4.2129492610075183e-07, + "logits/chosen": 1.824859619140625, + "logits/rejected": 1.5693551301956177, + "logps/chosen": -1.3151293992996216, + "logps/rejected": -1.891690731048584, + "loss": 2.0956, + "nll_loss": 1.3151293992996216, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.151294708251953, + "rewards/margins": 5.765612602233887, + "rewards/rejected": -18.916908264160156, + "step": 384 + }, + { + "epoch": 0.5740913327120224, + "grad_norm": 68.62274734978858, + "learning_rate": 4.1885688095577e-07, + "logits/chosen": 0.8779973387718201, + "logits/rejected": 1.3293192386627197, + "logps/chosen": -0.8747768402099609, + "logps/rejected": -2.0907740592956543, + "loss": 1.6596, + "nll_loss": 0.8747768402099609, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.74776840209961, + "rewards/margins": 12.15997314453125, + "rewards/rejected": -20.90774154663086, + "step": 385 + }, + { + "epoch": 0.5755824790307549, + "grad_norm": 27.63739304721906, + "learning_rate": 4.164208156792084e-07, + "logits/chosen": 0.765286922454834, + "logits/rejected": 1.0338056087493896, + "logps/chosen": -0.9056370258331299, + "logps/rejected": -1.8033547401428223, + "loss": 2.1928, + "nll_loss": 0.9056369066238403, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.05636978149414, + "rewards/margins": 8.977177619934082, + "rewards/rejected": -18.033546447753906, + "step": 386 + }, + { + "epoch": 0.5770736253494875, + "grad_norm": 36.881961688683596, + "learning_rate": 4.139867897103484e-07, + "logits/chosen": 0.45405313372612, + "logits/rejected": 0.6241415739059448, + "logps/chosen": -1.109076976776123, + "logps/rejected": -2.5582451820373535, + "loss": 1.6022, + "nll_loss": 1.109076976776123, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.090768814086914, + "rewards/margins": 14.491682052612305, + "rewards/rejected": -25.58245086669922, + "step": 387 + }, + { + "epoch": 0.57856477166822, + "grad_norm": 55.64832356660189, + "learning_rate": 4.1155486243871363e-07, + "logits/chosen": 1.1602576971054077, + "logits/rejected": 1.5388338565826416, + "logps/chosen": -1.3016027212142944, + "logps/rejected": -2.1892929077148438, + "loss": 2.5378, + "nll_loss": 1.3016026020050049, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.016027450561523, + "rewards/margins": 8.876900672912598, + "rewards/rejected": -21.892927169799805, + "step": 388 + }, + { + "epoch": 0.5800559179869524, + "grad_norm": 35.55429599551597, + "learning_rate": 4.091250932026191e-07, + "logits/chosen": 0.9565999507904053, + "logits/rejected": 1.5665593147277832, + "logps/chosen": -1.0186738967895508, + "logps/rejected": -2.1096742153167725, + "loss": 1.8503, + "nll_loss": 1.0186740159988403, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.186738967895508, + "rewards/margins": 10.910001754760742, + "rewards/rejected": -21.09674072265625, + "step": 389 + }, + { + "epoch": 0.581547064305685, + "grad_norm": 69.06322291465483, + "learning_rate": 4.066975412877255e-07, + "logits/chosen": 0.36940881609916687, + "logits/rejected": 0.5106832385063171, + "logps/chosen": -1.763047695159912, + "logps/rejected": -2.7701592445373535, + "loss": 1.7896, + "nll_loss": 1.763047695159912, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.630477905273438, + "rewards/margins": 10.071113586425781, + "rewards/rejected": -27.701589584350586, + "step": 390 + }, + { + "epoch": 0.5830382106244175, + "grad_norm": 54.823494430910216, + "learning_rate": 4.042722659255906e-07, + "logits/chosen": 1.4899789094924927, + "logits/rejected": 1.5993945598602295, + "logps/chosen": -1.0938667058944702, + "logps/rejected": -2.578829050064087, + "loss": 2.1689, + "nll_loss": 1.0938668251037598, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.938667297363281, + "rewards/margins": 14.849624633789062, + "rewards/rejected": -25.788291931152344, + "step": 391 + }, + { + "epoch": 0.58452935694315, + "grad_norm": 59.87078190391284, + "learning_rate": 4.0184932629222574e-07, + "logits/chosen": 1.8408669233322144, + "logits/rejected": 1.8936665058135986, + "logps/chosen": -0.9872679114341736, + "logps/rejected": -5.953817367553711, + "loss": 2.0431, + "nll_loss": 0.9872679710388184, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.87267780303955, + "rewards/margins": 49.665489196777344, + "rewards/rejected": -59.538169860839844, + "step": 392 + }, + { + "epoch": 0.5860205032618826, + "grad_norm": 99.10506654477443, + "learning_rate": 3.9942878150665027e-07, + "logits/chosen": 1.2618341445922852, + "logits/rejected": 1.1964116096496582, + "logps/chosen": -1.6831088066101074, + "logps/rejected": -2.680295467376709, + "loss": 2.4551, + "nll_loss": 1.683108925819397, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.83108901977539, + "rewards/margins": 9.971864700317383, + "rewards/rejected": -26.802955627441406, + "step": 393 + }, + { + "epoch": 0.5875116495806151, + "grad_norm": 69.95462863715613, + "learning_rate": 3.970106906294509e-07, + "logits/chosen": 1.3020118474960327, + "logits/rejected": 0.9566605091094971, + "logps/chosen": -1.1724271774291992, + "logps/rejected": -1.895683765411377, + "loss": 2.4444, + "nll_loss": 1.1724271774291992, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.72426986694336, + "rewards/margins": 7.232565879821777, + "rewards/rejected": -18.956836700439453, + "step": 394 + }, + { + "epoch": 0.5890027958993477, + "grad_norm": 72.28018046630525, + "learning_rate": 3.945951126613387e-07, + "logits/chosen": 1.7904846668243408, + "logits/rejected": 1.971991777420044, + "logps/chosen": -1.2904613018035889, + "logps/rejected": -2.2912511825561523, + "loss": 1.9116, + "nll_loss": 1.2904613018035889, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.904610633850098, + "rewards/margins": 10.007899284362793, + "rewards/rejected": -22.91250991821289, + "step": 395 + }, + { + "epoch": 0.5904939422180802, + "grad_norm": 64.6520711455083, + "learning_rate": 3.921821065417116e-07, + "logits/chosen": 0.5514054298400879, + "logits/rejected": 0.4734039306640625, + "logps/chosen": -1.5220355987548828, + "logps/rejected": -2.2492659091949463, + "loss": 2.0462, + "nll_loss": 1.522035837173462, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.220356941223145, + "rewards/margins": 7.272302150726318, + "rewards/rejected": -22.492658615112305, + "step": 396 + }, + { + "epoch": 0.5919850885368126, + "grad_norm": 74.35865053377688, + "learning_rate": 3.89771731147214e-07, + "logits/chosen": 0.790753960609436, + "logits/rejected": 0.9726923108100891, + "logps/chosen": -1.4724862575531006, + "logps/rejected": -1.5546221733093262, + "loss": 2.1503, + "nll_loss": 1.4724860191345215, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.724863052368164, + "rewards/margins": 0.8213585615158081, + "rewards/rejected": -15.546221733093262, + "step": 397 + }, + { + "epoch": 0.5934762348555452, + "grad_norm": 54.58234848469142, + "learning_rate": 3.8736404529030255e-07, + "logits/chosen": 0.807062029838562, + "logits/rejected": 1.2913286685943604, + "logps/chosen": -1.2444322109222412, + "logps/rejected": -2.2048096656799316, + "loss": 1.4848, + "nll_loss": 1.2444320917129517, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.44432258605957, + "rewards/margins": 9.60377311706543, + "rewards/rejected": -22.048095703125, + "step": 398 + }, + { + "epoch": 0.5949673811742777, + "grad_norm": 88.92242648972613, + "learning_rate": 3.8495910771780893e-07, + "logits/chosen": 1.281713843345642, + "logits/rejected": 1.7675105333328247, + "logps/chosen": -1.7222932577133179, + "logps/rejected": -2.6808714866638184, + "loss": 3.014, + "nll_loss": 1.7222931385040283, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.222932815551758, + "rewards/margins": 9.585779190063477, + "rewards/rejected": -26.808712005615234, + "step": 399 + }, + { + "epoch": 0.5964585274930102, + "grad_norm": 43.757364321920235, + "learning_rate": 3.825569771095082e-07, + "logits/chosen": 1.114524483680725, + "logits/rejected": 1.375151515007019, + "logps/chosen": -1.072527527809143, + "logps/rejected": -3.058666467666626, + "loss": 1.7625, + "nll_loss": 1.0725274085998535, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.725275993347168, + "rewards/margins": 19.861387252807617, + "rewards/rejected": -30.5866641998291, + "step": 400 + }, + { + "epoch": 0.5979496738117428, + "grad_norm": 53.13995705660573, + "learning_rate": 3.801577120766859e-07, + "logits/chosen": 1.2310510873794556, + "logits/rejected": 1.5479485988616943, + "logps/chosen": -1.18758225440979, + "logps/rejected": -2.707247018814087, + "loss": 2.4506, + "nll_loss": 1.18758225440979, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.875821113586426, + "rewards/margins": 15.196648597717285, + "rewards/rejected": -27.072471618652344, + "step": 401 + }, + { + "epoch": 0.5994408201304753, + "grad_norm": 59.186371030115765, + "learning_rate": 3.777613711607087e-07, + "logits/chosen": 0.6153742074966431, + "logits/rejected": 1.1276042461395264, + "logps/chosen": -1.373226523399353, + "logps/rejected": -2.5880839824676514, + "loss": 1.3769, + "nll_loss": 1.373226523399353, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.732264518737793, + "rewards/margins": 12.148576736450195, + "rewards/rejected": -25.880840301513672, + "step": 402 + }, + { + "epoch": 0.6009319664492079, + "grad_norm": 34.158229059058655, + "learning_rate": 3.753680128315952e-07, + "logits/chosen": -0.09757497161626816, + "logits/rejected": 0.2057550698518753, + "logps/chosen": -1.454146385192871, + "logps/rejected": -3.5339365005493164, + "loss": 1.6317, + "nll_loss": 1.454146385192871, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.541465759277344, + "rewards/margins": 20.797897338867188, + "rewards/rejected": -35.33936309814453, + "step": 403 + }, + { + "epoch": 0.6024231127679404, + "grad_norm": 43.757133641560635, + "learning_rate": 3.7297769548659046e-07, + "logits/chosen": 1.29662024974823, + "logits/rejected": 1.9946157932281494, + "logps/chosen": -1.143658995628357, + "logps/rejected": -2.533445119857788, + "loss": 2.3373, + "nll_loss": 1.143658995628357, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.436590194702148, + "rewards/margins": 13.897862434387207, + "rewards/rejected": -25.334453582763672, + "step": 404 + }, + { + "epoch": 0.6039142590866728, + "grad_norm": 68.84761989778333, + "learning_rate": 3.7059047744873955e-07, + "logits/chosen": 0.7742293477058411, + "logits/rejected": 0.8148295879364014, + "logps/chosen": -0.9669423699378967, + "logps/rejected": -2.176255702972412, + "loss": 2.4883, + "nll_loss": 0.9669424295425415, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.669424057006836, + "rewards/margins": 12.093133926391602, + "rewards/rejected": -21.762556076049805, + "step": 405 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 72.63496095280603, + "learning_rate": 3.6820641696546627e-07, + "logits/chosen": 1.5729310512542725, + "logits/rejected": 1.7845934629440308, + "logps/chosen": -1.5630098581314087, + "logps/rejected": -3.274157762527466, + "loss": 1.5419, + "nll_loss": 1.5630098581314087, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.63010025024414, + "rewards/margins": 17.111478805541992, + "rewards/rejected": -32.741580963134766, + "step": 406 + }, + { + "epoch": 0.6068965517241379, + "grad_norm": 149.93946543476252, + "learning_rate": 3.6582557220714997e-07, + "logits/chosen": 0.264228880405426, + "logits/rejected": 1.0309929847717285, + "logps/chosen": -1.0704864263534546, + "logps/rejected": -3.3166284561157227, + "loss": 1.7656, + "nll_loss": 1.0704864263534546, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.704864501953125, + "rewards/margins": 22.461421966552734, + "rewards/rejected": -33.166290283203125, + "step": 407 + }, + { + "epoch": 0.6083876980428704, + "grad_norm": 70.07278408398058, + "learning_rate": 3.634480012657084e-07, + "logits/chosen": 1.3579182624816895, + "logits/rejected": 1.3562121391296387, + "logps/chosen": -1.4525789022445679, + "logps/rejected": -2.9784724712371826, + "loss": 1.872, + "nll_loss": 1.4525790214538574, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.525790214538574, + "rewards/margins": 15.25893783569336, + "rewards/rejected": -29.784727096557617, + "step": 408 + }, + { + "epoch": 0.609878844361603, + "grad_norm": 215.56266336124844, + "learning_rate": 3.610737621531781e-07, + "logits/chosen": 1.726470947265625, + "logits/rejected": 2.071410655975342, + "logps/chosen": -1.3145530223846436, + "logps/rejected": -2.4200377464294434, + "loss": 1.8869, + "nll_loss": 1.314553141593933, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.145530700683594, + "rewards/margins": 11.05484676361084, + "rewards/rejected": -24.20037841796875, + "step": 409 + }, + { + "epoch": 0.6113699906803355, + "grad_norm": 48.49475915636012, + "learning_rate": 3.587029128003006e-07, + "logits/chosen": 0.8173800110816956, + "logits/rejected": 0.923591673374176, + "logps/chosen": -1.4306600093841553, + "logps/rejected": -3.322990894317627, + "loss": 2.6237, + "nll_loss": 1.4306602478027344, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.306600570678711, + "rewards/margins": 18.92330551147461, + "rewards/rejected": -33.22990798950195, + "step": 410 + }, + { + "epoch": 0.6128611369990681, + "grad_norm": 54.00669496137651, + "learning_rate": 3.5633551105510806e-07, + "logits/chosen": 0.9530036449432373, + "logits/rejected": 0.9924337863922119, + "logps/chosen": -1.5625369548797607, + "logps/rejected": -2.6102585792541504, + "loss": 2.5014, + "nll_loss": 1.5625371932983398, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.625370025634766, + "rewards/margins": 10.477216720581055, + "rewards/rejected": -26.102584838867188, + "step": 411 + }, + { + "epoch": 0.6143522833178006, + "grad_norm": 45.648147784415954, + "learning_rate": 3.5397161468151214e-07, + "logits/chosen": 1.569969654083252, + "logits/rejected": 1.1461650133132935, + "logps/chosen": -1.364423155784607, + "logps/rejected": -2.5526044368743896, + "loss": 1.6183, + "nll_loss": 1.3644229173660278, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.644231796264648, + "rewards/margins": 11.881815910339355, + "rewards/rejected": -25.526046752929688, + "step": 412 + }, + { + "epoch": 0.615843429636533, + "grad_norm": 61.258417482495204, + "learning_rate": 3.516112813578941e-07, + "logits/chosen": 0.2635102868080139, + "logits/rejected": 0.8463178277015686, + "logps/chosen": -1.4131048917770386, + "logps/rejected": -3.087088108062744, + "loss": 2.2833, + "nll_loss": 1.4131051301956177, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.131050109863281, + "rewards/margins": 16.73983383178711, + "rewards/rejected": -30.87088394165039, + "step": 413 + }, + { + "epoch": 0.6173345759552656, + "grad_norm": 60.87896365725072, + "learning_rate": 3.492545686756986e-07, + "logits/chosen": 1.755131483078003, + "logits/rejected": 2.0312671661376953, + "logps/chosen": -1.026936411857605, + "logps/rejected": -2.6670114994049072, + "loss": 2.4555, + "nll_loss": 1.0269362926483154, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.269363403320312, + "rewards/margins": 16.400753021240234, + "rewards/rejected": -26.670114517211914, + "step": 414 + }, + { + "epoch": 0.6188257222739981, + "grad_norm": 73.6020982510389, + "learning_rate": 3.4690153413802653e-07, + "logits/chosen": 1.4790246486663818, + "logits/rejected": 1.977297306060791, + "logps/chosen": -1.4076223373413086, + "logps/rejected": -2.840873956680298, + "loss": 2.2488, + "nll_loss": 1.4076223373413086, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.076223373413086, + "rewards/margins": 14.332515716552734, + "rewards/rejected": -28.408740997314453, + "step": 415 + }, + { + "epoch": 0.6203168685927306, + "grad_norm": 65.13663123971088, + "learning_rate": 3.445522351582344e-07, + "logits/chosen": 1.1710360050201416, + "logits/rejected": 1.0605045557022095, + "logps/chosen": -1.2275431156158447, + "logps/rejected": -2.114682912826538, + "loss": 2.9264, + "nll_loss": 1.2275429964065552, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.275430679321289, + "rewards/margins": 8.871397018432617, + "rewards/rejected": -21.146827697753906, + "step": 416 + }, + { + "epoch": 0.6218080149114632, + "grad_norm": 52.427745005888106, + "learning_rate": 3.4220672905853107e-07, + "logits/chosen": 1.3692402839660645, + "logits/rejected": 1.266655445098877, + "logps/chosen": -1.1131441593170166, + "logps/rejected": -2.0882177352905273, + "loss": 1.8158, + "nll_loss": 1.1131441593170166, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.131441116333008, + "rewards/margins": 9.75073528289795, + "rewards/rejected": -20.882179260253906, + "step": 417 + }, + { + "epoch": 0.6232991612301957, + "grad_norm": 170.39584633177057, + "learning_rate": 3.3986507306858125e-07, + "logits/chosen": 0.5034229755401611, + "logits/rejected": 0.8567270636558533, + "logps/chosen": -1.0039225816726685, + "logps/rejected": -2.926978588104248, + "loss": 2.5382, + "nll_loss": 1.003922462463379, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.039225578308105, + "rewards/margins": 19.230560302734375, + "rewards/rejected": -29.269784927368164, + "step": 418 + }, + { + "epoch": 0.6247903075489283, + "grad_norm": 56.76544732171944, + "learning_rate": 3.375273243241071e-07, + "logits/chosen": 1.012601613998413, + "logits/rejected": 1.2233651876449585, + "logps/chosen": -0.9432302117347717, + "logps/rejected": -3.0564088821411133, + "loss": 1.8637, + "nll_loss": 0.943230152130127, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.43230152130127, + "rewards/margins": 21.13178825378418, + "rewards/rejected": -30.5640869140625, + "step": 419 + }, + { + "epoch": 0.6262814538676608, + "grad_norm": 50.03299372561917, + "learning_rate": 3.3519353986549604e-07, + "logits/chosen": 1.4403176307678223, + "logits/rejected": 2.0863468647003174, + "logps/chosen": -1.264074444770813, + "logps/rejected": -3.074423313140869, + "loss": 1.5339, + "nll_loss": 1.264074444770813, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.640746116638184, + "rewards/margins": 18.10348892211914, + "rewards/rejected": -30.744232177734375, + "step": 420 + }, + { + "epoch": 0.6277726001863932, + "grad_norm": 83.85556547470075, + "learning_rate": 3.328637766364075e-07, + "logits/chosen": 0.2867899537086487, + "logits/rejected": 0.5098231434822083, + "logps/chosen": -1.5081285238265991, + "logps/rejected": -2.784846544265747, + "loss": 2.5833, + "nll_loss": 1.5081287622451782, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.081287384033203, + "rewards/margins": 12.76717758178711, + "rewards/rejected": -27.848464965820312, + "step": 421 + }, + { + "epoch": 0.6292637465051258, + "grad_norm": 91.27366980862237, + "learning_rate": 3.305380914823842e-07, + "logits/chosen": 1.0160527229309082, + "logits/rejected": 0.8778313994407654, + "logps/chosen": -1.4328757524490356, + "logps/rejected": -1.9611214399337769, + "loss": 3.2579, + "nll_loss": 1.4328757524490356, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.32875919342041, + "rewards/margins": 5.282455921173096, + "rewards/rejected": -19.6112117767334, + "step": 422 + }, + { + "epoch": 0.6307548928238583, + "grad_norm": 42.63298100880217, + "learning_rate": 3.2821654114946496e-07, + "logits/chosen": 1.3331093788146973, + "logits/rejected": 1.788509488105774, + "logps/chosen": -1.002078890800476, + "logps/rejected": -1.5614389181137085, + "loss": 1.9875, + "nll_loss": 1.0020790100097656, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.020788192749023, + "rewards/margins": 5.593601226806641, + "rewards/rejected": -15.614389419555664, + "step": 423 + }, + { + "epoch": 0.6322460391425909, + "grad_norm": 59.79902227784364, + "learning_rate": 3.2589918228280066e-07, + "logits/chosen": 2.0965588092803955, + "logits/rejected": 2.404791831970215, + "logps/chosen": -1.8881897926330566, + "logps/rejected": -3.009868621826172, + "loss": 1.9466, + "nll_loss": 1.8881897926330566, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.88189697265625, + "rewards/margins": 11.216791152954102, + "rewards/rejected": -30.09868812561035, + "step": 424 + }, + { + "epoch": 0.6337371854613234, + "grad_norm": 38.641026026310826, + "learning_rate": 3.235860714252708e-07, + "logits/chosen": 0.49497750401496887, + "logits/rejected": 0.8354448676109314, + "logps/chosen": -0.9587732553482056, + "logps/rejected": -2.164642572402954, + "loss": 1.4651, + "nll_loss": 0.9587733149528503, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.587732315063477, + "rewards/margins": 12.058693885803223, + "rewards/rejected": -21.646427154541016, + "step": 425 + }, + { + "epoch": 0.6352283317800559, + "grad_norm": 49.29895906205073, + "learning_rate": 3.2127726501610554e-07, + "logits/chosen": 0.8333710432052612, + "logits/rejected": 0.6832611560821533, + "logps/chosen": -1.228257179260254, + "logps/rejected": -2.4580414295196533, + "loss": 1.424, + "nll_loss": 1.2282572984695435, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.282571792602539, + "rewards/margins": 12.297839164733887, + "rewards/rejected": -24.580411911010742, + "step": 426 + }, + { + "epoch": 0.6367194780987885, + "grad_norm": 59.09457192361289, + "learning_rate": 3.189728193895069e-07, + "logits/chosen": 1.2447106838226318, + "logits/rejected": 1.6795345544815063, + "logps/chosen": -1.2835984230041504, + "logps/rejected": -2.071232795715332, + "loss": 3.1679, + "nll_loss": 1.2835984230041504, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.835984230041504, + "rewards/margins": 7.876343727111816, + "rewards/rejected": -20.712326049804688, + "step": 427 + }, + { + "epoch": 0.638210624417521, + "grad_norm": 55.472676645539266, + "learning_rate": 3.1667279077327596e-07, + "logits/chosen": 1.4515255689620972, + "logits/rejected": 1.4992138147354126, + "logps/chosen": -1.2138221263885498, + "logps/rejected": -2.1761789321899414, + "loss": 2.4694, + "nll_loss": 1.2138221263885498, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.138221740722656, + "rewards/margins": 9.62356948852539, + "rewards/rejected": -21.761789321899414, + "step": 428 + }, + { + "epoch": 0.6397017707362534, + "grad_norm": 48.72155621735601, + "learning_rate": 3.143772352874393e-07, + "logits/chosen": 1.3959624767303467, + "logits/rejected": 1.6266913414001465, + "logps/chosen": -1.3503375053405762, + "logps/rejected": -2.3042397499084473, + "loss": 2.1414, + "nll_loss": 1.3503373861312866, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.503375053405762, + "rewards/margins": 9.539024353027344, + "rewards/rejected": -23.04239845275879, + "step": 429 + }, + { + "epoch": 0.641192917054986, + "grad_norm": 35.66908163280164, + "learning_rate": 3.12086208942881e-07, + "logits/chosen": 0.44496166706085205, + "logits/rejected": 1.2202012538909912, + "logps/chosen": -1.1374258995056152, + "logps/rejected": -3.3101794719696045, + "loss": 1.208, + "nll_loss": 1.1374258995056152, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.374258995056152, + "rewards/margins": 21.727537155151367, + "rewards/rejected": -33.10179901123047, + "step": 430 + }, + { + "epoch": 0.6426840633737185, + "grad_norm": 30.10695795802461, + "learning_rate": 3.0979976763997483e-07, + "logits/chosen": 1.0399227142333984, + "logits/rejected": 0.6588888764381409, + "logps/chosen": -1.250860571861267, + "logps/rejected": -2.8929429054260254, + "loss": 1.8239, + "nll_loss": 1.250860333442688, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.5086030960083, + "rewards/margins": 16.420822143554688, + "rewards/rejected": -28.929426193237305, + "step": 431 + }, + { + "epoch": 0.6441752096924511, + "grad_norm": 47.98314828937602, + "learning_rate": 3.0751796716722154e-07, + "logits/chosen": 1.1352158784866333, + "logits/rejected": 1.410915493965149, + "logps/chosen": -1.0950578451156616, + "logps/rejected": -2.431433916091919, + "loss": 1.5171, + "nll_loss": 1.0950578451156616, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.950577735900879, + "rewards/margins": 13.363761901855469, + "rewards/rejected": -24.314340591430664, + "step": 432 + }, + { + "epoch": 0.6456663560111836, + "grad_norm": 35.63586624987481, + "learning_rate": 3.052408631998863e-07, + "logits/chosen": 1.6566545963287354, + "logits/rejected": 1.4458212852478027, + "logps/chosen": -1.2611005306243896, + "logps/rejected": -2.0897819995880127, + "loss": 1.4812, + "nll_loss": 1.2611005306243896, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.611005783081055, + "rewards/margins": 8.286813735961914, + "rewards/rejected": -20.89781951904297, + "step": 433 + }, + { + "epoch": 0.6471575023299161, + "grad_norm": 46.07313177405188, + "learning_rate": 3.0296851129864165e-07, + "logits/chosen": 1.6016666889190674, + "logits/rejected": 1.2090141773223877, + "logps/chosen": -1.4727472066879272, + "logps/rejected": -2.661288261413574, + "loss": 1.8318, + "nll_loss": 1.4727472066879272, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.727469444274902, + "rewards/margins": 11.885411262512207, + "rewards/rejected": -26.612884521484375, + "step": 434 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 74.1761189064138, + "learning_rate": 3.007009669082103e-07, + "logits/chosen": 1.45290207862854, + "logits/rejected": 1.8482708930969238, + "logps/chosen": -1.1907374858856201, + "logps/rejected": -2.357393741607666, + "loss": 2.126, + "nll_loss": 1.1907377243041992, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.90737533569336, + "rewards/margins": 11.6665620803833, + "rewards/rejected": -23.573938369750977, + "step": 435 + }, + { + "epoch": 0.6501397949673812, + "grad_norm": 64.14650524937251, + "learning_rate": 2.9843828535601397e-07, + "logits/chosen": 1.1652297973632812, + "logits/rejected": 0.9601278305053711, + "logps/chosen": -1.6680173873901367, + "logps/rejected": -3.202648878097534, + "loss": 2.3454, + "nll_loss": 1.6680173873901367, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.680173873901367, + "rewards/margins": 15.34631633758545, + "rewards/rejected": -32.0264892578125, + "step": 436 + }, + { + "epoch": 0.6516309412861137, + "grad_norm": 53.104136065409264, + "learning_rate": 2.9618052185082155e-07, + "logits/chosen": 0.9669073224067688, + "logits/rejected": 0.7368067502975464, + "logps/chosen": -1.1525717973709106, + "logps/rejected": -2.254432439804077, + "loss": 2.2193, + "nll_loss": 1.1525717973709106, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.525718688964844, + "rewards/margins": 11.018606185913086, + "rewards/rejected": -22.54432487487793, + "step": 437 + }, + { + "epoch": 0.6531220876048462, + "grad_norm": 73.70829574207959, + "learning_rate": 2.9392773148140404e-07, + "logits/chosen": 2.054089307785034, + "logits/rejected": 2.129491090774536, + "logps/chosen": -1.5031163692474365, + "logps/rejected": -3.7904915809631348, + "loss": 1.9853, + "nll_loss": 1.5031163692474365, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.031164169311523, + "rewards/margins": 22.873748779296875, + "rewards/rejected": -37.904911041259766, + "step": 438 + }, + { + "epoch": 0.6546132339235787, + "grad_norm": 39.8178746230072, + "learning_rate": 2.916799692151884e-07, + "logits/chosen": 1.1287901401519775, + "logits/rejected": 0.6743125319480896, + "logps/chosen": -1.3089931011199951, + "logps/rejected": -2.152021646499634, + "loss": 1.6465, + "nll_loss": 1.3089929819107056, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.089929580688477, + "rewards/margins": 8.43028450012207, + "rewards/rejected": -21.52021598815918, + "step": 439 + }, + { + "epoch": 0.6561043802423113, + "grad_norm": 44.707402411865324, + "learning_rate": 2.8943728989691857e-07, + "logits/chosen": 1.9802640676498413, + "logits/rejected": 1.882559061050415, + "logps/chosen": -1.4435269832611084, + "logps/rejected": -2.479424238204956, + "loss": 2.0328, + "nll_loss": 1.4435269832611084, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.435270309448242, + "rewards/margins": 10.35897445678711, + "rewards/rejected": -24.79424476623535, + "step": 440 + }, + { + "epoch": 0.6575955265610438, + "grad_norm": 52.793943718785414, + "learning_rate": 2.871997482473144e-07, + "logits/chosen": 1.3309818506240845, + "logits/rejected": 1.430246114730835, + "logps/chosen": -1.0432627201080322, + "logps/rejected": -2.574901580810547, + "loss": 1.7849, + "nll_loss": 1.0432627201080322, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.43262767791748, + "rewards/margins": 15.316386222839355, + "rewards/rejected": -25.74901580810547, + "step": 441 + }, + { + "epoch": 0.6590866728797763, + "grad_norm": 40.87763052696959, + "learning_rate": 2.849673988617399e-07, + "logits/chosen": 1.6311396360397339, + "logits/rejected": 0.8815900683403015, + "logps/chosen": -0.9415394067764282, + "logps/rejected": -1.797757625579834, + "loss": 1.2666, + "nll_loss": 0.9415394067764282, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.41539478302002, + "rewards/margins": 8.56218147277832, + "rewards/rejected": -17.977575302124023, + "step": 442 + }, + { + "epoch": 0.6605778191985089, + "grad_norm": 41.007404689910715, + "learning_rate": 2.827402962088677e-07, + "logits/chosen": 1.7913353443145752, + "logits/rejected": 2.1356258392333984, + "logps/chosen": -1.023738980293274, + "logps/rejected": -3.2478387355804443, + "loss": 1.739, + "nll_loss": 1.0237390995025635, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.237390518188477, + "rewards/margins": 22.240999221801758, + "rewards/rejected": -32.478389739990234, + "step": 443 + }, + { + "epoch": 0.6620689655172414, + "grad_norm": 55.087459504774436, + "learning_rate": 2.8051849462935317e-07, + "logits/chosen": 2.3499698638916016, + "logits/rejected": 2.2420759201049805, + "logps/chosen": -1.4096755981445312, + "logps/rejected": -1.641516923904419, + "loss": 2.7251, + "nll_loss": 1.4096755981445312, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.096755981445312, + "rewards/margins": 2.3184118270874023, + "rewards/rejected": -16.4151668548584, + "step": 444 + }, + { + "epoch": 0.6635601118359739, + "grad_norm": 33.19261803739409, + "learning_rate": 2.783020483345057e-07, + "logits/chosen": 0.9375135898590088, + "logits/rejected": 0.7836179733276367, + "logps/chosen": -1.0821305513381958, + "logps/rejected": -3.1024487018585205, + "loss": 2.0048, + "nll_loss": 1.0821305513381958, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.821305274963379, + "rewards/margins": 20.20318031311035, + "rewards/rejected": -31.024486541748047, + "step": 445 + }, + { + "epoch": 0.6650512581547064, + "grad_norm": 46.66424288671262, + "learning_rate": 2.760910114049686e-07, + "logits/chosen": 0.45911887288093567, + "logits/rejected": 0.7530557513237, + "logps/chosen": -1.196040153503418, + "logps/rejected": -2.101353883743286, + "loss": 2.4426, + "nll_loss": 1.1960399150848389, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.96040153503418, + "rewards/margins": 9.05313777923584, + "rewards/rejected": -21.013540267944336, + "step": 446 + }, + { + "epoch": 0.6665424044734389, + "grad_norm": 68.75236713436564, + "learning_rate": 2.738854377893969e-07, + "logits/chosen": 1.666689157485962, + "logits/rejected": 2.0555055141448975, + "logps/chosen": -1.335540533065796, + "logps/rejected": -2.899733781814575, + "loss": 2.2997, + "nll_loss": 1.3355404138565063, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.3554048538208, + "rewards/margins": 15.641935348510742, + "rewards/rejected": -28.997339248657227, + "step": 447 + }, + { + "epoch": 0.6680335507921715, + "grad_norm": 54.42062109615498, + "learning_rate": 2.7168538130314345e-07, + "logits/chosen": 0.376677930355072, + "logits/rejected": 0.675430178642273, + "logps/chosen": -0.9450101256370544, + "logps/rejected": -2.8028371334075928, + "loss": 1.9346, + "nll_loss": 0.9450101256370544, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.450100898742676, + "rewards/margins": 18.578269958496094, + "rewards/rejected": -28.028369903564453, + "step": 448 + }, + { + "epoch": 0.669524697110904, + "grad_norm": 71.23646436175397, + "learning_rate": 2.6949089562694433e-07, + "logits/chosen": 0.9078048467636108, + "logits/rejected": 1.0864940881729126, + "logps/chosen": -1.1694973707199097, + "logps/rejected": -3.034714698791504, + "loss": 2.4166, + "nll_loss": 1.1694972515106201, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.694973945617676, + "rewards/margins": 18.652170181274414, + "rewards/rejected": -30.347145080566406, + "step": 449 + }, + { + "epoch": 0.6710158434296365, + "grad_norm": 49.91139078148586, + "learning_rate": 2.673020343056094e-07, + "logits/chosen": 0.33078503608703613, + "logits/rejected": 0.979889988899231, + "logps/chosen": -1.1470595598220825, + "logps/rejected": -2.078333854675293, + "loss": 1.6967, + "nll_loss": 1.147059440612793, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.47059440612793, + "rewards/margins": 9.312743186950684, + "rewards/rejected": -20.78333854675293, + "step": 450 + }, + { + "epoch": 0.6725069897483691, + "grad_norm": 43.26217547679494, + "learning_rate": 2.651188507467161e-07, + "logits/chosen": 1.538424015045166, + "logits/rejected": 1.7342791557312012, + "logps/chosen": -1.0186197757720947, + "logps/rejected": -1.8571202754974365, + "loss": 2.0133, + "nll_loss": 1.0186196565628052, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.186197280883789, + "rewards/margins": 8.385005950927734, + "rewards/rejected": -18.57120132446289, + "step": 451 + }, + { + "epoch": 0.6739981360671016, + "grad_norm": 71.85168571075707, + "learning_rate": 2.629413982193059e-07, + "logits/chosen": 1.079938530921936, + "logits/rejected": 1.236559271812439, + "logps/chosen": -1.266383409500122, + "logps/rejected": -1.8653570413589478, + "loss": 2.9246, + "nll_loss": 1.266383409500122, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.663833618164062, + "rewards/margins": 5.989736080169678, + "rewards/rejected": -18.653568267822266, + "step": 452 + }, + { + "epoch": 0.6754892823858341, + "grad_norm": 129.63565634881414, + "learning_rate": 2.60769729852585e-07, + "logits/chosen": -0.3111036419868469, + "logits/rejected": 0.7170066833496094, + "logps/chosen": -0.6509460806846619, + "logps/rejected": -3.1134836673736572, + "loss": 2.6189, + "nll_loss": 0.6509461402893066, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.509461879730225, + "rewards/margins": 24.625377655029297, + "rewards/rejected": -31.134841918945312, + "step": 453 + }, + { + "epoch": 0.6769804287045667, + "grad_norm": 41.446021715687706, + "learning_rate": 2.5860389863462763e-07, + "logits/chosen": 0.4215829372406006, + "logits/rejected": 1.3934569358825684, + "logps/chosen": -1.2299526929855347, + "logps/rejected": -2.8610968589782715, + "loss": 1.6905, + "nll_loss": 1.2299526929855347, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.29952621459961, + "rewards/margins": 16.31144142150879, + "rewards/rejected": -28.61096954345703, + "step": 454 + }, + { + "epoch": 0.6784715750232991, + "grad_norm": 60.40564921235719, + "learning_rate": 2.564439574110833e-07, + "logits/chosen": 1.331271767616272, + "logits/rejected": 2.258624792098999, + "logps/chosen": -1.3361079692840576, + "logps/rejected": -2.4887638092041016, + "loss": 2.1492, + "nll_loss": 1.3361079692840576, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.361080169677734, + "rewards/margins": 11.526557922363281, + "rewards/rejected": -24.887638092041016, + "step": 455 + }, + { + "epoch": 0.6799627213420317, + "grad_norm": 213.62780772955298, + "learning_rate": 2.542899588838875e-07, + "logits/chosen": 1.3909834623336792, + "logits/rejected": 1.3308240175247192, + "logps/chosen": -1.6496881246566772, + "logps/rejected": -2.274871826171875, + "loss": 2.1291, + "nll_loss": 1.6496882438659668, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.49688148498535, + "rewards/margins": 6.251836776733398, + "rewards/rejected": -22.748720169067383, + "step": 456 + }, + { + "epoch": 0.6814538676607642, + "grad_norm": 113.79819976533673, + "learning_rate": 2.521419556099754e-07, + "logits/chosen": 0.2305797040462494, + "logits/rejected": 0.02278699167072773, + "logps/chosen": -1.1409465074539185, + "logps/rejected": -2.5157318115234375, + "loss": 2.0458, + "nll_loss": 1.140946626663208, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.409464836120605, + "rewards/margins": 13.747852325439453, + "rewards/rejected": -25.157318115234375, + "step": 457 + }, + { + "epoch": 0.6829450139794967, + "grad_norm": 53.05205087447335, + "learning_rate": 2.500000000000001e-07, + "logits/chosen": 1.3694684505462646, + "logits/rejected": 1.2448177337646484, + "logps/chosen": -0.9474362134933472, + "logps/rejected": -2.576544761657715, + "loss": 2.4503, + "nll_loss": 0.9474362134933472, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.474361419677734, + "rewards/margins": 16.291086196899414, + "rewards/rejected": -25.76544761657715, + "step": 458 + }, + { + "epoch": 0.6844361602982293, + "grad_norm": 94.38074467414543, + "learning_rate": 2.47864144317053e-07, + "logits/chosen": 0.28457310795783997, + "logits/rejected": 0.5626116991043091, + "logps/chosen": -1.023999571800232, + "logps/rejected": -4.407548427581787, + "loss": 2.6478, + "nll_loss": 1.023999571800232, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.239995956420898, + "rewards/margins": 33.835487365722656, + "rewards/rejected": -44.07548522949219, + "step": 459 + }, + { + "epoch": 0.6859273066169618, + "grad_norm": 155.12455111533777, + "learning_rate": 2.4573444067538985e-07, + "logits/chosen": 1.1949751377105713, + "logits/rejected": 1.0052484273910522, + "logps/chosen": -1.0581481456756592, + "logps/rejected": -2.1276328563690186, + "loss": 3.0956, + "nll_loss": 1.0581481456756592, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.581480979919434, + "rewards/margins": 10.694849014282227, + "rewards/rejected": -21.27633285522461, + "step": 460 + }, + { + "epoch": 0.6874184529356944, + "grad_norm": 33.62384733106699, + "learning_rate": 2.4361094103915724e-07, + "logits/chosen": 0.48369476199150085, + "logits/rejected": 0.27421849966049194, + "logps/chosen": -1.0688875913619995, + "logps/rejected": -1.9155361652374268, + "loss": 1.6311, + "nll_loss": 1.06888747215271, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.688875198364258, + "rewards/margins": 8.466485977172852, + "rewards/rejected": -19.15536117553711, + "step": 461 + }, + { + "epoch": 0.6889095992544269, + "grad_norm": 49.979637593529766, + "learning_rate": 2.4149369722112715e-07, + "logits/chosen": 0.8130873441696167, + "logits/rejected": 1.4969799518585205, + "logps/chosen": -1.1139277219772339, + "logps/rejected": -2.584320545196533, + "loss": 1.8583, + "nll_loss": 1.1139277219772339, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.139278411865234, + "rewards/margins": 14.703924179077148, + "rewards/rejected": -25.843204498291016, + "step": 462 + }, + { + "epoch": 0.6904007455731593, + "grad_norm": 85.69499562338711, + "learning_rate": 2.3938276088143e-07, + "logits/chosen": 1.6853172779083252, + "logits/rejected": 1.7209492921829224, + "logps/chosen": -1.050376296043396, + "logps/rejected": -2.0380706787109375, + "loss": 2.5335, + "nll_loss": 1.050376296043396, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.503764152526855, + "rewards/margins": 9.876941680908203, + "rewards/rejected": -20.380706787109375, + "step": 463 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 98.42041645310131, + "learning_rate": 2.3727818352629708e-07, + "logits/chosen": 1.3016146421432495, + "logits/rejected": 1.5215528011322021, + "logps/chosen": -1.1616820096969604, + "logps/rejected": -1.9143564701080322, + "loss": 2.4056, + "nll_loss": 1.161681890487671, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.616820335388184, + "rewards/margins": 7.526744842529297, + "rewards/rejected": -19.143566131591797, + "step": 464 + }, + { + "epoch": 0.6933830382106244, + "grad_norm": 67.84600762969895, + "learning_rate": 2.351800165068008e-07, + "logits/chosen": 2.1544480323791504, + "logits/rejected": 2.2394189834594727, + "logps/chosen": -0.8722681999206543, + "logps/rejected": -3.095642328262329, + "loss": 3.2354, + "nll_loss": 0.8722682595252991, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.72268295288086, + "rewards/margins": 22.23373794555664, + "rewards/rejected": -30.9564208984375, + "step": 465 + }, + { + "epoch": 0.6948741845293569, + "grad_norm": 42.045527132294, + "learning_rate": 2.3308831101760483e-07, + "logits/chosen": 1.230238914489746, + "logits/rejected": 1.6758296489715576, + "logps/chosen": -1.7337133884429932, + "logps/rejected": -2.593168258666992, + "loss": 1.672, + "nll_loss": 1.7337136268615723, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.337133407592773, + "rewards/margins": 8.594550132751465, + "rewards/rejected": -25.931682586669922, + "step": 466 + }, + { + "epoch": 0.6963653308480895, + "grad_norm": 43.5637245965348, + "learning_rate": 2.310031180957117e-07, + "logits/chosen": 0.024984115734696388, + "logits/rejected": 0.7004745006561279, + "logps/chosen": -0.9020689725875854, + "logps/rejected": -2.2720069885253906, + "loss": 1.8545, + "nll_loss": 0.9020689725875854, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.020689010620117, + "rewards/margins": 13.699378967285156, + "rewards/rejected": -22.720067977905273, + "step": 467 + }, + { + "epoch": 0.697856477166822, + "grad_norm": 511.0890454343964, + "learning_rate": 2.289244886192207e-07, + "logits/chosen": 0.222794309258461, + "logits/rejected": 0.4778207540512085, + "logps/chosen": -2.1019086837768555, + "logps/rejected": -2.340156316757202, + "loss": 3.0353, + "nll_loss": 2.1019086837768555, + "rewards/accuracies": 0.875, + "rewards/chosen": -21.01908302307129, + "rewards/margins": 2.3824777603149414, + "rewards/rejected": -23.401561737060547, + "step": 468 + }, + { + "epoch": 0.6993476234855546, + "grad_norm": 38.06885262051829, + "learning_rate": 2.2685247330608414e-07, + "logits/chosen": 0.7382382154464722, + "logits/rejected": 0.8615285158157349, + "logps/chosen": -1.1256132125854492, + "logps/rejected": -2.2803823947906494, + "loss": 1.5148, + "nll_loss": 1.1256132125854492, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.256132125854492, + "rewards/margins": 11.547691345214844, + "rewards/rejected": -22.803823471069336, + "step": 469 + }, + { + "epoch": 0.700838769804287, + "grad_norm": 123.78505103720795, + "learning_rate": 2.2478712271287087e-07, + "logits/chosen": -0.260221004486084, + "logits/rejected": 0.08943118155002594, + "logps/chosen": -1.3078944683074951, + "logps/rejected": -2.9328818321228027, + "loss": 2.8933, + "nll_loss": 1.3078944683074951, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.078944206237793, + "rewards/margins": 16.24987030029297, + "rewards/rejected": -29.32881736755371, + "step": 470 + }, + { + "epoch": 0.7023299161230195, + "grad_norm": 51.30693741373788, + "learning_rate": 2.227284872335325e-07, + "logits/chosen": 0.8327363729476929, + "logits/rejected": 1.1221492290496826, + "logps/chosen": -1.0217963457107544, + "logps/rejected": -4.047585487365723, + "loss": 1.6299, + "nll_loss": 1.0217963457107544, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.217963218688965, + "rewards/margins": 30.257888793945312, + "rewards/rejected": -40.475852966308594, + "step": 471 + }, + { + "epoch": 0.7038210624417521, + "grad_norm": 28.86346583951883, + "learning_rate": 2.2067661709817382e-07, + "logits/chosen": -0.10334792733192444, + "logits/rejected": 0.23174366354942322, + "logps/chosen": -0.8516181707382202, + "logps/rejected": -2.4010567665100098, + "loss": 1.774, + "nll_loss": 0.8516180515289307, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.516180992126465, + "rewards/margins": 15.49438762664795, + "rewards/rejected": -24.010568618774414, + "step": 472 + }, + { + "epoch": 0.7053122087604846, + "grad_norm": 53.7095788706672, + "learning_rate": 2.1863156237182724e-07, + "logits/chosen": 0.23709386587142944, + "logits/rejected": 0.6802022457122803, + "logps/chosen": -1.0881783962249756, + "logps/rejected": -2.5452468395233154, + "loss": 2.0664, + "nll_loss": 1.0881783962249756, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.881784439086914, + "rewards/margins": 14.570684432983398, + "rewards/rejected": -25.452468872070312, + "step": 473 + }, + { + "epoch": 0.7068033550792171, + "grad_norm": 107.01228096480615, + "learning_rate": 2.1659337295323114e-07, + "logits/chosen": 2.4457285404205322, + "logits/rejected": 2.192387104034424, + "logps/chosen": -1.1308457851409912, + "logps/rejected": -2.3631060123443604, + "loss": 1.9536, + "nll_loss": 1.1308457851409912, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.30845832824707, + "rewards/margins": 12.322601318359375, + "rewards/rejected": -23.631059646606445, + "step": 474 + }, + { + "epoch": 0.7082945013979497, + "grad_norm": 49.156409055564126, + "learning_rate": 2.1456209857361246e-07, + "logits/chosen": 0.9657554626464844, + "logits/rejected": 1.1160908937454224, + "logps/chosen": -1.2342641353607178, + "logps/rejected": -2.2354118824005127, + "loss": 2.0943, + "nll_loss": 1.2342641353607178, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.342641830444336, + "rewards/margins": 10.011476516723633, + "rewards/rejected": -22.354116439819336, + "step": 475 + }, + { + "epoch": 0.7097856477166822, + "grad_norm": 50.310764029228174, + "learning_rate": 2.1253778879547317e-07, + "logits/chosen": 0.5516436696052551, + "logits/rejected": 1.0273289680480957, + "logps/chosen": -1.400739073753357, + "logps/rejected": -5.508915901184082, + "loss": 1.5569, + "nll_loss": 1.400739073753357, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.007391929626465, + "rewards/margins": 41.08176803588867, + "rewards/rejected": -55.08916091918945, + "step": 476 + }, + { + "epoch": 0.7112767940354148, + "grad_norm": 60.52879609068727, + "learning_rate": 2.1052049301138092e-07, + "logits/chosen": 0.475372850894928, + "logits/rejected": 0.21769298613071442, + "logps/chosen": -1.2100934982299805, + "logps/rejected": -2.600706100463867, + "loss": 1.5172, + "nll_loss": 1.2100934982299805, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.100934982299805, + "rewards/margins": 13.906126976013184, + "rewards/rejected": -26.007061004638672, + "step": 477 + }, + { + "epoch": 0.7127679403541473, + "grad_norm": 77.34557138399435, + "learning_rate": 2.0851026044276405e-07, + "logits/chosen": 0.7670489549636841, + "logits/rejected": 0.7285082340240479, + "logps/chosen": -1.1550674438476562, + "logps/rejected": -2.071988344192505, + "loss": 1.6234, + "nll_loss": 1.1550673246383667, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.550674438476562, + "rewards/margins": 9.169209480285645, + "rewards/rejected": -20.71988296508789, + "step": 478 + }, + { + "epoch": 0.7142590866728797, + "grad_norm": 36.927537934249365, + "learning_rate": 2.0650714013871045e-07, + "logits/chosen": 1.0101845264434814, + "logits/rejected": 1.308388113975525, + "logps/chosen": -1.1056818962097168, + "logps/rejected": -2.300252676010132, + "loss": 1.7636, + "nll_loss": 1.1056818962097168, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.056818008422852, + "rewards/margins": 11.945709228515625, + "rewards/rejected": -23.00252914428711, + "step": 479 + }, + { + "epoch": 0.7157502329916123, + "grad_norm": 62.772752564819186, + "learning_rate": 2.0451118097477093e-07, + "logits/chosen": 1.5709446668624878, + "logits/rejected": 1.7383079528808594, + "logps/chosen": -1.1260969638824463, + "logps/rejected": -2.729841470718384, + "loss": 1.903, + "nll_loss": 1.1260970830917358, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.260971069335938, + "rewards/margins": 16.037445068359375, + "rewards/rejected": -27.298416137695312, + "step": 480 + }, + { + "epoch": 0.7172413793103448, + "grad_norm": 32.97119318254156, + "learning_rate": 2.025224316517663e-07, + "logits/chosen": 0.7222650051116943, + "logits/rejected": 1.2684485912322998, + "logps/chosen": -1.0870883464813232, + "logps/rejected": -2.2934865951538086, + "loss": 1.6303, + "nll_loss": 1.0870884656906128, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.87088394165039, + "rewards/margins": 12.063982009887695, + "rewards/rejected": -22.934865951538086, + "step": 481 + }, + { + "epoch": 0.7187325256290773, + "grad_norm": 93.56185034910865, + "learning_rate": 2.005409406946e-07, + "logits/chosen": 0.7665479779243469, + "logits/rejected": 0.9628517627716064, + "logps/chosen": -1.3418065309524536, + "logps/rejected": -2.322467803955078, + "loss": 2.4619, + "nll_loss": 1.3418065309524536, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.41806411743164, + "rewards/margins": 9.80661392211914, + "rewards/rejected": -23.224679946899414, + "step": 482 + }, + { + "epoch": 0.7202236719478099, + "grad_norm": 83.63211918908515, + "learning_rate": 1.985667564510724e-07, + "logits/chosen": 1.6486438512802124, + "logits/rejected": 1.6210968494415283, + "logps/chosen": -1.4717875719070435, + "logps/rejected": -1.9454703330993652, + "loss": 2.1969, + "nll_loss": 1.471787691116333, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.717876434326172, + "rewards/margins": 4.736826419830322, + "rewards/rejected": -19.45470428466797, + "step": 483 + }, + { + "epoch": 0.7217148182665424, + "grad_norm": 44.77487482055843, + "learning_rate": 1.9659992709070344e-07, + "logits/chosen": 1.3622450828552246, + "logits/rejected": 1.1589062213897705, + "logps/chosen": -0.8679985404014587, + "logps/rejected": -2.2884294986724854, + "loss": 2.0511, + "nll_loss": 0.8679985404014587, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.679985046386719, + "rewards/margins": 14.204309463500977, + "rewards/rejected": -22.884296417236328, + "step": 484 + }, + { + "epoch": 0.723205964585275, + "grad_norm": 59.93427920155752, + "learning_rate": 1.946405006035548e-07, + "logits/chosen": 0.12034586071968079, + "logits/rejected": 0.5588881969451904, + "logps/chosen": -1.1300389766693115, + "logps/rejected": -3.5124731063842773, + "loss": 1.6296, + "nll_loss": 1.130039095878601, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.300390243530273, + "rewards/margins": 23.8243408203125, + "rewards/rejected": -35.124732971191406, + "step": 485 + }, + { + "epoch": 0.7246971109040075, + "grad_norm": 46.34699445302191, + "learning_rate": 1.9268852479906145e-07, + "logits/chosen": 2.0305707454681396, + "logits/rejected": 2.339308261871338, + "logps/chosen": -1.8795857429504395, + "logps/rejected": -2.736845016479492, + "loss": 2.0155, + "nll_loss": 1.8795857429504395, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.795856475830078, + "rewards/margins": 8.572591781616211, + "rewards/rejected": -27.368450164794922, + "step": 486 + }, + { + "epoch": 0.7261882572227399, + "grad_norm": 51.974864556365205, + "learning_rate": 1.907440473048626e-07, + "logits/chosen": 0.6879156827926636, + "logits/rejected": 0.8359827995300293, + "logps/chosen": -1.199193000793457, + "logps/rejected": -1.9355554580688477, + "loss": 1.8027, + "nll_loss": 1.199193000793457, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.99193000793457, + "rewards/margins": 7.363624572753906, + "rewards/rejected": -19.355554580688477, + "step": 487 + }, + { + "epoch": 0.7276794035414725, + "grad_norm": 44.0225250357595, + "learning_rate": 1.8880711556564212e-07, + "logits/chosen": 0.9353025555610657, + "logits/rejected": 0.5232765078544617, + "logps/chosen": -1.4775171279907227, + "logps/rejected": -2.885024070739746, + "loss": 1.8494, + "nll_loss": 1.4775171279907227, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.775171279907227, + "rewards/margins": 14.075069427490234, + "rewards/rejected": -28.85024070739746, + "step": 488 + }, + { + "epoch": 0.729170549860205, + "grad_norm": 50.69197433851532, + "learning_rate": 1.8687777684196882e-07, + "logits/chosen": 1.100256323814392, + "logits/rejected": 1.2217504978179932, + "logps/chosen": -1.3356492519378662, + "logps/rejected": -2.1796789169311523, + "loss": 2.6281, + "nll_loss": 1.3356491327285767, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.356492042541504, + "rewards/margins": 8.440298080444336, + "rewards/rejected": -21.796791076660156, + "step": 489 + }, + { + "epoch": 0.7306616961789375, + "grad_norm": 78.26649672369982, + "learning_rate": 1.849560782091445e-07, + "logits/chosen": 0.6997460126876831, + "logits/rejected": 0.25134438276290894, + "logps/chosen": -1.0516213178634644, + "logps/rejected": -1.7665810585021973, + "loss": 1.7613, + "nll_loss": 1.0516211986541748, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.516212463378906, + "rewards/margins": 7.149598598480225, + "rewards/rejected": -17.66581153869629, + "step": 490 + }, + { + "epoch": 0.7321528424976701, + "grad_norm": 50.189099136000976, + "learning_rate": 1.8304206655605474e-07, + "logits/chosen": 0.45600277185440063, + "logits/rejected": 1.139525055885315, + "logps/chosen": -1.9147918224334717, + "logps/rejected": -2.3654191493988037, + "loss": 2.2826, + "nll_loss": 1.9147917032241821, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.147918701171875, + "rewards/margins": 4.5062737464904785, + "rewards/rejected": -23.654191970825195, + "step": 491 + }, + { + "epoch": 0.7336439888164026, + "grad_norm": 41.38891098119506, + "learning_rate": 1.811357885840254e-07, + "logits/chosen": 1.0754787921905518, + "logits/rejected": 0.987869381904602, + "logps/chosen": -1.3695212602615356, + "logps/rejected": -2.920154571533203, + "loss": 1.5894, + "nll_loss": 1.369521141052246, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.695213317871094, + "rewards/margins": 15.506331443786621, + "rewards/rejected": -29.20154571533203, + "step": 492 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 81.00793367937105, + "learning_rate": 1.7923729080568239e-07, + "logits/chosen": 1.4710057973861694, + "logits/rejected": 1.6370110511779785, + "logps/chosen": -1.0914305448532104, + "logps/rejected": -1.9285211563110352, + "loss": 1.7259, + "nll_loss": 1.091430425643921, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.914305686950684, + "rewards/margins": 8.370904922485352, + "rewards/rejected": -19.28521156311035, + "step": 493 + }, + { + "epoch": 0.7366262814538677, + "grad_norm": 37.78420798456383, + "learning_rate": 1.7734661954381752e-07, + "logits/chosen": 0.871479332447052, + "logits/rejected": 0.8258588314056396, + "logps/chosen": -1.3462225198745728, + "logps/rejected": -2.5515835285186768, + "loss": 1.8386, + "nll_loss": 1.3462225198745728, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.462225914001465, + "rewards/margins": 12.053609848022461, + "rewards/rejected": -25.51583480834961, + "step": 494 + }, + { + "epoch": 0.7381174277726001, + "grad_norm": 131.99066573478086, + "learning_rate": 1.7546382093025758e-07, + "logits/chosen": -0.19732213020324707, + "logits/rejected": -0.07462179660797119, + "logps/chosen": -1.0856541395187378, + "logps/rejected": -2.3993844985961914, + "loss": 1.978, + "nll_loss": 1.0856541395187378, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.85654067993164, + "rewards/margins": 13.137301445007324, + "rewards/rejected": -23.99384307861328, + "step": 495 + }, + { + "epoch": 0.7396085740913327, + "grad_norm": 75.4429379923374, + "learning_rate": 1.7358894090473924e-07, + "logits/chosen": 1.1260228157043457, + "logits/rejected": 1.5973632335662842, + "logps/chosen": -1.3908843994140625, + "logps/rejected": -2.6310038566589355, + "loss": 3.0098, + "nll_loss": 1.3908843994140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.908843994140625, + "rewards/margins": 12.40119457244873, + "rewards/rejected": -26.310039520263672, + "step": 496 + }, + { + "epoch": 0.7410997204100652, + "grad_norm": 57.59245498721595, + "learning_rate": 1.7172202521378793e-07, + "logits/chosen": 1.5798145532608032, + "logits/rejected": 1.379747986793518, + "logps/chosen": -1.0907506942749023, + "logps/rejected": -2.8836097717285156, + "loss": 1.5375, + "nll_loss": 1.0907506942749023, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.907506942749023, + "rewards/margins": 17.9285888671875, + "rewards/rejected": -28.836095809936523, + "step": 497 + }, + { + "epoch": 0.7425908667287978, + "grad_norm": 56.25519043444952, + "learning_rate": 1.6986311940960147e-07, + "logits/chosen": 1.462363362312317, + "logits/rejected": 1.563219666481018, + "logps/chosen": -1.2027587890625, + "logps/rejected": -1.8627557754516602, + "loss": 2.5616, + "nll_loss": 1.2027586698532104, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.027586936950684, + "rewards/margins": 6.599970817565918, + "rewards/rejected": -18.6275577545166, + "step": 498 + }, + { + "epoch": 0.7440820130475303, + "grad_norm": 58.6146470332597, + "learning_rate": 1.6801226884893893e-07, + "logits/chosen": 0.9026723504066467, + "logits/rejected": 1.0392895936965942, + "logps/chosen": -1.2807862758636475, + "logps/rejected": -2.035679578781128, + "loss": 2.2686, + "nll_loss": 1.280786395072937, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.807863235473633, + "rewards/margins": 7.548933982849121, + "rewards/rejected": -20.35679817199707, + "step": 499 + }, + { + "epoch": 0.7455731593662628, + "grad_norm": 127.17666337209936, + "learning_rate": 1.6616951869201378e-07, + "logits/chosen": 1.4067152738571167, + "logits/rejected": 1.5482068061828613, + "logps/chosen": -1.6144959926605225, + "logps/rejected": -2.988914966583252, + "loss": 2.2455, + "nll_loss": 1.6144959926605225, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.144960403442383, + "rewards/margins": 13.744190216064453, + "rewards/rejected": -29.88915252685547, + "step": 500 + }, + { + "epoch": 0.7470643056849954, + "grad_norm": 22.65024245941002, + "learning_rate": 1.6433491390139176e-07, + "logits/chosen": 1.386856198310852, + "logits/rejected": 0.9873509407043457, + "logps/chosen": -0.8607431650161743, + "logps/rejected": -1.7035075426101685, + "loss": 0.846, + "nll_loss": 0.8607431054115295, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.607431411743164, + "rewards/margins": 8.427644729614258, + "rewards/rejected": -17.035076141357422, + "step": 501 + }, + { + "epoch": 0.7485554520037279, + "grad_norm": 97.32340827582183, + "learning_rate": 1.6250849924089482e-07, + "logits/chosen": 0.7162159085273743, + "logits/rejected": 1.4583920240402222, + "logps/chosen": -1.1494529247283936, + "logps/rejected": -1.890822410583496, + "loss": 2.3423, + "nll_loss": 1.1494529247283936, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.49453067779541, + "rewards/margins": 7.413692951202393, + "rewards/rejected": -18.90822410583496, + "step": 502 + }, + { + "epoch": 0.7500465983224603, + "grad_norm": 66.08646716231478, + "learning_rate": 1.6069031927450692e-07, + "logits/chosen": 2.2476208209991455, + "logits/rejected": 2.9070322513580322, + "logps/chosen": -1.1560183763504028, + "logps/rejected": -3.0961015224456787, + "loss": 2.566, + "nll_loss": 1.1560183763504028, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.56018352508545, + "rewards/margins": 19.40083122253418, + "rewards/rejected": -30.961013793945312, + "step": 503 + }, + { + "epoch": 0.7515377446411929, + "grad_norm": 37.34765381376621, + "learning_rate": 1.5888041836528914e-07, + "logits/chosen": 0.9423756003379822, + "logits/rejected": 0.8755822777748108, + "logps/chosen": -1.1919398307800293, + "logps/rejected": -2.717829942703247, + "loss": 2.0692, + "nll_loss": 1.1919398307800293, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.919397354125977, + "rewards/margins": 15.25890064239502, + "rewards/rejected": -27.178298950195312, + "step": 504 + }, + { + "epoch": 0.7530288909599254, + "grad_norm": 43.39433506785265, + "learning_rate": 1.5707884067429471e-07, + "logits/chosen": 0.8235660195350647, + "logits/rejected": 0.9912365078926086, + "logps/chosen": -1.7123976945877075, + "logps/rejected": -3.390589475631714, + "loss": 1.8105, + "nll_loss": 1.7123976945877075, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.123977661132812, + "rewards/margins": 16.781919479370117, + "rewards/rejected": -33.9058952331543, + "step": 505 + }, + { + "epoch": 0.754520037278658, + "grad_norm": 47.056197370329635, + "learning_rate": 1.552856301594942e-07, + "logits/chosen": -0.24026452004909515, + "logits/rejected": 0.23408591747283936, + "logps/chosen": -1.2406717538833618, + "logps/rejected": -3.218183994293213, + "loss": 2.5921, + "nll_loss": 1.2406718730926514, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.406718254089355, + "rewards/margins": 19.775121688842773, + "rewards/rejected": -32.18183898925781, + "step": 506 + }, + { + "epoch": 0.7560111835973905, + "grad_norm": 121.73397969203478, + "learning_rate": 1.5350083057469998e-07, + "logits/chosen": 1.063665747642517, + "logits/rejected": 1.3006550073623657, + "logps/chosen": -0.9579554200172424, + "logps/rejected": -3.6276798248291016, + "loss": 2.4975, + "nll_loss": 0.9579554200172424, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.579554557800293, + "rewards/margins": 26.697248458862305, + "rewards/rejected": -36.27680206298828, + "step": 507 + }, + { + "epoch": 0.757502329916123, + "grad_norm": 74.82208165860814, + "learning_rate": 1.5172448546850163e-07, + "logits/chosen": 1.4153053760528564, + "logits/rejected": 1.931809902191162, + "logps/chosen": -1.1757323741912842, + "logps/rejected": -3.0061824321746826, + "loss": 2.2088, + "nll_loss": 1.1757322549819946, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.75732421875, + "rewards/margins": 18.30449867248535, + "rewards/rejected": -30.061824798583984, + "step": 508 + }, + { + "epoch": 0.7589934762348556, + "grad_norm": 107.8466687249151, + "learning_rate": 1.4995663818320071e-07, + "logits/chosen": 1.366182804107666, + "logits/rejected": 1.5222809314727783, + "logps/chosen": -1.8494032621383667, + "logps/rejected": -2.9303297996520996, + "loss": 2.3564, + "nll_loss": 1.8494032621383667, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.494033813476562, + "rewards/margins": 10.809263229370117, + "rewards/rejected": -29.303298950195312, + "step": 509 + }, + { + "epoch": 0.7604846225535881, + "grad_norm": 61.16767852788904, + "learning_rate": 1.4819733185375531e-07, + "logits/chosen": 1.347992181777954, + "logits/rejected": 1.6785510778427124, + "logps/chosen": -0.7743812799453735, + "logps/rejected": -2.0755648612976074, + "loss": 2.1959, + "nll_loss": 0.7743812799453735, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.743812561035156, + "rewards/margins": 13.011836051940918, + "rewards/rejected": -20.75564956665039, + "step": 510 + }, + { + "epoch": 0.7619757688723205, + "grad_norm": 58.20432865668072, + "learning_rate": 1.4644660940672627e-07, + "logits/chosen": 0.6616173982620239, + "logits/rejected": 0.8984595537185669, + "logps/chosen": -1.6654181480407715, + "logps/rejected": -3.81215238571167, + "loss": 2.0691, + "nll_loss": 1.665418267250061, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.6541805267334, + "rewards/margins": 21.467344284057617, + "rewards/rejected": -38.12152862548828, + "step": 511 + }, + { + "epoch": 0.7634669151910531, + "grad_norm": 52.55922504686432, + "learning_rate": 1.4470451355923024e-07, + "logits/chosen": 0.6327857971191406, + "logits/rejected": 0.07023850828409195, + "logps/chosen": -0.8465753793716431, + "logps/rejected": -2.0434751510620117, + "loss": 2.1946, + "nll_loss": 0.8465753793716431, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.465753555297852, + "rewards/margins": 11.968996047973633, + "rewards/rejected": -20.434751510620117, + "step": 512 + }, + { + "epoch": 0.7649580615097856, + "grad_norm": 49.18096490201257, + "learning_rate": 1.4297108681789749e-07, + "logits/chosen": 1.6971843242645264, + "logits/rejected": 1.4284838438034058, + "logps/chosen": -1.6984788179397583, + "logps/rejected": -1.7207996845245361, + "loss": 2.1438, + "nll_loss": 1.6984788179397583, + "rewards/accuracies": 0.375, + "rewards/chosen": -16.98478889465332, + "rewards/margins": 0.22320926189422607, + "rewards/rejected": -17.207996368408203, + "step": 513 + }, + { + "epoch": 0.7664492078285182, + "grad_norm": 63.55350906873402, + "learning_rate": 1.412463714778343e-07, + "logits/chosen": 1.1489533185958862, + "logits/rejected": 1.6662311553955078, + "logps/chosen": -1.5277798175811768, + "logps/rejected": -2.280330181121826, + "loss": 1.2412, + "nll_loss": 1.5277798175811768, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.27779769897461, + "rewards/margins": 7.525506019592285, + "rewards/rejected": -22.803302764892578, + "step": 514 + }, + { + "epoch": 0.7679403541472507, + "grad_norm": 57.944172276092736, + "learning_rate": 1.3953040962159207e-07, + "logits/chosen": 1.8766756057739258, + "logits/rejected": 1.922022819519043, + "logps/chosen": -1.529194712638855, + "logps/rejected": -2.722909450531006, + "loss": 2.0906, + "nll_loss": 1.5291945934295654, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.291946411132812, + "rewards/margins": 11.937149047851562, + "rewards/rejected": -27.229095458984375, + "step": 515 + }, + { + "epoch": 0.7694315004659832, + "grad_norm": 51.39188096538205, + "learning_rate": 1.3782324311813858e-07, + "logits/chosen": 1.4132533073425293, + "logits/rejected": 0.8430649042129517, + "logps/chosen": -1.2825404405593872, + "logps/rejected": -2.1167383193969727, + "loss": 1.9975, + "nll_loss": 1.2825404405593872, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.82540512084961, + "rewards/margins": 8.3419771194458, + "rewards/rejected": -21.167383193969727, + "step": 516 + }, + { + "epoch": 0.7709226467847158, + "grad_norm": 59.228716606988534, + "learning_rate": 1.3612491362183887e-07, + "logits/chosen": 0.7483983635902405, + "logits/rejected": 0.7808694243431091, + "logps/chosen": -1.6511446237564087, + "logps/rejected": -2.752027750015259, + "loss": 2.5048, + "nll_loss": 1.6511446237564087, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.511444091796875, + "rewards/margins": 11.008831024169922, + "rewards/rejected": -27.52027702331543, + "step": 517 + }, + { + "epoch": 0.7724137931034483, + "grad_norm": 66.07522781310549, + "learning_rate": 1.3443546257143623e-07, + "logits/chosen": 0.7868104577064514, + "logits/rejected": 0.8091244101524353, + "logps/chosen": -0.9046810269355774, + "logps/rejected": -5.082855224609375, + "loss": 1.9796, + "nll_loss": 0.9046810269355774, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.046810150146484, + "rewards/margins": 41.78173828125, + "rewards/rejected": -50.82854461669922, + "step": 518 + }, + { + "epoch": 0.7739049394221807, + "grad_norm": 63.93450684685768, + "learning_rate": 1.3275493118904385e-07, + "logits/chosen": 0.5779823660850525, + "logits/rejected": 0.955024003982544, + "logps/chosen": -1.513691782951355, + "logps/rejected": -2.9146623611450195, + "loss": 1.7256, + "nll_loss": 1.513691782951355, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.136917114257812, + "rewards/margins": 14.009706497192383, + "rewards/rejected": -29.146621704101562, + "step": 519 + }, + { + "epoch": 0.7753960857409133, + "grad_norm": 91.77706933219835, + "learning_rate": 1.3108336047913633e-07, + "logits/chosen": 1.4290512800216675, + "logits/rejected": 1.507631540298462, + "logps/chosen": -1.4324660301208496, + "logps/rejected": -3.8189008235931396, + "loss": 2.546, + "nll_loss": 1.4324660301208496, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.324661254882812, + "rewards/margins": 23.86434555053711, + "rewards/rejected": -38.18900680541992, + "step": 520 + }, + { + "epoch": 0.7768872320596458, + "grad_norm": 106.59145513895703, + "learning_rate": 1.2942079122755163e-07, + "logits/chosen": 0.4324963688850403, + "logits/rejected": 0.6091005802154541, + "logps/chosen": -1.304144263267517, + "logps/rejected": -2.2639334201812744, + "loss": 2.1878, + "nll_loss": 1.3041443824768066, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.041441917419434, + "rewards/margins": 9.597891807556152, + "rewards/rejected": -22.63933563232422, + "step": 521 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 62.71439391098405, + "learning_rate": 1.277672640004936e-07, + "logits/chosen": 1.8813598155975342, + "logits/rejected": 2.191922187805176, + "logps/chosen": -0.9609942436218262, + "logps/rejected": -2.417336940765381, + "loss": 1.5024, + "nll_loss": 0.9609941840171814, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.609942436218262, + "rewards/margins": 14.563425064086914, + "rewards/rejected": -24.173368453979492, + "step": 522 + }, + { + "epoch": 0.7798695246971109, + "grad_norm": 46.30568847741028, + "learning_rate": 1.261228191435445e-07, + "logits/chosen": 0.815243661403656, + "logits/rejected": 1.5941507816314697, + "logps/chosen": -1.1938700675964355, + "logps/rejected": -4.132516860961914, + "loss": 1.6403, + "nll_loss": 1.1938700675964355, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.938700675964355, + "rewards/margins": 29.386470794677734, + "rewards/rejected": -41.325172424316406, + "step": 523 + }, + { + "epoch": 0.7813606710158434, + "grad_norm": 58.473201038518226, + "learning_rate": 1.2448749678067855e-07, + "logits/chosen": 0.14041255414485931, + "logits/rejected": -0.08654403686523438, + "logps/chosen": -0.8794372081756592, + "logps/rejected": -2.187777519226074, + "loss": 2.5804, + "nll_loss": 0.8794372081756592, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.79437255859375, + "rewards/margins": 13.083402633666992, + "rewards/rejected": -21.877775192260742, + "step": 524 + }, + { + "epoch": 0.782851817334576, + "grad_norm": 48.892135342518635, + "learning_rate": 1.228613368132842e-07, + "logits/chosen": 0.5651916861534119, + "logits/rejected": 0.992500901222229, + "logps/chosen": -1.1246737241744995, + "logps/rejected": -2.251676321029663, + "loss": 2.0336, + "nll_loss": 1.1246737241744995, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.24673843383789, + "rewards/margins": 11.270025253295898, + "rewards/rejected": -22.51676368713379, + "step": 525 + }, + { + "epoch": 0.7843429636533085, + "grad_norm": 58.22641776546797, + "learning_rate": 1.2124437891918993e-07, + "logits/chosen": 0.8821541666984558, + "logits/rejected": 1.1546046733856201, + "logps/chosen": -1.0839579105377197, + "logps/rejected": -2.731253147125244, + "loss": 2.2118, + "nll_loss": 1.0839580297470093, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.839579582214355, + "rewards/margins": 16.47295379638672, + "rewards/rejected": -27.312532424926758, + "step": 526 + }, + { + "epoch": 0.7858341099720411, + "grad_norm": 136.31863333056407, + "learning_rate": 1.1963666255169645e-07, + "logits/chosen": 2.6750237941741943, + "logits/rejected": 2.6834774017333984, + "logps/chosen": -1.1975797414779663, + "logps/rejected": -3.423466444015503, + "loss": 2.3932, + "nll_loss": 1.1975797414779663, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.975796699523926, + "rewards/margins": 22.258865356445312, + "rewards/rejected": -34.23466110229492, + "step": 527 + }, + { + "epoch": 0.7873252562907735, + "grad_norm": 68.2415641801262, + "learning_rate": 1.1803822693861377e-07, + "logits/chosen": 0.39841076731681824, + "logits/rejected": 0.6484661102294922, + "logps/chosen": -0.8886002898216248, + "logps/rejected": -2.301114082336426, + "loss": 1.5966, + "nll_loss": 0.8886002898216248, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.886002540588379, + "rewards/margins": 14.125136375427246, + "rewards/rejected": -23.011140823364258, + "step": 528 + }, + { + "epoch": 0.788816402609506, + "grad_norm": 51.51857479336736, + "learning_rate": 1.1644911108130434e-07, + "logits/chosen": -0.4365030825138092, + "logits/rejected": -0.8091474175453186, + "logps/chosen": -1.275532603263855, + "logps/rejected": -2.5970406532287598, + "loss": 1.5767, + "nll_loss": 1.275532603263855, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.755326271057129, + "rewards/margins": 13.215079307556152, + "rewards/rejected": -25.97040557861328, + "step": 529 + }, + { + "epoch": 0.7903075489282386, + "grad_norm": 80.27794159826863, + "learning_rate": 1.1486935375373124e-07, + "logits/chosen": 1.818937063217163, + "logits/rejected": 2.0481789112091064, + "logps/chosen": -1.1100715398788452, + "logps/rejected": -1.631326675415039, + "loss": 1.4015, + "nll_loss": 1.1100716590881348, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.100716590881348, + "rewards/margins": 5.212551593780518, + "rewards/rejected": -16.313268661499023, + "step": 530 + }, + { + "epoch": 0.7917986952469711, + "grad_norm": 76.1622733301883, + "learning_rate": 1.1329899350151212e-07, + "logits/chosen": 0.8246088624000549, + "logits/rejected": 1.6595807075500488, + "logps/chosen": -1.6400606632232666, + "logps/rejected": -3.3556313514709473, + "loss": 2.8103, + "nll_loss": 1.6400606632232666, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.400606155395508, + "rewards/margins": 17.15570831298828, + "rewards/rejected": -33.55632019042969, + "step": 531 + }, + { + "epoch": 0.7932898415657036, + "grad_norm": 69.79747788603979, + "learning_rate": 1.1173806864097884e-07, + "logits/chosen": 0.603188693523407, + "logits/rejected": 1.2856109142303467, + "logps/chosen": -1.375902533531189, + "logps/rejected": -2.999077081680298, + "loss": 2.2676, + "nll_loss": 1.3759024143218994, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.759024620056152, + "rewards/margins": 16.23174476623535, + "rewards/rejected": -29.99077033996582, + "step": 532 + }, + { + "epoch": 0.7947809878844362, + "grad_norm": 107.54460434268915, + "learning_rate": 1.101866172582423e-07, + "logits/chosen": -0.07562939822673798, + "logits/rejected": 0.8102388381958008, + "logps/chosen": -1.3312407732009888, + "logps/rejected": -3.5981345176696777, + "loss": 2.4654, + "nll_loss": 1.3312406539916992, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.312407493591309, + "rewards/margins": 22.66893768310547, + "rewards/rejected": -35.981346130371094, + "step": 533 + }, + { + "epoch": 0.7962721342031687, + "grad_norm": 37.98279471066673, + "learning_rate": 1.0864467720826343e-07, + "logits/chosen": 2.0333845615386963, + "logits/rejected": 1.3615199327468872, + "logps/chosen": -0.9712334275245667, + "logps/rejected": -2.4473283290863037, + "loss": 1.73, + "nll_loss": 0.9712334275245667, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.712334632873535, + "rewards/margins": 14.76095199584961, + "rewards/rejected": -24.473285675048828, + "step": 534 + }, + { + "epoch": 0.7977632805219013, + "grad_norm": 70.00952769733418, + "learning_rate": 1.0711228611392936e-07, + "logits/chosen": 1.8075847625732422, + "logits/rejected": 2.0040998458862305, + "logps/chosen": -1.8060308694839478, + "logps/rejected": -2.689011573791504, + "loss": 2.6005, + "nll_loss": 1.8060306310653687, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.0603084564209, + "rewards/margins": 8.82980728149414, + "rewards/rejected": -26.890113830566406, + "step": 535 + }, + { + "epoch": 0.7992544268406337, + "grad_norm": 33.600323761305795, + "learning_rate": 1.0558948136513534e-07, + "logits/chosen": 1.3355344533920288, + "logits/rejected": 1.2286067008972168, + "logps/chosen": -1.0286279916763306, + "logps/rejected": -1.7476571798324585, + "loss": 1.7296, + "nll_loss": 1.0286281108856201, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.28628158569336, + "rewards/margins": 7.190290451049805, + "rewards/rejected": -17.476572036743164, + "step": 536 + }, + { + "epoch": 0.8007455731593662, + "grad_norm": 39.83559136796347, + "learning_rate": 1.0407630011787328e-07, + "logits/chosen": 1.4175457954406738, + "logits/rejected": 1.360532283782959, + "logps/chosen": -1.1332502365112305, + "logps/rejected": -4.271042346954346, + "loss": 1.7862, + "nll_loss": 1.1332502365112305, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.332502365112305, + "rewards/margins": 31.37792205810547, + "rewards/rejected": -42.71042251586914, + "step": 537 + }, + { + "epoch": 0.8022367194780988, + "grad_norm": 49.229915814260536, + "learning_rate": 1.0257277929332331e-07, + "logits/chosen": 0.09370435774326324, + "logits/rejected": 0.3487294912338257, + "logps/chosen": -1.0228612422943115, + "logps/rejected": -2.6441450119018555, + "loss": 1.7603, + "nll_loss": 1.0228612422943115, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.22861099243164, + "rewards/margins": 16.212839126586914, + "rewards/rejected": -26.441448211669922, + "step": 538 + }, + { + "epoch": 0.8037278657968313, + "grad_norm": 43.20291464295552, + "learning_rate": 1.0107895557695523e-07, + "logits/chosen": 1.0065107345581055, + "logits/rejected": 0.9046751856803894, + "logps/chosen": -1.1206310987472534, + "logps/rejected": -2.8322832584381104, + "loss": 2.025, + "nll_loss": 1.1206310987472534, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.20631217956543, + "rewards/margins": 17.116519927978516, + "rewards/rejected": -28.322832107543945, + "step": 539 + }, + { + "epoch": 0.8052190121155638, + "grad_norm": 87.85547545961866, + "learning_rate": 9.959486541763118e-08, + "logits/chosen": 0.5322209596633911, + "logits/rejected": 1.0989861488342285, + "logps/chosen": -1.4311912059783936, + "logps/rejected": -8.2313871383667, + "loss": 1.784, + "nll_loss": 1.4311912059783936, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.311910629272461, + "rewards/margins": 68.00196838378906, + "rewards/rejected": -82.3138656616211, + "step": 540 + }, + { + "epoch": 0.8067101584342964, + "grad_norm": 35.18616471342836, + "learning_rate": 9.812054502671834e-08, + "logits/chosen": 1.0977833271026611, + "logits/rejected": 1.6604937314987183, + "logps/chosen": -1.293630838394165, + "logps/rejected": -2.6169798374176025, + "loss": 2.5025, + "nll_loss": 1.2936309576034546, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.936308860778809, + "rewards/margins": 13.233489990234375, + "rewards/rejected": -26.1697998046875, + "step": 541 + }, + { + "epoch": 0.8082013047530289, + "grad_norm": 100.07499413913182, + "learning_rate": 9.66560303772035e-08, + "logits/chosen": 0.806122899055481, + "logits/rejected": 1.437886118888855, + "logps/chosen": -1.526921272277832, + "logps/rejected": -1.9441941976547241, + "loss": 2.3626, + "nll_loss": 1.526921272277832, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.269213676452637, + "rewards/margins": 4.172728538513184, + "rewards/rejected": -19.44194221496582, + "step": 542 + }, + { + "epoch": 0.8096924510717615, + "grad_norm": 85.39766336575707, + "learning_rate": 9.520135720281691e-08, + "logits/chosen": 1.511439323425293, + "logits/rejected": 1.9532711505889893, + "logps/chosen": -1.13179612159729, + "logps/rejected": -2.3110647201538086, + "loss": 2.6602, + "nll_loss": 1.1317960023880005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.317960739135742, + "rewards/margins": 11.79268741607666, + "rewards/rejected": -23.110647201538086, + "step": 543 + }, + { + "epoch": 0.811183597390494, + "grad_norm": 103.46761333299857, + "learning_rate": 9.375656099715934e-08, + "logits/chosen": 1.2593380212783813, + "logits/rejected": 1.5915199518203735, + "logps/chosen": -1.6882553100585938, + "logps/rejected": -2.8256163597106934, + "loss": 1.8447, + "nll_loss": 1.6882551908493042, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.882553100585938, + "rewards/margins": 11.373611450195312, + "rewards/rejected": -28.256162643432617, + "step": 544 + }, + { + "epoch": 0.8126747437092264, + "grad_norm": 55.77866402643874, + "learning_rate": 9.23216770128365e-08, + "logits/chosen": 1.0406886339187622, + "logits/rejected": 0.8361440300941467, + "logps/chosen": -1.4198107719421387, + "logps/rejected": -1.9521433115005493, + "loss": 2.1216, + "nll_loss": 1.4198107719421387, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.19810676574707, + "rewards/margins": 5.323326587677002, + "rewards/rejected": -19.521432876586914, + "step": 545 + }, + { + "epoch": 0.814165890027959, + "grad_norm": 63.57895472466941, + "learning_rate": 9.08967402605988e-08, + "logits/chosen": 0.09123604744672775, + "logits/rejected": -0.03876099735498428, + "logps/chosen": -0.7894692420959473, + "logps/rejected": -1.5298676490783691, + "loss": 2.2162, + "nll_loss": 0.7894692420959473, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.894692420959473, + "rewards/margins": 7.403984069824219, + "rewards/rejected": -15.298676490783691, + "step": 546 + }, + { + "epoch": 0.8156570363466915, + "grad_norm": 54.349500942594744, + "learning_rate": 8.9481785508487e-08, + "logits/chosen": 1.6667102575302124, + "logits/rejected": 2.203434944152832, + "logps/chosen": -1.2966077327728271, + "logps/rejected": -3.049083948135376, + "loss": 2.4018, + "nll_loss": 1.2966077327728271, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.966076850891113, + "rewards/margins": 17.52476692199707, + "rewards/rejected": -30.490842819213867, + "step": 547 + }, + { + "epoch": 0.817148182665424, + "grad_norm": 54.9356955967697, + "learning_rate": 8.807684728098419e-08, + "logits/chosen": 1.4617446660995483, + "logits/rejected": 2.135504961013794, + "logps/chosen": -1.8497101068496704, + "logps/rejected": -3.3187100887298584, + "loss": 2.004, + "nll_loss": 1.8497099876403809, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.497100830078125, + "rewards/margins": 14.689998626708984, + "rewards/rejected": -33.187103271484375, + "step": 548 + }, + { + "epoch": 0.8186393289841566, + "grad_norm": 86.27572672988894, + "learning_rate": 8.668195985817289e-08, + "logits/chosen": 1.0450414419174194, + "logits/rejected": 1.7045292854309082, + "logps/chosen": -1.3397154808044434, + "logps/rejected": -2.9649925231933594, + "loss": 2.3038, + "nll_loss": 1.3397154808044434, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.397154808044434, + "rewards/margins": 16.252769470214844, + "rewards/rejected": -29.649925231933594, + "step": 549 + }, + { + "epoch": 0.8201304753028891, + "grad_norm": 56.54653825260291, + "learning_rate": 8.529715727489912e-08, + "logits/chosen": 0.3464043438434601, + "logits/rejected": 0.28210508823394775, + "logps/chosen": -1.1573177576065063, + "logps/rejected": -2.432623863220215, + "loss": 1.177, + "nll_loss": 1.1573176383972168, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.5731782913208, + "rewards/margins": 12.753060340881348, + "rewards/rejected": -24.32623863220215, + "step": 550 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 62.44280686277024, + "learning_rate": 8.392247331994173e-08, + "logits/chosen": 1.1887741088867188, + "logits/rejected": 1.7529797554016113, + "logps/chosen": -0.8768868446350098, + "logps/rejected": -2.667144775390625, + "loss": 2.5923, + "nll_loss": 0.8768867254257202, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.768867492675781, + "rewards/margins": 17.90258026123047, + "rewards/rejected": -26.671445846557617, + "step": 551 + }, + { + "epoch": 0.8231127679403542, + "grad_norm": 45.94550332790867, + "learning_rate": 8.255794153518798e-08, + "logits/chosen": 0.9571793079376221, + "logits/rejected": 1.1607004404067993, + "logps/chosen": -1.3052154779434204, + "logps/rejected": -3.089205265045166, + "loss": 2.5004, + "nll_loss": 1.3052154779434204, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.052156448364258, + "rewards/margins": 17.83989715576172, + "rewards/rejected": -30.892053604125977, + "step": 552 + }, + { + "epoch": 0.8246039142590866, + "grad_norm": 124.92358818048629, + "learning_rate": 8.120359521481501e-08, + "logits/chosen": 0.836337149143219, + "logits/rejected": 1.1571295261383057, + "logps/chosen": -1.6450185775756836, + "logps/rejected": -2.187589645385742, + "loss": 2.6545, + "nll_loss": 1.6450185775756836, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.450185775756836, + "rewards/margins": 5.425711154937744, + "rewards/rejected": -21.875896453857422, + "step": 553 + }, + { + "epoch": 0.8260950605778192, + "grad_norm": 44.49060840808658, + "learning_rate": 7.985946740447791e-08, + "logits/chosen": -0.2631034255027771, + "logits/rejected": 0.145988330245018, + "logps/chosen": -1.6813266277313232, + "logps/rejected": -2.6556098461151123, + "loss": 2.0102, + "nll_loss": 1.6813266277313232, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.81326675415039, + "rewards/margins": 9.742830276489258, + "rewards/rejected": -26.55609703063965, + "step": 554 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 37.85218664393302, + "learning_rate": 7.852559090050276e-08, + "logits/chosen": 1.5902546644210815, + "logits/rejected": 1.614829421043396, + "logps/chosen": -1.2583115100860596, + "logps/rejected": -2.5400302410125732, + "loss": 1.4264, + "nll_loss": 1.25831139087677, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.583114624023438, + "rewards/margins": 12.817188262939453, + "rewards/rejected": -25.40030288696289, + "step": 555 + }, + { + "epoch": 0.8290773532152842, + "grad_norm": 49.26407932935538, + "learning_rate": 7.720199824908691e-08, + "logits/chosen": 1.361399531364441, + "logits/rejected": 1.409611463546753, + "logps/chosen": -1.4241909980773926, + "logps/rejected": -3.1376218795776367, + "loss": 0.9169, + "nll_loss": 1.424190878868103, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.24190902709961, + "rewards/margins": 17.13431167602539, + "rewards/rejected": -31.376222610473633, + "step": 556 + }, + { + "epoch": 0.8305684995340168, + "grad_norm": 31.238322178644815, + "learning_rate": 7.588872174550498e-08, + "logits/chosen": 0.7303067445755005, + "logits/rejected": 1.2701952457427979, + "logps/chosen": -1.468266248703003, + "logps/rejected": -2.4732413291931152, + "loss": 1.712, + "nll_loss": 1.4682661294937134, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.682662010192871, + "rewards/margins": 10.049752235412598, + "rewards/rejected": -24.73241424560547, + "step": 557 + }, + { + "epoch": 0.8320596458527493, + "grad_norm": 38.209656730980115, + "learning_rate": 7.458579343331995e-08, + "logits/chosen": 1.2878832817077637, + "logits/rejected": 1.2985243797302246, + "logps/chosen": -1.9119257926940918, + "logps/rejected": -3.6364402770996094, + "loss": 1.634, + "nll_loss": 1.911926031112671, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.119258880615234, + "rewards/margins": 17.24514389038086, + "rewards/rejected": -36.364402770996094, + "step": 558 + }, + { + "epoch": 0.8335507921714819, + "grad_norm": 66.1367570680353, + "learning_rate": 7.329324510360269e-08, + "logits/chosen": 0.8806648254394531, + "logits/rejected": 1.2991594076156616, + "logps/chosen": -1.3400239944458008, + "logps/rejected": -2.963407516479492, + "loss": 1.9285, + "nll_loss": 1.3400241136550903, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.400240898132324, + "rewards/margins": 16.233837127685547, + "rewards/rejected": -29.634078979492188, + "step": 559 + }, + { + "epoch": 0.8350419384902144, + "grad_norm": 298.9857034157112, + "learning_rate": 7.20111082941548e-08, + "logits/chosen": 1.279545783996582, + "logits/rejected": 1.127055048942566, + "logps/chosen": -1.0575612783432007, + "logps/rejected": -1.7720974683761597, + "loss": 3.8, + "nll_loss": 1.0575611591339111, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.57561206817627, + "rewards/margins": 7.145363807678223, + "rewards/rejected": -17.720977783203125, + "step": 560 + }, + { + "epoch": 0.8365330848089468, + "grad_norm": 41.34376803126447, + "learning_rate": 7.073941428874064e-08, + "logits/chosen": 1.465010643005371, + "logits/rejected": 1.6831226348876953, + "logps/chosen": -1.0001267194747925, + "logps/rejected": -2.4494314193725586, + "loss": 2.1194, + "nll_loss": 1.0001269578933716, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.00126838684082, + "rewards/margins": 14.493046760559082, + "rewards/rejected": -24.49431610107422, + "step": 561 + }, + { + "epoch": 0.8380242311276794, + "grad_norm": 64.56056727469941, + "learning_rate": 6.947819411632222e-08, + "logits/chosen": 1.2089592218399048, + "logits/rejected": 1.150889277458191, + "logps/chosen": -1.1490910053253174, + "logps/rejected": -2.86586594581604, + "loss": 1.8347, + "nll_loss": 1.1490910053253174, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.490909576416016, + "rewards/margins": 17.167747497558594, + "rewards/rejected": -28.65865707397461, + "step": 562 + }, + { + "epoch": 0.8395153774464119, + "grad_norm": 44.61651161600099, + "learning_rate": 6.822747855030414e-08, + "logits/chosen": 0.7507335543632507, + "logits/rejected": 0.36784982681274414, + "logps/chosen": -1.301298975944519, + "logps/rejected": -2.050633430480957, + "loss": 2.4966, + "nll_loss": 1.3012988567352295, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.012989044189453, + "rewards/margins": 7.493346214294434, + "rewards/rejected": -20.506336212158203, + "step": 563 + }, + { + "epoch": 0.8410065237651445, + "grad_norm": 59.12945591715341, + "learning_rate": 6.698729810778064e-08, + "logits/chosen": 1.094009518623352, + "logits/rejected": 1.172784686088562, + "logps/chosen": -1.4129308462142944, + "logps/rejected": -2.2906455993652344, + "loss": 2.0096, + "nll_loss": 1.412930965423584, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.129308700561523, + "rewards/margins": 8.777146339416504, + "rewards/rejected": -22.906455993652344, + "step": 564 + }, + { + "epoch": 0.842497670083877, + "grad_norm": 96.69133779250849, + "learning_rate": 6.575768304879292e-08, + "logits/chosen": 1.5224504470825195, + "logits/rejected": 0.9588276147842407, + "logps/chosen": -1.3630435466766357, + "logps/rejected": -1.9835121631622314, + "loss": 2.4146, + "nll_loss": 1.3630435466766357, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.630434036254883, + "rewards/margins": 6.204684257507324, + "rewards/rejected": -19.835119247436523, + "step": 565 + }, + { + "epoch": 0.8439888164026095, + "grad_norm": 42.53739974720539, + "learning_rate": 6.453866337558939e-08, + "logits/chosen": 0.6651906371116638, + "logits/rejected": 0.856502890586853, + "logps/chosen": -1.211168646812439, + "logps/rejected": -2.185823440551758, + "loss": 2.0492, + "nll_loss": 1.2111684083938599, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.111686706542969, + "rewards/margins": 9.746549606323242, + "rewards/rejected": -21.858234405517578, + "step": 566 + }, + { + "epoch": 0.8454799627213421, + "grad_norm": 54.55584240804619, + "learning_rate": 6.333026883189424e-08, + "logits/chosen": 1.4041742086410522, + "logits/rejected": 1.2528077363967896, + "logps/chosen": -1.6688252687454224, + "logps/rejected": -3.5840983390808105, + "loss": 1.9207, + "nll_loss": 1.668825387954712, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.688251495361328, + "rewards/margins": 19.152727127075195, + "rewards/rejected": -35.840980529785156, + "step": 567 + }, + { + "epoch": 0.8469711090400746, + "grad_norm": 38.24669997132275, + "learning_rate": 6.213252890218162e-08, + "logits/chosen": 1.8699870109558105, + "logits/rejected": 1.9853416681289673, + "logps/chosen": -1.3241424560546875, + "logps/rejected": -1.7929151058197021, + "loss": 2.0296, + "nll_loss": 1.324142575263977, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.241425514221191, + "rewards/margins": 4.687725067138672, + "rewards/rejected": -17.929149627685547, + "step": 568 + }, + { + "epoch": 0.848462255358807, + "grad_norm": 50.15694631335723, + "learning_rate": 6.094547281095619e-08, + "logits/chosen": 1.642357349395752, + "logits/rejected": 1.6782177686691284, + "logps/chosen": -1.40309476852417, + "logps/rejected": -3.434246778488159, + "loss": 1.5013, + "nll_loss": 1.4030948877334595, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.0309476852417, + "rewards/margins": 20.311519622802734, + "rewards/rejected": -34.342464447021484, + "step": 569 + }, + { + "epoch": 0.8499534016775396, + "grad_norm": 71.49518736065667, + "learning_rate": 5.976912952204016e-08, + "logits/chosen": 1.4287097454071045, + "logits/rejected": 1.656920313835144, + "logps/chosen": -1.3490947484970093, + "logps/rejected": -2.7284812927246094, + "loss": 2.129, + "nll_loss": 1.3490948677062988, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.490946769714355, + "rewards/margins": 13.793864250183105, + "rewards/rejected": -27.284812927246094, + "step": 570 + }, + { + "epoch": 0.8514445479962721, + "grad_norm": 32.25588380634021, + "learning_rate": 5.8603527737866307e-08, + "logits/chosen": 0.8476590514183044, + "logits/rejected": 0.6341254711151123, + "logps/chosen": -1.073239803314209, + "logps/rejected": -3.493922710418701, + "loss": 1.265, + "nll_loss": 1.0732399225234985, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.73239803314209, + "rewards/margins": 24.206829071044922, + "rewards/rejected": -34.93922805786133, + "step": 571 + }, + { + "epoch": 0.8529356943150047, + "grad_norm": 53.60751501091694, + "learning_rate": 5.7448695898778097e-08, + "logits/chosen": 1.249593734741211, + "logits/rejected": 1.2364052534103394, + "logps/chosen": -1.1790369749069214, + "logps/rejected": -6.050329208374023, + "loss": 1.3328, + "nll_loss": 1.1790369749069214, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.790369987487793, + "rewards/margins": 48.71292495727539, + "rewards/rejected": -60.5032958984375, + "step": 572 + }, + { + "epoch": 0.8544268406337372, + "grad_norm": 59.88563770735995, + "learning_rate": 5.63046621823352e-08, + "logits/chosen": 0.17915304005146027, + "logits/rejected": 0.8957693576812744, + "logps/chosen": -1.2315869331359863, + "logps/rejected": -2.526779890060425, + "loss": 1.785, + "nll_loss": 1.2315869331359863, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.315869331359863, + "rewards/margins": 12.951930046081543, + "rewards/rejected": -25.267799377441406, + "step": 573 + }, + { + "epoch": 0.8559179869524697, + "grad_norm": 74.06734466100485, + "learning_rate": 5.517145450262639e-08, + "logits/chosen": 0.946685791015625, + "logits/rejected": 0.8377312421798706, + "logps/chosen": -0.8227486610412598, + "logps/rejected": -4.0714111328125, + "loss": 2.0619, + "nll_loss": 0.8227487206459045, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.227487564086914, + "rewards/margins": 32.48662185668945, + "rewards/rejected": -40.714107513427734, + "step": 574 + }, + { + "epoch": 0.8574091332712023, + "grad_norm": 79.67704275085624, + "learning_rate": 5.404910050958833e-08, + "logits/chosen": 0.5169818997383118, + "logits/rejected": 0.3395709991455078, + "logps/chosen": -1.0307010412216187, + "logps/rejected": -2.207871437072754, + "loss": 2.3389, + "nll_loss": 1.0307011604309082, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.30700969696045, + "rewards/margins": 11.771703720092773, + "rewards/rejected": -22.07871437072754, + "step": 575 + }, + { + "epoch": 0.8589002795899348, + "grad_norm": 55.25972369360697, + "learning_rate": 5.29376275883307e-08, + "logits/chosen": 1.3334778547286987, + "logits/rejected": 1.3576440811157227, + "logps/chosen": -1.3050498962402344, + "logps/rejected": -2.214493751525879, + "loss": 1.5946, + "nll_loss": 1.305050015449524, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.050498962402344, + "rewards/margins": 9.094437599182129, + "rewards/rejected": -22.144935607910156, + "step": 576 + }, + { + "epoch": 0.8603914259086672, + "grad_norm": 66.70966380503094, + "learning_rate": 5.183706285846873e-08, + "logits/chosen": 1.4294792413711548, + "logits/rejected": 1.2827816009521484, + "logps/chosen": -1.2769086360931396, + "logps/rejected": -2.087651491165161, + "loss": 1.557, + "nll_loss": 1.2769086360931396, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.769086837768555, + "rewards/margins": 8.107427597045898, + "rewards/rejected": -20.876514434814453, + "step": 577 + }, + { + "epoch": 0.8618825722273998, + "grad_norm": 98.46543978254562, + "learning_rate": 5.0747433173460086e-08, + "logits/chosen": 0.6869046688079834, + "logits/rejected": 0.4922327697277069, + "logps/chosen": -1.5533781051635742, + "logps/rejected": -2.522840738296509, + "loss": 2.2821, + "nll_loss": 1.5533778667449951, + "rewards/accuracies": 0.625, + "rewards/chosen": -15.53377914428711, + "rewards/margins": 9.694629669189453, + "rewards/rejected": -25.228408813476562, + "step": 578 + }, + { + "epoch": 0.8633737185461323, + "grad_norm": 81.29000068555673, + "learning_rate": 4.966876511995149e-08, + "logits/chosen": 1.6201094388961792, + "logits/rejected": 2.0438897609710693, + "logps/chosen": -1.243577480316162, + "logps/rejected": -2.3286705017089844, + "loss": 1.0166, + "nll_loss": 1.243577480316162, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.435773849487305, + "rewards/margins": 10.850931167602539, + "rewards/rejected": -23.286705017089844, + "step": 579 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 40.303681390891704, + "learning_rate": 4.860108501712823e-08, + "logits/chosen": 0.5733177661895752, + "logits/rejected": 0.7400199174880981, + "logps/chosen": -1.0819733142852783, + "logps/rejected": -2.0159566402435303, + "loss": 1.8131, + "nll_loss": 1.0819733142852783, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.819732666015625, + "rewards/margins": 9.339835166931152, + "rewards/rejected": -20.159568786621094, + "step": 580 + }, + { + "epoch": 0.8663560111835974, + "grad_norm": 47.867505185407104, + "learning_rate": 4.754441891607347e-08, + "logits/chosen": 0.6819798946380615, + "logits/rejected": 1.1605446338653564, + "logps/chosen": -1.33807373046875, + "logps/rejected": -3.4481253623962402, + "loss": 1.5911, + "nll_loss": 1.338073492050171, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.380736351013184, + "rewards/margins": 21.100513458251953, + "rewards/rejected": -34.48125076293945, + "step": 581 + }, + { + "epoch": 0.8678471575023299, + "grad_norm": 39.98802770089898, + "learning_rate": 4.649879259913136e-08, + "logits/chosen": 0.2189992517232895, + "logits/rejected": 0.5796717405319214, + "logps/chosen": -1.145107388496399, + "logps/rejected": -2.4502243995666504, + "loss": 2.1808, + "nll_loss": 1.145107388496399, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.451074600219727, + "rewards/margins": 13.051168441772461, + "rewards/rejected": -24.502243041992188, + "step": 582 + }, + { + "epoch": 0.8693383038210625, + "grad_norm": 61.294343400740814, + "learning_rate": 4.54642315792792e-08, + "logits/chosen": 0.9891291856765747, + "logits/rejected": 0.7852681875228882, + "logps/chosen": -1.4123867750167847, + "logps/rejected": -2.1884164810180664, + "loss": 2.0489, + "nll_loss": 1.4123867750167847, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.123867988586426, + "rewards/margins": 7.7602972984313965, + "rewards/rejected": -21.884164810180664, + "step": 583 + }, + { + "epoch": 0.870829450139795, + "grad_norm": 131.9277214916375, + "learning_rate": 4.4440761099503456e-08, + "logits/chosen": 0.7502199411392212, + "logits/rejected": 1.2810550928115845, + "logps/chosen": -1.5025020837783813, + "logps/rejected": -3.7406561374664307, + "loss": 2.0273, + "nll_loss": 1.5025020837783813, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.025020599365234, + "rewards/margins": 22.38153839111328, + "rewards/rejected": -37.40655517578125, + "step": 584 + }, + { + "epoch": 0.8723205964585274, + "grad_norm": 102.54915589860299, + "learning_rate": 4.342840613218546e-08, + "logits/chosen": 0.70381760597229, + "logits/rejected": 0.3464395999908447, + "logps/chosen": -1.022120714187622, + "logps/rejected": -2.0207552909851074, + "loss": 1.713, + "nll_loss": 1.0221205949783325, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.221206665039062, + "rewards/margins": 9.986349105834961, + "rewards/rejected": -20.207555770874023, + "step": 585 + }, + { + "epoch": 0.87381174277726, + "grad_norm": 172.5828711123485, + "learning_rate": 4.242719137849077e-08, + "logits/chosen": 0.3403228521347046, + "logits/rejected": 0.8271859884262085, + "logps/chosen": -1.2993659973144531, + "logps/rejected": -3.0384278297424316, + "loss": 2.1327, + "nll_loss": 1.2993658781051636, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.993660926818848, + "rewards/margins": 17.3906192779541, + "rewards/rejected": -30.384279251098633, + "step": 586 + }, + { + "epoch": 0.8753028890959925, + "grad_norm": 41.59906874697002, + "learning_rate": 4.143714126776715e-08, + "logits/chosen": 0.8385416865348816, + "logits/rejected": 0.5051766633987427, + "logps/chosen": -1.114525318145752, + "logps/rejected": -2.0112345218658447, + "loss": 2.0028, + "nll_loss": 1.114525318145752, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.145252227783203, + "rewards/margins": 8.967092514038086, + "rewards/rejected": -20.112346649169922, + "step": 587 + }, + { + "epoch": 0.8767940354147251, + "grad_norm": 44.70675834424241, + "learning_rate": 4.045827995694834e-08, + "logits/chosen": 0.10920746624469757, + "logits/rejected": 0.29233425855636597, + "logps/chosen": -1.331228494644165, + "logps/rejected": -2.7238054275512695, + "loss": 1.9673, + "nll_loss": 1.3312286138534546, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.312284469604492, + "rewards/margins": 13.92576789855957, + "rewards/rejected": -27.238056182861328, + "step": 588 + }, + { + "epoch": 0.8782851817334576, + "grad_norm": 70.80189184290701, + "learning_rate": 3.9490631329964554e-08, + "logits/chosen": 1.0415701866149902, + "logits/rejected": 1.1610839366912842, + "logps/chosen": -1.3345025777816772, + "logps/rejected": -2.035282850265503, + "loss": 2.3617, + "nll_loss": 1.3345026969909668, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.345026016235352, + "rewards/margins": 7.0078020095825195, + "rewards/rejected": -20.352828979492188, + "step": 589 + }, + { + "epoch": 0.8797763280521901, + "grad_norm": 97.95046250851314, + "learning_rate": 3.853421899715992e-08, + "logits/chosen": 0.18456491827964783, + "logits/rejected": 0.5277552008628845, + "logps/chosen": -1.9759291410446167, + "logps/rejected": -2.498504161834717, + "loss": 3.3488, + "nll_loss": 1.9759293794631958, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.759288787841797, + "rewards/margins": 5.2257513999938965, + "rewards/rejected": -24.985044479370117, + "step": 590 + }, + { + "epoch": 0.8812674743709227, + "grad_norm": 41.890934746252, + "learning_rate": 3.758906629471614e-08, + "logits/chosen": 0.46019110083580017, + "logits/rejected": 0.36783766746520996, + "logps/chosen": -0.7036344408988953, + "logps/rejected": -2.221921920776367, + "loss": 1.2762, + "nll_loss": 0.70363450050354, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.036343574523926, + "rewards/margins": 15.182877540588379, + "rewards/rejected": -22.219223022460938, + "step": 591 + }, + { + "epoch": 0.8827586206896552, + "grad_norm": 123.64391132718144, + "learning_rate": 3.665519628408331e-08, + "logits/chosen": 0.2561766505241394, + "logits/rejected": 0.9669838547706604, + "logps/chosen": -0.9937634468078613, + "logps/rejected": -3.312620162963867, + "loss": 2.3651, + "nll_loss": 0.9937633872032166, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.937634468078613, + "rewards/margins": 23.188566207885742, + "rewards/rejected": -33.126197814941406, + "step": 592 + }, + { + "epoch": 0.8842497670083876, + "grad_norm": 41.395426479146714, + "learning_rate": 3.5732631751417054e-08, + "logits/chosen": 1.6422443389892578, + "logits/rejected": 1.427198052406311, + "logps/chosen": -1.3927075862884521, + "logps/rejected": -2.9072377681732178, + "loss": 2.1218, + "nll_loss": 1.3927075862884521, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.92707633972168, + "rewards/margins": 15.14530086517334, + "rewards/rejected": -29.072378158569336, + "step": 593 + }, + { + "epoch": 0.8857409133271202, + "grad_norm": 51.132575782347416, + "learning_rate": 3.482139520702276e-08, + "logits/chosen": 0.4521043300628662, + "logits/rejected": 0.9026426076889038, + "logps/chosen": -0.948829174041748, + "logps/rejected": -2.1322081089019775, + "loss": 2.4032, + "nll_loss": 0.9488292336463928, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.48829174041748, + "rewards/margins": 11.833791732788086, + "rewards/rejected": -21.32208251953125, + "step": 594 + }, + { + "epoch": 0.8872320596458527, + "grad_norm": 50.644399265019686, + "learning_rate": 3.39215088848061e-08, + "logits/chosen": 0.24500666558742523, + "logits/rejected": 0.5586891174316406, + "logps/chosen": -1.1638610363006592, + "logps/rejected": -2.017951726913452, + "loss": 2.2166, + "nll_loss": 1.1638609170913696, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.638608932495117, + "rewards/margins": 8.54090690612793, + "rewards/rejected": -20.179515838623047, + "step": 595 + }, + { + "epoch": 0.8887232059645853, + "grad_norm": 36.11216268799047, + "learning_rate": 3.303299474173066e-08, + "logits/chosen": 0.936069667339325, + "logits/rejected": 1.1829633712768555, + "logps/chosen": -1.165700912475586, + "logps/rejected": -2.8161368370056152, + "loss": 1.9274, + "nll_loss": 1.1657007932662964, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.657008171081543, + "rewards/margins": 16.50436019897461, + "rewards/rejected": -28.161367416381836, + "step": 596 + }, + { + "epoch": 0.8902143522833178, + "grad_norm": 59.19934715323104, + "learning_rate": 3.2155874457282185e-08, + "logits/chosen": 0.3062177896499634, + "logits/rejected": 0.34192609786987305, + "logps/chosen": -0.7291023135185242, + "logps/rejected": -3.108163356781006, + "loss": 2.2441, + "nll_loss": 0.7291023135185242, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.291023254394531, + "rewards/margins": 23.790611267089844, + "rewards/rejected": -31.081634521484375, + "step": 597 + }, + { + "epoch": 0.8917054986020503, + "grad_norm": 42.851544166935966, + "learning_rate": 3.129016943293955e-08, + "logits/chosen": 0.6586010456085205, + "logits/rejected": 0.4904717206954956, + "logps/chosen": -1.0113714933395386, + "logps/rejected": -1.9497562646865845, + "loss": 1.3841, + "nll_loss": 1.011371374130249, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.113714218139648, + "rewards/margins": 9.383848190307617, + "rewards/rejected": -19.497562408447266, + "step": 598 + }, + { + "epoch": 0.8931966449207829, + "grad_norm": 56.49355049536372, + "learning_rate": 3.043590079165281e-08, + "logits/chosen": 0.10095011442899704, + "logits/rejected": 0.2807294726371765, + "logps/chosen": -1.176227331161499, + "logps/rejected": -2.48679518699646, + "loss": 1.7267, + "nll_loss": 1.176227331161499, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.762274742126465, + "rewards/margins": 13.105676651000977, + "rewards/rejected": -24.867952346801758, + "step": 599 + }, + { + "epoch": 0.8946877912395154, + "grad_norm": 102.1023178737433, + "learning_rate": 2.9593089377327242e-08, + "logits/chosen": 1.2022994756698608, + "logits/rejected": 1.4467451572418213, + "logps/chosen": -0.6037580966949463, + "logps/rejected": -2.4274349212646484, + "loss": 2.5207, + "nll_loss": 0.6037580966949463, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.037580966949463, + "rewards/margins": 18.236770629882812, + "rewards/rejected": -24.274349212646484, + "step": 600 + }, + { + "epoch": 0.896178937558248, + "grad_norm": 70.55892559060052, + "learning_rate": 2.8761755754315663e-08, + "logits/chosen": 1.6827621459960938, + "logits/rejected": 1.490635871887207, + "logps/chosen": -1.2804337739944458, + "logps/rejected": -2.2912468910217285, + "loss": 1.8508, + "nll_loss": 1.2804336547851562, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.804336547851562, + "rewards/margins": 10.108132362365723, + "rewards/rejected": -22.9124698638916, + "step": 601 + }, + { + "epoch": 0.8976700838769804, + "grad_norm": 91.97458303860651, + "learning_rate": 2.7941920206915436e-08, + "logits/chosen": 1.8210875988006592, + "logits/rejected": 2.193286418914795, + "logps/chosen": -1.0207717418670654, + "logps/rejected": -2.7276382446289062, + "loss": 2.1616, + "nll_loss": 1.0207717418670654, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.207717895507812, + "rewards/margins": 17.06866455078125, + "rewards/rejected": -27.276384353637695, + "step": 602 + }, + { + "epoch": 0.8991612301957129, + "grad_norm": 320.06858059071607, + "learning_rate": 2.7133602738874995e-08, + "logits/chosen": 0.7025165557861328, + "logits/rejected": 0.778084397315979, + "logps/chosen": -1.287645697593689, + "logps/rejected": -4.223053932189941, + "loss": 2.0365, + "nll_loss": 1.2876458168029785, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.876458168029785, + "rewards/margins": 29.354087829589844, + "rewards/rejected": -42.23053741455078, + "step": 603 + }, + { + "epoch": 0.9006523765144455, + "grad_norm": 99.66976447018196, + "learning_rate": 2.63368230729043e-08, + "logits/chosen": 1.1805634498596191, + "logits/rejected": 0.9838704466819763, + "logps/chosen": -1.0174989700317383, + "logps/rejected": -2.3494932651519775, + "loss": 1.7424, + "nll_loss": 1.0174988508224487, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.174989700317383, + "rewards/margins": 13.319944381713867, + "rewards/rejected": -23.49493408203125, + "step": 604 + }, + { + "epoch": 0.902143522833178, + "grad_norm": 74.51528491716375, + "learning_rate": 2.5551600650194906e-08, + "logits/chosen": 0.9548903703689575, + "logits/rejected": 1.0899478197097778, + "logps/chosen": -1.306660532951355, + "logps/rejected": -2.837043046951294, + "loss": 2.1283, + "nll_loss": 1.306660532951355, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.066605567932129, + "rewards/margins": 15.303826332092285, + "rewards/rejected": -28.37042999267578, + "step": 605 + }, + { + "epoch": 0.9036346691519105, + "grad_norm": 51.993252521508545, + "learning_rate": 2.4777954629944475e-08, + "logits/chosen": 0.8407946825027466, + "logits/rejected": 0.9599366784095764, + "logps/chosen": -1.164607286453247, + "logps/rejected": -2.8004937171936035, + "loss": 1.8249, + "nll_loss": 1.164607286453247, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.646072387695312, + "rewards/margins": 16.358863830566406, + "rewards/rejected": -28.00493812561035, + "step": 606 + }, + { + "epoch": 0.9051258154706431, + "grad_norm": 62.129272211552866, + "learning_rate": 2.4015903888890242e-08, + "logits/chosen": 1.026720643043518, + "logits/rejected": 1.730891227722168, + "logps/chosen": -1.6653600931167603, + "logps/rejected": -3.1316123008728027, + "loss": 1.8785, + "nll_loss": 1.6653600931167603, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.65359878540039, + "rewards/margins": 14.662520408630371, + "rewards/rejected": -31.316120147705078, + "step": 607 + }, + { + "epoch": 0.9066169617893756, + "grad_norm": 64.74965431651496, + "learning_rate": 2.3265467020847863e-08, + "logits/chosen": 0.856157124042511, + "logits/rejected": 1.233034372329712, + "logps/chosen": -0.943716824054718, + "logps/rejected": -1.8875681161880493, + "loss": 1.3686, + "nll_loss": 0.943716824054718, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.437169075012207, + "rewards/margins": 9.43851375579834, + "rewards/rejected": -18.875682830810547, + "step": 608 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 78.35000696177752, + "learning_rate": 2.2526662336257828e-08, + "logits/chosen": 1.7188822031021118, + "logits/rejected": 2.068807601928711, + "logps/chosen": -1.2436611652374268, + "logps/rejected": -5.058825492858887, + "loss": 2.7311, + "nll_loss": 1.2436611652374268, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.436612129211426, + "rewards/margins": 38.15163803100586, + "rewards/rejected": -50.58824920654297, + "step": 609 + }, + { + "epoch": 0.9095992544268406, + "grad_norm": 51.12329553486554, + "learning_rate": 2.1799507861738788e-08, + "logits/chosen": 1.5251386165618896, + "logits/rejected": 0.9365883469581604, + "logps/chosen": -0.9077203273773193, + "logps/rejected": -2.4619970321655273, + "loss": 1.861, + "nll_loss": 0.9077203273773193, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.077203750610352, + "rewards/margins": 15.542766571044922, + "rewards/rejected": -24.61996841430664, + "step": 610 + }, + { + "epoch": 0.9110904007455731, + "grad_norm": 61.222416516366145, + "learning_rate": 2.1084021339647707e-08, + "logits/chosen": 0.5966976881027222, + "logits/rejected": 0.9337188601493835, + "logps/chosen": -1.2190940380096436, + "logps/rejected": -3.3994054794311523, + "loss": 2.4781, + "nll_loss": 1.219094157218933, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.190940856933594, + "rewards/margins": 21.803115844726562, + "rewards/rejected": -33.994056701660156, + "step": 611 + }, + { + "epoch": 0.9125815470643057, + "grad_norm": 49.41778458085236, + "learning_rate": 2.038022022764685e-08, + "logits/chosen": 1.6303707361221313, + "logits/rejected": 1.7657480239868164, + "logps/chosen": -1.535523533821106, + "logps/rejected": -3.0815024375915527, + "loss": 2.1474, + "nll_loss": 1.5355234146118164, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.355234146118164, + "rewards/margins": 15.459792137145996, + "rewards/rejected": -30.815027236938477, + "step": 612 + }, + { + "epoch": 0.9140726933830382, + "grad_norm": 61.05392998371482, + "learning_rate": 1.9688121698277993e-08, + "logits/chosen": 1.0834482908248901, + "logits/rejected": 1.8084291219711304, + "logps/chosen": -1.3302435874938965, + "logps/rejected": -3.0064194202423096, + "loss": 1.4181, + "nll_loss": 1.330243706703186, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.302435874938965, + "rewards/margins": 16.761760711669922, + "rewards/rejected": -30.06419563293457, + "step": 613 + }, + { + "epoch": 0.9155638397017707, + "grad_norm": 34.21776098764438, + "learning_rate": 1.90077426385431e-08, + "logits/chosen": 0.7245126962661743, + "logits/rejected": 0.22384954988956451, + "logps/chosen": -1.3143211603164673, + "logps/rejected": -1.861713171005249, + "loss": 1.8045, + "nll_loss": 1.3143210411071777, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.143211364746094, + "rewards/margins": 5.4739227294921875, + "rewards/rejected": -18.61713218688965, + "step": 614 + }, + { + "epoch": 0.9170549860205033, + "grad_norm": 42.5324206312152, + "learning_rate": 1.8339099649492762e-08, + "logits/chosen": 0.3494272232055664, + "logits/rejected": 0.9441218376159668, + "logps/chosen": -1.5409001111984253, + "logps/rejected": -3.023975372314453, + "loss": 2.1119, + "nll_loss": 1.5409001111984253, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.409000396728516, + "rewards/margins": 14.83074951171875, + "rewards/rejected": -30.239749908447266, + "step": 615 + }, + { + "epoch": 0.9185461323392358, + "grad_norm": 43.06626427761403, + "learning_rate": 1.7682209045820684e-08, + "logits/chosen": 0.9627107977867126, + "logits/rejected": 1.0145137310028076, + "logps/chosen": -0.9481117725372314, + "logps/rejected": -2.1838059425354004, + "loss": 2.1317, + "nll_loss": 0.9481117725372314, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.481118202209473, + "rewards/margins": 12.356941223144531, + "rewards/rejected": -21.838058471679688, + "step": 616 + }, + { + "epoch": 0.9200372786579684, + "grad_norm": 178.43839248334547, + "learning_rate": 1.7037086855465898e-08, + "logits/chosen": 0.6895330548286438, + "logits/rejected": 0.6384507417678833, + "logps/chosen": -1.1706783771514893, + "logps/rejected": -1.7648053169250488, + "loss": 2.2996, + "nll_loss": 1.1706783771514893, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.706781387329102, + "rewards/margins": 5.94127082824707, + "rewards/rejected": -17.648052215576172, + "step": 617 + }, + { + "epoch": 0.9215284249767008, + "grad_norm": 66.09270295798773, + "learning_rate": 1.6403748819221462e-08, + "logits/chosen": 0.9936915040016174, + "logits/rejected": 1.0178120136260986, + "logps/chosen": -1.1915603876113892, + "logps/rejected": -1.6295900344848633, + "loss": 2.3494, + "nll_loss": 1.1915605068206787, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.915602684020996, + "rewards/margins": 4.38029670715332, + "rewards/rejected": -16.295902252197266, + "step": 618 + }, + { + "epoch": 0.9230195712954333, + "grad_norm": 72.23670774806365, + "learning_rate": 1.5782210390350713e-08, + "logits/chosen": 0.9415168762207031, + "logits/rejected": 0.9538595080375671, + "logps/chosen": -1.3151698112487793, + "logps/rejected": -2.0880651473999023, + "loss": 1.289, + "nll_loss": 1.3151699304580688, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.15169906616211, + "rewards/margins": 7.728951930999756, + "rewards/rejected": -20.880651473999023, + "step": 619 + }, + { + "epoch": 0.9245107176141659, + "grad_norm": 517.5186319293753, + "learning_rate": 1.5172486734209788e-08, + "logits/chosen": 1.1598143577575684, + "logits/rejected": 1.4311909675598145, + "logps/chosen": -1.2884063720703125, + "logps/rejected": -2.6638433933258057, + "loss": 3.2189, + "nll_loss": 1.2884063720703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.884064674377441, + "rewards/margins": 13.75437068939209, + "rewards/rejected": -26.63843536376953, + "step": 620 + }, + { + "epoch": 0.9260018639328984, + "grad_norm": 48.93138506507298, + "learning_rate": 1.4574592727878088e-08, + "logits/chosen": 1.112391471862793, + "logits/rejected": 1.7636058330535889, + "logps/chosen": -1.2476575374603271, + "logps/rejected": -2.744623899459839, + "loss": 1.6136, + "nll_loss": 1.2476575374603271, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.476574897766113, + "rewards/margins": 14.9696626663208, + "rewards/rejected": -27.446237564086914, + "step": 621 + }, + { + "epoch": 0.9274930102516309, + "grad_norm": 69.13203966426812, + "learning_rate": 1.3988542959794625e-08, + "logits/chosen": 1.5924474000930786, + "logits/rejected": 1.2295907735824585, + "logps/chosen": -1.1291077136993408, + "logps/rejected": -1.6911044120788574, + "loss": 2.1184, + "nll_loss": 1.1291077136993408, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.291078567504883, + "rewards/margins": 5.619965076446533, + "rewards/rejected": -16.911041259765625, + "step": 622 + }, + { + "epoch": 0.9289841565703635, + "grad_norm": 53.60782937532031, + "learning_rate": 1.3414351729402863e-08, + "logits/chosen": 0.5311175584793091, + "logits/rejected": 0.696600079536438, + "logps/chosen": -1.6392128467559814, + "logps/rejected": -2.6737866401672363, + "loss": 1.3688, + "nll_loss": 1.6392128467559814, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.39212989807129, + "rewards/margins": 10.34573745727539, + "rewards/rejected": -26.737865447998047, + "step": 623 + }, + { + "epoch": 0.930475302889096, + "grad_norm": 66.6061457553233, + "learning_rate": 1.2852033046801104e-08, + "logits/chosen": 1.6591801643371582, + "logits/rejected": 1.7488484382629395, + "logps/chosen": -1.3164441585540771, + "logps/rejected": -3.4187371730804443, + "loss": 2.2486, + "nll_loss": 1.3164441585540771, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.164440155029297, + "rewards/margins": 21.022930145263672, + "rewards/rejected": -34.18737030029297, + "step": 624 + }, + { + "epoch": 0.9319664492078286, + "grad_norm": 76.24305169308032, + "learning_rate": 1.230160063240121e-08, + "logits/chosen": 0.9626173973083496, + "logits/rejected": 0.7872528433799744, + "logps/chosen": -1.2172050476074219, + "logps/rejected": -2.30886173248291, + "loss": 1.8232, + "nll_loss": 1.2172050476074219, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.172050476074219, + "rewards/margins": 10.916565895080566, + "rewards/rejected": -23.0886173248291, + "step": 625 + }, + { + "epoch": 0.933457595526561, + "grad_norm": 65.28519911178785, + "learning_rate": 1.176306791659326e-08, + "logits/chosen": 0.561115026473999, + "logits/rejected": 0.9786922931671143, + "logps/chosen": -1.3929322957992554, + "logps/rejected": -3.053676128387451, + "loss": 2.4585, + "nll_loss": 1.3929322957992554, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.9293212890625, + "rewards/margins": 16.607440948486328, + "rewards/rejected": -30.536762237548828, + "step": 626 + }, + { + "epoch": 0.9349487418452935, + "grad_norm": 71.50712565768815, + "learning_rate": 1.1236448039418423e-08, + "logits/chosen": 0.45714130997657776, + "logits/rejected": 0.8796458840370178, + "logps/chosen": -1.5751104354858398, + "logps/rejected": -4.103085994720459, + "loss": 1.451, + "nll_loss": 1.5751101970672607, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.751103401184082, + "rewards/margins": 25.279754638671875, + "rewards/rejected": -41.030860900878906, + "step": 627 + }, + { + "epoch": 0.9364398881640261, + "grad_norm": 61.241804371004626, + "learning_rate": 1.0721753850247984e-08, + "logits/chosen": 0.9310768842697144, + "logits/rejected": 0.5946216583251953, + "logps/chosen": -1.1604797840118408, + "logps/rejected": -2.7220888137817383, + "loss": 2.3107, + "nll_loss": 1.1604797840118408, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.604798316955566, + "rewards/margins": 15.6160888671875, + "rewards/rejected": -27.22088623046875, + "step": 628 + }, + { + "epoch": 0.9379310344827586, + "grad_norm": 121.50896184894009, + "learning_rate": 1.021899790746994e-08, + "logits/chosen": 0.08384272456169128, + "logits/rejected": 0.2588069438934326, + "logps/chosen": -0.8513846397399902, + "logps/rejected": -2.030687093734741, + "loss": 2.3264, + "nll_loss": 0.8513847589492798, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.513846397399902, + "rewards/margins": 11.793025016784668, + "rewards/rejected": -20.306873321533203, + "step": 629 + }, + { + "epoch": 0.9394221808014911, + "grad_norm": 35.10139854500652, + "learning_rate": 9.728192478182573e-09, + "logits/chosen": 1.4342637062072754, + "logits/rejected": 1.5611820220947266, + "logps/chosen": -1.205596685409546, + "logps/rejected": -2.7099761962890625, + "loss": 1.568, + "nll_loss": 1.205596685409546, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.055967330932617, + "rewards/margins": 15.043792724609375, + "rewards/rejected": -27.099760055541992, + "step": 630 + }, + { + "epoch": 0.9409133271202237, + "grad_norm": 29.967230346920555, + "learning_rate": 9.249349537894968e-09, + "logits/chosen": 0.3210725784301758, + "logits/rejected": 0.6524243354797363, + "logps/chosen": -0.7849727272987366, + "logps/rejected": -2.915769338607788, + "loss": 1.5934, + "nll_loss": 0.784972608089447, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.849726676940918, + "rewards/margins": 21.307964324951172, + "rewards/rejected": -29.15769386291504, + "step": 631 + }, + { + "epoch": 0.9424044734389562, + "grad_norm": 141.74255777816967, + "learning_rate": 8.782480770235246e-09, + "logits/chosen": 0.3069119453430176, + "logits/rejected": 0.485944926738739, + "logps/chosen": -1.1904934644699097, + "logps/rejected": -3.739229440689087, + "loss": 2.0276, + "nll_loss": 1.1904934644699097, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.904936790466309, + "rewards/margins": 25.487361907958984, + "rewards/rejected": -37.392295837402344, + "step": 632 + }, + { + "epoch": 0.9438956197576888, + "grad_norm": 60.527308659160475, + "learning_rate": 8.327597566665013e-09, + "logits/chosen": 1.219635009765625, + "logits/rejected": 1.583566427230835, + "logps/chosen": -1.4690210819244385, + "logps/rejected": -2.254682779312134, + "loss": 2.6834, + "nll_loss": 1.4690210819244385, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.690210342407227, + "rewards/margins": 7.856616973876953, + "rewards/rejected": -22.54682731628418, + "step": 633 + }, + { + "epoch": 0.9453867660764212, + "grad_norm": 56.10853338090398, + "learning_rate": 7.884711026201584e-09, + "logits/chosen": 1.9719793796539307, + "logits/rejected": 2.1581757068634033, + "logps/chosen": -1.436596393585205, + "logps/rejected": -2.7122642993927, + "loss": 1.6313, + "nll_loss": 1.436596393585205, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.36596393585205, + "rewards/margins": 12.75667667388916, + "rewards/rejected": -27.122638702392578, + "step": 634 + }, + { + "epoch": 0.9468779123951537, + "grad_norm": 37.272532661154244, + "learning_rate": 7.453831955147428e-09, + "logits/chosen": 0.12578025460243225, + "logits/rejected": -0.2529383897781372, + "logps/chosen": -1.0211150646209717, + "logps/rejected": -2.2397446632385254, + "loss": 1.4805, + "nll_loss": 1.0211150646209717, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.211149215698242, + "rewards/margins": 12.186296463012695, + "rewards/rejected": -22.397445678710938, + "step": 635 + }, + { + "epoch": 0.9483690587138863, + "grad_norm": 56.88066251632034, + "learning_rate": 7.034970866825973e-09, + "logits/chosen": 0.7423490285873413, + "logits/rejected": 0.7912289500236511, + "logps/chosen": -1.2308918237686157, + "logps/rejected": -1.750805139541626, + "loss": 1.4598, + "nll_loss": 1.2308918237686157, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.308917999267578, + "rewards/margins": 5.19913387298584, + "rewards/rejected": -17.508052825927734, + "step": 636 + }, + { + "epoch": 0.9498602050326188, + "grad_norm": 113.66211780898051, + "learning_rate": 6.62813798132561e-09, + "logits/chosen": 1.0367634296417236, + "logits/rejected": 0.7382550835609436, + "logps/chosen": -1.2447309494018555, + "logps/rejected": -1.7634875774383545, + "loss": 3.0594, + "nll_loss": 1.2447309494018555, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.447310447692871, + "rewards/margins": 5.187565326690674, + "rewards/rejected": -17.634876251220703, + "step": 637 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 63.164427816842846, + "learning_rate": 6.233343225249932e-09, + "logits/chosen": 1.2581459283828735, + "logits/rejected": 1.399833083152771, + "logps/chosen": -0.9300872683525085, + "logps/rejected": -1.7020875215530396, + "loss": 2.3141, + "nll_loss": 0.9300872683525085, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.300872802734375, + "rewards/margins": 7.7200026512146, + "rewards/rejected": -17.0208740234375, + "step": 638 + }, + { + "epoch": 0.9528424976700839, + "grad_norm": 116.17240382010048, + "learning_rate": 5.850596231475768e-09, + "logits/chosen": 1.2323070764541626, + "logits/rejected": 1.4736135005950928, + "logps/chosen": -1.9990520477294922, + "logps/rejected": -2.0188441276550293, + "loss": 2.3635, + "nll_loss": 1.9990520477294922, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.990522384643555, + "rewards/margins": 0.19792091846466064, + "rewards/rejected": -20.18844223022461, + "step": 639 + }, + { + "epoch": 0.9543336439888164, + "grad_norm": 148.9927846719541, + "learning_rate": 5.4799063389179834e-09, + "logits/chosen": 0.46052801609039307, + "logits/rejected": 0.3371131420135498, + "logps/chosen": -1.3033381700515747, + "logps/rejected": -2.339939832687378, + "loss": 2.2474, + "nll_loss": 1.3033380508422852, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.033382415771484, + "rewards/margins": 10.36601448059082, + "rewards/rejected": -23.399396896362305, + "step": 640 + }, + { + "epoch": 0.955824790307549, + "grad_norm": 48.0116134291069, + "learning_rate": 5.1212825923019345e-09, + "logits/chosen": 0.8156238794326782, + "logits/rejected": 1.123835563659668, + "logps/chosen": -1.2078592777252197, + "logps/rejected": -4.5481648445129395, + "loss": 1.7785, + "nll_loss": 1.2078593969345093, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.078592300415039, + "rewards/margins": 33.40306091308594, + "rewards/rejected": -45.481651306152344, + "step": 641 + }, + { + "epoch": 0.9573159366262814, + "grad_norm": 40.00712354284295, + "learning_rate": 4.7747337419422054e-09, + "logits/chosen": 0.6781156063079834, + "logits/rejected": 1.3144218921661377, + "logps/chosen": -1.5253450870513916, + "logps/rejected": -3.2332403659820557, + "loss": 2.1828, + "nll_loss": 1.5253452062606812, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.253450393676758, + "rewards/margins": 17.078954696655273, + "rewards/rejected": -32.33240509033203, + "step": 642 + }, + { + "epoch": 0.9588070829450139, + "grad_norm": 80.79198208372269, + "learning_rate": 4.440268243529666e-09, + "logits/chosen": 0.9073221683502197, + "logits/rejected": 0.5689750909805298, + "logps/chosen": -0.7339849472045898, + "logps/rejected": -1.8821582794189453, + "loss": 1.6134, + "nll_loss": 0.7339848279953003, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.33984899520874, + "rewards/margins": 11.481732368469238, + "rewards/rejected": -18.82158088684082, + "step": 643 + }, + { + "epoch": 0.9602982292637465, + "grad_norm": 90.62829032832231, + "learning_rate": 4.117894257924803e-09, + "logits/chosen": 0.46462368965148926, + "logits/rejected": 0.5671396255493164, + "logps/chosen": -1.1450039148330688, + "logps/rejected": -2.218526601791382, + "loss": 2.9791, + "nll_loss": 1.1450039148330688, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.450039863586426, + "rewards/margins": 10.735227584838867, + "rewards/rejected": -22.185266494750977, + "step": 644 + }, + { + "epoch": 0.961789375582479, + "grad_norm": 53.71551572686904, + "learning_rate": 3.807619650958827e-09, + "logits/chosen": 0.6184532642364502, + "logits/rejected": 0.8686254024505615, + "logps/chosen": -1.2457393407821655, + "logps/rejected": -3.1474404335021973, + "loss": 1.738, + "nll_loss": 1.2457393407821655, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.457393646240234, + "rewards/margins": 19.01700782775879, + "rewards/rejected": -31.474403381347656, + "step": 645 + }, + { + "epoch": 0.9632805219012116, + "grad_norm": 53.90284021390508, + "learning_rate": 3.509451993241541e-09, + "logits/chosen": 0.43453526496887207, + "logits/rejected": 0.8017415404319763, + "logps/chosen": -1.391627311706543, + "logps/rejected": -2.1751885414123535, + "loss": 1.8167, + "nll_loss": 1.3916271924972534, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.916272163391113, + "rewards/margins": 7.835611343383789, + "rewards/rejected": -21.75188446044922, + "step": 646 + }, + { + "epoch": 0.9647716682199441, + "grad_norm": 52.50485960299144, + "learning_rate": 3.22339855997672e-09, + "logits/chosen": 0.2412007749080658, + "logits/rejected": 0.03859227895736694, + "logps/chosen": -1.3658912181854248, + "logps/rejected": -2.9418210983276367, + "loss": 1.9701, + "nll_loss": 1.3658912181854248, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.65891170501709, + "rewards/margins": 15.759300231933594, + "rewards/rejected": -29.418210983276367, + "step": 647 + }, + { + "epoch": 0.9662628145386766, + "grad_norm": 34.81335059745858, + "learning_rate": 2.9494663307847443e-09, + "logits/chosen": 1.0914424657821655, + "logits/rejected": 1.1791269779205322, + "logps/chosen": -1.3588266372680664, + "logps/rejected": -2.738715887069702, + "loss": 1.9192, + "nll_loss": 1.3588268756866455, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.588268280029297, + "rewards/margins": 13.798890113830566, + "rewards/rejected": -27.38715934753418, + "step": 648 + }, + { + "epoch": 0.9677539608574092, + "grad_norm": 69.14758369873154, + "learning_rate": 2.687661989531964e-09, + "logits/chosen": 0.7465240359306335, + "logits/rejected": 1.6419605016708374, + "logps/chosen": -1.1785346269607544, + "logps/rejected": -2.6736412048339844, + "loss": 2.6972, + "nll_loss": 1.1785345077514648, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.785346984863281, + "rewards/margins": 14.951066970825195, + "rewards/rejected": -26.736412048339844, + "step": 649 + }, + { + "epoch": 0.9692451071761417, + "grad_norm": 55.38444741315943, + "learning_rate": 2.437991924167937e-09, + "logits/chosen": 1.2680692672729492, + "logits/rejected": 1.549578070640564, + "logps/chosen": -0.9802628755569458, + "logps/rejected": -2.6409151554107666, + "loss": 1.8713, + "nll_loss": 0.9802627563476562, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.802628517150879, + "rewards/margins": 16.606521606445312, + "rewards/rejected": -26.40915298461914, + "step": 650 + }, + { + "epoch": 0.9707362534948741, + "grad_norm": 43.159724090253455, + "learning_rate": 2.2004622265693882e-09, + "logits/chosen": 1.2227438688278198, + "logits/rejected": 1.2243788242340088, + "logps/chosen": -1.2036027908325195, + "logps/rejected": -2.386033296585083, + "loss": 1.9318, + "nll_loss": 1.2036027908325195, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.036026000976562, + "rewards/margins": 11.82430648803711, + "rewards/rejected": -23.860336303710938, + "step": 651 + }, + { + "epoch": 0.9722273998136067, + "grad_norm": 56.44120590824913, + "learning_rate": 1.975078692391552e-09, + "logits/chosen": 0.7070285081863403, + "logits/rejected": 0.6214600801467896, + "logps/chosen": -1.1754833459854126, + "logps/rejected": -4.710268497467041, + "loss": 1.9201, + "nll_loss": 1.175483226776123, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.75483226776123, + "rewards/margins": 35.34785461425781, + "rewards/rejected": -47.102684020996094, + "step": 652 + }, + { + "epoch": 0.9737185461323392, + "grad_norm": 51.00916882401214, + "learning_rate": 1.7618468209268933e-09, + "logits/chosen": 1.83562433719635, + "logits/rejected": 1.3902753591537476, + "logps/chosen": -1.349332332611084, + "logps/rejected": -5.075465202331543, + "loss": 2.4647, + "nll_loss": 1.349332332611084, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.493322372436523, + "rewards/margins": 37.261329650878906, + "rewards/rejected": -50.75465774536133, + "step": 653 + }, + { + "epoch": 0.9752096924510718, + "grad_norm": 173.61554484377376, + "learning_rate": 1.5607718149708848e-09, + "logits/chosen": 1.0174025297164917, + "logits/rejected": 1.0746914148330688, + "logps/chosen": -1.3610864877700806, + "logps/rejected": -3.2821123600006104, + "loss": 3.1357, + "nll_loss": 1.361086368560791, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.610864639282227, + "rewards/margins": 19.21026039123535, + "rewards/rejected": -32.821128845214844, + "step": 654 + }, + { + "epoch": 0.9767008387698043, + "grad_norm": 73.2726059205274, + "learning_rate": 1.37185858069494e-09, + "logits/chosen": 1.0790843963623047, + "logits/rejected": 0.8912894129753113, + "logps/chosen": -1.2312716245651245, + "logps/rejected": -2.1072559356689453, + "loss": 1.3087, + "nll_loss": 1.2312716245651245, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.31271743774414, + "rewards/margins": 8.759840965270996, + "rewards/rejected": -21.07255744934082, + "step": 655 + }, + { + "epoch": 0.9781919850885368, + "grad_norm": 37.979537721749395, + "learning_rate": 1.195111727526843e-09, + "logits/chosen": 0.765521764755249, + "logits/rejected": 1.9364503622055054, + "logps/chosen": -0.838382363319397, + "logps/rejected": -4.206418514251709, + "loss": 1.2937, + "nll_loss": 0.838382363319397, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.38382339477539, + "rewards/margins": 33.68035888671875, + "rewards/rejected": -42.064186096191406, + "step": 656 + }, + { + "epoch": 0.9796831314072694, + "grad_norm": 60.05396788153073, + "learning_rate": 1.0305355680382266e-09, + "logits/chosen": 1.1509252786636353, + "logits/rejected": 1.104540467262268, + "logps/chosen": -1.3822535276412964, + "logps/rejected": -3.314253807067871, + "loss": 1.632, + "nll_loss": 1.3822535276412964, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.822535514831543, + "rewards/margins": 19.32000160217285, + "rewards/rejected": -33.14253616333008, + "step": 657 + }, + { + "epoch": 0.9811742777260019, + "grad_norm": 60.548920303826634, + "learning_rate": 8.781341178393242e-10, + "logits/chosen": 1.27897047996521, + "logits/rejected": 1.580756664276123, + "logps/chosen": -1.4538989067077637, + "logps/rejected": -3.935889720916748, + "loss": 2.3341, + "nll_loss": 1.4538989067077637, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.538990020751953, + "rewards/margins": 24.81990623474121, + "rewards/rejected": -39.35889434814453, + "step": 658 + }, + { + "epoch": 0.9826654240447343, + "grad_norm": 45.86956571349362, + "learning_rate": 7.379110954810475e-10, + "logits/chosen": 0.560336172580719, + "logits/rejected": 0.9452404379844666, + "logps/chosen": -1.4137957096099854, + "logps/rejected": -3.3868355751037598, + "loss": 1.798, + "nll_loss": 1.4137957096099854, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.137956619262695, + "rewards/margins": 19.730396270751953, + "rewards/rejected": -33.86835479736328, + "step": 659 + }, + { + "epoch": 0.9841565703634669, + "grad_norm": 57.46987230573217, + "learning_rate": 6.098699223641701e-10, + "logits/chosen": 0.6086639761924744, + "logits/rejected": 0.858557939529419, + "logps/chosen": -1.8596971035003662, + "logps/rejected": -3.0429792404174805, + "loss": 2.201, + "nll_loss": 1.8596974611282349, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.596973419189453, + "rewards/margins": 11.832818031311035, + "rewards/rejected": -30.429792404174805, + "step": 660 + }, + { + "epoch": 0.9856477166821994, + "grad_norm": 62.5753873150776, + "learning_rate": 4.940137226560615e-10, + "logits/chosen": 0.6363529562950134, + "logits/rejected": 1.5825424194335938, + "logps/chosen": -1.7450621128082275, + "logps/rejected": -2.7526190280914307, + "loss": 2.2655, + "nll_loss": 1.7450621128082275, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.450620651245117, + "rewards/margins": 10.075567245483398, + "rewards/rejected": -27.52618980407715, + "step": 661 + }, + { + "epoch": 0.987138863000932, + "grad_norm": 47.832905093901545, + "learning_rate": 3.903453232140808e-10, + "logits/chosen": 1.244388222694397, + "logits/rejected": 2.0133092403411865, + "logps/chosen": -1.0886551141738892, + "logps/rejected": -2.5148251056671143, + "loss": 1.9311, + "nll_loss": 1.0886549949645996, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.886550903320312, + "rewards/margins": 14.261701583862305, + "rewards/rejected": -25.148252487182617, + "step": 662 + }, + { + "epoch": 0.9886300093196645, + "grad_norm": 41.41579902475401, + "learning_rate": 2.988672535169656e-10, + "logits/chosen": 1.6264417171478271, + "logits/rejected": 1.6430389881134033, + "logps/chosen": -1.0195305347442627, + "logps/rejected": -2.44887638092041, + "loss": 1.1391, + "nll_loss": 1.0195305347442627, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.195306777954102, + "rewards/margins": 14.293455123901367, + "rewards/rejected": -24.48876190185547, + "step": 663 + }, + { + "epoch": 0.990121155638397, + "grad_norm": 61.20192914397783, + "learning_rate": 2.1958174560282594e-10, + "logits/chosen": 0.9436533451080322, + "logits/rejected": 1.2115904092788696, + "logps/chosen": -1.481966495513916, + "logps/rejected": -3.0835845470428467, + "loss": 1.5986, + "nll_loss": 1.481966495513916, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.819665908813477, + "rewards/margins": 16.01618003845215, + "rewards/rejected": -30.835845947265625, + "step": 664 + }, + { + "epoch": 0.9916123019571296, + "grad_norm": 79.46932255527533, + "learning_rate": 1.5249073401502055e-10, + "logits/chosen": 1.7353107929229736, + "logits/rejected": 2.1959099769592285, + "logps/chosen": -1.3309359550476074, + "logps/rejected": -3.29795503616333, + "loss": 1.6526, + "nll_loss": 1.330936074256897, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.309359550476074, + "rewards/margins": 19.670188903808594, + "rewards/rejected": -32.979549407958984, + "step": 665 + }, + { + "epoch": 0.993103448275862, + "grad_norm": 41.19926284687701, + "learning_rate": 9.759585575458417e-11, + "logits/chosen": 0.7111138701438904, + "logits/rejected": 0.504228949546814, + "logps/chosen": -1.7196786403656006, + "logps/rejected": -2.507728099822998, + "loss": 1.775, + "nll_loss": 1.7196786403656006, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.19678497314453, + "rewards/margins": 7.880496025085449, + "rewards/rejected": -25.077281951904297, + "step": 666 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 165.31221246030398, + "learning_rate": 5.4898450240536964e-11, + "logits/chosen": 1.0977013111114502, + "logits/rejected": 1.3191360235214233, + "logps/chosen": -1.375827431678772, + "logps/rejected": -2.653599262237549, + "loss": 2.869, + "nll_loss": 1.3758275508880615, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.75827407836914, + "rewards/margins": 12.777718544006348, + "rewards/rejected": -26.535995483398438, + "step": 667 + }, + { + "epoch": 0.9960857409133271, + "grad_norm": 50.15883035033017, + "learning_rate": 2.4399559277132885e-11, + "logits/chosen": 1.5955461263656616, + "logits/rejected": 1.0452600717544556, + "logps/chosen": -1.4784938097000122, + "logps/rejected": -2.4211621284484863, + "loss": 1.0848, + "nll_loss": 1.4784936904907227, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.784937858581543, + "rewards/margins": 9.426685333251953, + "rewards/rejected": -24.21162223815918, + "step": 668 + }, + { + "epoch": 0.9975768872320596, + "grad_norm": 39.829485947617805, + "learning_rate": 6.099927028380136e-12, + "logits/chosen": 1.5119438171386719, + "logits/rejected": 1.5995426177978516, + "logps/chosen": -1.1745887994766235, + "logps/rejected": -2.149003267288208, + "loss": 1.478, + "nll_loss": 1.174588918685913, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.745888710021973, + "rewards/margins": 9.744142532348633, + "rewards/rejected": -21.490032196044922, + "step": 669 + }, + { + "epoch": 0.9990680335507922, + "grad_norm": 35.06242587573981, + "learning_rate": 0.0, + "logits/chosen": 1.1288516521453857, + "logits/rejected": 1.465525507926941, + "logps/chosen": -1.4203619956970215, + "logps/rejected": -5.043858528137207, + "loss": 2.0247, + "nll_loss": 1.420362114906311, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.203620910644531, + "rewards/margins": 36.234962463378906, + "rewards/rejected": -50.4385871887207, + "step": 670 + }, + { + "epoch": 0.9990680335507922, + "step": 670, + "total_flos": 0.0, + "train_loss": 2.3420828120032353, + "train_runtime": 14192.8209, + "train_samples_per_second": 3.024, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1, + "max_steps": 670, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}