{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959925193694897, "eval_steps": 400, "global_step": 233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02137323002938819, "grad_norm": 0.7561385035514832, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.7804944515228271, "logits/rejected": -1.6545133590698242, "logps/chosen": -0.7810468673706055, "logps/ref_chosen": -0.7813535928726196, "logps/ref_rejected": -0.8060104250907898, "logps/rejected": -0.805632472038269, "loss": 0.5, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": 0.0007667625322937965, "rewards/margins": -0.00017807073891162872, "rewards/rejected": 0.0009448332712054253, "step": 5 }, { "epoch": 0.04274646005877638, "grad_norm": 0.5518713593482971, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.7383073568344116, "logits/rejected": -1.7030198574066162, "logps/chosen": -0.8675562143325806, "logps/ref_chosen": -0.8662088513374329, "logps/ref_rejected": -0.9053529500961304, "logps/rejected": -0.907278835773468, "loss": 0.5, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.003368514124304056, "rewards/margins": 0.0014460685197263956, "rewards/rejected": -0.004814582876861095, "step": 10 }, { "epoch": 0.06411969008816458, "grad_norm": 0.49651747941970825, "learning_rate": 6.249999999999999e-07, "logits/chosen": -1.928145170211792, "logits/rejected": -1.8129940032958984, "logps/chosen": -0.855307400226593, "logps/ref_chosen": -0.8494969606399536, "logps/ref_rejected": -0.8662179708480835, "logps/rejected": -0.8723622560501099, "loss": 0.4998, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.014526228420436382, "rewards/margins": 0.0008344631642103195, "rewards/rejected": -0.015360690653324127, "step": 15 }, { "epoch": 0.08549292011755276, "grad_norm": 0.27864935994148254, "learning_rate": 8.333333333333333e-07, "logits/chosen": -1.7597744464874268, "logits/rejected": -1.6725934743881226, "logps/chosen": -0.9120359420776367, "logps/ref_chosen": -0.8935796618461609, "logps/ref_rejected": -0.8952409029006958, "logps/rejected": -0.9148454666137695, "loss": 0.4993, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0461406409740448, "rewards/margins": 0.0028707808814942837, "rewards/rejected": -0.04901142045855522, "step": 20 }, { "epoch": 0.10686615014694095, "grad_norm": 0.4763440787792206, "learning_rate": 9.999435142363483e-07, "logits/chosen": -1.624091386795044, "logits/rejected": -1.5722483396530151, "logps/chosen": -0.9541429281234741, "logps/ref_chosen": -0.8983734846115112, "logps/ref_rejected": -0.9594888687133789, "logps/rejected": -1.035103440284729, "loss": 0.4976, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1394234150648117, "rewards/margins": 0.04961305111646652, "rewards/rejected": -0.18903647363185883, "step": 25 }, { "epoch": 0.12823938017632916, "grad_norm": 0.3390841782093048, "learning_rate": 9.97967852255038e-07, "logits/chosen": -1.6577975749969482, "logits/rejected": -1.5775320529937744, "logps/chosen": -0.8317171931266785, "logps/ref_chosen": -0.7469085454940796, "logps/ref_rejected": -0.79144287109375, "logps/rejected": -0.8856765031814575, "loss": 0.4975, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2120215892791748, "rewards/margins": 0.023562394082546234, "rewards/rejected": -0.23558397591114044, "step": 30 }, { "epoch": 0.14961261020571734, "grad_norm": 0.3006940484046936, "learning_rate": 9.931806517013612e-07, "logits/chosen": -1.6243362426757812, "logits/rejected": -1.6471837759017944, "logps/chosen": -0.9220904111862183, "logps/ref_chosen": -0.7822158336639404, "logps/ref_rejected": -0.8102364540100098, "logps/rejected": -0.9670912027359009, "loss": 0.4963, "rewards/accuracies": 0.5, "rewards/chosen": -0.34968677163124084, "rewards/margins": 0.04244992882013321, "rewards/rejected": -0.39213672280311584, "step": 35 }, { "epoch": 0.17098584023510552, "grad_norm": 0.27084407210350037, "learning_rate": 9.856089412257604e-07, "logits/chosen": -1.6578766107559204, "logits/rejected": -1.6355148553848267, "logps/chosen": -1.05038583278656, "logps/ref_chosen": -0.8560595512390137, "logps/ref_rejected": -0.914546012878418, "logps/rejected": -1.1552728414535522, "loss": 0.4937, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.48581594228744507, "rewards/margins": 0.11600111424922943, "rewards/rejected": -0.6018170118331909, "step": 40 }, { "epoch": 0.19235907026449373, "grad_norm": 0.49249762296676636, "learning_rate": 9.752954708892377e-07, "logits/chosen": -1.5577068328857422, "logits/rejected": -1.465714931488037, "logps/chosen": -1.0501822233200073, "logps/ref_chosen": -0.8724653124809265, "logps/ref_rejected": -0.8607926368713379, "logps/rejected": -1.0398765802383423, "loss": 0.4975, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.4442923665046692, "rewards/margins": 0.003417615545913577, "rewards/rejected": -0.4477098882198334, "step": 45 }, { "epoch": 0.2137323002938819, "grad_norm": 0.32699063420295715, "learning_rate": 9.62298470795473e-07, "logits/chosen": -1.7691097259521484, "logits/rejected": -1.7416681051254272, "logps/chosen": -0.9927698969841003, "logps/ref_chosen": -0.8696678280830383, "logps/ref_rejected": -0.8965504765510559, "logps/rejected": -1.030956506729126, "loss": 0.4967, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3077549934387207, "rewards/margins": 0.028260568156838417, "rewards/rejected": -0.33601561188697815, "step": 50 }, { "epoch": 0.2351055303232701, "grad_norm": 0.31701868772506714, "learning_rate": 9.466913223222465e-07, "logits/chosen": -1.5519920587539673, "logits/rejected": -1.4699208736419678, "logps/chosen": -0.8616452217102051, "logps/ref_chosen": -0.7731812596321106, "logps/ref_rejected": -0.7838868498802185, "logps/rejected": -0.8863974809646606, "loss": 0.4956, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.22115974128246307, "rewards/margins": 0.03511647880077362, "rewards/rejected": -0.2562762200832367, "step": 55 }, { "epoch": 0.2564787603526583, "grad_norm": 0.40317973494529724, "learning_rate": 9.285621438083997e-07, "logits/chosen": -1.601485252380371, "logits/rejected": -1.5545583963394165, "logps/chosen": -0.8779473304748535, "logps/ref_chosen": -0.7888692617416382, "logps/ref_rejected": -0.8163660168647766, "logps/rejected": -0.9202233552932739, "loss": 0.4936, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.22269515693187714, "rewards/margins": 0.03694819286465645, "rewards/rejected": -0.2596433460712433, "step": 60 }, { "epoch": 0.2778519903820465, "grad_norm": 0.5152870416641235, "learning_rate": 9.080132930355566e-07, "logits/chosen": -1.6490017175674438, "logits/rejected": -1.6716206073760986, "logps/chosen": -0.9653270840644836, "logps/ref_chosen": -0.8533055186271667, "logps/ref_rejected": -0.9036076664924622, "logps/rejected": -1.0383471250534058, "loss": 0.4942, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28005388379096985, "rewards/margins": 0.05679459124803543, "rewards/rejected": -0.33684849739074707, "step": 65 }, { "epoch": 0.2992252204114347, "grad_norm": 0.4174951910972595, "learning_rate": 8.851607893136064e-07, "logits/chosen": -1.728899598121643, "logits/rejected": -1.6759620904922485, "logps/chosen": -0.9261114001274109, "logps/ref_chosen": -0.8132463693618774, "logps/ref_rejected": -0.8208681344985962, "logps/rejected": -0.9566439390182495, "loss": 0.4926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28216248750686646, "rewards/margins": 0.057276882231235504, "rewards/rejected": -0.33943939208984375, "step": 70 }, { "epoch": 0.32059845044082286, "grad_norm": 0.36295104026794434, "learning_rate": 8.601336584328658e-07, "logits/chosen": -1.7176015377044678, "logits/rejected": -1.7168267965316772, "logps/chosen": -0.9694639444351196, "logps/ref_chosen": -0.8283951878547668, "logps/ref_rejected": -0.8723212480545044, "logps/rejected": -1.0357882976531982, "loss": 0.4939, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.3526715338230133, "rewards/margins": 0.05599608272314072, "rewards/rejected": -0.4086676239967346, "step": 75 }, { "epoch": 0.34197168047021104, "grad_norm": 0.6786319017410278, "learning_rate": 8.330732041813366e-07, "logits/chosen": -1.8271814584732056, "logits/rejected": -1.7772512435913086, "logps/chosen": -0.8929777145385742, "logps/ref_chosen": -0.8355891108512878, "logps/ref_rejected": -0.9002590179443359, "logps/rejected": -0.9975314140319824, "loss": 0.4901, "rewards/accuracies": 0.53125, "rewards/chosen": -0.14347167313098907, "rewards/margins": 0.09970954060554504, "rewards/rejected": -0.24318119883537292, "step": 80 }, { "epoch": 0.36334491049959927, "grad_norm": 0.8407174348831177, "learning_rate": 8.041322105400921e-07, "logits/chosen": -1.706368088722229, "logits/rejected": -1.650854468345642, "logps/chosen": -0.8318307995796204, "logps/ref_chosen": -0.8256785273551941, "logps/ref_rejected": -0.8488883972167969, "logps/rejected": -0.9011926651000977, "loss": 0.4878, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01538090594112873, "rewards/margins": 0.1153799295425415, "rewards/rejected": -0.13076083362102509, "step": 85 }, { "epoch": 0.38471814052898745, "grad_norm": 1.0095113515853882, "learning_rate": 7.734740790612136e-07, "logits/chosen": -1.8660595417022705, "logits/rejected": -1.8641777038574219, "logps/chosen": -0.8596251606941223, "logps/ref_chosen": -0.9228288531303406, "logps/ref_rejected": -0.9406684637069702, "logps/rejected": -0.9123810529708862, "loss": 0.4896, "rewards/accuracies": 0.59375, "rewards/chosen": 0.15800921618938446, "rewards/margins": 0.08729076385498047, "rewards/rejected": 0.07071846723556519, "step": 90 }, { "epoch": 0.40609137055837563, "grad_norm": 0.8767968416213989, "learning_rate": 7.412719062986631e-07, "logits/chosen": -1.9249579906463623, "logits/rejected": -1.8531602621078491, "logps/chosen": -0.8149619102478027, "logps/ref_chosen": -0.9041957855224609, "logps/ref_rejected": -0.914394736289978, "logps/rejected": -0.8845084309577942, "loss": 0.4859, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.22308464348316193, "rewards/margins": 0.14836890995502472, "rewards/rejected": 0.0747157484292984, "step": 95 }, { "epoch": 0.4274646005877638, "grad_norm": 0.6044840812683105, "learning_rate": 7.077075065009433e-07, "logits/chosen": -1.731792688369751, "logits/rejected": -1.7363353967666626, "logps/chosen": -0.7217603921890259, "logps/ref_chosen": -0.8257284164428711, "logps/ref_rejected": -0.8479409217834473, "logps/rejected": -0.784611701965332, "loss": 0.4852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.2599199414253235, "rewards/margins": 0.10159693658351898, "rewards/rejected": 0.1583230048418045, "step": 100 }, { "epoch": 0.448837830617152, "grad_norm": 0.7751099467277527, "learning_rate": 6.72970385083438e-07, "logits/chosen": -1.9043171405792236, "logits/rejected": -1.789009690284729, "logps/chosen": -0.7011796236038208, "logps/ref_chosen": -0.8166704177856445, "logps/ref_rejected": -0.8361040949821472, "logps/rejected": -0.7531259655952454, "loss": 0.4905, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.2887269854545593, "rewards/margins": 0.0812816247344017, "rewards/rejected": 0.20744535326957703, "step": 105 }, { "epoch": 0.4702110606465402, "grad_norm": 0.7444783449172974, "learning_rate": 6.372566686762426e-07, "logits/chosen": -1.8429124355316162, "logits/rejected": -1.760053277015686, "logps/chosen": -0.7318671941757202, "logps/ref_chosen": -0.8331576585769653, "logps/ref_rejected": -0.8635438084602356, "logps/rejected": -0.7953906059265137, "loss": 0.4817, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.253226101398468, "rewards/margins": 0.08284299075603485, "rewards/rejected": 0.17038312554359436, "step": 110 }, { "epoch": 0.4915842906759284, "grad_norm": 0.8039557337760925, "learning_rate": 6.00767997788451e-07, "logits/chosen": -1.9033887386322021, "logits/rejected": -1.770939588546753, "logps/chosen": -0.7566145062446594, "logps/ref_chosen": -0.8713752627372742, "logps/ref_rejected": -0.8939735293388367, "logps/rejected": -0.8082603216171265, "loss": 0.4844, "rewards/accuracies": 0.625, "rewards/chosen": 0.28690171241760254, "rewards/margins": 0.07261888682842255, "rewards/rejected": 0.2142828404903412, "step": 115 }, { "epoch": 0.5129575207053166, "grad_norm": 1.1172226667404175, "learning_rate": 5.637103883409525e-07, "logits/chosen": -1.9406812191009521, "logits/rejected": -1.867462158203125, "logps/chosen": -0.8249004483222961, "logps/ref_chosen": -0.873686671257019, "logps/ref_rejected": -0.9026174545288086, "logps/rejected": -0.8801124691963196, "loss": 0.4836, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.1219654530286789, "rewards/margins": 0.06570279598236084, "rewards/rejected": 0.05626266077160835, "step": 120 }, { "epoch": 0.5343307507347048, "grad_norm": 0.8093484044075012, "learning_rate": 5.262930684955438e-07, "logits/chosen": -2.0165348052978516, "logits/rejected": -1.9574447870254517, "logps/chosen": -0.8015368580818176, "logps/ref_chosen": -0.815376877784729, "logps/ref_rejected": -0.8817696571350098, "logps/rejected": -0.9249275326728821, "loss": 0.4825, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.034600116312503815, "rewards/margins": 0.14249476790428162, "rewards/rejected": -0.1078946590423584, "step": 125 }, { "epoch": 0.555703980764093, "grad_norm": 0.8614518642425537, "learning_rate": 4.88727297347654e-07, "logits/chosen": -1.951319694519043, "logits/rejected": -1.933098554611206, "logps/chosen": -0.7757576107978821, "logps/ref_chosen": -0.7751168608665466, "logps/ref_rejected": -0.8734749555587769, "logps/rejected": -0.9476582407951355, "loss": 0.4794, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0016015321016311646, "rewards/margins": 0.18385668098926544, "rewards/rejected": -0.1854582130908966, "step": 130 }, { "epoch": 0.5770772107934812, "grad_norm": 1.054673194885254, "learning_rate": 4.512251721523659e-07, "logits/chosen": -2.005443811416626, "logits/rejected": -2.0154759883880615, "logps/chosen": -0.7493831515312195, "logps/ref_chosen": -0.7740285992622375, "logps/ref_rejected": -0.8138446807861328, "logps/rejected": -0.851770281791687, "loss": 0.4847, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.06161379814147949, "rewards/margins": 0.15642789006233215, "rewards/rejected": -0.09481407701969147, "step": 135 }, { "epoch": 0.5984504408228694, "grad_norm": 1.669247031211853, "learning_rate": 4.139984308181708e-07, "logits/chosen": -1.9584558010101318, "logits/rejected": -1.8885042667388916, "logps/chosen": -0.7844404578208923, "logps/ref_chosen": -0.8231161236763, "logps/ref_rejected": -0.83356112241745, "logps/rejected": -0.8187839388847351, "loss": 0.481, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.09668895602226257, "rewards/margins": 0.0597461462020874, "rewards/rejected": 0.03694281354546547, "step": 140 }, { "epoch": 0.6198236708522575, "grad_norm": 1.0050734281539917, "learning_rate": 3.772572564296004e-07, "logits/chosen": -1.8883240222930908, "logits/rejected": -1.8240330219268799, "logps/chosen": -0.7662582397460938, "logps/ref_chosen": -0.8861669301986694, "logps/ref_rejected": -0.924543023109436, "logps/rejected": -0.8522801399230957, "loss": 0.4785, "rewards/accuracies": 0.59375, "rewards/chosen": 0.29977160692214966, "rewards/margins": 0.11911455541849136, "rewards/rejected": 0.18065707385540009, "step": 145 }, { "epoch": 0.6411969008816457, "grad_norm": 0.9123177528381348, "learning_rate": 3.412090905484337e-07, "logits/chosen": -1.9726581573486328, "logits/rejected": -1.9151983261108398, "logps/chosen": -0.7370959520339966, "logps/ref_chosen": -0.866258978843689, "logps/ref_rejected": -0.8657606840133667, "logps/rejected": -0.8145822286605835, "loss": 0.4817, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3229079246520996, "rewards/margins": 0.19496168196201324, "rewards/rejected": 0.12794628739356995, "step": 150 }, { "epoch": 0.6625701309110339, "grad_norm": 0.7277486324310303, "learning_rate": 3.060574619936075e-07, "logits/chosen": -1.8861163854599, "logits/rejected": -1.9004647731781006, "logps/chosen": -0.7181011438369751, "logps/ref_chosen": -0.8273455500602722, "logps/ref_rejected": -0.8894198536872864, "logps/rejected": -0.8557901382446289, "loss": 0.4828, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.27311110496520996, "rewards/margins": 0.18903681635856628, "rewards/rejected": 0.08407425880432129, "step": 155 }, { "epoch": 0.6839433609404221, "grad_norm": 1.0556169748306274, "learning_rate": 2.720008377125682e-07, "logits/chosen": -2.1498093605041504, "logits/rejected": -2.08402419090271, "logps/chosen": -0.7090437412261963, "logps/ref_chosen": -0.8103801012039185, "logps/ref_rejected": -0.8715206980705261, "logps/rejected": -0.7982600927352905, "loss": 0.4815, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.25334107875823975, "rewards/margins": 0.0701896995306015, "rewards/rejected": 0.18315134942531586, "step": 160 }, { "epoch": 0.7053165909698104, "grad_norm": 0.9068896770477295, "learning_rate": 2.3923150223207173e-07, "logits/chosen": -1.9448268413543701, "logits/rejected": -1.904314637184143, "logps/chosen": -0.7294695377349854, "logps/ref_chosen": -0.8327474594116211, "logps/ref_rejected": -0.9134753346443176, "logps/rejected": -0.8374517560005188, "loss": 0.4781, "rewards/accuracies": 0.59375, "rewards/chosen": 0.25819462537765503, "rewards/margins": 0.06813579052686691, "rewards/rejected": 0.1900588572025299, "step": 165 }, { "epoch": 0.7266898209991985, "grad_norm": 0.9222660660743713, "learning_rate": 2.0793447201508286e-07, "logits/chosen": -1.9369819164276123, "logits/rejected": -1.9469242095947266, "logps/chosen": -0.6714679002761841, "logps/ref_chosen": -0.7705163359642029, "logps/ref_rejected": -0.8333786129951477, "logps/rejected": -0.7605674862861633, "loss": 0.4845, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.24762126803398132, "rewards/margins": 0.06559363007545471, "rewards/rejected": 0.18202762305736542, "step": 170 }, { "epoch": 0.7480630510285867, "grad_norm": 0.7805910110473633, "learning_rate": 1.7828645085333644e-07, "logits/chosen": -1.9515256881713867, "logits/rejected": -1.9057369232177734, "logps/chosen": -0.7692245244979858, "logps/ref_chosen": -0.8767744302749634, "logps/ref_rejected": -0.8914516568183899, "logps/rejected": -0.8368504643440247, "loss": 0.4772, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2688748240470886, "rewards/margins": 0.13237187266349792, "rewards/rejected": 0.1365029364824295, "step": 175 }, { "epoch": 0.7694362810579749, "grad_norm": 0.8946753740310669, "learning_rate": 1.5045483219344385e-07, "logits/chosen": -2.008927583694458, "logits/rejected": -1.9914191961288452, "logps/chosen": -0.7557133436203003, "logps/ref_chosen": -0.8390854597091675, "logps/ref_rejected": -0.8787837028503418, "logps/rejected": -0.853840708732605, "loss": 0.479, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2084302008152008, "rewards/margins": 0.14607290923595428, "rewards/rejected": 0.062357254326343536, "step": 180 }, { "epoch": 0.7908095110873631, "grad_norm": 1.0278949737548828, "learning_rate": 1.2459675402943288e-07, "logits/chosen": -2.0313353538513184, "logits/rejected": -1.9398431777954102, "logps/chosen": -0.7753286361694336, "logps/ref_chosen": -0.8660305142402649, "logps/ref_rejected": -0.8604307174682617, "logps/rejected": -0.7984707951545715, "loss": 0.4765, "rewards/accuracies": 0.625, "rewards/chosen": 0.22675485908985138, "rewards/margins": 0.07185501605272293, "rewards/rejected": 0.15489983558654785, "step": 185 }, { "epoch": 0.8121827411167513, "grad_norm": 0.8263163566589355, "learning_rate": 1.0085821169782199e-07, "logits/chosen": -2.0978431701660156, "logits/rejected": -2.055168628692627, "logps/chosen": -0.7470763921737671, "logps/ref_chosen": -0.8603025674819946, "logps/ref_rejected": -0.9167188405990601, "logps/rejected": -0.8492987751960754, "loss": 0.4776, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2830653786659241, "rewards/margins": 0.11451487243175507, "rewards/rejected": 0.1685505211353302, "step": 190 }, { "epoch": 0.8335559711461394, "grad_norm": 0.9788782596588135, "learning_rate": 7.937323358440934e-08, "logits/chosen": -2.13584041595459, "logits/rejected": -2.0809855461120605, "logps/chosen": -0.7344987988471985, "logps/ref_chosen": -0.8153272867202759, "logps/ref_rejected": -0.8524506688117981, "logps/rejected": -0.8127390742301941, "loss": 0.475, "rewards/accuracies": 0.59375, "rewards/chosen": 0.20207130908966064, "rewards/margins": 0.10279206931591034, "rewards/rejected": 0.09927921742200851, "step": 195 }, { "epoch": 0.8549292011755276, "grad_norm": 1.1459167003631592, "learning_rate": 6.026312439675551e-08, "logits/chosen": -1.9530729055404663, "logits/rejected": -1.8388773202896118, "logps/chosen": -0.7524019479751587, "logps/ref_chosen": -0.832992672920227, "logps/ref_rejected": -0.8378564715385437, "logps/rejected": -0.7900499701499939, "loss": 0.4783, "rewards/accuracies": 0.59375, "rewards/chosen": 0.20147652924060822, "rewards/margins": 0.08196047693490982, "rewards/rejected": 0.11951601505279541, "step": 200 }, { "epoch": 0.8763024312049158, "grad_norm": 1.2217403650283813, "learning_rate": 4.3635780274861864e-08, "logits/chosen": -1.9760059118270874, "logits/rejected": -1.881757378578186, "logps/chosen": -0.754524827003479, "logps/ref_chosen": -0.8361877202987671, "logps/ref_rejected": -0.8636215329170227, "logps/rejected": -0.8430477380752563, "loss": 0.4761, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.20415742695331573, "rewards/margins": 0.15272292494773865, "rewards/rejected": 0.05143451690673828, "step": 205 }, { "epoch": 0.897675661234304, "grad_norm": 0.9312232732772827, "learning_rate": 2.958507960694784e-08, "logits/chosen": -1.9826923608779907, "logits/rejected": -1.963322401046753, "logps/chosen": -0.7218400239944458, "logps/ref_chosen": -0.775153636932373, "logps/ref_rejected": -0.82710200548172, "logps/rejected": -0.8231655359268188, "loss": 0.4756, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.13328400254249573, "rewards/margins": 0.12344253063201904, "rewards/rejected": 0.009841480292379856, "step": 210 }, { "epoch": 0.9190488912636923, "grad_norm": 0.9846080541610718, "learning_rate": 1.8190352989793322e-08, "logits/chosen": -1.9855095148086548, "logits/rejected": -1.907576560974121, "logps/chosen": -0.7199736833572388, "logps/ref_chosen": -0.803063690662384, "logps/ref_rejected": -0.8516971468925476, "logps/rejected": -0.8365123867988586, "loss": 0.4776, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2077248990535736, "rewards/margins": 0.16976311802864075, "rewards/rejected": 0.03796178475022316, "step": 215 }, { "epoch": 0.9404221212930804, "grad_norm": 0.7335969805717468, "learning_rate": 9.515935326265378e-09, "logits/chosen": -2.0024361610412598, "logits/rejected": -1.9636704921722412, "logps/chosen": -0.7521845698356628, "logps/ref_chosen": -0.8253963589668274, "logps/ref_rejected": -0.849533200263977, "logps/rejected": -0.8330303430557251, "loss": 0.4757, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1830292046070099, "rewards/margins": 0.14177197217941284, "rewards/rejected": 0.04125722497701645, "step": 220 }, { "epoch": 0.9617953513224686, "grad_norm": 1.1543854475021362, "learning_rate": 3.6108025888958447e-09, "logits/chosen": -1.9360746145248413, "logits/rejected": -1.911627173423767, "logps/chosen": -0.7084980010986328, "logps/ref_chosen": -0.7970255613327026, "logps/ref_rejected": -0.8132475018501282, "logps/rejected": -0.7666771411895752, "loss": 0.4765, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.22131893038749695, "rewards/margins": 0.10489317029714584, "rewards/rejected": 0.1164257749915123, "step": 225 }, { "epoch": 0.9831685813518568, "grad_norm": 1.1417807340621948, "learning_rate": 5.082953003528456e-10, "logits/chosen": -2.013756513595581, "logits/rejected": -2.0383520126342773, "logps/chosen": -0.8249381184577942, "logps/ref_chosen": -0.8979822993278503, "logps/ref_rejected": -0.9172071218490601, "logps/rejected": -0.9182927012443542, "loss": 0.4774, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1826106607913971, "rewards/margins": 0.18532457947731018, "rewards/rejected": -0.0027139366138726473, "step": 230 }, { "epoch": 0.9959925193694897, "step": 233, "total_flos": 0.0, "train_loss": 0.4860667634931245, "train_runtime": 16529.3176, "train_samples_per_second": 3.622, "train_steps_per_second": 0.014 } ], "logging_steps": 5, "max_steps": 233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }