two_agent_1_epoch_2_dpo_iter_5 / trainer_state.json
YYYYYYibo's picture
Model save
79a0033 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 500,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 30.20068910303907,
"learning_rate": 6.25e-09,
"logits/chosen": 0.05829288810491562,
"logits/rejected": 0.12195920199155807,
"logps/chosen": -235.51776123046875,
"logps/rejected": -252.3660888671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"grad_norm": 35.659220970788716,
"learning_rate": 6.25e-08,
"logits/chosen": -0.1302209049463272,
"logits/rejected": -0.3692338764667511,
"logps/chosen": -247.50286865234375,
"logps/rejected": -264.7320556640625,
"loss": 0.6932,
"rewards/accuracies": 0.4340277910232544,
"rewards/chosen": -0.0013414309360086918,
"rewards/margins": -7.45424404158257e-05,
"rewards/rejected": -0.001266888459213078,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 29.89211126085747,
"learning_rate": 9.979871469976195e-08,
"logits/chosen": -0.259229838848114,
"logits/rejected": -0.35706019401550293,
"logps/chosen": -241.0030517578125,
"logps/rejected": -251.1970977783203,
"loss": 0.6931,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.02032109722495079,
"rewards/margins": -0.0005441303364932537,
"rewards/rejected": -0.01977696642279625,
"step": 20
},
{
"epoch": 0.19,
"grad_norm": 38.00242034346191,
"learning_rate": 9.755282581475768e-08,
"logits/chosen": -0.37507274746894836,
"logits/rejected": -0.30858054757118225,
"logps/chosen": -250.5952911376953,
"logps/rejected": -267.4319152832031,
"loss": 0.6913,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -0.04578831046819687,
"rewards/margins": 0.00868249125778675,
"rewards/rejected": -0.05447079613804817,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 44.38122948328188,
"learning_rate": 9.29224396800933e-08,
"logits/chosen": -0.30581027269363403,
"logits/rejected": -0.25420406460762024,
"logps/chosen": -245.3369598388672,
"logps/rejected": -265.40728759765625,
"loss": 0.6889,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -0.0466889962553978,
"rewards/margins": 0.00311011029407382,
"rewards/rejected": -0.049799107015132904,
"step": 40
},
{
"epoch": 0.32,
"grad_norm": 33.66958666071777,
"learning_rate": 8.613974319136957e-08,
"logits/chosen": -0.3118899464607239,
"logits/rejected": -0.2555062770843506,
"logps/chosen": -246.3278045654297,
"logps/rejected": -265.2810974121094,
"loss": 0.6874,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -0.04047214612364769,
"rewards/margins": 0.024090787395834923,
"rewards/rejected": -0.06456293165683746,
"step": 50
},
{
"epoch": 0.38,
"grad_norm": 39.42735209702952,
"learning_rate": 7.754484907260513e-08,
"logits/chosen": -0.26858726143836975,
"logits/rejected": -0.28768494725227356,
"logps/chosen": -255.31631469726562,
"logps/rejected": -269.5321960449219,
"loss": 0.6875,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -0.06594480574131012,
"rewards/margins": 0.02973010577261448,
"rewards/rejected": -0.09567491710186005,
"step": 60
},
{
"epoch": 0.45,
"grad_norm": 38.57810394961094,
"learning_rate": 6.756874120406714e-08,
"logits/chosen": -0.3371458649635315,
"logits/rejected": -0.1996055543422699,
"logps/chosen": -242.06527709960938,
"logps/rejected": -267.87457275390625,
"loss": 0.685,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0042762779630720615,
"rewards/margins": 0.029838895425200462,
"rewards/rejected": -0.03411517292261124,
"step": 70
},
{
"epoch": 0.51,
"grad_norm": 37.950575567812315,
"learning_rate": 5.6711663290882774e-08,
"logits/chosen": -0.19361785054206848,
"logits/rejected": -0.26724424958229065,
"logps/chosen": -230.7703399658203,
"logps/rejected": -257.036376953125,
"loss": 0.6904,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.005543149076402187,
"rewards/margins": 0.030957844108343124,
"rewards/rejected": -0.03650099039077759,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 50.50742600928591,
"learning_rate": 4.551803455482833e-08,
"logits/chosen": -0.23545122146606445,
"logits/rejected": -0.22761189937591553,
"logps/chosen": -248.32565307617188,
"logps/rejected": -266.6417541503906,
"loss": 0.6884,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.016439877450466156,
"rewards/margins": 0.02168435789644718,
"rewards/rejected": -0.03812423720955849,
"step": 90
},
{
"epoch": 0.64,
"grad_norm": 45.90060699086576,
"learning_rate": 3.4549150281252633e-08,
"logits/chosen": -0.24451108276844025,
"logits/rejected": -0.11925282329320908,
"logps/chosen": -253.621826171875,
"logps/rejected": -265.9607849121094,
"loss": 0.6907,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": -0.05188063532114029,
"rewards/margins": 0.025425013154745102,
"rewards/rejected": -0.07730564475059509,
"step": 100
},
{
"epoch": 0.7,
"grad_norm": 38.076986954965236,
"learning_rate": 2.43550361297047e-08,
"logits/chosen": -0.12951107323169708,
"logits/rejected": -0.1342063993215561,
"logps/chosen": -243.32846069335938,
"logps/rejected": -262.29034423828125,
"loss": 0.6895,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07932275533676147,
"rewards/margins": 0.019449030980467796,
"rewards/rejected": -0.09877178817987442,
"step": 110
},
{
"epoch": 0.77,
"grad_norm": 37.05649918470721,
"learning_rate": 1.5446867550656767e-08,
"logits/chosen": -0.2998012900352478,
"logits/rejected": -0.23906514048576355,
"logps/chosen": -253.05178833007812,
"logps/rejected": -268.27691650390625,
"loss": 0.6859,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08057109266519547,
"rewards/margins": 0.011484967544674873,
"rewards/rejected": -0.09205605834722519,
"step": 120
},
{
"epoch": 0.83,
"grad_norm": 37.629166998074275,
"learning_rate": 8.271337313934867e-09,
"logits/chosen": 0.0005074322107248008,
"logits/rejected": -0.1076931357383728,
"logps/chosen": -243.9499053955078,
"logps/rejected": -260.7448425292969,
"loss": 0.6848,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08455522358417511,
"rewards/margins": 0.022291380912065506,
"rewards/rejected": -0.10684660822153091,
"step": 130
},
{
"epoch": 0.9,
"grad_norm": 62.22968620333278,
"learning_rate": 3.1882564680131396e-09,
"logits/chosen": -0.28716200590133667,
"logits/rejected": -0.3127291798591614,
"logps/chosen": -243.41116333007812,
"logps/rejected": -268.4090270996094,
"loss": 0.687,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.06137162446975708,
"rewards/margins": 0.038829904049634933,
"rewards/rejected": -0.10020153224468231,
"step": 140
},
{
"epoch": 0.96,
"grad_norm": 37.82208076011054,
"learning_rate": 4.52511911603265e-10,
"logits/chosen": -0.08432894945144653,
"logits/rejected": -0.23105964064598083,
"logps/chosen": -239.08517456054688,
"logps/rejected": -257.0472106933594,
"loss": 0.6819,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09014703333377838,
"rewards/margins": 0.016562053933739662,
"rewards/rejected": -0.1067090854048729,
"step": 150
},
{
"epoch": 1.0,
"step": 156,
"total_flos": 0.0,
"train_loss": 0.6882889736921359,
"train_runtime": 18117.9848,
"train_samples_per_second": 1.104,
"train_steps_per_second": 0.009
}
],
"logging_steps": 10,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}