simple_online_epoch_2_dpo_iter_5 / trainer_state.json
YYYYYYibo's picture
Model save
4264ef2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 28.229060496303497,
"learning_rate": 3.125e-08,
"logits/chosen": 0.5326807498931885,
"logits/rejected": 0.5883637070655823,
"logps/chosen": -185.19822692871094,
"logps/rejected": -194.60989379882812,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"grad_norm": 32.45486691880899,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": 0.33694812655448914,
"logits/rejected": 0.18525859713554382,
"logps/chosen": -228.18931579589844,
"logps/rejected": -250.44186401367188,
"loss": 0.6926,
"rewards/accuracies": 0.44017094373703003,
"rewards/chosen": -0.009515076875686646,
"rewards/margins": 0.0016028096433728933,
"rewards/rejected": -0.011117885820567608,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 32.74664763693393,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": 0.37088415026664734,
"logits/rejected": 0.4128836989402771,
"logps/chosen": -254.14837646484375,
"logps/rejected": -276.72271728515625,
"loss": 0.6928,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.197755366563797,
"rewards/margins": 0.020086202770471573,
"rewards/rejected": -0.21784158051013947,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 38.40726627675541,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": 0.354877769947052,
"logits/rejected": 0.33263731002807617,
"logps/chosen": -253.26522827148438,
"logps/rejected": -263.25042724609375,
"loss": 0.6936,
"rewards/accuracies": 0.5192307829856873,
"rewards/chosen": -0.3028598725795746,
"rewards/margins": 0.01474391482770443,
"rewards/rejected": -0.3176037669181824,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 38.797592985526,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": 0.1634213924407959,
"logits/rejected": 0.2352660596370697,
"logps/chosen": -244.1608123779297,
"logps/rejected": -260.97369384765625,
"loss": 0.7013,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0662732720375061,
"rewards/margins": 0.021908778697252274,
"rewards/rejected": -0.08818206936120987,
"step": 40
},
{
"epoch": 0.33,
"grad_norm": 35.745848630813434,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": 0.44109058380126953,
"logits/rejected": 0.4526838958263397,
"logps/chosen": -233.4575653076172,
"logps/rejected": -257.31494140625,
"loss": 0.6925,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.13867294788360596,
"rewards/margins": 0.03718903288245201,
"rewards/rejected": -0.17586196959018707,
"step": 50
},
{
"epoch": 0.39,
"grad_norm": 38.266936092602805,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": 0.3567802309989929,
"logits/rejected": 0.4483684301376343,
"logps/chosen": -273.6899719238281,
"logps/rejected": -289.3863220214844,
"loss": 0.6819,
"rewards/accuracies": 0.5769230723381042,
"rewards/chosen": -0.3489604592323303,
"rewards/margins": 0.07096390426158905,
"rewards/rejected": -0.4199243485927582,
"step": 60
},
{
"epoch": 0.46,
"grad_norm": 36.58606527761828,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": 0.49843645095825195,
"logits/rejected": 0.5629610419273376,
"logps/chosen": -280.0177917480469,
"logps/rejected": -301.7088317871094,
"loss": 0.6845,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.45351117849349976,
"rewards/margins": 0.03525887802243233,
"rewards/rejected": -0.4887700378894806,
"step": 70
},
{
"epoch": 0.52,
"grad_norm": 37.11287003011496,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": 0.6807990670204163,
"logits/rejected": 0.7718464136123657,
"logps/chosen": -280.74658203125,
"logps/rejected": -311.41009521484375,
"loss": 0.676,
"rewards/accuracies": 0.5923076868057251,
"rewards/chosen": -0.41281428933143616,
"rewards/margins": 0.10995330661535263,
"rewards/rejected": -0.5227676033973694,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 50.701799655354534,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": 0.6627506613731384,
"logits/rejected": 0.6323168277740479,
"logps/chosen": -278.023681640625,
"logps/rejected": -306.6815185546875,
"loss": 0.6771,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.4557516276836395,
"rewards/margins": 0.07288946956396103,
"rewards/rejected": -0.5286410450935364,
"step": 90
},
{
"epoch": 0.65,
"grad_norm": 29.961533632761956,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": 0.5369245409965515,
"logits/rejected": 0.5473312735557556,
"logps/chosen": -275.7201843261719,
"logps/rejected": -282.8805236816406,
"loss": 0.7042,
"rewards/accuracies": 0.5115384459495544,
"rewards/chosen": -0.3841624855995178,
"rewards/margins": -0.015378502197563648,
"rewards/rejected": -0.36878401041030884,
"step": 100
},
{
"epoch": 0.71,
"grad_norm": 31.11285576879039,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": 0.31445741653442383,
"logits/rejected": 0.33911341428756714,
"logps/chosen": -258.0011901855469,
"logps/rejected": -270.0567932128906,
"loss": 0.6857,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.2574421763420105,
"rewards/margins": 0.022587427869439125,
"rewards/rejected": -0.2800295948982239,
"step": 110
},
{
"epoch": 0.78,
"grad_norm": 31.866161116501196,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": 0.6134840250015259,
"logits/rejected": 0.691197395324707,
"logps/chosen": -284.92510986328125,
"logps/rejected": -306.45050048828125,
"loss": 0.6837,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.5774205327033997,
"rewards/margins": 0.037515509873628616,
"rewards/rejected": -0.6149360537528992,
"step": 120
},
{
"epoch": 0.84,
"grad_norm": 41.02700179238128,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": 1.0320525169372559,
"logits/rejected": 0.8667150735855103,
"logps/chosen": -293.99853515625,
"logps/rejected": -317.0647888183594,
"loss": 0.688,
"rewards/accuracies": 0.5961538553237915,
"rewards/chosen": -0.5775225162506104,
"rewards/margins": 0.07906623929738998,
"rewards/rejected": -0.6565887928009033,
"step": 130
},
{
"epoch": 0.91,
"grad_norm": 33.934196556449756,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": 0.4998157322406769,
"logits/rejected": 0.41973716020584106,
"logps/chosen": -282.9462585449219,
"logps/rejected": -299.1942443847656,
"loss": 0.6794,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.4659727215766907,
"rewards/margins": 0.056435175240039825,
"rewards/rejected": -0.5224078297615051,
"step": 140
},
{
"epoch": 0.97,
"grad_norm": 38.33373569305899,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": 0.42322245240211487,
"logits/rejected": 0.43874579668045044,
"logps/chosen": -272.926513671875,
"logps/rejected": -309.4190368652344,
"loss": 0.6797,
"rewards/accuracies": 0.5807692408561707,
"rewards/chosen": -0.5053122043609619,
"rewards/margins": 0.1106579527258873,
"rewards/rejected": -0.6159701943397522,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.687320882978003,
"train_runtime": 21824.845,
"train_samples_per_second": 0.916,
"train_steps_per_second": 0.007
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}