ZhangShenao's picture
Model save
6d9216a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984301412872841,
"eval_steps": 500,
"global_step": 159,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006279434850863423,
"grad_norm": 16.698454749053152,
"learning_rate": 1.875e-08,
"logits/chosen": 0.13163629174232483,
"logits/rejected": 0.7037353515625,
"logps/chosen": -296.6709289550781,
"logps/pi_response": -123.40753173828125,
"logps/ref_response": -123.40753173828125,
"logps/rejected": -325.5771484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06279434850863422,
"grad_norm": 24.513917430946538,
"learning_rate": 1.875e-07,
"logits/chosen": 0.6406950354576111,
"logits/rejected": 0.8759365081787109,
"logps/chosen": -260.0070495605469,
"logps/pi_response": -114.28534698486328,
"logps/ref_response": -114.47286224365234,
"logps/rejected": -385.5276184082031,
"loss": 0.6924,
"rewards/accuracies": 0.4097222089767456,
"rewards/chosen": 0.00021976388234179467,
"rewards/margins": -2.7502783268573694e-05,
"rewards/rejected": 0.000247266492806375,
"step": 10
},
{
"epoch": 0.12558869701726844,
"grad_norm": 20.98441253721937,
"learning_rate": 2.9942119880575817e-07,
"logits/chosen": 0.5097376704216003,
"logits/rejected": 0.8540347814559937,
"logps/chosen": -268.75433349609375,
"logps/pi_response": -118.2417221069336,
"logps/ref_response": -118.39286041259766,
"logps/rejected": -400.633544921875,
"loss": 0.6749,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03052676096558571,
"rewards/margins": 0.040752165019512177,
"rewards/rejected": -0.07127892971038818,
"step": 20
},
{
"epoch": 0.18838304552590268,
"grad_norm": 13.999591521451507,
"learning_rate": 2.929608750821129e-07,
"logits/chosen": 0.5199416875839233,
"logits/rejected": 0.992133617401123,
"logps/chosen": -295.43292236328125,
"logps/pi_response": -119.1610336303711,
"logps/ref_response": -118.39522552490234,
"logps/rejected": -421.61041259765625,
"loss": 0.6174,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.17318478226661682,
"rewards/margins": 0.18207214772701263,
"rewards/rejected": -0.35525694489479065,
"step": 30
},
{
"epoch": 0.25117739403453687,
"grad_norm": 14.705995187750815,
"learning_rate": 2.7962832564252725e-07,
"logits/chosen": 0.5350409746170044,
"logits/rejected": 0.9762212634086609,
"logps/chosen": -292.2400207519531,
"logps/pi_response": -122.87149810791016,
"logps/ref_response": -120.0985336303711,
"logps/rejected": -484.0975646972656,
"loss": 0.5739,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.30772843956947327,
"rewards/margins": 0.45096302032470703,
"rewards/rejected": -0.7586914300918579,
"step": 40
},
{
"epoch": 0.3139717425431711,
"grad_norm": 20.925472606748368,
"learning_rate": 2.6006445513357056e-07,
"logits/chosen": 0.6897233724594116,
"logits/rejected": 1.0123343467712402,
"logps/chosen": -320.56976318359375,
"logps/pi_response": -124.61143493652344,
"logps/ref_response": -115.71650695800781,
"logps/rejected": -523.7175903320312,
"loss": 0.5629,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6995627284049988,
"rewards/margins": 0.6693333387374878,
"rewards/rejected": -1.3688960075378418,
"step": 50
},
{
"epoch": 0.37676609105180536,
"grad_norm": 13.008934683020064,
"learning_rate": 2.3520971200967334e-07,
"logits/chosen": 0.6137208938598633,
"logits/rejected": 1.0412781238555908,
"logps/chosen": -332.25738525390625,
"logps/pi_response": -126.09577941894531,
"logps/ref_response": -118.1528549194336,
"logps/rejected": -495.4088439941406,
"loss": 0.555,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5905637145042419,
"rewards/margins": 0.6077089309692383,
"rewards/rejected": -1.198272705078125,
"step": 60
},
{
"epoch": 0.43956043956043955,
"grad_norm": 12.777487677582881,
"learning_rate": 2.0625888054143427e-07,
"logits/chosen": 0.593045175075531,
"logits/rejected": 0.9839151501655579,
"logps/chosen": -273.0267333984375,
"logps/pi_response": -126.1861801147461,
"logps/ref_response": -120.62638854980469,
"logps/rejected": -531.7461547851562,
"loss": 0.5431,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3775605261325836,
"rewards/margins": 0.8122557401657104,
"rewards/rejected": -1.1898162364959717,
"step": 70
},
{
"epoch": 0.5023547880690737,
"grad_norm": 10.709251992827037,
"learning_rate": 1.7460364672965327e-07,
"logits/chosen": 0.6686810255050659,
"logits/rejected": 1.0736128091812134,
"logps/chosen": -280.6498107910156,
"logps/pi_response": -112.1661376953125,
"logps/ref_response": -106.67897033691406,
"logps/rejected": -489.26556396484375,
"loss": 0.5321,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4469337463378906,
"rewards/margins": 0.6373990774154663,
"rewards/rejected": -1.084332823753357,
"step": 80
},
{
"epoch": 0.565149136577708,
"grad_norm": 11.68215452300686,
"learning_rate": 1.4176569902035086e-07,
"logits/chosen": 0.6378764510154724,
"logits/rejected": 1.0353127717971802,
"logps/chosen": -339.03973388671875,
"logps/pi_response": -119.71498107910156,
"logps/ref_response": -111.9307861328125,
"logps/rejected": -513.7333984375,
"loss": 0.5119,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5252664685249329,
"rewards/margins": 0.7039340734481812,
"rewards/rejected": -1.2292006015777588,
"step": 90
},
{
"epoch": 0.6279434850863422,
"grad_norm": 11.96502204484806,
"learning_rate": 1.0932357971453743e-07,
"logits/chosen": 0.7395003437995911,
"logits/rejected": 1.0328724384307861,
"logps/chosen": -301.8004455566406,
"logps/pi_response": -116.29144287109375,
"logps/ref_response": -108.0909423828125,
"logps/rejected": -524.360107421875,
"loss": 0.547,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.5083015561103821,
"rewards/margins": 0.7502217888832092,
"rewards/rejected": -1.2585232257843018,
"step": 100
},
{
"epoch": 0.6907378335949764,
"grad_norm": 11.40631298798362,
"learning_rate": 7.883680337481599e-08,
"logits/chosen": 0.7460795640945435,
"logits/rejected": 0.9838323593139648,
"logps/chosen": -305.3519592285156,
"logps/pi_response": -125.8452377319336,
"logps/ref_response": -117.07008361816406,
"logps/rejected": -504.4964294433594,
"loss": 0.5349,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4922094941139221,
"rewards/margins": 0.6827653646469116,
"rewards/rejected": -1.174974799156189,
"step": 110
},
{
"epoch": 0.7535321821036107,
"grad_norm": 10.671563097729658,
"learning_rate": 5.177088990820725e-08,
"logits/chosen": 0.5097354650497437,
"logits/rejected": 0.8302543759346008,
"logps/chosen": -327.8287353515625,
"logps/pi_response": -134.05953979492188,
"logps/ref_response": -125.61170959472656,
"logps/rejected": -554.1288452148438,
"loss": 0.524,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5032998323440552,
"rewards/margins": 0.7790510058403015,
"rewards/rejected": -1.282350778579712,
"step": 120
},
{
"epoch": 0.8163265306122449,
"grad_norm": 13.393410138993277,
"learning_rate": 2.942691603548416e-08,
"logits/chosen": 0.5484687089920044,
"logits/rejected": 1.0104806423187256,
"logps/chosen": -329.3583984375,
"logps/pi_response": -133.8960723876953,
"logps/ref_response": -126.83935546875,
"logps/rejected": -530.1004028320312,
"loss": 0.5048,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4904448986053467,
"rewards/margins": 0.7761750221252441,
"rewards/rejected": -1.2666199207305908,
"step": 130
},
{
"epoch": 0.8791208791208791,
"grad_norm": 12.267577779535525,
"learning_rate": 1.2878971655412513e-08,
"logits/chosen": 0.5720739364624023,
"logits/rejected": 0.9325042963027954,
"logps/chosen": -310.2190856933594,
"logps/pi_response": -136.25198364257812,
"logps/ref_response": -126.86582946777344,
"logps/rejected": -564.576171875,
"loss": 0.5164,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.481649786233902,
"rewards/margins": 0.8537376523017883,
"rewards/rejected": -1.3353874683380127,
"step": 140
},
{
"epoch": 0.9419152276295133,
"grad_norm": 11.331362077198552,
"learning_rate": 2.922527618666465e-09,
"logits/chosen": 0.5811373591423035,
"logits/rejected": 0.9567831158638,
"logps/chosen": -304.1393737792969,
"logps/pi_response": -123.76485443115234,
"logps/ref_response": -114.90129089355469,
"logps/rejected": -522.5949096679688,
"loss": 0.5227,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.5134440064430237,
"rewards/margins": 0.7166833281517029,
"rewards/rejected": -1.2301273345947266,
"step": 150
},
{
"epoch": 0.9984301412872841,
"step": 159,
"total_flos": 0.0,
"train_loss": 0.5592624436384477,
"train_runtime": 4324.4895,
"train_samples_per_second": 4.712,
"train_steps_per_second": 0.037
}
],
"logging_steps": 10,
"max_steps": 159,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}