{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3968, "eval_steps": 31, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 309.04591823661724, "learning_rate": 5.3191489361702125e-09, "logits/generated": -3.1874351501464844, "logits/real": -2.811344623565674, "logps/generated": -277.39678955078125, "logps/real": -164.29153442382812, "loss": 0.8248, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.032, "grad_norm": 273.888694984639, "learning_rate": 5.3191489361702123e-08, "logits/generated": -2.979994058609009, "logits/real": -2.536571979522705, "logps/generated": -242.26495361328125, "logps/real": -126.36863708496094, "loss": 0.7579, "rewards/accuracies": 0.8055555820465088, "rewards/generated": -0.08589766174554825, "rewards/margins": 0.1557125300168991, "rewards/real": 0.06981485337018967, "step": 10 }, { "epoch": 0.064, "grad_norm": 6.278552838520857, "learning_rate": 1.0638297872340425e-07, "logits/generated": -3.1302971839904785, "logits/real": -2.4443600177764893, "logps/generated": -277.353759765625, "logps/real": -123.6572494506836, "loss": 0.2741, "rewards/accuracies": 1.0, "rewards/generated": -1.92562997341156, "rewards/margins": 3.013349771499634, "rewards/real": 1.0877193212509155, "step": 20 }, { "epoch": 0.096, "grad_norm": 2.184986357146384, "learning_rate": 1.5957446808510638e-07, "logits/generated": -2.7463412284851074, "logits/real": -2.1607251167297363, "logps/generated": -311.76275634765625, "logps/real": -105.89111328125, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/generated": -5.328858375549316, "rewards/margins": 8.681567192077637, "rewards/real": 3.3527092933654785, "step": 30 }, { "epoch": 0.0992, "eval_logits/generated": -2.7707955837249756, "eval_logits/real": -2.1341326236724854, "eval_logps/generated": -309.7686767578125, "eval_logps/real": -99.30474090576172, "eval_loss": 0.10212492197751999, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -5.6528496742248535, "eval_rewards/margins": 9.39120101928711, "eval_rewards/real": 3.7383503913879395, "eval_runtime": 52.8853, "eval_samples_per_second": 3.782, "eval_steps_per_second": 0.246, "step": 31 }, { "epoch": 0.128, "grad_norm": 1.4683533798363122, "learning_rate": 2.127659574468085e-07, "logits/generated": -2.6473488807678223, "logits/real": -2.092941999435425, "logps/generated": -328.85003662109375, "logps/real": -108.34922790527344, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/generated": -7.4022650718688965, "rewards/margins": 11.075445175170898, "rewards/real": 3.6731820106506348, "step": 40 }, { "epoch": 0.16, "grad_norm": 1.3833819249775736, "learning_rate": 2.659574468085106e-07, "logits/generated": -2.673710346221924, "logits/real": -1.7863633632659912, "logps/generated": -341.03179931640625, "logps/real": -99.21113586425781, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/generated": -8.64827823638916, "rewards/margins": 12.339118003845215, "rewards/real": 3.6908397674560547, "step": 50 }, { "epoch": 0.192, "grad_norm": 1.4745541409589713, "learning_rate": 3.1914893617021275e-07, "logits/generated": -2.457066297531128, "logits/real": -1.9866771697998047, "logps/generated": -337.7644958496094, "logps/real": -100.87881469726562, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/generated": -9.72453784942627, "rewards/margins": 13.710101127624512, "rewards/real": 3.9855639934539795, "step": 60 }, { "epoch": 0.1984, "eval_logits/generated": -2.7848963737487793, "eval_logits/real": -2.406802177429199, "eval_logps/generated": -329.84674072265625, "eval_logps/real": -96.66284942626953, "eval_loss": 0.09936456382274628, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -7.660656452178955, "eval_rewards/margins": 11.66319751739502, "eval_rewards/real": 4.002540111541748, "eval_runtime": 57.5754, "eval_samples_per_second": 3.474, "eval_steps_per_second": 0.226, "step": 62 }, { "epoch": 0.224, "grad_norm": 1.323646793674921, "learning_rate": 3.7234042553191484e-07, "logits/generated": -2.6429388523101807, "logits/real": -2.2720017433166504, "logps/generated": -340.06341552734375, "logps/real": -93.11213684082031, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/generated": -8.733041763305664, "rewards/margins": 12.913464546203613, "rewards/real": 4.180423259735107, "step": 70 }, { "epoch": 0.256, "grad_norm": 1.5638794899127106, "learning_rate": 4.25531914893617e-07, "logits/generated": -2.6645286083221436, "logits/real": -2.423600435256958, "logps/generated": -337.8456726074219, "logps/real": -98.58467102050781, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/generated": -9.53776741027832, "rewards/margins": 13.911605834960938, "rewards/real": 4.373837947845459, "step": 80 }, { "epoch": 0.288, "grad_norm": 1.3991564683587894, "learning_rate": 4.787234042553192e-07, "logits/generated": -2.6595396995544434, "logits/real": -2.3267276287078857, "logps/generated": -356.4102478027344, "logps/real": -105.5425033569336, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/generated": -9.849004745483398, "rewards/margins": 14.890844345092773, "rewards/real": 5.041840076446533, "step": 90 }, { "epoch": 0.2976, "eval_logits/generated": -2.7958528995513916, "eval_logits/real": -2.4889986515045166, "eval_logps/generated": -360.5955505371094, "eval_logps/real": -97.25410461425781, "eval_loss": 0.09956898540258408, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -10.735539436340332, "eval_rewards/margins": 14.678956985473633, "eval_rewards/real": 3.943415403366089, "eval_runtime": 58.0625, "eval_samples_per_second": 3.445, "eval_steps_per_second": 0.224, "step": 93 }, { "epoch": 0.32, "grad_norm": 1.2445906541027463, "learning_rate": 4.96437054631829e-07, "logits/generated": -2.7101943492889404, "logits/real": -2.502084493637085, "logps/generated": -350.31158447265625, "logps/real": -102.1003646850586, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/generated": -10.010136604309082, "rewards/margins": 15.269411087036133, "rewards/real": 5.259275913238525, "step": 100 }, { "epoch": 0.352, "grad_norm": 1.0986971210025431, "learning_rate": 4.904988123515439e-07, "logits/generated": -2.725268602371216, "logits/real": -2.57702374458313, "logps/generated": -361.0770263671875, "logps/real": -92.5576171875, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/generated": -11.147048950195312, "rewards/margins": 16.710220336914062, "rewards/real": 5.563170433044434, "step": 110 }, { "epoch": 0.384, "grad_norm": 1.335382395511731, "learning_rate": 4.845605700712589e-07, "logits/generated": -2.6858296394348145, "logits/real": -2.493039608001709, "logps/generated": -359.83551025390625, "logps/real": -86.32958984375, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/generated": -11.481039047241211, "rewards/margins": 17.33412742614746, "rewards/real": 5.853088855743408, "step": 120 }, { "epoch": 0.3968, "eval_logits/generated": -2.7832465171813965, "eval_logits/real": -2.51879620552063, "eval_logps/generated": -361.52301025390625, "eval_logps/real": -98.11322784423828, "eval_loss": 0.10012635588645935, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -10.828282356262207, "eval_rewards/margins": 14.685786247253418, "eval_rewards/real": 3.85750150680542, "eval_runtime": 58.1864, "eval_samples_per_second": 3.437, "eval_steps_per_second": 0.223, "step": 124 } ], "logging_steps": 10, "max_steps": 936, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 31, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }