|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.504, |
|
"eval_steps": 500, |
|
"global_step": 470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001, |
|
"loss": 4.2845, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.611, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1007, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0667, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6745, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4179, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.256, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1206, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8113, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5563, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2945, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0038, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9775, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9107, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8357, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8438, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8182, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6811, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5087, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9827, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9673, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9514, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8378, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8721, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8317, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7948, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7682, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6472, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.463, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8907, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8254, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8455, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8194, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8291, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7265, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7856, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7599, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6127, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4152, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8772, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7661, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8362, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6781, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7479, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6598, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7109, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6603, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5983, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3945, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7734, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7553, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8062, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6815, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7524, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6798, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7037, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6274, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6103, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3983, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6683, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6045, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5759, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5826, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6502, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6278, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6155, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6104, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5942, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6177, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5307, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.443, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4582, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6175, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6191, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5887, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5517, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5712, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5526, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6027, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5325, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4752, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4214, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6299, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6215, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5869, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5448, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6038, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5647, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5564, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4994, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4244, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4652, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5929, |
|
"step": 470 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 470, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 90, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7094162776644813e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|