{ "best_metric": 0.9142091152815014, "best_model_checkpoint": "pokemon_models\\checkpoint-1610", "epoch": 23.0, "eval_steps": 500, "global_step": 1610, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.29, "learning_rate": 5e-06, "loss": 5.0145, "step": 20 }, { "epoch": 0.57, "learning_rate": 1e-05, "loss": 5.0039, "step": 40 }, { "epoch": 0.86, "learning_rate": 1.5e-05, "loss": 4.9942, "step": 60 }, { "epoch": 1.0, "eval_accuracy": 0.01876675603217158, "eval_loss": 4.973499298095703, "eval_runtime": 102.0829, "eval_samples_per_second": 10.962, "eval_steps_per_second": 0.686, "step": 70 }, { "epoch": 1.14, "learning_rate": 2e-05, "loss": 4.97, "step": 80 }, { "epoch": 1.43, "learning_rate": 2.5e-05, "loss": 4.9313, "step": 100 }, { "epoch": 1.71, "learning_rate": 3e-05, "loss": 4.893, "step": 120 }, { "epoch": 2.0, "learning_rate": 3.5e-05, "loss": 4.8374, "step": 140 }, { "epoch": 2.0, "eval_accuracy": 0.20196604110813227, "eval_loss": 4.816006660461426, "eval_runtime": 124.2897, "eval_samples_per_second": 9.003, "eval_steps_per_second": 0.563, "step": 140 }, { "epoch": 2.29, "learning_rate": 4e-05, "loss": 4.7329, "step": 160 }, { "epoch": 2.57, "learning_rate": 4.5e-05, "loss": 4.6472, "step": 180 }, { "epoch": 2.86, "learning_rate": 5e-05, "loss": 4.541, "step": 200 }, { "epoch": 3.0, "eval_accuracy": 0.5495978552278821, "eval_loss": 4.4448018074035645, "eval_runtime": 101.0357, "eval_samples_per_second": 11.075, "eval_steps_per_second": 0.693, "step": 210 }, { "epoch": 3.14, "learning_rate": 4.9444444444444446e-05, "loss": 4.4117, "step": 220 }, { "epoch": 3.43, "learning_rate": 4.888888888888889e-05, "loss": 4.2454, "step": 240 }, { "epoch": 3.71, "learning_rate": 4.8333333333333334e-05, "loss": 4.1227, "step": 260 }, { "epoch": 4.0, "learning_rate": 4.7777777777777784e-05, "loss": 4.0198, "step": 280 }, { "epoch": 4.0, "eval_accuracy": 0.7042001787310098, "eval_loss": 4.0061211585998535, "eval_runtime": 100.1956, "eval_samples_per_second": 11.168, "eval_steps_per_second": 0.699, "step": 280 }, { "epoch": 4.29, "learning_rate": 4.722222222222222e-05, "loss": 3.84, "step": 300 }, { "epoch": 4.57, "learning_rate": 4.666666666666667e-05, "loss": 3.757, "step": 320 }, { "epoch": 4.86, "learning_rate": 4.6111111111111115e-05, "loss": 3.6626, "step": 340 }, { "epoch": 5.0, "eval_accuracy": 0.7605004468275246, "eval_loss": 3.630556106567383, "eval_runtime": 100.0509, "eval_samples_per_second": 11.184, "eval_steps_per_second": 0.7, "step": 350 }, { "epoch": 5.14, "learning_rate": 4.555555555555556e-05, "loss": 3.5477, "step": 360 }, { "epoch": 5.43, "learning_rate": 4.5e-05, "loss": 3.3914, "step": 380 }, { "epoch": 5.71, "learning_rate": 4.4444444444444447e-05, "loss": 3.3164, "step": 400 }, { "epoch": 6.0, "learning_rate": 4.388888888888889e-05, "loss": 3.2654, "step": 420 }, { "epoch": 6.0, "eval_accuracy": 0.7971403038427167, "eval_loss": 3.3061511516571045, "eval_runtime": 99.8013, "eval_samples_per_second": 11.212, "eval_steps_per_second": 0.701, "step": 420 }, { "epoch": 6.29, "learning_rate": 4.3333333333333334e-05, "loss": 3.1041, "step": 440 }, { "epoch": 6.57, "learning_rate": 4.277777777777778e-05, "loss": 3.0193, "step": 460 }, { "epoch": 6.86, "learning_rate": 4.222222222222222e-05, "loss": 2.9314, "step": 480 }, { "epoch": 7.0, "eval_accuracy": 0.8310991957104558, "eval_loss": 2.994609832763672, "eval_runtime": 106.5638, "eval_samples_per_second": 10.501, "eval_steps_per_second": 0.657, "step": 490 }, { "epoch": 7.14, "learning_rate": 4.166666666666667e-05, "loss": 2.871, "step": 500 }, { "epoch": 7.43, "learning_rate": 4.111111111111111e-05, "loss": 2.7418, "step": 520 }, { "epoch": 7.71, "learning_rate": 4.055555555555556e-05, "loss": 2.6542, "step": 540 }, { "epoch": 8.0, "learning_rate": 4e-05, "loss": 2.5893, "step": 560 }, { "epoch": 8.0, "eval_accuracy": 0.8507596067917784, "eval_loss": 2.7318336963653564, "eval_runtime": 125.4233, "eval_samples_per_second": 8.922, "eval_steps_per_second": 0.558, "step": 560 }, { "epoch": 8.29, "learning_rate": 3.944444444444445e-05, "loss": 2.5106, "step": 580 }, { "epoch": 8.57, "learning_rate": 3.888888888888889e-05, "loss": 2.4358, "step": 600 }, { "epoch": 8.86, "learning_rate": 3.8333333333333334e-05, "loss": 2.3645, "step": 620 }, { "epoch": 9.0, "eval_accuracy": 0.8579088471849866, "eval_loss": 2.4826338291168213, "eval_runtime": 121.4568, "eval_samples_per_second": 9.213, "eval_steps_per_second": 0.576, "step": 630 }, { "epoch": 9.14, "learning_rate": 3.777777777777778e-05, "loss": 2.2831, "step": 640 }, { "epoch": 9.43, "learning_rate": 3.722222222222222e-05, "loss": 2.2297, "step": 660 }, { "epoch": 9.71, "learning_rate": 3.6666666666666666e-05, "loss": 2.1367, "step": 680 }, { "epoch": 10.0, "learning_rate": 3.611111111111111e-05, "loss": 2.0793, "step": 700 }, { "epoch": 10.0, "eval_accuracy": 0.871313672922252, "eval_loss": 2.245124578475952, "eval_runtime": 122.6079, "eval_samples_per_second": 9.127, "eval_steps_per_second": 0.571, "step": 700 }, { "epoch": 10.29, "learning_rate": 3.555555555555556e-05, "loss": 1.9796, "step": 720 }, { "epoch": 10.57, "learning_rate": 3.5e-05, "loss": 1.9471, "step": 740 }, { "epoch": 10.86, "learning_rate": 3.444444444444445e-05, "loss": 1.8754, "step": 760 }, { "epoch": 11.0, "eval_accuracy": 0.871313672922252, "eval_loss": 2.060222625732422, "eval_runtime": 122.2722, "eval_samples_per_second": 9.152, "eval_steps_per_second": 0.572, "step": 770 }, { "epoch": 11.14, "learning_rate": 3.388888888888889e-05, "loss": 1.8259, "step": 780 }, { "epoch": 11.43, "learning_rate": 3.3333333333333335e-05, "loss": 1.7872, "step": 800 }, { "epoch": 11.71, "learning_rate": 3.277777777777778e-05, "loss": 1.6884, "step": 820 }, { "epoch": 12.0, "learning_rate": 3.222222222222223e-05, "loss": 1.6703, "step": 840 }, { "epoch": 12.0, "eval_accuracy": 0.8811438784629133, "eval_loss": 1.872039556503296, "eval_runtime": 98.0421, "eval_samples_per_second": 11.413, "eval_steps_per_second": 0.714, "step": 840 }, { "epoch": 12.29, "learning_rate": 3.1666666666666666e-05, "loss": 1.6003, "step": 860 }, { "epoch": 12.57, "learning_rate": 3.111111111111111e-05, "loss": 1.5433, "step": 880 }, { "epoch": 12.86, "learning_rate": 3.055555555555556e-05, "loss": 1.5198, "step": 900 }, { "epoch": 13.0, "eval_accuracy": 0.8900804289544236, "eval_loss": 1.7361352443695068, "eval_runtime": 97.0673, "eval_samples_per_second": 11.528, "eval_steps_per_second": 0.721, "step": 910 }, { "epoch": 13.14, "learning_rate": 3e-05, "loss": 1.4742, "step": 920 }, { "epoch": 13.43, "learning_rate": 2.9444444444444448e-05, "loss": 1.3876, "step": 940 }, { "epoch": 13.71, "learning_rate": 2.8888888888888888e-05, "loss": 1.3603, "step": 960 }, { "epoch": 14.0, "learning_rate": 2.8333333333333335e-05, "loss": 1.329, "step": 980 }, { "epoch": 14.0, "eval_accuracy": 0.900804289544236, "eval_loss": 1.563855528831482, "eval_runtime": 97.4399, "eval_samples_per_second": 11.484, "eval_steps_per_second": 0.718, "step": 980 }, { "epoch": 14.29, "learning_rate": 2.777777777777778e-05, "loss": 1.2523, "step": 1000 }, { "epoch": 14.57, "learning_rate": 2.7222222222222223e-05, "loss": 1.2747, "step": 1020 }, { "epoch": 14.86, "learning_rate": 2.6666666666666667e-05, "loss": 1.203, "step": 1040 }, { "epoch": 15.0, "eval_accuracy": 0.8927613941018767, "eval_loss": 1.4685680866241455, "eval_runtime": 96.9819, "eval_samples_per_second": 11.538, "eval_steps_per_second": 0.722, "step": 1050 }, { "epoch": 15.14, "learning_rate": 2.6111111111111114e-05, "loss": 1.1697, "step": 1060 }, { "epoch": 15.43, "learning_rate": 2.5555555555555554e-05, "loss": 1.0943, "step": 1080 }, { "epoch": 15.71, "learning_rate": 2.5e-05, "loss": 1.0947, "step": 1100 }, { "epoch": 16.0, "learning_rate": 2.4444444444444445e-05, "loss": 1.104, "step": 1120 }, { "epoch": 16.0, "eval_accuracy": 0.8981233243967829, "eval_loss": 1.3596620559692383, "eval_runtime": 97.1177, "eval_samples_per_second": 11.522, "eval_steps_per_second": 0.721, "step": 1120 }, { "epoch": 16.29, "learning_rate": 2.3888888888888892e-05, "loss": 1.0113, "step": 1140 }, { "epoch": 16.57, "learning_rate": 2.3333333333333336e-05, "loss": 1.0285, "step": 1160 }, { "epoch": 16.86, "learning_rate": 2.277777777777778e-05, "loss": 0.9682, "step": 1180 }, { "epoch": 17.0, "eval_accuracy": 0.8990169794459338, "eval_loss": 1.2199994325637817, "eval_runtime": 486.7671, "eval_samples_per_second": 2.299, "eval_steps_per_second": 0.144, "step": 1190 }, { "epoch": 17.14, "learning_rate": 2.2222222222222223e-05, "loss": 0.9578, "step": 1200 }, { "epoch": 17.43, "learning_rate": 2.1666666666666667e-05, "loss": 0.9403, "step": 1220 }, { "epoch": 17.71, "learning_rate": 2.111111111111111e-05, "loss": 0.8924, "step": 1240 }, { "epoch": 18.0, "learning_rate": 2.0555555555555555e-05, "loss": 0.872, "step": 1260 }, { "epoch": 18.0, "eval_accuracy": 0.903485254691689, "eval_loss": 1.1389293670654297, "eval_runtime": 110.8112, "eval_samples_per_second": 10.098, "eval_steps_per_second": 0.632, "step": 1260 }, { "epoch": 18.29, "learning_rate": 2e-05, "loss": 0.8312, "step": 1280 }, { "epoch": 18.57, "learning_rate": 1.9444444444444445e-05, "loss": 0.8201, "step": 1300 }, { "epoch": 18.86, "learning_rate": 1.888888888888889e-05, "loss": 0.844, "step": 1320 }, { "epoch": 19.0, "eval_accuracy": 0.9124218051831993, "eval_loss": 1.0643764734268188, "eval_runtime": 109.2391, "eval_samples_per_second": 10.244, "eval_steps_per_second": 0.641, "step": 1330 }, { "epoch": 19.14, "learning_rate": 1.8333333333333333e-05, "loss": 0.8116, "step": 1340 }, { "epoch": 19.43, "learning_rate": 1.777777777777778e-05, "loss": 0.7649, "step": 1360 }, { "epoch": 19.71, "learning_rate": 1.7222222222222224e-05, "loss": 0.7402, "step": 1380 }, { "epoch": 20.0, "learning_rate": 1.6666666666666667e-05, "loss": 0.7605, "step": 1400 }, { "epoch": 20.0, "eval_accuracy": 0.9088471849865952, "eval_loss": 1.0364218950271606, "eval_runtime": 108.8495, "eval_samples_per_second": 10.28, "eval_steps_per_second": 0.643, "step": 1400 }, { "epoch": 20.29, "learning_rate": 1.6111111111111115e-05, "loss": 0.7156, "step": 1420 }, { "epoch": 20.57, "learning_rate": 1.5555555555555555e-05, "loss": 0.7109, "step": 1440 }, { "epoch": 20.86, "learning_rate": 1.5e-05, "loss": 0.7244, "step": 1460 }, { "epoch": 21.0, "eval_accuracy": 0.902591599642538, "eval_loss": 0.9655722379684448, "eval_runtime": 106.989, "eval_samples_per_second": 10.459, "eval_steps_per_second": 0.654, "step": 1470 }, { "epoch": 21.14, "learning_rate": 1.4444444444444444e-05, "loss": 0.6925, "step": 1480 }, { "epoch": 21.43, "learning_rate": 1.388888888888889e-05, "loss": 0.6687, "step": 1500 }, { "epoch": 21.71, "learning_rate": 1.3333333333333333e-05, "loss": 0.658, "step": 1520 }, { "epoch": 22.0, "learning_rate": 1.2777777777777777e-05, "loss": 0.6595, "step": 1540 }, { "epoch": 22.0, "eval_accuracy": 0.9133154602323503, "eval_loss": 0.9125866889953613, "eval_runtime": 106.9609, "eval_samples_per_second": 10.462, "eval_steps_per_second": 0.654, "step": 1540 }, { "epoch": 22.29, "learning_rate": 1.2222222222222222e-05, "loss": 0.6489, "step": 1560 }, { "epoch": 22.57, "learning_rate": 1.1666666666666668e-05, "loss": 0.6666, "step": 1580 }, { "epoch": 22.86, "learning_rate": 1.1111111111111112e-05, "loss": 0.6188, "step": 1600 }, { "epoch": 23.0, "eval_accuracy": 0.9142091152815014, "eval_loss": 0.8716733455657959, "eval_runtime": 107.8489, "eval_samples_per_second": 10.376, "eval_steps_per_second": 0.649, "step": 1610 } ], "logging_steps": 20, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 29, "save_steps": 500, "total_flos": 7.982873471516332e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }