{ "best_metric": 0.6441005802707931, "best_model_checkpoint": "/train/checkpoint-1344", "epoch": 447.87857142857143, "global_step": 1344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.29, "learning_rate": 0.0001, "loss": 4.3257, "step": 1 }, { "epoch": 1.29, "learning_rate": 0.0001, "loss": 2.9741, "step": 4 }, { "epoch": 2.59, "learning_rate": 0.0001, "loss": 1.6232, "step": 8 }, { "epoch": 3.88, "learning_rate": 0.0001, "loss": 0.951, "step": 12 }, { "epoch": 5.29, "learning_rate": 0.0001, "loss": 0.7842, "step": 16 }, { "epoch": 6.59, "learning_rate": 0.0001, "loss": 0.5481, "step": 20 }, { "epoch": 7.88, "learning_rate": 0.0001, "loss": 0.4986, "step": 24 }, { "epoch": 9.29, "learning_rate": 0.0001, "loss": 0.4439, "step": 28 }, { "epoch": 10.59, "learning_rate": 0.0001, "loss": 0.3802, "step": 32 }, { "epoch": 11.88, "learning_rate": 0.0001, "loss": 0.3228, "step": 36 }, { "epoch": 13.29, "learning_rate": 0.0001, "loss": 0.3209, "step": 40 }, { "epoch": 14.59, "learning_rate": 0.0001, "loss": 0.2883, "step": 44 }, { "epoch": 15.88, "learning_rate": 0.0001, "loss": 0.2424, "step": 48 }, { "epoch": 17.29, "learning_rate": 0.0001, "loss": 0.2485, "step": 52 }, { "epoch": 18.59, "learning_rate": 0.0001, "loss": 0.2291, "step": 56 }, { "epoch": 19.88, "learning_rate": 0.0001, "loss": 0.2096, "step": 60 }, { "epoch": 21.29, "learning_rate": 0.0001, "loss": 0.2204, "step": 64 }, { "epoch": 21.29, "eval_exact_match": 0.4090909090909091, "eval_exec": 0.44777562862669246, "eval_loss": 0.21543213725090027, "eval_runtime": 371.8333, "eval_samples_per_second": 2.781, "step": 64 }, { "epoch": 22.59, "learning_rate": 0.0001, "loss": 0.1867, "step": 68 }, { "epoch": 23.88, "learning_rate": 0.0001, "loss": 0.1716, "step": 72 }, { "epoch": 25.29, "learning_rate": 0.0001, "loss": 0.1708, "step": 76 }, { "epoch": 26.59, "learning_rate": 0.0001, "loss": 0.1475, "step": 80 }, { "epoch": 27.88, "learning_rate": 0.0001, "loss": 0.1406, "step": 84 }, { "epoch": 29.29, "learning_rate": 0.0001, "loss": 0.1382, "step": 88 }, { "epoch": 30.59, "learning_rate": 0.0001, "loss": 0.1211, "step": 92 }, { "epoch": 31.88, "learning_rate": 0.0001, "loss": 0.1082, "step": 96 }, { "epoch": 33.29, "learning_rate": 0.0001, "loss": 0.1139, "step": 100 }, { "epoch": 34.59, "learning_rate": 0.0001, "loss": 0.1048, "step": 104 }, { "epoch": 35.88, "learning_rate": 0.0001, "loss": 0.097, "step": 108 }, { "epoch": 37.29, "learning_rate": 0.0001, "loss": 0.1101, "step": 112 }, { "epoch": 38.59, "learning_rate": 0.0001, "loss": 0.0946, "step": 116 }, { "epoch": 39.88, "learning_rate": 0.0001, "loss": 0.0852, "step": 120 }, { "epoch": 41.29, "learning_rate": 0.0001, "loss": 0.0923, "step": 124 }, { "epoch": 42.59, "learning_rate": 0.0001, "loss": 0.0736, "step": 128 }, { "epoch": 42.59, "eval_exact_match": 0.5386847195357833, "eval_exec": 0.5725338491295938, "eval_loss": 0.1859811693429947, "eval_runtime": 369.8509, "eval_samples_per_second": 2.796, "step": 128 }, { "epoch": 43.88, "learning_rate": 0.0001, "loss": 0.0688, "step": 132 }, { "epoch": 45.29, "learning_rate": 0.0001, "loss": 0.0807, "step": 136 }, { "epoch": 46.59, "learning_rate": 0.0001, "loss": 0.0676, "step": 140 }, { "epoch": 47.88, "learning_rate": 0.0001, "loss": 0.0673, "step": 144 }, { "epoch": 49.29, "learning_rate": 0.0001, "loss": 0.0663, "step": 148 }, { "epoch": 50.59, "learning_rate": 0.0001, "loss": 0.0575, "step": 152 }, { "epoch": 51.88, "learning_rate": 0.0001, "loss": 0.0599, "step": 156 }, { "epoch": 53.29, "learning_rate": 0.0001, "loss": 0.059, "step": 160 }, { "epoch": 54.59, "learning_rate": 0.0001, "loss": 0.0509, "step": 164 }, { "epoch": 55.88, "learning_rate": 0.0001, "loss": 0.0485, "step": 168 }, { "epoch": 57.29, "learning_rate": 0.0001, "loss": 0.0536, "step": 172 }, { "epoch": 58.59, "learning_rate": 0.0001, "loss": 0.0504, "step": 176 }, { "epoch": 59.88, "learning_rate": 0.0001, "loss": 0.0445, "step": 180 }, { "epoch": 61.29, "learning_rate": 0.0001, "loss": 0.0426, "step": 184 }, { "epoch": 62.59, "learning_rate": 0.0001, "loss": 0.0495, "step": 188 }, { "epoch": 63.88, "learning_rate": 0.0001, "loss": 0.0442, "step": 192 }, { "epoch": 63.88, "eval_exact_match": 0.574468085106383, "eval_exec": 0.597678916827853, "eval_loss": 0.20976035296916962, "eval_runtime": 436.2256, "eval_samples_per_second": 2.37, "step": 192 }, { "epoch": 65.29, "learning_rate": 0.0001, "loss": 0.0421, "step": 196 }, { "epoch": 66.59, "learning_rate": 0.0001, "loss": 0.0411, "step": 200 }, { "epoch": 67.88, "learning_rate": 0.0001, "loss": 0.036, "step": 204 }, { "epoch": 69.29, "learning_rate": 0.0001, "loss": 0.0346, "step": 208 }, { "epoch": 70.59, "learning_rate": 0.0001, "loss": 0.0294, "step": 212 }, { "epoch": 71.88, "learning_rate": 0.0001, "loss": 0.0323, "step": 216 }, { "epoch": 73.29, "learning_rate": 0.0001, "loss": 0.0367, "step": 220 }, { "epoch": 74.59, "learning_rate": 0.0001, "loss": 0.0277, "step": 224 }, { "epoch": 75.88, "learning_rate": 0.0001, "loss": 0.0281, "step": 228 }, { "epoch": 77.29, "learning_rate": 0.0001, "loss": 0.026, "step": 232 }, { "epoch": 78.59, "learning_rate": 0.0001, "loss": 0.0242, "step": 236 }, { "epoch": 79.88, "learning_rate": 0.0001, "loss": 0.0245, "step": 240 }, { "epoch": 81.29, "learning_rate": 0.0001, "loss": 0.0264, "step": 244 }, { "epoch": 82.59, "learning_rate": 0.0001, "loss": 0.0264, "step": 248 }, { "epoch": 83.88, "learning_rate": 0.0001, "loss": 0.0248, "step": 252 }, { "epoch": 85.29, "learning_rate": 0.0001, "loss": 0.0254, "step": 256 }, { "epoch": 85.29, "eval_exact_match": 0.59284332688588, "eval_exec": 0.6150870406189555, "eval_loss": 0.24511948227882385, "eval_runtime": 397.4948, "eval_samples_per_second": 2.601, "step": 256 }, { "epoch": 86.59, "learning_rate": 0.0001, "loss": 0.0192, "step": 260 }, { "epoch": 87.88, "learning_rate": 0.0001, "loss": 0.0181, "step": 264 }, { "epoch": 89.29, "learning_rate": 0.0001, "loss": 0.0195, "step": 268 }, { "epoch": 90.59, "learning_rate": 0.0001, "loss": 0.0184, "step": 272 }, { "epoch": 91.88, "learning_rate": 0.0001, "loss": 0.0162, "step": 276 }, { "epoch": 93.29, "learning_rate": 0.0001, "loss": 0.0167, "step": 280 }, { "epoch": 94.59, "learning_rate": 0.0001, "loss": 0.0163, "step": 284 }, { "epoch": 95.88, "learning_rate": 0.0001, "loss": 0.0164, "step": 288 }, { "epoch": 97.29, "learning_rate": 0.0001, "loss": 0.016, "step": 292 }, { "epoch": 98.59, "learning_rate": 0.0001, "loss": 0.0136, "step": 296 }, { "epoch": 99.88, "learning_rate": 0.0001, "loss": 0.0182, "step": 300 }, { "epoch": 101.29, "learning_rate": 0.0001, "loss": 0.0205, "step": 304 }, { "epoch": 102.59, "learning_rate": 0.0001, "loss": 0.0136, "step": 308 }, { "epoch": 103.88, "learning_rate": 0.0001, "loss": 0.0136, "step": 312 }, { "epoch": 105.29, "learning_rate": 0.0001, "loss": 0.0117, "step": 316 }, { "epoch": 106.59, "learning_rate": 0.0001, "loss": 0.0103, "step": 320 }, { "epoch": 106.59, "eval_exact_match": 0.5967117988394585, "eval_exec": 0.6179883945841392, "eval_loss": 0.29194486141204834, "eval_runtime": 428.7156, "eval_samples_per_second": 2.412, "step": 320 }, { "epoch": 107.88, "learning_rate": 0.0001, "loss": 0.0112, "step": 324 }, { "epoch": 109.29, "learning_rate": 0.0001, "loss": 0.016, "step": 328 }, { "epoch": 110.59, "learning_rate": 0.0001, "loss": 0.0092, "step": 332 }, { "epoch": 111.88, "learning_rate": 0.0001, "loss": 0.0095, "step": 336 }, { "epoch": 113.29, "learning_rate": 0.0001, "loss": 0.01, "step": 340 }, { "epoch": 114.59, "learning_rate": 0.0001, "loss": 0.0095, "step": 344 }, { "epoch": 115.88, "learning_rate": 0.0001, "loss": 0.0094, "step": 348 }, { "epoch": 117.29, "learning_rate": 0.0001, "loss": 0.0091, "step": 352 }, { "epoch": 118.59, "learning_rate": 0.0001, "loss": 0.0079, "step": 356 }, { "epoch": 119.88, "learning_rate": 0.0001, "loss": 0.0081, "step": 360 }, { "epoch": 121.29, "learning_rate": 0.0001, "loss": 0.0089, "step": 364 }, { "epoch": 122.59, "learning_rate": 0.0001, "loss": 0.0071, "step": 368 }, { "epoch": 123.88, "learning_rate": 0.0001, "loss": 0.0073, "step": 372 }, { "epoch": 125.29, "learning_rate": 0.0001, "loss": 0.0079, "step": 376 }, { "epoch": 126.59, "learning_rate": 0.0001, "loss": 0.0088, "step": 380 }, { "epoch": 127.88, "learning_rate": 0.0001, "loss": 0.0073, "step": 384 }, { "epoch": 127.88, "eval_exact_match": 0.5957446808510638, "eval_exec": 0.6170212765957447, "eval_loss": 0.30737635493278503, "eval_runtime": 453.1803, "eval_samples_per_second": 2.282, "step": 384 }, { "epoch": 129.29, "learning_rate": 0.0001, "loss": 0.007, "step": 388 }, { "epoch": 130.59, "learning_rate": 0.0001, "loss": 0.006, "step": 392 }, { "epoch": 131.88, "learning_rate": 0.0001, "loss": 0.0065, "step": 396 }, { "epoch": 133.29, "learning_rate": 0.0001, "loss": 0.006, "step": 400 }, { "epoch": 134.59, "learning_rate": 0.0001, "loss": 0.0054, "step": 404 }, { "epoch": 135.88, "learning_rate": 0.0001, "loss": 0.0059, "step": 408 }, { "epoch": 137.29, "learning_rate": 0.0001, "loss": 0.017, "step": 412 }, { "epoch": 138.59, "learning_rate": 0.0001, "loss": 0.0096, "step": 416 }, { "epoch": 139.88, "learning_rate": 0.0001, "loss": 0.006, "step": 420 }, { "epoch": 141.29, "learning_rate": 0.0001, "loss": 0.0061, "step": 424 }, { "epoch": 142.59, "learning_rate": 0.0001, "loss": 0.0048, "step": 428 }, { "epoch": 143.88, "learning_rate": 0.0001, "loss": 0.005, "step": 432 }, { "epoch": 145.29, "learning_rate": 0.0001, "loss": 0.0133, "step": 436 }, { "epoch": 146.59, "learning_rate": 0.0001, "loss": 0.005, "step": 440 }, { "epoch": 147.88, "learning_rate": 0.0001, "loss": 0.0043, "step": 444 }, { "epoch": 149.29, "learning_rate": 0.0001, "loss": 0.004, "step": 448 }, { "epoch": 149.29, "eval_exact_match": 0.6160541586073501, "eval_exec": 0.6286266924564797, "eval_loss": 0.3122102916240692, "eval_runtime": 446.9159, "eval_samples_per_second": 2.314, "step": 448 }, { "epoch": 150.59, "learning_rate": 0.0001, "loss": 0.004, "step": 452 }, { "epoch": 151.88, "learning_rate": 0.0001, "loss": 0.0038, "step": 456 }, { "epoch": 153.29, "learning_rate": 0.0001, "loss": 0.0043, "step": 460 }, { "epoch": 154.59, "learning_rate": 0.0001, "loss": 0.0046, "step": 464 }, { "epoch": 155.88, "learning_rate": 0.0001, "loss": 0.0043, "step": 468 }, { "epoch": 157.29, "learning_rate": 0.0001, "loss": 0.0043, "step": 472 }, { "epoch": 158.59, "learning_rate": 0.0001, "loss": 0.003, "step": 476 }, { "epoch": 159.88, "learning_rate": 0.0001, "loss": 0.0033, "step": 480 }, { "epoch": 161.29, "learning_rate": 0.0001, "loss": 0.0038, "step": 484 }, { "epoch": 162.59, "learning_rate": 0.0001, "loss": 0.0032, "step": 488 }, { "epoch": 163.88, "learning_rate": 0.0001, "loss": 0.0033, "step": 492 }, { "epoch": 165.29, "learning_rate": 0.0001, "loss": 0.0033, "step": 496 }, { "epoch": 166.59, "learning_rate": 0.0001, "loss": 0.003, "step": 500 }, { "epoch": 167.88, "learning_rate": 0.0001, "loss": 0.0029, "step": 504 }, { "epoch": 169.29, "learning_rate": 0.0001, "loss": 0.003, "step": 508 }, { "epoch": 170.59, "learning_rate": 0.0001, "loss": 0.0026, "step": 512 }, { "epoch": 170.59, "eval_exact_match": 0.6092843326885881, "eval_exec": 0.6286266924564797, "eval_loss": 0.329732745885849, "eval_runtime": 461.0101, "eval_samples_per_second": 2.243, "step": 512 }, { "epoch": 171.88, "learning_rate": 0.0001, "loss": 0.0029, "step": 516 }, { "epoch": 173.29, "learning_rate": 0.0001, "loss": 0.003, "step": 520 }, { "epoch": 174.59, "learning_rate": 0.0001, "loss": 0.0025, "step": 524 }, { "epoch": 175.88, "learning_rate": 0.0001, "loss": 0.0031, "step": 528 }, { "epoch": 177.29, "learning_rate": 0.0001, "loss": 0.0031, "step": 532 }, { "epoch": 178.59, "learning_rate": 0.0001, "loss": 0.0025, "step": 536 }, { "epoch": 179.88, "learning_rate": 0.0001, "loss": 0.0026, "step": 540 }, { "epoch": 181.29, "learning_rate": 0.0001, "loss": 0.0027, "step": 544 }, { "epoch": 182.59, "learning_rate": 0.0001, "loss": 0.0028, "step": 548 }, { "epoch": 183.88, "learning_rate": 0.0001, "loss": 0.0023, "step": 552 }, { "epoch": 185.29, "learning_rate": 0.0001, "loss": 0.0025, "step": 556 }, { "epoch": 186.59, "learning_rate": 0.0001, "loss": 0.0057, "step": 560 }, { "epoch": 187.88, "learning_rate": 0.0001, "loss": 0.013, "step": 564 }, { "epoch": 189.29, "learning_rate": 0.0001, "loss": 0.0033, "step": 568 }, { "epoch": 190.59, "learning_rate": 0.0001, "loss": 0.0042, "step": 572 }, { "epoch": 191.88, "learning_rate": 0.0001, "loss": 0.0326, "step": 576 }, { "epoch": 191.88, "eval_exact_match": 0.6199226305609284, "eval_exec": 0.6441005802707931, "eval_loss": 0.28043264150619507, "eval_runtime": 434.1831, "eval_samples_per_second": 2.381, "step": 576 }, { "epoch": 193.29, "learning_rate": 0.0001, "loss": 0.0029, "step": 580 }, { "epoch": 194.59, "learning_rate": 0.0001, "loss": 0.0021, "step": 584 }, { "epoch": 195.88, "learning_rate": 0.0001, "loss": 0.002, "step": 588 }, { "epoch": 197.29, "learning_rate": 0.0001, "loss": 0.0021, "step": 592 }, { "epoch": 198.59, "learning_rate": 0.0001, "loss": 0.002, "step": 596 }, { "epoch": 199.88, "learning_rate": 0.0001, "loss": 0.0017, "step": 600 }, { "epoch": 201.29, "learning_rate": 0.0001, "loss": 0.002, "step": 604 }, { "epoch": 202.59, "learning_rate": 0.0001, "loss": 0.0016, "step": 608 }, { "epoch": 203.88, "learning_rate": 0.0001, "loss": 0.0018, "step": 612 }, { "epoch": 205.29, "learning_rate": 0.0001, "loss": 0.0021, "step": 616 }, { "epoch": 206.59, "learning_rate": 0.0001, "loss": 0.002, "step": 620 }, { "epoch": 207.88, "learning_rate": 0.0001, "loss": 0.0018, "step": 624 }, { "epoch": 209.29, "learning_rate": 0.0001, "loss": 0.0026, "step": 628 }, { "epoch": 210.59, "learning_rate": 0.0001, "loss": 0.0015, "step": 632 }, { "epoch": 211.88, "learning_rate": 0.0001, "loss": 0.0018, "step": 636 }, { "epoch": 213.29, "learning_rate": 0.0001, "loss": 0.0054, "step": 640 }, { "epoch": 213.29, "eval_exact_match": 0.5938104448742747, "eval_exec": 0.6160541586073501, "eval_loss": 0.34003064036369324, "eval_runtime": 454.3384, "eval_samples_per_second": 2.276, "step": 640 }, { "epoch": 214.59, "learning_rate": 0.0001, "loss": 0.0044, "step": 644 }, { "epoch": 215.88, "learning_rate": 0.0001, "loss": 0.0018, "step": 648 }, { "epoch": 217.29, "learning_rate": 0.0001, "loss": 0.0018, "step": 652 }, { "epoch": 218.59, "learning_rate": 0.0001, "loss": 0.0015, "step": 656 }, { "epoch": 219.88, "learning_rate": 0.0001, "loss": 0.0017, "step": 660 }, { "epoch": 221.29, "learning_rate": 0.0001, "loss": 0.0017, "step": 664 }, { "epoch": 222.59, "learning_rate": 0.0001, "loss": 0.0015, "step": 668 }, { "epoch": 223.88, "learning_rate": 0.0001, "loss": 0.0015, "step": 672 }, { "epoch": 225.29, "learning_rate": 0.0001, "loss": 0.0015, "step": 676 }, { "epoch": 226.59, "learning_rate": 0.0001, "loss": 0.0014, "step": 680 }, { "epoch": 227.88, "learning_rate": 0.0001, "loss": 0.0013, "step": 684 }, { "epoch": 229.29, "learning_rate": 0.0001, "loss": 0.0015, "step": 688 }, { "epoch": 230.59, "learning_rate": 0.0001, "loss": 0.0013, "step": 692 }, { "epoch": 231.88, "learning_rate": 0.0001, "loss": 0.0012, "step": 696 }, { "epoch": 233.29, "learning_rate": 0.0001, "loss": 0.0014, "step": 700 }, { "epoch": 234.59, "learning_rate": 0.0001, "loss": 0.0038, "step": 704 }, { "epoch": 234.59, "eval_exact_match": 0.6305609284332688, "eval_exec": 0.648936170212766, "eval_loss": 0.3312581479549408, "eval_runtime": 454.3187, "eval_samples_per_second": 2.276, "step": 704 }, { "epoch": 235.88, "learning_rate": 0.0001, "loss": 0.0014, "step": 708 }, { "epoch": 237.29, "learning_rate": 0.0001, "loss": 0.0013, "step": 712 }, { "epoch": 238.59, "learning_rate": 0.0001, "loss": 0.0011, "step": 716 }, { "epoch": 239.88, "learning_rate": 0.0001, "loss": 0.001, "step": 720 }, { "epoch": 241.29, "learning_rate": 0.0001, "loss": 0.0014, "step": 724 }, { "epoch": 242.59, "learning_rate": 0.0001, "loss": 0.0013, "step": 728 }, { "epoch": 243.88, "learning_rate": 0.0001, "loss": 0.0012, "step": 732 }, { "epoch": 245.29, "learning_rate": 0.0001, "loss": 0.0012, "step": 736 }, { "epoch": 246.59, "learning_rate": 0.0001, "loss": 0.001, "step": 740 }, { "epoch": 247.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 744 }, { "epoch": 249.29, "learning_rate": 0.0001, "loss": 0.001, "step": 748 }, { "epoch": 250.59, "learning_rate": 0.0001, "loss": 0.001, "step": 752 }, { "epoch": 251.88, "learning_rate": 0.0001, "loss": 0.0011, "step": 756 }, { "epoch": 253.29, "learning_rate": 0.0001, "loss": 0.0012, "step": 760 }, { "epoch": 254.59, "learning_rate": 0.0001, "loss": 0.001, "step": 764 }, { "epoch": 255.88, "learning_rate": 0.0001, "loss": 0.0012, "step": 768 }, { "epoch": 255.88, "eval_exact_match": 0.6141199226305609, "eval_exec": 0.6373307543520309, "eval_loss": 0.3713914155960083, "eval_runtime": 454.4742, "eval_samples_per_second": 2.275, "step": 768 }, { "epoch": 257.29, "learning_rate": 0.0001, "loss": 0.0014, "step": 772 }, { "epoch": 258.59, "learning_rate": 0.0001, "loss": 0.0012, "step": 776 }, { "epoch": 259.88, "learning_rate": 0.0001, "loss": 0.0011, "step": 780 }, { "epoch": 261.29, "learning_rate": 0.0001, "loss": 0.001, "step": 784 }, { "epoch": 262.59, "learning_rate": 0.0001, "loss": 0.001, "step": 788 }, { "epoch": 263.88, "learning_rate": 0.0001, "loss": 0.001, "step": 792 }, { "epoch": 265.29, "learning_rate": 0.0001, "loss": 0.0011, "step": 796 }, { "epoch": 266.59, "learning_rate": 0.0001, "loss": 0.0013, "step": 800 }, { "epoch": 267.88, "learning_rate": 0.0001, "loss": 0.001, "step": 804 }, { "epoch": 269.29, "learning_rate": 0.0001, "loss": 0.001, "step": 808 }, { "epoch": 270.59, "learning_rate": 0.0001, "loss": 0.0009, "step": 812 }, { "epoch": 271.88, "learning_rate": 0.0001, "loss": 0.0104, "step": 816 }, { "epoch": 273.29, "learning_rate": 0.0001, "loss": 0.0057, "step": 820 }, { "epoch": 274.59, "learning_rate": 0.0001, "loss": 0.0013, "step": 824 }, { "epoch": 275.88, "learning_rate": 0.0001, "loss": 0.0011, "step": 828 }, { "epoch": 277.29, "learning_rate": 0.0001, "loss": 0.0011, "step": 832 }, { "epoch": 277.29, "eval_exact_match": 0.6228239845261122, "eval_exec": 0.6450676982591876, "eval_loss": 0.3546225130558014, "eval_runtime": 443.6147, "eval_samples_per_second": 2.331, "step": 832 }, { "epoch": 278.59, "learning_rate": 0.0001, "loss": 0.0012, "step": 836 }, { "epoch": 279.88, "learning_rate": 0.0001, "loss": 0.0025, "step": 840 }, { "epoch": 281.29, "learning_rate": 0.0001, "loss": 0.0031, "step": 844 }, { "epoch": 282.59, "learning_rate": 0.0001, "loss": 0.001, "step": 848 }, { "epoch": 283.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 852 }, { "epoch": 285.29, "learning_rate": 0.0001, "loss": 0.0008, "step": 856 }, { "epoch": 286.59, "learning_rate": 0.0001, "loss": 0.0009, "step": 860 }, { "epoch": 287.88, "learning_rate": 0.0001, "loss": 0.0029, "step": 864 }, { "epoch": 289.29, "learning_rate": 0.0001, "loss": 0.0039, "step": 868 }, { "epoch": 290.59, "learning_rate": 0.0001, "loss": 0.0011, "step": 872 }, { "epoch": 291.88, "learning_rate": 0.0001, "loss": 0.001, "step": 876 }, { "epoch": 293.29, "learning_rate": 0.0001, "loss": 0.0011, "step": 880 }, { "epoch": 294.59, "learning_rate": 0.0001, "loss": 0.0011, "step": 884 }, { "epoch": 295.88, "learning_rate": 0.0001, "loss": 0.0009, "step": 888 }, { "epoch": 297.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 892 }, { "epoch": 298.59, "learning_rate": 0.0001, "loss": 0.001, "step": 896 }, { "epoch": 298.59, "eval_exact_match": 0.620889748549323, "eval_exec": 0.6450676982591876, "eval_loss": 0.3728490173816681, "eval_runtime": 458.092, "eval_samples_per_second": 2.257, "step": 896 }, { "epoch": 299.88, "learning_rate": 0.0001, "loss": 0.0007, "step": 900 }, { "epoch": 301.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 904 }, { "epoch": 302.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 908 }, { "epoch": 303.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 912 }, { "epoch": 305.29, "learning_rate": 0.0001, "loss": 0.001, "step": 916 }, { "epoch": 306.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 920 }, { "epoch": 307.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 924 }, { "epoch": 309.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 928 }, { "epoch": 310.59, "learning_rate": 0.0001, "loss": 0.0007, "step": 932 }, { "epoch": 311.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 936 }, { "epoch": 313.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 940 }, { "epoch": 314.59, "learning_rate": 0.0001, "loss": 0.0009, "step": 944 }, { "epoch": 315.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 948 }, { "epoch": 317.29, "learning_rate": 0.0001, "loss": 0.0008, "step": 952 }, { "epoch": 318.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 956 }, { "epoch": 319.88, "learning_rate": 0.0001, "loss": 0.0007, "step": 960 }, { "epoch": 319.88, "eval_exact_match": 0.6286266924564797, "eval_exec": 0.648936170212766, "eval_loss": 0.3788833022117615, "eval_runtime": 456.1366, "eval_samples_per_second": 2.267, "step": 960 }, { "epoch": 321.29, "learning_rate": 0.0001, "loss": 0.0008, "step": 964 }, { "epoch": 322.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 968 }, { "epoch": 323.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 972 }, { "epoch": 325.29, "learning_rate": 0.0001, "loss": 0.001, "step": 976 }, { "epoch": 326.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 980 }, { "epoch": 327.88, "learning_rate": 0.0001, "loss": 0.0009, "step": 984 }, { "epoch": 329.29, "learning_rate": 0.0001, "loss": 0.0008, "step": 988 }, { "epoch": 330.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 992 }, { "epoch": 331.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 996 }, { "epoch": 333.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 1000 }, { "epoch": 334.59, "learning_rate": 0.0001, "loss": 0.0069, "step": 1004 }, { "epoch": 335.88, "learning_rate": 0.0001, "loss": 0.0039, "step": 1008 }, { "epoch": 337.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 1012 }, { "epoch": 338.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1016 }, { "epoch": 339.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 1020 }, { "epoch": 341.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 1024 }, { "epoch": 341.29, "eval_exact_match": 0.6141199226305609, "eval_exec": 0.6402321083172147, "eval_loss": 0.37917226552963257, "eval_runtime": 457.05, "eval_samples_per_second": 2.262, "step": 1024 }, { "epoch": 342.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 1028 }, { "epoch": 343.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1032 }, { "epoch": 345.29, "learning_rate": 0.0001, "loss": 0.0008, "step": 1036 }, { "epoch": 346.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 1040 }, { "epoch": 347.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 1044 }, { "epoch": 349.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1048 }, { "epoch": 350.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1052 }, { "epoch": 351.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 1056 }, { "epoch": 353.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1060 }, { "epoch": 354.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1064 }, { "epoch": 355.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1068 }, { "epoch": 357.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1072 }, { "epoch": 358.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1076 }, { "epoch": 359.88, "learning_rate": 0.0001, "loss": 0.0011, "step": 1080 }, { "epoch": 361.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1084 }, { "epoch": 362.59, "learning_rate": 0.0001, "loss": 0.0011, "step": 1088 }, { "epoch": 362.59, "eval_exact_match": 0.6199226305609284, "eval_exec": 0.648936170212766, "eval_loss": 0.38900309801101685, "eval_runtime": 443.2936, "eval_samples_per_second": 2.333, "step": 1088 }, { "epoch": 363.88, "learning_rate": 0.0001, "loss": 0.0015, "step": 1092 }, { "epoch": 365.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1096 }, { "epoch": 366.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 1100 }, { "epoch": 367.88, "learning_rate": 0.0001, "loss": 0.0009, "step": 1104 }, { "epoch": 369.29, "learning_rate": 0.0001, "loss": 0.0009, "step": 1108 }, { "epoch": 370.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1112 }, { "epoch": 371.88, "learning_rate": 0.0001, "loss": 0.0007, "step": 1116 }, { "epoch": 373.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1120 }, { "epoch": 374.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1124 }, { "epoch": 375.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 1128 }, { "epoch": 377.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1132 }, { "epoch": 378.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1136 }, { "epoch": 379.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1140 }, { "epoch": 381.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1144 }, { "epoch": 382.59, "learning_rate": 0.0001, "loss": 0.0016, "step": 1148 }, { "epoch": 383.88, "learning_rate": 0.0001, "loss": 0.004, "step": 1152 }, { "epoch": 383.88, "eval_exact_match": 0.6295938104448743, "eval_exec": 0.6528046421663443, "eval_loss": 0.35109272599220276, "eval_runtime": 428.9118, "eval_samples_per_second": 2.411, "step": 1152 }, { "epoch": 385.29, "learning_rate": 0.0001, "loss": 0.001, "step": 1156 }, { "epoch": 386.59, "learning_rate": 0.0001, "loss": 0.0005, "step": 1160 }, { "epoch": 387.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1164 }, { "epoch": 389.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1168 }, { "epoch": 390.59, "learning_rate": 0.0001, "loss": 0.0005, "step": 1172 }, { "epoch": 391.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1176 }, { "epoch": 393.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1180 }, { "epoch": 394.59, "learning_rate": 0.0001, "loss": 0.0004, "step": 1184 }, { "epoch": 395.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1188 }, { "epoch": 397.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1192 }, { "epoch": 398.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1196 }, { "epoch": 399.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1200 }, { "epoch": 401.29, "learning_rate": 0.0001, "loss": 0.0007, "step": 1204 }, { "epoch": 402.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1208 }, { "epoch": 403.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1212 }, { "epoch": 405.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1216 }, { "epoch": 405.29, "eval_exact_match": 0.6353965183752418, "eval_exec": 0.660541586073501, "eval_loss": 0.4038596451282501, "eval_runtime": 442.9132, "eval_samples_per_second": 2.335, "step": 1216 }, { "epoch": 406.59, "learning_rate": 0.0001, "loss": 0.0008, "step": 1220 }, { "epoch": 407.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1224 }, { "epoch": 409.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1228 }, { "epoch": 410.59, "learning_rate": 0.0001, "loss": 0.0007, "step": 1232 }, { "epoch": 411.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1236 }, { "epoch": 413.29, "learning_rate": 0.0001, "loss": 0.0059, "step": 1240 }, { "epoch": 414.59, "learning_rate": 0.0001, "loss": 0.0007, "step": 1244 }, { "epoch": 415.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1248 }, { "epoch": 417.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1252 }, { "epoch": 418.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1256 }, { "epoch": 419.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1260 }, { "epoch": 421.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1264 }, { "epoch": 422.59, "learning_rate": 0.0001, "loss": 0.0004, "step": 1268 }, { "epoch": 423.88, "learning_rate": 0.0001, "loss": 0.0004, "step": 1272 }, { "epoch": 425.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1276 }, { "epoch": 426.59, "learning_rate": 0.0001, "loss": 0.0004, "step": 1280 }, { "epoch": 426.59, "eval_exact_match": 0.6295938104448743, "eval_exec": 0.6499032882011605, "eval_loss": 0.4094041585922241, "eval_runtime": 442.2153, "eval_samples_per_second": 2.338, "step": 1280 }, { "epoch": 427.88, "learning_rate": 0.0001, "loss": 0.001, "step": 1284 }, { "epoch": 429.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1288 }, { "epoch": 430.59, "learning_rate": 0.0001, "loss": 0.0004, "step": 1292 }, { "epoch": 431.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1296 }, { "epoch": 433.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1300 }, { "epoch": 434.59, "learning_rate": 0.0001, "loss": 0.0004, "step": 1304 }, { "epoch": 435.88, "learning_rate": 0.0001, "loss": 0.0004, "step": 1308 }, { "epoch": 437.29, "learning_rate": 0.0001, "loss": 0.0006, "step": 1312 }, { "epoch": 438.59, "learning_rate": 0.0001, "loss": 0.0005, "step": 1316 }, { "epoch": 439.88, "learning_rate": 0.0001, "loss": 0.0005, "step": 1320 }, { "epoch": 441.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1324 }, { "epoch": 442.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1328 }, { "epoch": 443.88, "learning_rate": 0.0001, "loss": 0.0006, "step": 1332 }, { "epoch": 445.29, "learning_rate": 0.0001, "loss": 0.0005, "step": 1336 }, { "epoch": 446.59, "learning_rate": 0.0001, "loss": 0.0006, "step": 1340 }, { "epoch": 447.88, "learning_rate": 0.0001, "loss": 0.0008, "step": 1344 }, { "epoch": 447.88, "eval_exact_match": 0.6441005802707931, "eval_exec": 0.6624758220502901, "eval_loss": 0.3987429440021515, "eval_runtime": 448.6509, "eval_samples_per_second": 2.305, "step": 1344 } ], "max_steps": 9216, "num_train_epochs": 3072, "total_flos": 7.71745071522816e+18, "trial_name": null, "trial_params": null }