diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,461 +1,4397 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 50.0, - "global_step": 3100, + "epoch": 500.0, + "global_step": 31000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, - "eval_loss": 39.687320709228516, - "eval_runtime": 2.8335, - "eval_samples_per_second": 43.056, - "eval_steps_per_second": 5.647, + "eval_loss": 40.06241989135742, + "eval_runtime": 1.547, + "eval_samples_per_second": 79.509, + "eval_steps_per_second": 10.343, "step": 62 }, { "epoch": 2.0, - "eval_loss": 39.516841888427734, - "eval_runtime": 2.9783, - "eval_samples_per_second": 40.963, - "eval_steps_per_second": 5.372, + "eval_loss": 39.472923278808594, + "eval_runtime": 1.5508, + "eval_samples_per_second": 79.312, + "eval_steps_per_second": 10.317, "step": 124 }, { "epoch": 3.0, - "eval_loss": 38.932273864746094, - "eval_runtime": 3.1472, - "eval_samples_per_second": 38.765, - "eval_steps_per_second": 5.084, + "eval_loss": 39.0081672668457, + "eval_runtime": 1.546, + "eval_samples_per_second": 79.558, + "eval_steps_per_second": 10.349, "step": 186 }, { "epoch": 4.0, - "eval_loss": 37.797115325927734, - "eval_runtime": 3.1415, - "eval_samples_per_second": 38.835, - "eval_steps_per_second": 5.093, + "eval_loss": 38.20319366455078, + "eval_runtime": 1.5507, + "eval_samples_per_second": 79.317, + "eval_steps_per_second": 10.318, "step": 248 }, { "epoch": 5.0, - "eval_loss": 36.89667510986328, - "eval_runtime": 3.1742, - "eval_samples_per_second": 38.435, - "eval_steps_per_second": 5.041, + "eval_loss": 37.33580017089844, + "eval_runtime": 1.5485, + "eval_samples_per_second": 79.431, + "eval_steps_per_second": 10.332, "step": 310 }, { "epoch": 6.0, - "eval_loss": 35.38439178466797, - "eval_runtime": 3.1525, - "eval_samples_per_second": 38.699, - "eval_steps_per_second": 5.075, + "eval_loss": 36.15742111206055, + "eval_runtime": 1.5522, + "eval_samples_per_second": 79.244, + "eval_steps_per_second": 10.308, "step": 372 }, { "epoch": 7.0, - "eval_loss": 34.643463134765625, - "eval_runtime": 3.1776, - "eval_samples_per_second": 38.394, - "eval_steps_per_second": 5.035, + "eval_loss": 35.14997863769531, + "eval_runtime": 1.5454, + "eval_samples_per_second": 79.589, + "eval_steps_per_second": 10.353, "step": 434 }, { "epoch": 8.0, - "eval_loss": 33.6444206237793, - "eval_runtime": 3.1827, - "eval_samples_per_second": 38.332, - "eval_steps_per_second": 5.027, + "eval_loss": 33.69432067871094, + "eval_runtime": 1.5477, + "eval_samples_per_second": 79.472, + "eval_steps_per_second": 10.338, "step": 496 }, { "epoch": 8.06, - "learning_rate": 5.870967741935484e-08, - "loss": 39.4735, + "learning_rate": 6.887096774193549e-08, + "loss": 39.2118, "step": 500 }, { "epoch": 9.0, - "eval_loss": 32.891536712646484, - "eval_runtime": 3.2424, - "eval_samples_per_second": 37.627, - "eval_steps_per_second": 4.935, + "eval_loss": 32.3968391418457, + "eval_runtime": 1.5552, + "eval_samples_per_second": 79.09, + "eval_steps_per_second": 10.288, "step": 558 }, { "epoch": 10.0, - "eval_loss": 31.138931274414062, - "eval_runtime": 3.1172, - "eval_samples_per_second": 39.138, - "eval_steps_per_second": 5.133, + "eval_loss": 30.730754852294922, + "eval_runtime": 1.5637, + "eval_samples_per_second": 78.658, + "eval_steps_per_second": 10.232, "step": 620 }, { "epoch": 11.0, - "eval_loss": 30.020915985107422, - "eval_runtime": 3.1861, - "eval_samples_per_second": 38.291, - "eval_steps_per_second": 5.022, + "eval_loss": 29.227901458740234, + "eval_runtime": 1.553, + "eval_samples_per_second": 79.199, + "eval_steps_per_second": 10.302, "step": 682 }, { "epoch": 12.0, - "eval_loss": 28.798078536987305, - "eval_runtime": 3.158, - "eval_samples_per_second": 38.632, - "eval_steps_per_second": 5.066, + "eval_loss": 27.82735824584961, + "eval_runtime": 1.5637, + "eval_samples_per_second": 78.659, + "eval_steps_per_second": 10.232, "step": 744 }, { "epoch": 13.0, - "eval_loss": 27.6728458404541, - "eval_runtime": 3.1725, - "eval_samples_per_second": 38.456, - "eval_steps_per_second": 5.043, + "eval_loss": 26.49405288696289, + "eval_runtime": 1.5536, + "eval_samples_per_second": 79.17, + "eval_steps_per_second": 10.298, "step": 806 }, { "epoch": 14.0, - "eval_loss": 26.61111068725586, - "eval_runtime": 3.1711, - "eval_samples_per_second": 38.472, - "eval_steps_per_second": 5.046, + "eval_loss": 24.659284591674805, + "eval_runtime": 1.5457, + "eval_samples_per_second": 79.576, + "eval_steps_per_second": 10.351, "step": 868 }, { "epoch": 15.0, - "eval_loss": 25.55615234375, - "eval_runtime": 3.1645, - "eval_samples_per_second": 38.553, - "eval_steps_per_second": 5.056, + "eval_loss": 23.44048309326172, + "eval_runtime": 1.5478, + "eval_samples_per_second": 79.467, + "eval_steps_per_second": 10.337, "step": 930 }, { "epoch": 16.0, - "eval_loss": 24.428285598754883, - "eval_runtime": 3.1894, - "eval_samples_per_second": 38.251, - "eval_steps_per_second": 5.017, + "eval_loss": 21.999380111694336, + "eval_runtime": 1.5447, + "eval_samples_per_second": 79.626, + "eval_steps_per_second": 10.358, "step": 992 }, { "epoch": 16.13, - "learning_rate": 4.741935483870968e-08, - "loss": 29.1909, + "learning_rate": 6.774193548387097e-08, + "loss": 27.5423, "step": 1000 }, { "epoch": 17.0, - "eval_loss": 23.227224349975586, - "eval_runtime": 3.3011, - "eval_samples_per_second": 36.957, - "eval_steps_per_second": 4.847, + "eval_loss": 21.027385711669922, + "eval_runtime": 1.5545, + "eval_samples_per_second": 79.127, + "eval_steps_per_second": 10.293, "step": 1054 }, { "epoch": 18.0, - "eval_loss": 22.856584548950195, - "eval_runtime": 3.1351, - "eval_samples_per_second": 38.914, - "eval_steps_per_second": 5.103, + "eval_loss": 19.89065170288086, + "eval_runtime": 1.565, + "eval_samples_per_second": 78.592, + "eval_steps_per_second": 10.223, "step": 1116 }, { "epoch": 19.0, - "eval_loss": 22.174100875854492, - "eval_runtime": 3.2113, - "eval_samples_per_second": 37.99, - "eval_steps_per_second": 4.982, + "eval_loss": 19.33472442626953, + "eval_runtime": 1.5542, + "eval_samples_per_second": 79.141, + "eval_steps_per_second": 10.295, "step": 1178 }, { "epoch": 20.0, - "eval_loss": 21.41921615600586, - "eval_runtime": 3.1698, - "eval_samples_per_second": 38.488, - "eval_steps_per_second": 5.048, + "eval_loss": 18.324054718017578, + "eval_runtime": 1.5611, + "eval_samples_per_second": 78.793, + "eval_steps_per_second": 10.249, "step": 1240 }, { "epoch": 21.0, - "eval_loss": 20.49114227294922, - "eval_runtime": 3.202, - "eval_samples_per_second": 38.101, - "eval_steps_per_second": 4.997, + "eval_loss": 17.521472930908203, + "eval_runtime": 1.5648, + "eval_samples_per_second": 78.603, + "eval_steps_per_second": 10.225, "step": 1302 }, { "epoch": 22.0, - "eval_loss": 20.245670318603516, - "eval_runtime": 3.1786, - "eval_samples_per_second": 38.382, - "eval_steps_per_second": 5.034, + "eval_loss": 16.96886444091797, + "eval_runtime": 1.5541, + "eval_samples_per_second": 79.144, + "eval_steps_per_second": 10.295, "step": 1364 }, { "epoch": 23.0, - "eval_loss": 19.58565902709961, - "eval_runtime": 3.205, - "eval_samples_per_second": 38.065, - "eval_steps_per_second": 4.992, + "eval_loss": 16.298294067382812, + "eval_runtime": 1.5454, + "eval_samples_per_second": 79.59, + "eval_steps_per_second": 10.353, "step": 1426 }, { "epoch": 24.0, - "eval_loss": 19.287837982177734, - "eval_runtime": 3.1902, - "eval_samples_per_second": 38.242, - "eval_steps_per_second": 5.015, + "eval_loss": 15.870747566223145, + "eval_runtime": 1.5451, + "eval_samples_per_second": 79.609, + "eval_steps_per_second": 10.356, "step": 1488 }, { "epoch": 24.19, - "learning_rate": 3.612903225806452e-08, - "loss": 21.7917, + "learning_rate": 6.661290322580646e-08, + "loss": 18.4542, "step": 1500 }, { "epoch": 25.0, - "eval_loss": 18.806514739990234, - "eval_runtime": 3.3778, - "eval_samples_per_second": 36.118, - "eval_steps_per_second": 4.737, + "eval_loss": 15.465510368347168, + "eval_runtime": 1.5532, + "eval_samples_per_second": 79.191, + "eval_steps_per_second": 10.301, "step": 1550 }, { "epoch": 26.0, - "eval_loss": 18.336509704589844, - "eval_runtime": 3.1436, - "eval_samples_per_second": 38.81, - "eval_steps_per_second": 5.09, + "eval_loss": 15.184890747070312, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.271, + "eval_steps_per_second": 10.312, "step": 1612 }, { "epoch": 27.0, - "eval_loss": 18.16900062561035, - "eval_runtime": 3.2259, - "eval_samples_per_second": 37.819, - "eval_steps_per_second": 4.96, + "eval_loss": 14.780122756958008, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.297, "step": 1674 }, { "epoch": 28.0, - "eval_loss": 17.55986213684082, - "eval_runtime": 3.1926, - "eval_samples_per_second": 38.213, - "eval_steps_per_second": 5.012, + "eval_loss": 14.557552337646484, + "eval_runtime": 1.5541, + "eval_samples_per_second": 79.147, + "eval_steps_per_second": 10.296, "step": 1736 }, { "epoch": 29.0, - "eval_loss": 17.429519653320312, - "eval_runtime": 3.19, - "eval_samples_per_second": 38.245, - "eval_steps_per_second": 5.016, + "eval_loss": 14.421510696411133, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.275, + "eval_steps_per_second": 10.312, "step": 1798 }, { "epoch": 30.0, - "eval_loss": 17.191831588745117, - "eval_runtime": 3.2186, - "eval_samples_per_second": 37.905, - "eval_steps_per_second": 4.971, + "eval_loss": 13.931785583496094, + "eval_runtime": 1.5492, + "eval_samples_per_second": 79.395, + "eval_steps_per_second": 10.328, "step": 1860 }, { "epoch": 31.0, - "eval_loss": 16.991268157958984, - "eval_runtime": 3.2004, - "eval_samples_per_second": 38.12, - "eval_steps_per_second": 4.999, + "eval_loss": 13.980083465576172, + "eval_runtime": 1.5454, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 10.354, "step": 1922 }, { "epoch": 32.0, - "eval_loss": 16.737815856933594, - "eval_runtime": 3.2002, - "eval_samples_per_second": 38.122, - "eval_steps_per_second": 5.0, + "eval_loss": 13.709578514099121, + "eval_runtime": 1.5464, + "eval_samples_per_second": 79.537, + "eval_steps_per_second": 10.346, "step": 1984 }, { "epoch": 32.26, - "learning_rate": 2.483870967741936e-08, - "loss": 18.0017, + "learning_rate": 6.548387096774194e-08, + "loss": 13.9556, "step": 2000 }, { "epoch": 33.0, - "eval_loss": 16.585847854614258, - "eval_runtime": 3.3866, - "eval_samples_per_second": 36.024, - "eval_steps_per_second": 4.725, + "eval_loss": 13.463862419128418, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.233, + "eval_steps_per_second": 10.307, "step": 2046 }, { "epoch": 34.0, - "eval_loss": 16.41578483581543, - "eval_runtime": 3.1469, - "eval_samples_per_second": 38.769, - "eval_steps_per_second": 5.084, + "eval_loss": 13.298738479614258, + "eval_runtime": 1.5554, + "eval_samples_per_second": 79.08, + "eval_steps_per_second": 10.287, "step": 2108 }, { "epoch": 35.0, - "eval_loss": 16.26023292541504, - "eval_runtime": 3.2254, - "eval_samples_per_second": 37.825, - "eval_steps_per_second": 4.961, + "eval_loss": 13.23029613494873, + "eval_runtime": 1.5584, + "eval_samples_per_second": 78.926, + "eval_steps_per_second": 10.267, "step": 2170 }, { "epoch": 36.0, - "eval_loss": 16.033435821533203, - "eval_runtime": 3.1846, - "eval_samples_per_second": 38.309, - "eval_steps_per_second": 5.024, + "eval_loss": 13.24497127532959, + "eval_runtime": 1.5624, + "eval_samples_per_second": 78.724, + "eval_steps_per_second": 10.24, "step": 2232 }, { "epoch": 37.0, - "eval_loss": 16.080913543701172, - "eval_runtime": 3.2091, - "eval_samples_per_second": 38.017, - "eval_steps_per_second": 4.986, + "eval_loss": 13.057371139526367, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.261, + "eval_steps_per_second": 10.31, "step": 2294 }, { "epoch": 38.0, - "eval_loss": 15.7175931930542, - "eval_runtime": 3.1978, - "eval_samples_per_second": 38.151, - "eval_steps_per_second": 5.003, + "eval_loss": 12.902210235595703, + "eval_runtime": 1.5456, + "eval_samples_per_second": 79.583, + "eval_steps_per_second": 10.352, "step": 2356 }, { "epoch": 39.0, - "eval_loss": 15.824816703796387, - "eval_runtime": 3.2153, - "eval_samples_per_second": 37.944, - "eval_steps_per_second": 4.976, + "eval_loss": 12.819963455200195, + "eval_runtime": 1.5458, + "eval_samples_per_second": 79.571, + "eval_steps_per_second": 10.351, "step": 2418 }, { "epoch": 40.0, - "eval_loss": 15.852823257446289, - "eval_runtime": 3.1937, - "eval_samples_per_second": 38.2, - "eval_steps_per_second": 5.01, + "eval_loss": 12.729530334472656, + "eval_runtime": 1.5456, + "eval_samples_per_second": 79.582, + "eval_steps_per_second": 10.352, "step": 2480 }, { "epoch": 40.32, - "learning_rate": 1.3548387096774193e-08, - "loss": 16.124, + "learning_rate": 6.435483870967743e-08, + "loss": 12.111, "step": 2500 }, { "epoch": 41.0, - "eval_loss": 15.77718734741211, - "eval_runtime": 3.377, - "eval_samples_per_second": 36.127, - "eval_steps_per_second": 4.738, + "eval_loss": 12.840241432189941, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.277, + "eval_steps_per_second": 10.312, "step": 2542 }, { "epoch": 42.0, - "eval_loss": 15.797329902648926, - "eval_runtime": 3.1524, - "eval_samples_per_second": 38.701, - "eval_steps_per_second": 5.075, + "eval_loss": 12.690611839294434, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.163, + "eval_steps_per_second": 10.298, "step": 2604 }, { "epoch": 43.0, - "eval_loss": 15.45718002319336, - "eval_runtime": 3.2372, - "eval_samples_per_second": 37.686, - "eval_steps_per_second": 4.942, + "eval_loss": 12.54220199584961, + "eval_runtime": 1.554, + "eval_samples_per_second": 79.15, + "eval_steps_per_second": 10.296, "step": 2666 }, { "epoch": 44.0, - "eval_loss": 15.622541427612305, - "eval_runtime": 3.2026, - "eval_samples_per_second": 38.094, - "eval_steps_per_second": 4.996, + "eval_loss": 12.503222465515137, + "eval_runtime": 1.5553, + "eval_samples_per_second": 79.086, + "eval_steps_per_second": 10.288, "step": 2728 }, { "epoch": 45.0, - "eval_loss": 15.430057525634766, - "eval_runtime": 3.1974, - "eval_samples_per_second": 38.156, - "eval_steps_per_second": 5.004, + "eval_loss": 12.296753883361816, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.275, + "eval_steps_per_second": 10.312, "step": 2790 }, { "epoch": 46.0, - "eval_loss": 15.735359191894531, - "eval_runtime": 3.2169, - "eval_samples_per_second": 37.925, - "eval_steps_per_second": 4.974, + "eval_loss": 12.332414627075195, + "eval_runtime": 1.5484, + "eval_samples_per_second": 79.436, + "eval_steps_per_second": 10.333, "step": 2852 }, { "epoch": 47.0, - "eval_loss": 15.529631614685059, - "eval_runtime": 3.2177, - "eval_samples_per_second": 37.916, - "eval_steps_per_second": 4.973, + "eval_loss": 12.254257202148438, + "eval_runtime": 1.551, + "eval_samples_per_second": 79.301, + "eval_steps_per_second": 10.316, "step": 2914 }, { "epoch": 48.0, - "eval_loss": 15.421342849731445, - "eval_runtime": 3.225, - "eval_samples_per_second": 37.829, - "eval_steps_per_second": 4.961, + "eval_loss": 12.24339485168457, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.26, + "eval_steps_per_second": 10.31, "step": 2976 }, { "epoch": 48.39, - "learning_rate": 2.2580645161290324e-09, - "loss": 15.3787, + "learning_rate": 6.322580645161291e-08, + "loss": 11.3304, "step": 3000 }, { "epoch": 49.0, - "eval_loss": 15.512802124023438, - "eval_runtime": 3.3809, - "eval_samples_per_second": 36.085, - "eval_steps_per_second": 4.732, + "eval_loss": 12.251891136169434, + "eval_runtime": 1.5529, + "eval_samples_per_second": 79.207, + "eval_steps_per_second": 10.303, "step": 3038 }, { "epoch": 50.0, - "eval_loss": 15.596056938171387, - "eval_runtime": 3.151, - "eval_samples_per_second": 38.718, - "eval_steps_per_second": 5.078, + "eval_loss": 12.112042427062988, + "eval_runtime": 1.56, + "eval_samples_per_second": 78.845, + "eval_steps_per_second": 10.256, "step": 3100 }, { - "epoch": 50.0, - "step": 3100, - "total_flos": 1620261687859200.0, - "train_loss": 23.064505024571574, - "train_runtime": 2425.9558, - "train_samples_per_second": 10.12, - "train_steps_per_second": 1.278 + "epoch": 51.0, + "eval_loss": 12.196945190429688, + "eval_runtime": 1.5565, + "eval_samples_per_second": 79.023, + "eval_steps_per_second": 10.279, + "step": 3162 + }, + { + "epoch": 52.0, + "eval_loss": 11.9977388381958, + "eval_runtime": 1.5836, + "eval_samples_per_second": 77.673, + "eval_steps_per_second": 10.104, + "step": 3224 + }, + { + "epoch": 53.0, + "eval_loss": 12.175936698913574, + "eval_runtime": 1.5513, + "eval_samples_per_second": 79.287, + "eval_steps_per_second": 10.314, + "step": 3286 + }, + { + "epoch": 54.0, + "eval_loss": 11.980809211730957, + "eval_runtime": 1.546, + "eval_samples_per_second": 79.559, + "eval_steps_per_second": 10.349, + "step": 3348 + }, + { + "epoch": 55.0, + "eval_loss": 12.149847030639648, + "eval_runtime": 1.5462, + "eval_samples_per_second": 79.548, + "eval_steps_per_second": 10.348, + "step": 3410 + }, + { + "epoch": 56.0, + "eval_loss": 11.985103607177734, + "eval_runtime": 1.5457, + "eval_samples_per_second": 79.577, + "eval_steps_per_second": 10.351, + "step": 3472 + }, + { + "epoch": 56.45, + "learning_rate": 6.20967741935484e-08, + "loss": 10.9471, + "step": 3500 + }, + { + "epoch": 57.0, + "eval_loss": 12.080038070678711, + "eval_runtime": 1.5551, + "eval_samples_per_second": 79.095, + "eval_steps_per_second": 10.289, + "step": 3534 + }, + { + "epoch": 58.0, + "eval_loss": 11.881911277770996, + "eval_runtime": 1.5543, + "eval_samples_per_second": 79.133, + "eval_steps_per_second": 10.294, + "step": 3596 + }, + { + "epoch": 59.0, + "eval_loss": 12.009904861450195, + "eval_runtime": 1.5505, + "eval_samples_per_second": 79.328, + "eval_steps_per_second": 10.319, + "step": 3658 + }, + { + "epoch": 60.0, + "eval_loss": 11.861567497253418, + "eval_runtime": 1.5654, + "eval_samples_per_second": 78.574, + "eval_steps_per_second": 10.221, + "step": 3720 + }, + { + "epoch": 61.0, + "eval_loss": 11.89500617980957, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.279, + "eval_steps_per_second": 10.313, + "step": 3782 + }, + { + "epoch": 62.0, + "eval_loss": 11.830110549926758, + "eval_runtime": 1.5521, + "eval_samples_per_second": 79.249, + "eval_steps_per_second": 10.309, + "step": 3844 + }, + { + "epoch": 63.0, + "eval_loss": 12.096732139587402, + "eval_runtime": 1.5464, + "eval_samples_per_second": 79.542, + "eval_steps_per_second": 10.347, + "step": 3906 + }, + { + "epoch": 64.0, + "eval_loss": 11.837874412536621, + "eval_runtime": 1.5533, + "eval_samples_per_second": 79.184, + "eval_steps_per_second": 10.3, + "step": 3968 + }, + { + "epoch": 64.52, + "learning_rate": 6.096774193548388e-08, + "loss": 10.6841, + "step": 4000 + }, + { + "epoch": 65.0, + "eval_loss": 11.909323692321777, + "eval_runtime": 1.5534, + "eval_samples_per_second": 79.181, + "eval_steps_per_second": 10.3, + "step": 4030 + }, + { + "epoch": 66.0, + "eval_loss": 11.91748332977295, + "eval_runtime": 1.5598, + "eval_samples_per_second": 78.854, + "eval_steps_per_second": 10.257, + "step": 4092 + }, + { + "epoch": 67.0, + "eval_loss": 11.73180103302002, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.365, + "eval_steps_per_second": 10.324, + "step": 4154 + }, + { + "epoch": 68.0, + "eval_loss": 11.830022811889648, + "eval_runtime": 1.5461, + "eval_samples_per_second": 79.557, + "eval_steps_per_second": 10.349, + "step": 4216 + }, + { + "epoch": 69.0, + "eval_loss": 11.599220275878906, + "eval_runtime": 1.5555, + "eval_samples_per_second": 79.074, + "eval_steps_per_second": 10.286, + "step": 4278 + }, + { + "epoch": 70.0, + "eval_loss": 11.604393005371094, + "eval_runtime": 1.5547, + "eval_samples_per_second": 79.114, + "eval_steps_per_second": 10.291, + "step": 4340 + }, + { + "epoch": 71.0, + "eval_loss": 11.693653106689453, + "eval_runtime": 1.5528, + "eval_samples_per_second": 79.212, + "eval_steps_per_second": 10.304, + "step": 4402 + }, + { + "epoch": 72.0, + "eval_loss": 11.596038818359375, + "eval_runtime": 1.5565, + "eval_samples_per_second": 79.023, + "eval_steps_per_second": 10.279, + "step": 4464 + }, + { + "epoch": 72.58, + "learning_rate": 5.983870967741936e-08, + "loss": 10.542, + "step": 4500 + }, + { + "epoch": 73.0, + "eval_loss": 11.545083045959473, + "eval_runtime": 1.5539, + "eval_samples_per_second": 79.157, + "eval_steps_per_second": 10.297, + "step": 4526 + }, + { + "epoch": 74.0, + "eval_loss": 11.403264999389648, + "eval_runtime": 1.5513, + "eval_samples_per_second": 79.291, + "eval_steps_per_second": 10.314, + "step": 4588 + }, + { + "epoch": 75.0, + "eval_loss": 11.480517387390137, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.28, + "eval_steps_per_second": 10.313, + "step": 4650 + }, + { + "epoch": 76.0, + "eval_loss": 11.539589881896973, + "eval_runtime": 1.552, + "eval_samples_per_second": 79.252, + "eval_steps_per_second": 10.309, + "step": 4712 + }, + { + "epoch": 77.0, + "eval_loss": 11.492401123046875, + "eval_runtime": 1.5522, + "eval_samples_per_second": 79.245, + "eval_steps_per_second": 10.308, + "step": 4774 + }, + { + "epoch": 78.0, + "eval_loss": 11.428875923156738, + "eval_runtime": 1.5583, + "eval_samples_per_second": 78.931, + "eval_steps_per_second": 10.268, + "step": 4836 + }, + { + "epoch": 79.0, + "eval_loss": 11.364572525024414, + "eval_runtime": 1.5553, + "eval_samples_per_second": 79.085, + "eval_steps_per_second": 10.287, + "step": 4898 + }, + { + "epoch": 80.0, + "eval_loss": 11.362411499023438, + "eval_runtime": 1.5451, + "eval_samples_per_second": 79.605, + "eval_steps_per_second": 10.355, + "step": 4960 + }, + { + "epoch": 80.65, + "learning_rate": 5.870967741935484e-08, + "loss": 10.4457, + "step": 5000 + }, + { + "epoch": 81.0, + "eval_loss": 11.462297439575195, + "eval_runtime": 1.5542, + "eval_samples_per_second": 79.139, + "eval_steps_per_second": 10.294, + "step": 5022 + }, + { + "epoch": 82.0, + "eval_loss": 11.522425651550293, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.297, + "step": 5084 + }, + { + "epoch": 83.0, + "eval_loss": 11.419597625732422, + "eval_runtime": 1.5519, + "eval_samples_per_second": 79.26, + "eval_steps_per_second": 10.31, + "step": 5146 + }, + { + "epoch": 84.0, + "eval_loss": 11.435912132263184, + "eval_runtime": 1.572, + "eval_samples_per_second": 78.243, + "eval_steps_per_second": 10.178, + "step": 5208 + }, + { + "epoch": 85.0, + "eval_loss": 11.434619903564453, + "eval_runtime": 1.5634, + "eval_samples_per_second": 78.674, + "eval_steps_per_second": 10.234, + "step": 5270 + }, + { + "epoch": 86.0, + "eval_loss": 11.463889122009277, + "eval_runtime": 1.5521, + "eval_samples_per_second": 79.248, + "eval_steps_per_second": 10.309, + "step": 5332 + }, + { + "epoch": 87.0, + "eval_loss": 11.230652809143066, + "eval_runtime": 1.5468, + "eval_samples_per_second": 79.521, + "eval_steps_per_second": 10.344, + "step": 5394 + }, + { + "epoch": 88.0, + "eval_loss": 11.526171684265137, + "eval_runtime": 1.5455, + "eval_samples_per_second": 79.584, + "eval_steps_per_second": 10.352, + "step": 5456 + }, + { + "epoch": 88.71, + "learning_rate": 5.758064516129033e-08, + "loss": 10.3854, + "step": 5500 + }, + { + "epoch": 89.0, + "eval_loss": 11.665349006652832, + "eval_runtime": 1.5505, + "eval_samples_per_second": 79.329, + "eval_steps_per_second": 10.319, + "step": 5518 + }, + { + "epoch": 90.0, + "eval_loss": 11.3471040725708, + "eval_runtime": 1.5506, + "eval_samples_per_second": 79.325, + "eval_steps_per_second": 10.319, + "step": 5580 + }, + { + "epoch": 91.0, + "eval_loss": 11.304780960083008, + "eval_runtime": 1.5529, + "eval_samples_per_second": 79.209, + "eval_steps_per_second": 10.304, + "step": 5642 + }, + { + "epoch": 92.0, + "eval_loss": 11.36330509185791, + "eval_runtime": 1.5593, + "eval_samples_per_second": 78.88, + "eval_steps_per_second": 10.261, + "step": 5704 + }, + { + "epoch": 93.0, + "eval_loss": 11.343398094177246, + "eval_runtime": 1.5672, + "eval_samples_per_second": 78.483, + "eval_steps_per_second": 10.209, + "step": 5766 + }, + { + "epoch": 94.0, + "eval_loss": 11.257966041564941, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.263, + "eval_steps_per_second": 10.311, + "step": 5828 + }, + { + "epoch": 95.0, + "eval_loss": 11.270854949951172, + "eval_runtime": 1.5445, + "eval_samples_per_second": 79.635, + "eval_steps_per_second": 10.359, + "step": 5890 + }, + { + "epoch": 96.0, + "eval_loss": 11.198323249816895, + "eval_runtime": 1.5471, + "eval_samples_per_second": 79.503, + "eval_steps_per_second": 10.342, + "step": 5952 + }, + { + "epoch": 96.77, + "learning_rate": 5.6451612903225805e-08, + "loss": 10.324, + "step": 6000 + }, + { + "epoch": 97.0, + "eval_loss": 11.22177791595459, + "eval_runtime": 1.5536, + "eval_samples_per_second": 79.171, + "eval_steps_per_second": 10.299, + "step": 6014 + }, + { + "epoch": 98.0, + "eval_loss": 11.330657005310059, + "eval_runtime": 1.5691, + "eval_samples_per_second": 78.388, + "eval_steps_per_second": 10.197, + "step": 6076 + }, + { + "epoch": 99.0, + "eval_loss": 11.203608512878418, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.972, + "eval_steps_per_second": 10.273, + "step": 6138 + }, + { + "epoch": 100.0, + "eval_loss": 11.372391700744629, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.216, + "eval_steps_per_second": 10.305, + "step": 6200 + }, + { + "epoch": 101.0, + "eval_loss": 11.344269752502441, + "eval_runtime": 1.5528, + "eval_samples_per_second": 79.21, + "eval_steps_per_second": 10.304, + "step": 6262 + }, + { + "epoch": 102.0, + "eval_loss": 11.264634132385254, + "eval_runtime": 1.5533, + "eval_samples_per_second": 79.188, + "eval_steps_per_second": 10.301, + "step": 6324 + }, + { + "epoch": 103.0, + "eval_loss": 11.27165699005127, + "eval_runtime": 1.549, + "eval_samples_per_second": 79.408, + "eval_steps_per_second": 10.33, + "step": 6386 + }, + { + "epoch": 104.0, + "eval_loss": 11.310558319091797, + "eval_runtime": 1.5464, + "eval_samples_per_second": 79.542, + "eval_steps_per_second": 10.347, + "step": 6448 + }, + { + "epoch": 104.84, + "learning_rate": 5.532258064516129e-08, + "loss": 10.2981, + "step": 6500 + }, + { + "epoch": 105.0, + "eval_loss": 11.243507385253906, + "eval_runtime": 1.5536, + "eval_samples_per_second": 79.17, + "eval_steps_per_second": 10.298, + "step": 6510 + }, + { + "epoch": 106.0, + "eval_loss": 11.249150276184082, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.261, + "eval_steps_per_second": 10.31, + "step": 6572 + }, + { + "epoch": 107.0, + "eval_loss": 11.111257553100586, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.218, + "eval_steps_per_second": 10.305, + "step": 6634 + }, + { + "epoch": 108.0, + "eval_loss": 11.210649490356445, + "eval_runtime": 1.557, + "eval_samples_per_second": 78.999, + "eval_steps_per_second": 10.276, + "step": 6696 + }, + { + "epoch": 109.0, + "eval_loss": 11.214300155639648, + "eval_runtime": 1.5747, + "eval_samples_per_second": 78.112, + "eval_steps_per_second": 10.161, + "step": 6758 + }, + { + "epoch": 110.0, + "eval_loss": 11.015155792236328, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.274, + "eval_steps_per_second": 10.312, + "step": 6820 + }, + { + "epoch": 111.0, + "eval_loss": 11.08711051940918, + "eval_runtime": 1.5448, + "eval_samples_per_second": 79.621, + "eval_steps_per_second": 10.357, + "step": 6882 + }, + { + "epoch": 112.0, + "eval_loss": 11.098098754882812, + "eval_runtime": 1.5449, + "eval_samples_per_second": 79.617, + "eval_steps_per_second": 10.357, + "step": 6944 + }, + { + "epoch": 112.9, + "learning_rate": 5.4193548387096774e-08, + "loss": 10.2536, + "step": 7000 + }, + { + "epoch": 113.0, + "eval_loss": 11.243887901306152, + "eval_runtime": 1.5475, + "eval_samples_per_second": 79.483, + "eval_steps_per_second": 10.339, + "step": 7006 + }, + { + "epoch": 114.0, + "eval_loss": 11.156224250793457, + "eval_runtime": 1.5505, + "eval_samples_per_second": 79.327, + "eval_steps_per_second": 10.319, + "step": 7068 + }, + { + "epoch": 115.0, + "eval_loss": 11.118559837341309, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.363, + "eval_steps_per_second": 10.324, + "step": 7130 + }, + { + "epoch": 116.0, + "eval_loss": 11.2282133102417, + "eval_runtime": 1.562, + "eval_samples_per_second": 78.744, + "eval_steps_per_second": 10.243, + "step": 7192 + }, + { + "epoch": 117.0, + "eval_loss": 11.289180755615234, + "eval_runtime": 1.5484, + "eval_samples_per_second": 79.437, + "eval_steps_per_second": 10.333, + "step": 7254 + }, + { + "epoch": 118.0, + "eval_loss": 11.165923118591309, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.263, + "eval_steps_per_second": 10.311, + "step": 7316 + }, + { + "epoch": 119.0, + "eval_loss": 11.34392261505127, + "eval_runtime": 1.545, + "eval_samples_per_second": 79.613, + "eval_steps_per_second": 10.356, + "step": 7378 + }, + { + "epoch": 120.0, + "eval_loss": 11.199868202209473, + "eval_runtime": 1.5455, + "eval_samples_per_second": 79.586, + "eval_steps_per_second": 10.353, + "step": 7440 + }, + { + "epoch": 120.97, + "learning_rate": 5.306451612903226e-08, + "loss": 10.2336, + "step": 7500 + }, + { + "epoch": 121.0, + "eval_loss": 11.250391960144043, + "eval_runtime": 1.5504, + "eval_samples_per_second": 79.337, + "eval_steps_per_second": 10.32, + "step": 7502 + }, + { + "epoch": 122.0, + "eval_loss": 11.289613723754883, + "eval_runtime": 1.5534, + "eval_samples_per_second": 79.18, + "eval_steps_per_second": 10.3, + "step": 7564 + }, + { + "epoch": 123.0, + "eval_loss": 11.068822860717773, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.158, + "eval_steps_per_second": 10.297, + "step": 7626 + }, + { + "epoch": 124.0, + "eval_loss": 11.16740894317627, + "eval_runtime": 1.5519, + "eval_samples_per_second": 79.258, + "eval_steps_per_second": 10.31, + "step": 7688 + }, + { + "epoch": 125.0, + "eval_loss": 11.207221984863281, + "eval_runtime": 1.5529, + "eval_samples_per_second": 79.204, + "eval_steps_per_second": 10.303, + "step": 7750 + }, + { + "epoch": 126.0, + "eval_loss": 11.035326957702637, + "eval_runtime": 1.552, + "eval_samples_per_second": 79.253, + "eval_steps_per_second": 10.309, + "step": 7812 + }, + { + "epoch": 127.0, + "eval_loss": 11.048843383789062, + "eval_runtime": 1.548, + "eval_samples_per_second": 79.46, + "eval_steps_per_second": 10.336, + "step": 7874 + }, + { + "epoch": 128.0, + "eval_loss": 11.093210220336914, + "eval_runtime": 1.5454, + "eval_samples_per_second": 79.59, + "eval_steps_per_second": 10.353, + "step": 7936 + }, + { + "epoch": 129.0, + "eval_loss": 11.070518493652344, + "eval_runtime": 1.5446, + "eval_samples_per_second": 79.63, + "eval_steps_per_second": 10.358, + "step": 7998 + }, + { + "epoch": 129.03, + "learning_rate": 5.193548387096775e-08, + "loss": 10.2101, + "step": 8000 + }, + { + "epoch": 130.0, + "eval_loss": 11.219257354736328, + "eval_runtime": 1.5539, + "eval_samples_per_second": 79.157, + "eval_steps_per_second": 10.297, + "step": 8060 + }, + { + "epoch": 131.0, + "eval_loss": 11.227781295776367, + "eval_runtime": 1.5509, + "eval_samples_per_second": 79.307, + "eval_steps_per_second": 10.316, + "step": 8122 + }, + { + "epoch": 132.0, + "eval_loss": 11.136541366577148, + "eval_runtime": 1.5783, + "eval_samples_per_second": 77.932, + "eval_steps_per_second": 10.137, + "step": 8184 + }, + { + "epoch": 133.0, + "eval_loss": 11.035323143005371, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.297, + "step": 8246 + }, + { + "epoch": 134.0, + "eval_loss": 11.014593124389648, + "eval_runtime": 1.5521, + "eval_samples_per_second": 79.246, + "eval_steps_per_second": 10.308, + "step": 8308 + }, + { + "epoch": 135.0, + "eval_loss": 11.237153053283691, + "eval_runtime": 1.5457, + "eval_samples_per_second": 79.575, + "eval_steps_per_second": 10.351, + "step": 8370 + }, + { + "epoch": 136.0, + "eval_loss": 11.104418754577637, + "eval_runtime": 1.5452, + "eval_samples_per_second": 79.6, + "eval_steps_per_second": 10.355, + "step": 8432 + }, + { + "epoch": 137.0, + "eval_loss": 11.134021759033203, + "eval_runtime": 1.5476, + "eval_samples_per_second": 79.478, + "eval_steps_per_second": 10.339, + "step": 8494 + }, + { + "epoch": 137.1, + "learning_rate": 5.0806451612903234e-08, + "loss": 10.1991, + "step": 8500 + }, + { + "epoch": 138.0, + "eval_loss": 11.075273513793945, + "eval_runtime": 1.5484, + "eval_samples_per_second": 79.436, + "eval_steps_per_second": 10.333, + "step": 8556 + }, + { + "epoch": 139.0, + "eval_loss": 11.021109580993652, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.391, + "eval_steps_per_second": 10.327, + "step": 8618 + }, + { + "epoch": 140.0, + "eval_loss": 10.930947303771973, + "eval_runtime": 1.5551, + "eval_samples_per_second": 79.096, + "eval_steps_per_second": 10.289, + "step": 8680 + }, + { + "epoch": 141.0, + "eval_loss": 10.977023124694824, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.263, + "eval_steps_per_second": 10.311, + "step": 8742 + }, + { + "epoch": 142.0, + "eval_loss": 11.079144477844238, + "eval_runtime": 1.5523, + "eval_samples_per_second": 79.238, + "eval_steps_per_second": 10.307, + "step": 8804 + }, + { + "epoch": 143.0, + "eval_loss": 11.116453170776367, + "eval_runtime": 1.5541, + "eval_samples_per_second": 79.147, + "eval_steps_per_second": 10.296, + "step": 8866 + }, + { + "epoch": 144.0, + "eval_loss": 10.936894416809082, + "eval_runtime": 1.5532, + "eval_samples_per_second": 79.19, + "eval_steps_per_second": 10.301, + "step": 8928 + }, + { + "epoch": 145.0, + "eval_loss": 11.121102333068848, + "eval_runtime": 1.5444, + "eval_samples_per_second": 79.641, + "eval_steps_per_second": 10.36, + "step": 8990 + }, + { + "epoch": 145.16, + "learning_rate": 4.967741935483872e-08, + "loss": 10.1925, + "step": 9000 + }, + { + "epoch": 146.0, + "eval_loss": 11.074503898620605, + "eval_runtime": 1.5533, + "eval_samples_per_second": 79.185, + "eval_steps_per_second": 10.301, + "step": 9052 + }, + { + "epoch": 147.0, + "eval_loss": 11.104596138000488, + "eval_runtime": 1.553, + "eval_samples_per_second": 79.201, + "eval_steps_per_second": 10.303, + "step": 9114 + }, + { + "epoch": 148.0, + "eval_loss": 11.025848388671875, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.234, + "eval_steps_per_second": 10.307, + "step": 9176 + }, + { + "epoch": 149.0, + "eval_loss": 11.02661418914795, + "eval_runtime": 1.5531, + "eval_samples_per_second": 79.198, + "eval_steps_per_second": 10.302, + "step": 9238 + }, + { + "epoch": 150.0, + "eval_loss": 11.253996849060059, + "eval_runtime": 1.5526, + "eval_samples_per_second": 79.224, + "eval_steps_per_second": 10.306, + "step": 9300 + }, + { + "epoch": 151.0, + "eval_loss": 10.970392227172852, + "eval_runtime": 1.5474, + "eval_samples_per_second": 79.487, + "eval_steps_per_second": 10.34, + "step": 9362 + }, + { + "epoch": 152.0, + "eval_loss": 11.034109115600586, + "eval_runtime": 1.5453, + "eval_samples_per_second": 79.599, + "eval_steps_per_second": 10.354, + "step": 9424 + }, + { + "epoch": 153.0, + "eval_loss": 11.111385345458984, + "eval_runtime": 1.5448, + "eval_samples_per_second": 79.622, + "eval_steps_per_second": 10.357, + "step": 9486 + }, + { + "epoch": 153.23, + "learning_rate": 4.85483870967742e-08, + "loss": 10.1538, + "step": 9500 + }, + { + "epoch": 154.0, + "eval_loss": 11.022419929504395, + "eval_runtime": 1.5528, + "eval_samples_per_second": 79.213, + "eval_steps_per_second": 10.304, + "step": 9548 + }, + { + "epoch": 155.0, + "eval_loss": 10.915043830871582, + "eval_runtime": 1.6252, + "eval_samples_per_second": 75.682, + "eval_steps_per_second": 9.845, + "step": 9610 + }, + { + "epoch": 156.0, + "eval_loss": 10.930474281311035, + "eval_runtime": 1.5606, + "eval_samples_per_second": 78.816, + "eval_steps_per_second": 10.252, + "step": 9672 + }, + { + "epoch": 157.0, + "eval_loss": 11.069900512695312, + "eval_runtime": 1.5514, + "eval_samples_per_second": 79.281, + "eval_steps_per_second": 10.313, + "step": 9734 + }, + { + "epoch": 158.0, + "eval_loss": 10.990084648132324, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.23, + "eval_steps_per_second": 10.306, + "step": 9796 + }, + { + "epoch": 159.0, + "eval_loss": 11.068836212158203, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.392, + "eval_steps_per_second": 10.327, + "step": 9858 + }, + { + "epoch": 160.0, + "eval_loss": 10.933384895324707, + "eval_runtime": 1.5461, + "eval_samples_per_second": 79.553, + "eval_steps_per_second": 10.348, + "step": 9920 + }, + { + "epoch": 161.0, + "eval_loss": 10.985239028930664, + "eval_runtime": 1.5514, + "eval_samples_per_second": 79.283, + "eval_steps_per_second": 10.313, + "step": 9982 + }, + { + "epoch": 161.29, + "learning_rate": 4.741935483870968e-08, + "loss": 10.1494, + "step": 10000 + }, + { + "epoch": 162.0, + "eval_loss": 10.903247833251953, + "eval_runtime": 1.5599, + "eval_samples_per_second": 78.85, + "eval_steps_per_second": 10.257, + "step": 10044 + }, + { + "epoch": 163.0, + "eval_loss": 10.96179485321045, + "eval_runtime": 1.5509, + "eval_samples_per_second": 79.309, + "eval_steps_per_second": 10.317, + "step": 10106 + }, + { + "epoch": 164.0, + "eval_loss": 10.947345733642578, + "eval_runtime": 1.5514, + "eval_samples_per_second": 79.286, + "eval_steps_per_second": 10.314, + "step": 10168 + }, + { + "epoch": 165.0, + "eval_loss": 11.153131484985352, + "eval_runtime": 1.5559, + "eval_samples_per_second": 79.053, + "eval_steps_per_second": 10.283, + "step": 10230 + }, + { + "epoch": 166.0, + "eval_loss": 10.846189498901367, + "eval_runtime": 1.5522, + "eval_samples_per_second": 79.244, + "eval_steps_per_second": 10.308, + "step": 10292 + }, + { + "epoch": 167.0, + "eval_loss": 10.957653045654297, + "eval_runtime": 1.5541, + "eval_samples_per_second": 79.147, + "eval_steps_per_second": 10.296, + "step": 10354 + }, + { + "epoch": 168.0, + "eval_loss": 10.935074806213379, + "eval_runtime": 1.5492, + "eval_samples_per_second": 79.394, + "eval_steps_per_second": 10.328, + "step": 10416 + }, + { + "epoch": 169.0, + "eval_loss": 11.212529182434082, + "eval_runtime": 1.545, + "eval_samples_per_second": 79.61, + "eval_steps_per_second": 10.356, + "step": 10478 + }, + { + "epoch": 169.35, + "learning_rate": 4.6290322580645165e-08, + "loss": 10.1424, + "step": 10500 + }, + { + "epoch": 170.0, + "eval_loss": 11.054618835449219, + "eval_runtime": 1.5612, + "eval_samples_per_second": 78.784, + "eval_steps_per_second": 10.248, + "step": 10540 + }, + { + "epoch": 171.0, + "eval_loss": 10.993365287780762, + "eval_runtime": 1.5547, + "eval_samples_per_second": 79.116, + "eval_steps_per_second": 10.292, + "step": 10602 + }, + { + "epoch": 172.0, + "eval_loss": 10.891836166381836, + "eval_runtime": 1.5523, + "eval_samples_per_second": 79.24, + "eval_steps_per_second": 10.308, + "step": 10664 + }, + { + "epoch": 173.0, + "eval_loss": 10.886784553527832, + "eval_runtime": 1.5513, + "eval_samples_per_second": 79.29, + "eval_steps_per_second": 10.314, + "step": 10726 + }, + { + "epoch": 174.0, + "eval_loss": 11.039985656738281, + "eval_runtime": 1.6601, + "eval_samples_per_second": 74.092, + "eval_steps_per_second": 9.638, + "step": 10788 + }, + { + "epoch": 175.0, + "eval_loss": 11.119784355163574, + "eval_runtime": 1.5449, + "eval_samples_per_second": 79.617, + "eval_steps_per_second": 10.357, + "step": 10850 + }, + { + "epoch": 176.0, + "eval_loss": 11.159457206726074, + "eval_runtime": 1.5467, + "eval_samples_per_second": 79.525, + "eval_steps_per_second": 10.345, + "step": 10912 + }, + { + "epoch": 177.0, + "eval_loss": 11.113701820373535, + "eval_runtime": 1.5449, + "eval_samples_per_second": 79.616, + "eval_steps_per_second": 10.357, + "step": 10974 + }, + { + "epoch": 177.42, + "learning_rate": 4.516129032258065e-08, + "loss": 10.1181, + "step": 11000 + }, + { + "epoch": 178.0, + "eval_loss": 11.199760437011719, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.297, + "step": 11036 + }, + { + "epoch": 179.0, + "eval_loss": 11.049378395080566, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.215, + "eval_steps_per_second": 10.304, + "step": 11098 + }, + { + "epoch": 180.0, + "eval_loss": 10.948933601379395, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.216, + "eval_steps_per_second": 10.305, + "step": 11160 + }, + { + "epoch": 181.0, + "eval_loss": 11.02480411529541, + "eval_runtime": 1.5509, + "eval_samples_per_second": 79.306, + "eval_steps_per_second": 10.316, + "step": 11222 + }, + { + "epoch": 182.0, + "eval_loss": 10.991153717041016, + "eval_runtime": 1.5561, + "eval_samples_per_second": 79.045, + "eval_steps_per_second": 10.282, + "step": 11284 + }, + { + "epoch": 183.0, + "eval_loss": 10.922146797180176, + "eval_runtime": 1.5488, + "eval_samples_per_second": 79.416, + "eval_steps_per_second": 10.331, + "step": 11346 + }, + { + "epoch": 184.0, + "eval_loss": 11.055255889892578, + "eval_runtime": 1.547, + "eval_samples_per_second": 79.508, + "eval_steps_per_second": 10.343, + "step": 11408 + }, + { + "epoch": 185.0, + "eval_loss": 11.005462646484375, + "eval_runtime": 1.5452, + "eval_samples_per_second": 79.6, + "eval_steps_per_second": 10.354, + "step": 11470 + }, + { + "epoch": 185.48, + "learning_rate": 4.4032258064516134e-08, + "loss": 10.1165, + "step": 11500 + }, + { + "epoch": 186.0, + "eval_loss": 11.112872123718262, + "eval_runtime": 1.5624, + "eval_samples_per_second": 78.724, + "eval_steps_per_second": 10.24, + "step": 11532 + }, + { + "epoch": 187.0, + "eval_loss": 11.082246780395508, + "eval_runtime": 1.5529, + "eval_samples_per_second": 79.209, + "eval_steps_per_second": 10.304, + "step": 11594 + }, + { + "epoch": 188.0, + "eval_loss": 11.050854682922363, + "eval_runtime": 1.5513, + "eval_samples_per_second": 79.287, + "eval_steps_per_second": 10.314, + "step": 11656 + }, + { + "epoch": 189.0, + "eval_loss": 11.088144302368164, + "eval_runtime": 1.5522, + "eval_samples_per_second": 79.24, + "eval_steps_per_second": 10.308, + "step": 11718 + }, + { + "epoch": 190.0, + "eval_loss": 10.890948295593262, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.263, + "eval_steps_per_second": 10.311, + "step": 11780 + }, + { + "epoch": 191.0, + "eval_loss": 11.048895835876465, + "eval_runtime": 1.5439, + "eval_samples_per_second": 79.668, + "eval_steps_per_second": 10.363, + "step": 11842 + }, + { + "epoch": 192.0, + "eval_loss": 10.934896469116211, + "eval_runtime": 1.548, + "eval_samples_per_second": 79.46, + "eval_steps_per_second": 10.336, + "step": 11904 + }, + { + "epoch": 193.0, + "eval_loss": 10.882064819335938, + "eval_runtime": 1.5473, + "eval_samples_per_second": 79.492, + "eval_steps_per_second": 10.34, + "step": 11966 + }, + { + "epoch": 193.55, + "learning_rate": 4.290322580645162e-08, + "loss": 10.1021, + "step": 12000 + }, + { + "epoch": 194.0, + "eval_loss": 11.055244445800781, + "eval_runtime": 1.5566, + "eval_samples_per_second": 79.017, + "eval_steps_per_second": 10.279, + "step": 12028 + }, + { + "epoch": 195.0, + "eval_loss": 11.152359962463379, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.217, + "eval_steps_per_second": 10.305, + "step": 12090 + }, + { + "epoch": 196.0, + "eval_loss": 11.008437156677246, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.158, + "eval_steps_per_second": 10.297, + "step": 12152 + }, + { + "epoch": 197.0, + "eval_loss": 11.086997032165527, + "eval_runtime": 1.5634, + "eval_samples_per_second": 78.677, + "eval_steps_per_second": 10.234, + "step": 12214 + }, + { + "epoch": 198.0, + "eval_loss": 10.842860221862793, + "eval_runtime": 1.5635, + "eval_samples_per_second": 78.671, + "eval_steps_per_second": 10.234, + "step": 12276 + }, + { + "epoch": 199.0, + "eval_loss": 11.010416030883789, + "eval_runtime": 1.5454, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 10.354, + "step": 12338 + }, + { + "epoch": 200.0, + "eval_loss": 10.92686939239502, + "eval_runtime": 1.545, + "eval_samples_per_second": 79.613, + "eval_steps_per_second": 10.356, + "step": 12400 + }, + { + "epoch": 201.0, + "eval_loss": 10.987348556518555, + "eval_runtime": 1.548, + "eval_samples_per_second": 79.458, + "eval_steps_per_second": 10.336, + "step": 12462 + }, + { + "epoch": 201.61, + "learning_rate": 4.17741935483871e-08, + "loss": 10.1022, + "step": 12500 + }, + { + "epoch": 202.0, + "eval_loss": 10.98776912689209, + "eval_runtime": 1.57, + "eval_samples_per_second": 78.342, + "eval_steps_per_second": 10.191, + "step": 12524 + }, + { + "epoch": 203.0, + "eval_loss": 10.882826805114746, + "eval_runtime": 1.5543, + "eval_samples_per_second": 79.135, + "eval_steps_per_second": 10.294, + "step": 12586 + }, + { + "epoch": 204.0, + "eval_loss": 11.043395042419434, + "eval_runtime": 1.5636, + "eval_samples_per_second": 78.662, + "eval_steps_per_second": 10.233, + "step": 12648 + }, + { + "epoch": 205.0, + "eval_loss": 10.953902244567871, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.972, + "eval_steps_per_second": 10.273, + "step": 12710 + }, + { + "epoch": 206.0, + "eval_loss": 10.684782028198242, + "eval_runtime": 1.5525, + "eval_samples_per_second": 79.228, + "eval_steps_per_second": 10.306, + "step": 12772 + }, + { + "epoch": 207.0, + "eval_loss": 11.177257537841797, + "eval_runtime": 1.5483, + "eval_samples_per_second": 79.441, + "eval_steps_per_second": 10.334, + "step": 12834 + }, + { + "epoch": 208.0, + "eval_loss": 10.996722221374512, + "eval_runtime": 1.5543, + "eval_samples_per_second": 79.138, + "eval_steps_per_second": 10.294, + "step": 12896 + }, + { + "epoch": 209.0, + "eval_loss": 10.998943328857422, + "eval_runtime": 1.5532, + "eval_samples_per_second": 79.19, + "eval_steps_per_second": 10.301, + "step": 12958 + }, + { + "epoch": 209.68, + "learning_rate": 4.064516129032259e-08, + "loss": 10.0962, + "step": 13000 + }, + { + "epoch": 210.0, + "eval_loss": 11.20012378692627, + "eval_runtime": 1.5557, + "eval_samples_per_second": 79.063, + "eval_steps_per_second": 10.285, + "step": 13020 + }, + { + "epoch": 211.0, + "eval_loss": 10.965839385986328, + "eval_runtime": 1.554, + "eval_samples_per_second": 79.148, + "eval_steps_per_second": 10.296, + "step": 13082 + }, + { + "epoch": 212.0, + "eval_loss": 11.009934425354004, + "eval_runtime": 1.5719, + "eval_samples_per_second": 78.252, + "eval_steps_per_second": 10.179, + "step": 13144 + }, + { + "epoch": 213.0, + "eval_loss": 11.053404808044434, + "eval_runtime": 1.5544, + "eval_samples_per_second": 79.129, + "eval_steps_per_second": 10.293, + "step": 13206 + }, + { + "epoch": 214.0, + "eval_loss": 11.185773849487305, + "eval_runtime": 1.5545, + "eval_samples_per_second": 79.125, + "eval_steps_per_second": 10.293, + "step": 13268 + }, + { + "epoch": 215.0, + "eval_loss": 11.044096946716309, + "eval_runtime": 1.5449, + "eval_samples_per_second": 79.615, + "eval_steps_per_second": 10.356, + "step": 13330 + }, + { + "epoch": 216.0, + "eval_loss": 11.181927680969238, + "eval_runtime": 1.5461, + "eval_samples_per_second": 79.553, + "eval_steps_per_second": 10.348, + "step": 13392 + }, + { + "epoch": 217.0, + "eval_loss": 11.01004695892334, + "eval_runtime": 1.5451, + "eval_samples_per_second": 79.607, + "eval_steps_per_second": 10.355, + "step": 13454 + }, + { + "epoch": 217.74, + "learning_rate": 3.951612903225807e-08, + "loss": 10.0861, + "step": 13500 + }, + { + "epoch": 218.0, + "eval_loss": 11.007074356079102, + "eval_runtime": 1.5582, + "eval_samples_per_second": 78.937, + "eval_steps_per_second": 10.268, + "step": 13516 + }, + { + "epoch": 219.0, + "eval_loss": 11.145269393920898, + "eval_runtime": 1.5533, + "eval_samples_per_second": 79.188, + "eval_steps_per_second": 10.301, + "step": 13578 + }, + { + "epoch": 220.0, + "eval_loss": 10.985397338867188, + "eval_runtime": 1.5653, + "eval_samples_per_second": 78.577, + "eval_steps_per_second": 10.221, + "step": 13640 + }, + { + "epoch": 221.0, + "eval_loss": 11.120607376098633, + "eval_runtime": 1.5535, + "eval_samples_per_second": 79.174, + "eval_steps_per_second": 10.299, + "step": 13702 + }, + { + "epoch": 222.0, + "eval_loss": 11.069469451904297, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.297, + "step": 13764 + }, + { + "epoch": 223.0, + "eval_loss": 11.070048332214355, + "eval_runtime": 1.5473, + "eval_samples_per_second": 79.495, + "eval_steps_per_second": 10.341, + "step": 13826 + }, + { + "epoch": 224.0, + "eval_loss": 10.968293190002441, + "eval_runtime": 1.547, + "eval_samples_per_second": 79.509, + "eval_steps_per_second": 10.343, + "step": 13888 + }, + { + "epoch": 225.0, + "eval_loss": 10.9319486618042, + "eval_runtime": 1.5452, + "eval_samples_per_second": 79.599, + "eval_steps_per_second": 10.354, + "step": 13950 + }, + { + "epoch": 225.81, + "learning_rate": 3.838709677419355e-08, + "loss": 10.0808, + "step": 14000 + }, + { + "epoch": 226.0, + "eval_loss": 10.993424415588379, + "eval_runtime": 1.57, + "eval_samples_per_second": 78.345, + "eval_steps_per_second": 10.191, + "step": 14012 + }, + { + "epoch": 227.0, + "eval_loss": 10.878746032714844, + "eval_runtime": 1.5608, + "eval_samples_per_second": 78.805, + "eval_steps_per_second": 10.251, + "step": 14074 + }, + { + "epoch": 228.0, + "eval_loss": 10.905098915100098, + "eval_runtime": 1.5539, + "eval_samples_per_second": 79.155, + "eval_steps_per_second": 10.297, + "step": 14136 + }, + { + "epoch": 229.0, + "eval_loss": 11.037513732910156, + "eval_runtime": 1.5551, + "eval_samples_per_second": 79.093, + "eval_steps_per_second": 10.289, + "step": 14198 + }, + { + "epoch": 230.0, + "eval_loss": 11.025934219360352, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.161, + "eval_steps_per_second": 10.297, + "step": 14260 + }, + { + "epoch": 231.0, + "eval_loss": 11.063486099243164, + "eval_runtime": 1.5562, + "eval_samples_per_second": 79.041, + "eval_steps_per_second": 10.282, + "step": 14322 + }, + { + "epoch": 232.0, + "eval_loss": 10.88222885131836, + "eval_runtime": 1.5487, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 10.331, + "step": 14384 + }, + { + "epoch": 233.0, + "eval_loss": 10.89100170135498, + "eval_runtime": 1.5457, + "eval_samples_per_second": 79.576, + "eval_steps_per_second": 10.351, + "step": 14446 + }, + { + "epoch": 233.87, + "learning_rate": 3.7258064516129034e-08, + "loss": 10.0768, + "step": 14500 + }, + { + "epoch": 234.0, + "eval_loss": 10.974419593811035, + "eval_runtime": 1.5538, + "eval_samples_per_second": 79.16, + "eval_steps_per_second": 10.297, + "step": 14508 + }, + { + "epoch": 235.0, + "eval_loss": 11.042181015014648, + "eval_runtime": 1.58, + "eval_samples_per_second": 77.85, + "eval_steps_per_second": 10.127, + "step": 14570 + }, + { + "epoch": 236.0, + "eval_loss": 11.03979778289795, + "eval_runtime": 1.5648, + "eval_samples_per_second": 78.605, + "eval_steps_per_second": 10.225, + "step": 14632 + }, + { + "epoch": 237.0, + "eval_loss": 10.850703239440918, + "eval_runtime": 1.5587, + "eval_samples_per_second": 78.914, + "eval_steps_per_second": 10.265, + "step": 14694 + }, + { + "epoch": 238.0, + "eval_loss": 11.022680282592773, + "eval_runtime": 1.5543, + "eval_samples_per_second": 79.135, + "eval_steps_per_second": 10.294, + "step": 14756 + }, + { + "epoch": 239.0, + "eval_loss": 11.027267456054688, + "eval_runtime": 1.5884, + "eval_samples_per_second": 77.438, + "eval_steps_per_second": 10.073, + "step": 14818 + }, + { + "epoch": 240.0, + "eval_loss": 11.07776165008545, + "eval_runtime": 1.5492, + "eval_samples_per_second": 79.395, + "eval_steps_per_second": 10.328, + "step": 14880 + }, + { + "epoch": 241.0, + "eval_loss": 11.077691078186035, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.393, + "eval_steps_per_second": 10.328, + "step": 14942 + }, + { + "epoch": 241.94, + "learning_rate": 3.612903225806452e-08, + "loss": 10.0792, + "step": 15000 + }, + { + "epoch": 242.0, + "eval_loss": 10.942319869995117, + "eval_runtime": 1.5569, + "eval_samples_per_second": 79.005, + "eval_steps_per_second": 10.277, + "step": 15004 + }, + { + "epoch": 243.0, + "eval_loss": 11.226703643798828, + "eval_runtime": 1.5593, + "eval_samples_per_second": 78.88, + "eval_steps_per_second": 10.261, + "step": 15066 + }, + { + "epoch": 244.0, + "eval_loss": 10.85287094116211, + "eval_runtime": 1.564, + "eval_samples_per_second": 78.644, + "eval_steps_per_second": 10.23, + "step": 15128 + }, + { + "epoch": 245.0, + "eval_loss": 11.056612968444824, + "eval_runtime": 1.5825, + "eval_samples_per_second": 77.725, + "eval_steps_per_second": 10.111, + "step": 15190 + }, + { + "epoch": 246.0, + "eval_loss": 10.887572288513184, + "eval_runtime": 1.5637, + "eval_samples_per_second": 78.661, + "eval_steps_per_second": 10.232, + "step": 15252 + }, + { + "epoch": 247.0, + "eval_loss": 11.144104957580566, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.978, + "eval_steps_per_second": 10.274, + "step": 15314 + }, + { + "epoch": 248.0, + "eval_loss": 10.909028053283691, + "eval_runtime": 1.5501, + "eval_samples_per_second": 79.35, + "eval_steps_per_second": 10.322, + "step": 15376 + }, + { + "epoch": 249.0, + "eval_loss": 10.97555923461914, + "eval_runtime": 1.5502, + "eval_samples_per_second": 79.347, + "eval_steps_per_second": 10.322, + "step": 15438 + }, + { + "epoch": 250.0, + "learning_rate": 3.5e-08, + "loss": 10.0832, + "step": 15500 + }, + { + "epoch": 250.0, + "eval_loss": 10.953834533691406, + "eval_runtime": 1.5699, + "eval_samples_per_second": 78.347, + "eval_steps_per_second": 10.192, + "step": 15500 + }, + { + "epoch": 251.0, + "eval_loss": 10.842375755310059, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.974, + "eval_steps_per_second": 10.273, + "step": 15562 + }, + { + "epoch": 252.0, + "eval_loss": 10.767086029052734, + "eval_runtime": 1.5579, + "eval_samples_per_second": 78.953, + "eval_steps_per_second": 10.27, + "step": 15624 + }, + { + "epoch": 253.0, + "eval_loss": 10.963540077209473, + "eval_runtime": 1.5571, + "eval_samples_per_second": 78.992, + "eval_steps_per_second": 10.275, + "step": 15686 + }, + { + "epoch": 254.0, + "eval_loss": 10.849014282226562, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.976, + "eval_steps_per_second": 10.273, + "step": 15748 + }, + { + "epoch": 255.0, + "eval_loss": 11.134928703308105, + "eval_runtime": 1.5686, + "eval_samples_per_second": 78.416, + "eval_steps_per_second": 10.2, + "step": 15810 + }, + { + "epoch": 256.0, + "eval_loss": 10.884407043457031, + "eval_runtime": 1.5495, + "eval_samples_per_second": 79.381, + "eval_steps_per_second": 10.326, + "step": 15872 + }, + { + "epoch": 257.0, + "eval_loss": 10.865640640258789, + "eval_runtime": 1.552, + "eval_samples_per_second": 79.251, + "eval_steps_per_second": 10.309, + "step": 15934 + }, + { + "epoch": 258.0, + "eval_loss": 10.823393821716309, + "eval_runtime": 1.5505, + "eval_samples_per_second": 79.33, + "eval_steps_per_second": 10.319, + "step": 15996 + }, + { + "epoch": 258.06, + "learning_rate": 3.387096774193549e-08, + "loss": 10.0747, + "step": 16000 + }, + { + "epoch": 259.0, + "eval_loss": 10.950251579284668, + "eval_runtime": 1.5601, + "eval_samples_per_second": 78.843, + "eval_steps_per_second": 10.256, + "step": 16058 + }, + { + "epoch": 260.0, + "eval_loss": 10.885615348815918, + "eval_runtime": 1.5592, + "eval_samples_per_second": 78.886, + "eval_steps_per_second": 10.262, + "step": 16120 + }, + { + "epoch": 261.0, + "eval_loss": 11.051911354064941, + "eval_runtime": 1.5755, + "eval_samples_per_second": 78.069, + "eval_steps_per_second": 10.155, + "step": 16182 + }, + { + "epoch": 262.0, + "eval_loss": 10.886160850524902, + "eval_runtime": 1.557, + "eval_samples_per_second": 79.0, + "eval_steps_per_second": 10.276, + "step": 16244 + }, + { + "epoch": 263.0, + "eval_loss": 10.897393226623535, + "eval_runtime": 1.5641, + "eval_samples_per_second": 78.64, + "eval_steps_per_second": 10.23, + "step": 16306 + }, + { + "epoch": 264.0, + "eval_loss": 10.84638500213623, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.364, + "eval_steps_per_second": 10.324, + "step": 16368 + }, + { + "epoch": 265.0, + "eval_loss": 10.965924263000488, + "eval_runtime": 1.5489, + "eval_samples_per_second": 79.411, + "eval_steps_per_second": 10.33, + "step": 16430 + }, + { + "epoch": 266.0, + "eval_loss": 10.742626190185547, + "eval_runtime": 1.55, + "eval_samples_per_second": 79.357, + "eval_steps_per_second": 10.323, + "step": 16492 + }, + { + "epoch": 266.13, + "learning_rate": 3.274193548387097e-08, + "loss": 10.0678, + "step": 16500 + }, + { + "epoch": 267.0, + "eval_loss": 10.985177040100098, + "eval_runtime": 1.5698, + "eval_samples_per_second": 78.356, + "eval_steps_per_second": 10.193, + "step": 16554 + }, + { + "epoch": 268.0, + "eval_loss": 11.049816131591797, + "eval_runtime": 1.557, + "eval_samples_per_second": 79.0, + "eval_steps_per_second": 10.276, + "step": 16616 + }, + { + "epoch": 269.0, + "eval_loss": 10.7893648147583, + "eval_runtime": 1.5584, + "eval_samples_per_second": 78.929, + "eval_steps_per_second": 10.267, + "step": 16678 + }, + { + "epoch": 270.0, + "eval_loss": 11.10730266571045, + "eval_runtime": 1.5594, + "eval_samples_per_second": 78.878, + "eval_steps_per_second": 10.261, + "step": 16740 + }, + { + "epoch": 271.0, + "eval_loss": 10.826376914978027, + "eval_runtime": 1.5624, + "eval_samples_per_second": 78.727, + "eval_steps_per_second": 10.241, + "step": 16802 + }, + { + "epoch": 272.0, + "eval_loss": 10.931463241577148, + "eval_runtime": 1.5486, + "eval_samples_per_second": 79.427, + "eval_steps_per_second": 10.332, + "step": 16864 + }, + { + "epoch": 273.0, + "eval_loss": 10.775986671447754, + "eval_runtime": 1.5509, + "eval_samples_per_second": 79.307, + "eval_steps_per_second": 10.316, + "step": 16926 + }, + { + "epoch": 274.0, + "eval_loss": 10.972262382507324, + "eval_runtime": 1.566, + "eval_samples_per_second": 78.543, + "eval_steps_per_second": 10.217, + "step": 16988 + }, + { + "epoch": 274.19, + "learning_rate": 3.1612903225806456e-08, + "loss": 10.0631, + "step": 17000 + }, + { + "epoch": 275.0, + "eval_loss": 10.748165130615234, + "eval_runtime": 1.5567, + "eval_samples_per_second": 79.014, + "eval_steps_per_second": 10.278, + "step": 17050 + }, + { + "epoch": 276.0, + "eval_loss": 11.049092292785645, + "eval_runtime": 1.5587, + "eval_samples_per_second": 78.912, + "eval_steps_per_second": 10.265, + "step": 17112 + }, + { + "epoch": 277.0, + "eval_loss": 10.751622200012207, + "eval_runtime": 1.5568, + "eval_samples_per_second": 79.007, + "eval_steps_per_second": 10.277, + "step": 17174 + }, + { + "epoch": 278.0, + "eval_loss": 10.950490951538086, + "eval_runtime": 1.5586, + "eval_samples_per_second": 78.916, + "eval_steps_per_second": 10.266, + "step": 17236 + }, + { + "epoch": 279.0, + "eval_loss": 10.889892578125, + "eval_runtime": 1.5568, + "eval_samples_per_second": 79.01, + "eval_steps_per_second": 10.278, + "step": 17298 + }, + { + "epoch": 280.0, + "eval_loss": 10.786404609680176, + "eval_runtime": 1.557, + "eval_samples_per_second": 78.998, + "eval_steps_per_second": 10.276, + "step": 17360 + }, + { + "epoch": 281.0, + "eval_loss": 10.865982055664062, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.377, + "eval_steps_per_second": 10.326, + "step": 17422 + }, + { + "epoch": 282.0, + "eval_loss": 10.97549819946289, + "eval_runtime": 1.5507, + "eval_samples_per_second": 79.318, + "eval_steps_per_second": 10.318, + "step": 17484 + }, + { + "epoch": 282.26, + "learning_rate": 3.048387096774194e-08, + "loss": 10.0464, + "step": 17500 + }, + { + "epoch": 283.0, + "eval_loss": 10.970197677612305, + "eval_runtime": 1.5595, + "eval_samples_per_second": 78.873, + "eval_steps_per_second": 10.26, + "step": 17546 + }, + { + "epoch": 284.0, + "eval_loss": 10.966419219970703, + "eval_runtime": 1.5592, + "eval_samples_per_second": 78.887, + "eval_steps_per_second": 10.262, + "step": 17608 + }, + { + "epoch": 285.0, + "eval_loss": 10.941322326660156, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.971, + "eval_steps_per_second": 10.273, + "step": 17670 + }, + { + "epoch": 286.0, + "eval_loss": 10.966172218322754, + "eval_runtime": 1.5626, + "eval_samples_per_second": 78.714, + "eval_steps_per_second": 10.239, + "step": 17732 + }, + { + "epoch": 287.0, + "eval_loss": 10.951107025146484, + "eval_runtime": 1.5766, + "eval_samples_per_second": 78.017, + "eval_steps_per_second": 10.149, + "step": 17794 + }, + { + "epoch": 288.0, + "eval_loss": 10.864799499511719, + "eval_runtime": 1.5527, + "eval_samples_per_second": 79.216, + "eval_steps_per_second": 10.305, + "step": 17856 + }, + { + "epoch": 289.0, + "eval_loss": 10.88668441772461, + "eval_runtime": 1.5518, + "eval_samples_per_second": 79.261, + "eval_steps_per_second": 10.31, + "step": 17918 + }, + { + "epoch": 290.0, + "eval_loss": 11.037867546081543, + "eval_runtime": 1.5494, + "eval_samples_per_second": 79.386, + "eval_steps_per_second": 10.327, + "step": 17980 + }, + { + "epoch": 290.32, + "learning_rate": 2.935483870967742e-08, + "loss": 10.059, + "step": 18000 + }, + { + "epoch": 291.0, + "eval_loss": 10.958696365356445, + "eval_runtime": 1.5631, + "eval_samples_per_second": 78.692, + "eval_steps_per_second": 10.236, + "step": 18042 + }, + { + "epoch": 292.0, + "eval_loss": 10.871495246887207, + "eval_runtime": 1.5627, + "eval_samples_per_second": 78.71, + "eval_steps_per_second": 10.239, + "step": 18104 + }, + { + "epoch": 293.0, + "eval_loss": 11.032896041870117, + "eval_runtime": 1.5647, + "eval_samples_per_second": 78.608, + "eval_steps_per_second": 10.225, + "step": 18166 + }, + { + "epoch": 294.0, + "eval_loss": 10.993353843688965, + "eval_runtime": 1.557, + "eval_samples_per_second": 78.998, + "eval_steps_per_second": 10.276, + "step": 18228 + }, + { + "epoch": 295.0, + "eval_loss": 11.031813621520996, + "eval_runtime": 1.5556, + "eval_samples_per_second": 79.072, + "eval_steps_per_second": 10.286, + "step": 18290 + }, + { + "epoch": 296.0, + "eval_loss": 10.949562072753906, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.389, + "eval_steps_per_second": 10.327, + "step": 18352 + }, + { + "epoch": 297.0, + "eval_loss": 10.985105514526367, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.373, + "eval_steps_per_second": 10.325, + "step": 18414 + }, + { + "epoch": 298.0, + "eval_loss": 10.904229164123535, + "eval_runtime": 1.5499, + "eval_samples_per_second": 79.359, + "eval_steps_per_second": 10.323, + "step": 18476 + }, + { + "epoch": 298.39, + "learning_rate": 2.8225806451612902e-08, + "loss": 10.0478, + "step": 18500 + }, + { + "epoch": 299.0, + "eval_loss": 10.848288536071777, + "eval_runtime": 1.5652, + "eval_samples_per_second": 78.585, + "eval_steps_per_second": 10.222, + "step": 18538 + }, + { + "epoch": 300.0, + "eval_loss": 11.128362655639648, + "eval_runtime": 1.5589, + "eval_samples_per_second": 78.903, + "eval_steps_per_second": 10.264, + "step": 18600 + }, + { + "epoch": 301.0, + "eval_loss": 10.965646743774414, + "eval_runtime": 1.5837, + "eval_samples_per_second": 77.668, + "eval_steps_per_second": 10.103, + "step": 18662 + }, + { + "epoch": 302.0, + "eval_loss": 10.962158203125, + "eval_runtime": 1.5581, + "eval_samples_per_second": 78.944, + "eval_steps_per_second": 10.269, + "step": 18724 + }, + { + "epoch": 303.0, + "eval_loss": 11.022586822509766, + "eval_runtime": 1.5559, + "eval_samples_per_second": 79.056, + "eval_steps_per_second": 10.284, + "step": 18786 + }, + { + "epoch": 304.0, + "eval_loss": 10.998231887817383, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.272, + "eval_steps_per_second": 10.312, + "step": 18848 + }, + { + "epoch": 305.0, + "eval_loss": 10.964576721191406, + "eval_runtime": 1.5886, + "eval_samples_per_second": 77.425, + "eval_steps_per_second": 10.072, + "step": 18910 + }, + { + "epoch": 306.0, + "eval_loss": 11.094855308532715, + "eval_runtime": 1.549, + "eval_samples_per_second": 79.406, + "eval_steps_per_second": 10.329, + "step": 18972 + }, + { + "epoch": 306.45, + "learning_rate": 2.7096774193548387e-08, + "loss": 10.0513, + "step": 19000 + }, + { + "epoch": 307.0, + "eval_loss": 11.073990821838379, + "eval_runtime": 1.5592, + "eval_samples_per_second": 78.885, + "eval_steps_per_second": 10.261, + "step": 19034 + }, + { + "epoch": 308.0, + "eval_loss": 10.888773918151855, + "eval_runtime": 1.5638, + "eval_samples_per_second": 78.652, + "eval_steps_per_second": 10.231, + "step": 19096 + }, + { + "epoch": 309.0, + "eval_loss": 11.247567176818848, + "eval_runtime": 1.5616, + "eval_samples_per_second": 78.763, + "eval_steps_per_second": 10.246, + "step": 19158 + }, + { + "epoch": 310.0, + "eval_loss": 10.999629020690918, + "eval_runtime": 1.5902, + "eval_samples_per_second": 77.348, + "eval_steps_per_second": 10.062, + "step": 19220 + }, + { + "epoch": 311.0, + "eval_loss": 10.804391860961914, + "eval_runtime": 1.5586, + "eval_samples_per_second": 78.919, + "eval_steps_per_second": 10.266, + "step": 19282 + }, + { + "epoch": 312.0, + "eval_loss": 10.949257850646973, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.391, + "eval_steps_per_second": 10.327, + "step": 19344 + }, + { + "epoch": 313.0, + "eval_loss": 10.94599723815918, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.367, + "eval_steps_per_second": 10.324, + "step": 19406 + }, + { + "epoch": 314.0, + "eval_loss": 10.966387748718262, + "eval_runtime": 1.5504, + "eval_samples_per_second": 79.336, + "eval_steps_per_second": 10.32, + "step": 19468 + }, + { + "epoch": 314.52, + "learning_rate": 2.5967741935483875e-08, + "loss": 10.0363, + "step": 19500 + }, + { + "epoch": 315.0, + "eval_loss": 10.91958999633789, + "eval_runtime": 1.5704, + "eval_samples_per_second": 78.324, + "eval_steps_per_second": 10.188, + "step": 19530 + }, + { + "epoch": 316.0, + "eval_loss": 10.921772956848145, + "eval_runtime": 1.5577, + "eval_samples_per_second": 78.964, + "eval_steps_per_second": 10.272, + "step": 19592 + }, + { + "epoch": 317.0, + "eval_loss": 10.800854682922363, + "eval_runtime": 1.5565, + "eval_samples_per_second": 79.024, + "eval_steps_per_second": 10.28, + "step": 19654 + }, + { + "epoch": 318.0, + "eval_loss": 11.121504783630371, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.979, + "eval_steps_per_second": 10.274, + "step": 19716 + }, + { + "epoch": 319.0, + "eval_loss": 10.96510124206543, + "eval_runtime": 1.5619, + "eval_samples_per_second": 78.751, + "eval_steps_per_second": 10.244, + "step": 19778 + }, + { + "epoch": 320.0, + "eval_loss": 10.995678901672363, + "eval_runtime": 1.5505, + "eval_samples_per_second": 79.33, + "eval_steps_per_second": 10.319, + "step": 19840 + }, + { + "epoch": 321.0, + "eval_loss": 11.144912719726562, + "eval_runtime": 1.5521, + "eval_samples_per_second": 79.247, + "eval_steps_per_second": 10.309, + "step": 19902 + }, + { + "epoch": 322.0, + "eval_loss": 11.01318645477295, + "eval_runtime": 1.555, + "eval_samples_per_second": 79.101, + "eval_steps_per_second": 10.289, + "step": 19964 + }, + { + "epoch": 322.58, + "learning_rate": 2.483870967741936e-08, + "loss": 10.0503, + "step": 20000 + }, + { + "epoch": 323.0, + "eval_loss": 11.119362831115723, + "eval_runtime": 1.5655, + "eval_samples_per_second": 78.571, + "eval_steps_per_second": 10.221, + "step": 20026 + }, + { + "epoch": 324.0, + "eval_loss": 10.964983940124512, + "eval_runtime": 1.5692, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 10.196, + "step": 20088 + }, + { + "epoch": 325.0, + "eval_loss": 10.841401100158691, + "eval_runtime": 1.5595, + "eval_samples_per_second": 78.871, + "eval_steps_per_second": 10.26, + "step": 20150 + }, + { + "epoch": 326.0, + "eval_loss": 10.86677360534668, + "eval_runtime": 1.5533, + "eval_samples_per_second": 79.184, + "eval_steps_per_second": 10.3, + "step": 20212 + }, + { + "epoch": 327.0, + "eval_loss": 11.009995460510254, + "eval_runtime": 1.5673, + "eval_samples_per_second": 78.478, + "eval_steps_per_second": 10.209, + "step": 20274 + }, + { + "epoch": 328.0, + "eval_loss": 10.875937461853027, + "eval_runtime": 1.5499, + "eval_samples_per_second": 79.359, + "eval_steps_per_second": 10.323, + "step": 20336 + }, + { + "epoch": 329.0, + "eval_loss": 10.965596199035645, + "eval_runtime": 1.562, + "eval_samples_per_second": 78.744, + "eval_steps_per_second": 10.243, + "step": 20398 + }, + { + "epoch": 330.0, + "eval_loss": 11.007844924926758, + "eval_runtime": 1.5553, + "eval_samples_per_second": 79.085, + "eval_steps_per_second": 10.287, + "step": 20460 + }, + { + "epoch": 330.65, + "learning_rate": 2.370967741935484e-08, + "loss": 10.0223, + "step": 20500 + }, + { + "epoch": 331.0, + "eval_loss": 11.0198335647583, + "eval_runtime": 1.5833, + "eval_samples_per_second": 77.688, + "eval_steps_per_second": 10.106, + "step": 20522 + }, + { + "epoch": 332.0, + "eval_loss": 11.088841438293457, + "eval_runtime": 1.5649, + "eval_samples_per_second": 78.601, + "eval_steps_per_second": 10.224, + "step": 20584 + }, + { + "epoch": 333.0, + "eval_loss": 11.105833053588867, + "eval_runtime": 1.5609, + "eval_samples_per_second": 78.798, + "eval_steps_per_second": 10.25, + "step": 20646 + }, + { + "epoch": 334.0, + "eval_loss": 10.942315101623535, + "eval_runtime": 1.5599, + "eval_samples_per_second": 78.853, + "eval_steps_per_second": 10.257, + "step": 20708 + }, + { + "epoch": 335.0, + "eval_loss": 11.103010177612305, + "eval_runtime": 1.5806, + "eval_samples_per_second": 77.817, + "eval_steps_per_second": 10.123, + "step": 20770 + }, + { + "epoch": 336.0, + "eval_loss": 11.081936836242676, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.366, + "eval_steps_per_second": 10.324, + "step": 20832 + }, + { + "epoch": 337.0, + "eval_loss": 10.90597152709961, + "eval_runtime": 1.55, + "eval_samples_per_second": 79.356, + "eval_steps_per_second": 10.323, + "step": 20894 + }, + { + "epoch": 338.0, + "eval_loss": 11.074986457824707, + "eval_runtime": 1.5577, + "eval_samples_per_second": 78.962, + "eval_steps_per_second": 10.271, + "step": 20956 + }, + { + "epoch": 338.71, + "learning_rate": 2.2580645161290325e-08, + "loss": 10.0321, + "step": 21000 + }, + { + "epoch": 339.0, + "eval_loss": 11.110125541687012, + "eval_runtime": 1.5561, + "eval_samples_per_second": 79.046, + "eval_steps_per_second": 10.282, + "step": 21018 + }, + { + "epoch": 340.0, + "eval_loss": 10.971090316772461, + "eval_runtime": 1.5625, + "eval_samples_per_second": 78.722, + "eval_steps_per_second": 10.24, + "step": 21080 + }, + { + "epoch": 341.0, + "eval_loss": 10.92870044708252, + "eval_runtime": 1.5635, + "eval_samples_per_second": 78.668, + "eval_steps_per_second": 10.233, + "step": 21142 + }, + { + "epoch": 342.0, + "eval_loss": 10.796195983886719, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.978, + "eval_steps_per_second": 10.273, + "step": 21204 + }, + { + "epoch": 343.0, + "eval_loss": 11.136199951171875, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.976, + "eval_steps_per_second": 10.273, + "step": 21266 + }, + { + "epoch": 344.0, + "eval_loss": 11.102249145507812, + "eval_runtime": 1.551, + "eval_samples_per_second": 79.302, + "eval_steps_per_second": 10.316, + "step": 21328 + }, + { + "epoch": 345.0, + "eval_loss": 10.999671936035156, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.392, + "eval_steps_per_second": 10.327, + "step": 21390 + }, + { + "epoch": 346.0, + "eval_loss": 10.876097679138184, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.366, + "eval_steps_per_second": 10.324, + "step": 21452 + }, + { + "epoch": 346.77, + "learning_rate": 2.145161290322581e-08, + "loss": 10.0226, + "step": 21500 + }, + { + "epoch": 347.0, + "eval_loss": 10.934317588806152, + "eval_runtime": 1.5612, + "eval_samples_per_second": 78.786, + "eval_steps_per_second": 10.249, + "step": 21514 + }, + { + "epoch": 348.0, + "eval_loss": 10.910962104797363, + "eval_runtime": 1.5564, + "eval_samples_per_second": 79.028, + "eval_steps_per_second": 10.28, + "step": 21576 + }, + { + "epoch": 349.0, + "eval_loss": 11.034477233886719, + "eval_runtime": 1.5584, + "eval_samples_per_second": 78.929, + "eval_steps_per_second": 10.267, + "step": 21638 + }, + { + "epoch": 350.0, + "eval_loss": 10.88716983795166, + "eval_runtime": 1.5754, + "eval_samples_per_second": 78.075, + "eval_steps_per_second": 10.156, + "step": 21700 + }, + { + "epoch": 351.0, + "eval_loss": 10.766716957092285, + "eval_runtime": 1.5893, + "eval_samples_per_second": 77.394, + "eval_steps_per_second": 10.067, + "step": 21762 + }, + { + "epoch": 352.0, + "eval_loss": 11.056756973266602, + "eval_runtime": 1.5503, + "eval_samples_per_second": 79.339, + "eval_steps_per_second": 10.321, + "step": 21824 + }, + { + "epoch": 353.0, + "eval_loss": 11.028831481933594, + "eval_runtime": 1.5494, + "eval_samples_per_second": 79.385, + "eval_steps_per_second": 10.326, + "step": 21886 + }, + { + "epoch": 354.0, + "eval_loss": 11.066434860229492, + "eval_runtime": 1.5562, + "eval_samples_per_second": 79.039, + "eval_steps_per_second": 10.281, + "step": 21948 + }, + { + "epoch": 354.84, + "learning_rate": 2.0322580645161293e-08, + "loss": 10.033, + "step": 22000 + }, + { + "epoch": 355.0, + "eval_loss": 10.975112915039062, + "eval_runtime": 1.5594, + "eval_samples_per_second": 78.876, + "eval_steps_per_second": 10.26, + "step": 22010 + }, + { + "epoch": 356.0, + "eval_loss": 10.856056213378906, + "eval_runtime": 1.5565, + "eval_samples_per_second": 79.022, + "eval_steps_per_second": 10.279, + "step": 22072 + }, + { + "epoch": 357.0, + "eval_loss": 10.960885047912598, + "eval_runtime": 1.5772, + "eval_samples_per_second": 77.988, + "eval_steps_per_second": 10.145, + "step": 22134 + }, + { + "epoch": 358.0, + "eval_loss": 11.000205993652344, + "eval_runtime": 1.5685, + "eval_samples_per_second": 78.418, + "eval_steps_per_second": 10.201, + "step": 22196 + }, + { + "epoch": 359.0, + "eval_loss": 11.056760787963867, + "eval_runtime": 1.5597, + "eval_samples_per_second": 78.862, + "eval_steps_per_second": 10.259, + "step": 22258 + }, + { + "epoch": 360.0, + "eval_loss": 10.958597183227539, + "eval_runtime": 1.5568, + "eval_samples_per_second": 79.009, + "eval_steps_per_second": 10.278, + "step": 22320 + }, + { + "epoch": 361.0, + "eval_loss": 10.925392150878906, + "eval_runtime": 1.5568, + "eval_samples_per_second": 79.007, + "eval_steps_per_second": 10.277, + "step": 22382 + }, + { + "epoch": 362.0, + "eval_loss": 11.065186500549316, + "eval_runtime": 1.5511, + "eval_samples_per_second": 79.297, + "eval_steps_per_second": 10.315, + "step": 22444 + }, + { + "epoch": 362.9, + "learning_rate": 1.9193548387096775e-08, + "loss": 10.0084, + "step": 22500 + }, + { + "epoch": 363.0, + "eval_loss": 10.965627670288086, + "eval_runtime": 1.5573, + "eval_samples_per_second": 78.985, + "eval_steps_per_second": 10.274, + "step": 22506 + }, + { + "epoch": 364.0, + "eval_loss": 11.061145782470703, + "eval_runtime": 1.5616, + "eval_samples_per_second": 78.765, + "eval_steps_per_second": 10.246, + "step": 22568 + }, + { + "epoch": 365.0, + "eval_loss": 10.833497047424316, + "eval_runtime": 1.5633, + "eval_samples_per_second": 78.678, + "eval_steps_per_second": 10.235, + "step": 22630 + }, + { + "epoch": 366.0, + "eval_loss": 10.980859756469727, + "eval_runtime": 1.5593, + "eval_samples_per_second": 78.883, + "eval_steps_per_second": 10.261, + "step": 22692 + }, + { + "epoch": 367.0, + "eval_loss": 11.020894050598145, + "eval_runtime": 1.5581, + "eval_samples_per_second": 78.94, + "eval_steps_per_second": 10.269, + "step": 22754 + }, + { + "epoch": 368.0, + "eval_loss": 10.91490364074707, + "eval_runtime": 1.574, + "eval_samples_per_second": 78.142, + "eval_steps_per_second": 10.165, + "step": 22816 + }, + { + "epoch": 369.0, + "eval_loss": 11.020668029785156, + "eval_runtime": 1.5554, + "eval_samples_per_second": 79.078, + "eval_steps_per_second": 10.287, + "step": 22878 + }, + { + "epoch": 370.0, + "eval_loss": 10.877435684204102, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.374, + "eval_steps_per_second": 10.325, + "step": 22940 + }, + { + "epoch": 370.97, + "learning_rate": 1.806451612903226e-08, + "loss": 10.024, + "step": 23000 + }, + { + "epoch": 371.0, + "eval_loss": 10.985451698303223, + "eval_runtime": 1.5536, + "eval_samples_per_second": 79.173, + "eval_steps_per_second": 10.299, + "step": 23002 + }, + { + "epoch": 372.0, + "eval_loss": 10.90639877319336, + "eval_runtime": 1.5566, + "eval_samples_per_second": 79.018, + "eval_steps_per_second": 10.279, + "step": 23064 + }, + { + "epoch": 373.0, + "eval_loss": 10.95183277130127, + "eval_runtime": 1.5635, + "eval_samples_per_second": 78.669, + "eval_steps_per_second": 10.233, + "step": 23126 + }, + { + "epoch": 374.0, + "eval_loss": 10.877464294433594, + "eval_runtime": 1.5564, + "eval_samples_per_second": 79.031, + "eval_steps_per_second": 10.28, + "step": 23188 + }, + { + "epoch": 375.0, + "eval_loss": 10.91928768157959, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.971, + "eval_steps_per_second": 10.273, + "step": 23250 + }, + { + "epoch": 376.0, + "eval_loss": 11.109807968139648, + "eval_runtime": 1.5563, + "eval_samples_per_second": 79.033, + "eval_steps_per_second": 10.281, + "step": 23312 + }, + { + "epoch": 377.0, + "eval_loss": 11.148626327514648, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.392, + "eval_steps_per_second": 10.327, + "step": 23374 + }, + { + "epoch": 378.0, + "eval_loss": 11.089497566223145, + "eval_runtime": 1.5491, + "eval_samples_per_second": 79.4, + "eval_steps_per_second": 10.329, + "step": 23436 + }, + { + "epoch": 379.0, + "eval_loss": 10.915903091430664, + "eval_runtime": 1.5528, + "eval_samples_per_second": 79.214, + "eval_steps_per_second": 10.304, + "step": 23498 + }, + { + "epoch": 379.03, + "learning_rate": 1.6935483870967743e-08, + "loss": 10.0052, + "step": 23500 + }, + { + "epoch": 380.0, + "eval_loss": 11.109511375427246, + "eval_runtime": 1.5576, + "eval_samples_per_second": 78.966, + "eval_steps_per_second": 10.272, + "step": 23560 + }, + { + "epoch": 381.0, + "eval_loss": 11.094901084899902, + "eval_runtime": 1.5561, + "eval_samples_per_second": 79.044, + "eval_steps_per_second": 10.282, + "step": 23622 + }, + { + "epoch": 382.0, + "eval_loss": 10.983734130859375, + "eval_runtime": 1.5557, + "eval_samples_per_second": 79.066, + "eval_steps_per_second": 10.285, + "step": 23684 + }, + { + "epoch": 383.0, + "eval_loss": 10.943252563476562, + "eval_runtime": 1.577, + "eval_samples_per_second": 77.998, + "eval_steps_per_second": 10.146, + "step": 23746 + }, + { + "epoch": 384.0, + "eval_loss": 10.910370826721191, + "eval_runtime": 1.5682, + "eval_samples_per_second": 78.434, + "eval_steps_per_second": 10.203, + "step": 23808 + }, + { + "epoch": 385.0, + "eval_loss": 10.95122241973877, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.28, + "eval_steps_per_second": 10.313, + "step": 23870 + }, + { + "epoch": 386.0, + "eval_loss": 11.107484817504883, + "eval_runtime": 1.5487, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 10.331, + "step": 23932 + }, + { + "epoch": 387.0, + "eval_loss": 10.865962982177734, + "eval_runtime": 1.5535, + "eval_samples_per_second": 79.176, + "eval_steps_per_second": 10.299, + "step": 23994 + }, + { + "epoch": 387.1, + "learning_rate": 1.5806451612903228e-08, + "loss": 10.0218, + "step": 24000 + }, + { + "epoch": 388.0, + "eval_loss": 11.052102088928223, + "eval_runtime": 1.5572, + "eval_samples_per_second": 78.988, + "eval_steps_per_second": 10.275, + "step": 24056 + }, + { + "epoch": 389.0, + "eval_loss": 10.898940086364746, + "eval_runtime": 1.5627, + "eval_samples_per_second": 78.712, + "eval_steps_per_second": 10.239, + "step": 24118 + }, + { + "epoch": 390.0, + "eval_loss": 10.913924217224121, + "eval_runtime": 1.5799, + "eval_samples_per_second": 77.852, + "eval_steps_per_second": 10.127, + "step": 24180 + }, + { + "epoch": 391.0, + "eval_loss": 11.10788345336914, + "eval_runtime": 1.5783, + "eval_samples_per_second": 77.934, + "eval_steps_per_second": 10.138, + "step": 24242 + }, + { + "epoch": 392.0, + "eval_loss": 10.906310081481934, + "eval_runtime": 1.5568, + "eval_samples_per_second": 79.006, + "eval_steps_per_second": 10.277, + "step": 24304 + }, + { + "epoch": 393.0, + "eval_loss": 11.085139274597168, + "eval_runtime": 1.5494, + "eval_samples_per_second": 79.384, + "eval_steps_per_second": 10.326, + "step": 24366 + }, + { + "epoch": 394.0, + "eval_loss": 10.980916023254395, + "eval_runtime": 1.5522, + "eval_samples_per_second": 79.244, + "eval_steps_per_second": 10.308, + "step": 24428 + }, + { + "epoch": 395.0, + "eval_loss": 10.932045936584473, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.389, + "eval_steps_per_second": 10.327, + "step": 24490 + }, + { + "epoch": 395.16, + "learning_rate": 1.467741935483871e-08, + "loss": 10.0039, + "step": 24500 + }, + { + "epoch": 396.0, + "eval_loss": 11.052530288696289, + "eval_runtime": 1.5595, + "eval_samples_per_second": 78.871, + "eval_steps_per_second": 10.26, + "step": 24552 + }, + { + "epoch": 397.0, + "eval_loss": 11.037784576416016, + "eval_runtime": 1.5581, + "eval_samples_per_second": 78.943, + "eval_steps_per_second": 10.269, + "step": 24614 + }, + { + "epoch": 398.0, + "eval_loss": 11.000636100769043, + "eval_runtime": 1.5574, + "eval_samples_per_second": 78.979, + "eval_steps_per_second": 10.274, + "step": 24676 + }, + { + "epoch": 399.0, + "eval_loss": 11.028385162353516, + "eval_runtime": 1.5601, + "eval_samples_per_second": 78.842, + "eval_steps_per_second": 10.256, + "step": 24738 + }, + { + "epoch": 400.0, + "eval_loss": 11.005521774291992, + "eval_runtime": 1.5704, + "eval_samples_per_second": 78.326, + "eval_steps_per_second": 10.189, + "step": 24800 + }, + { + "epoch": 401.0, + "eval_loss": 11.010418891906738, + "eval_runtime": 1.55, + "eval_samples_per_second": 79.356, + "eval_steps_per_second": 10.323, + "step": 24862 + }, + { + "epoch": 402.0, + "eval_loss": 11.11683464050293, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.377, + "eval_steps_per_second": 10.325, + "step": 24924 + }, + { + "epoch": 403.0, + "eval_loss": 10.831143379211426, + "eval_runtime": 1.5514, + "eval_samples_per_second": 79.284, + "eval_steps_per_second": 10.313, + "step": 24986 + }, + { + "epoch": 403.23, + "learning_rate": 1.3548387096774193e-08, + "loss": 10.0186, + "step": 25000 + }, + { + "epoch": 404.0, + "eval_loss": 10.890425682067871, + "eval_runtime": 1.5613, + "eval_samples_per_second": 78.78, + "eval_steps_per_second": 10.248, + "step": 25048 + }, + { + "epoch": 405.0, + "eval_loss": 10.861490249633789, + "eval_runtime": 1.5685, + "eval_samples_per_second": 78.418, + "eval_steps_per_second": 10.201, + "step": 25110 + }, + { + "epoch": 406.0, + "eval_loss": 10.951952934265137, + "eval_runtime": 1.5597, + "eval_samples_per_second": 78.863, + "eval_steps_per_second": 10.259, + "step": 25172 + }, + { + "epoch": 407.0, + "eval_loss": 10.911187171936035, + "eval_runtime": 1.5579, + "eval_samples_per_second": 78.953, + "eval_steps_per_second": 10.27, + "step": 25234 + }, + { + "epoch": 408.0, + "eval_loss": 11.027958869934082, + "eval_runtime": 1.5537, + "eval_samples_per_second": 79.165, + "eval_steps_per_second": 10.298, + "step": 25296 + }, + { + "epoch": 409.0, + "eval_loss": 10.828099250793457, + "eval_runtime": 1.5453, + "eval_samples_per_second": 79.597, + "eval_steps_per_second": 10.354, + "step": 25358 + }, + { + "epoch": 410.0, + "eval_loss": 11.187531471252441, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.375, + "eval_steps_per_second": 10.325, + "step": 25420 + }, + { + "epoch": 411.0, + "eval_loss": 11.00891399383545, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.231, + "eval_steps_per_second": 10.306, + "step": 25482 + }, + { + "epoch": 411.29, + "learning_rate": 1.241935483870968e-08, + "loss": 10.0146, + "step": 25500 + }, + { + "epoch": 412.0, + "eval_loss": 11.038575172424316, + "eval_runtime": 1.5665, + "eval_samples_per_second": 78.518, + "eval_steps_per_second": 10.214, + "step": 25544 + }, + { + "epoch": 413.0, + "eval_loss": 10.815169334411621, + "eval_runtime": 1.558, + "eval_samples_per_second": 78.948, + "eval_steps_per_second": 10.27, + "step": 25606 + }, + { + "epoch": 414.0, + "eval_loss": 10.946043014526367, + "eval_runtime": 1.5604, + "eval_samples_per_second": 78.824, + "eval_steps_per_second": 10.254, + "step": 25668 + }, + { + "epoch": 415.0, + "eval_loss": 10.9921875, + "eval_runtime": 1.5586, + "eval_samples_per_second": 78.917, + "eval_steps_per_second": 10.266, + "step": 25730 + }, + { + "epoch": 416.0, + "eval_loss": 11.085003852844238, + "eval_runtime": 1.5606, + "eval_samples_per_second": 78.817, + "eval_steps_per_second": 10.253, + "step": 25792 + }, + { + "epoch": 417.0, + "eval_loss": 11.002412796020508, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.276, + "eval_steps_per_second": 10.312, + "step": 25854 + }, + { + "epoch": 418.0, + "eval_loss": 11.093355178833008, + "eval_runtime": 1.5531, + "eval_samples_per_second": 79.196, + "eval_steps_per_second": 10.302, + "step": 25916 + }, + { + "epoch": 419.0, + "eval_loss": 11.095279693603516, + "eval_runtime": 1.5495, + "eval_samples_per_second": 79.379, + "eval_steps_per_second": 10.326, + "step": 25978 + }, + { + "epoch": 419.35, + "learning_rate": 1.1290322580645162e-08, + "loss": 10.0251, + "step": 26000 + }, + { + "epoch": 420.0, + "eval_loss": 10.89963150024414, + "eval_runtime": 1.5606, + "eval_samples_per_second": 78.815, + "eval_steps_per_second": 10.252, + "step": 26040 + }, + { + "epoch": 421.0, + "eval_loss": 10.983254432678223, + "eval_runtime": 1.5556, + "eval_samples_per_second": 79.068, + "eval_steps_per_second": 10.285, + "step": 26102 + }, + { + "epoch": 422.0, + "eval_loss": 10.754561424255371, + "eval_runtime": 1.558, + "eval_samples_per_second": 78.947, + "eval_steps_per_second": 10.27, + "step": 26164 + }, + { + "epoch": 423.0, + "eval_loss": 10.976980209350586, + "eval_runtime": 1.577, + "eval_samples_per_second": 77.995, + "eval_steps_per_second": 10.146, + "step": 26226 + }, + { + "epoch": 424.0, + "eval_loss": 11.063197135925293, + "eval_runtime": 1.5683, + "eval_samples_per_second": 78.427, + "eval_steps_per_second": 10.202, + "step": 26288 + }, + { + "epoch": 425.0, + "eval_loss": 10.924360275268555, + "eval_runtime": 1.5498, + "eval_samples_per_second": 79.364, + "eval_steps_per_second": 10.324, + "step": 26350 + }, + { + "epoch": 426.0, + "eval_loss": 10.937601089477539, + "eval_runtime": 1.5495, + "eval_samples_per_second": 79.379, + "eval_steps_per_second": 10.326, + "step": 26412 + }, + { + "epoch": 427.0, + "eval_loss": 11.044069290161133, + "eval_runtime": 1.5501, + "eval_samples_per_second": 79.351, + "eval_steps_per_second": 10.322, + "step": 26474 + }, + { + "epoch": 427.42, + "learning_rate": 1.0161290322580647e-08, + "loss": 9.9776, + "step": 26500 + }, + { + "epoch": 428.0, + "eval_loss": 11.067927360534668, + "eval_runtime": 1.5604, + "eval_samples_per_second": 78.827, + "eval_steps_per_second": 10.254, + "step": 26536 + }, + { + "epoch": 429.0, + "eval_loss": 11.066262245178223, + "eval_runtime": 1.5875, + "eval_samples_per_second": 77.482, + "eval_steps_per_second": 10.079, + "step": 26598 + }, + { + "epoch": 430.0, + "eval_loss": 10.988542556762695, + "eval_runtime": 1.5716, + "eval_samples_per_second": 78.264, + "eval_steps_per_second": 10.181, + "step": 26660 + }, + { + "epoch": 431.0, + "eval_loss": 10.956550598144531, + "eval_runtime": 1.558, + "eval_samples_per_second": 78.949, + "eval_steps_per_second": 10.27, + "step": 26722 + }, + { + "epoch": 432.0, + "eval_loss": 11.156039237976074, + "eval_runtime": 1.5566, + "eval_samples_per_second": 79.019, + "eval_steps_per_second": 10.279, + "step": 26784 + }, + { + "epoch": 433.0, + "eval_loss": 11.059048652648926, + "eval_runtime": 1.5499, + "eval_samples_per_second": 79.358, + "eval_steps_per_second": 10.323, + "step": 26846 + }, + { + "epoch": 434.0, + "eval_loss": 11.01535415649414, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.233, + "eval_steps_per_second": 10.307, + "step": 26908 + }, + { + "epoch": 435.0, + "eval_loss": 10.964756965637207, + "eval_runtime": 1.553, + "eval_samples_per_second": 79.204, + "eval_steps_per_second": 10.303, + "step": 26970 + }, + { + "epoch": 435.48, + "learning_rate": 9.03225806451613e-09, + "loss": 10.0079, + "step": 27000 + }, + { + "epoch": 436.0, + "eval_loss": 11.084016799926758, + "eval_runtime": 1.559, + "eval_samples_per_second": 78.895, + "eval_steps_per_second": 10.263, + "step": 27032 + }, + { + "epoch": 437.0, + "eval_loss": 10.810328483581543, + "eval_runtime": 1.5654, + "eval_samples_per_second": 78.574, + "eval_steps_per_second": 10.221, + "step": 27094 + }, + { + "epoch": 438.0, + "eval_loss": 10.89944839477539, + "eval_runtime": 1.557, + "eval_samples_per_second": 79.0, + "eval_steps_per_second": 10.276, + "step": 27156 + }, + { + "epoch": 439.0, + "eval_loss": 11.036003112792969, + "eval_runtime": 1.5567, + "eval_samples_per_second": 79.015, + "eval_steps_per_second": 10.278, + "step": 27218 + }, + { + "epoch": 440.0, + "eval_loss": 10.909425735473633, + "eval_runtime": 1.5567, + "eval_samples_per_second": 79.013, + "eval_steps_per_second": 10.278, + "step": 27280 + }, + { + "epoch": 441.0, + "eval_loss": 10.940203666687012, + "eval_runtime": 1.5862, + "eval_samples_per_second": 77.542, + "eval_steps_per_second": 10.087, + "step": 27342 + }, + { + "epoch": 442.0, + "eval_loss": 10.877068519592285, + "eval_runtime": 1.5493, + "eval_samples_per_second": 79.388, + "eval_steps_per_second": 10.327, + "step": 27404 + }, + { + "epoch": 443.0, + "eval_loss": 11.093377113342285, + "eval_runtime": 1.5504, + "eval_samples_per_second": 79.335, + "eval_steps_per_second": 10.32, + "step": 27466 + }, + { + "epoch": 443.55, + "learning_rate": 7.903225806451614e-09, + "loss": 10.0049, + "step": 27500 + }, + { + "epoch": 444.0, + "eval_loss": 11.089668273925781, + "eval_runtime": 1.5636, + "eval_samples_per_second": 78.663, + "eval_steps_per_second": 10.233, + "step": 27528 + }, + { + "epoch": 445.0, + "eval_loss": 11.049309730529785, + "eval_runtime": 1.5562, + "eval_samples_per_second": 79.038, + "eval_steps_per_second": 10.281, + "step": 27590 + }, + { + "epoch": 446.0, + "eval_loss": 11.039472579956055, + "eval_runtime": 1.5633, + "eval_samples_per_second": 78.678, + "eval_steps_per_second": 10.235, + "step": 27652 + }, + { + "epoch": 447.0, + "eval_loss": 10.898978233337402, + "eval_runtime": 1.5657, + "eval_samples_per_second": 78.558, + "eval_steps_per_second": 10.219, + "step": 27714 + }, + { + "epoch": 448.0, + "eval_loss": 11.16734504699707, + "eval_runtime": 1.5575, + "eval_samples_per_second": 78.975, + "eval_steps_per_second": 10.273, + "step": 27776 + }, + { + "epoch": 449.0, + "eval_loss": 11.158203125, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.235, + "eval_steps_per_second": 10.307, + "step": 27838 + }, + { + "epoch": 450.0, + "eval_loss": 10.85914421081543, + "eval_runtime": 1.5491, + "eval_samples_per_second": 79.399, + "eval_steps_per_second": 10.328, + "step": 27900 + }, + { + "epoch": 451.0, + "eval_loss": 11.079207420349121, + "eval_runtime": 1.5491, + "eval_samples_per_second": 79.398, + "eval_steps_per_second": 10.328, + "step": 27962 + }, + { + "epoch": 451.61, + "learning_rate": 6.774193548387097e-09, + "loss": 9.9924, + "step": 28000 + }, + { + "epoch": 452.0, + "eval_loss": 11.144965171813965, + "eval_runtime": 1.5578, + "eval_samples_per_second": 78.958, + "eval_steps_per_second": 10.271, + "step": 28024 + }, + { + "epoch": 453.0, + "eval_loss": 10.981874465942383, + "eval_runtime": 1.5581, + "eval_samples_per_second": 78.94, + "eval_steps_per_second": 10.269, + "step": 28086 + }, + { + "epoch": 454.0, + "eval_loss": 10.887094497680664, + "eval_runtime": 1.5515, + "eval_samples_per_second": 79.28, + "eval_steps_per_second": 10.313, + "step": 28148 + }, + { + "epoch": 455.0, + "eval_loss": 10.99148178100586, + "eval_runtime": 1.558, + "eval_samples_per_second": 78.947, + "eval_steps_per_second": 10.27, + "step": 28210 + }, + { + "epoch": 456.0, + "eval_loss": 11.03730583190918, + "eval_runtime": 1.5584, + "eval_samples_per_second": 78.926, + "eval_steps_per_second": 10.267, + "step": 28272 + }, + { + "epoch": 457.0, + "eval_loss": 10.938337326049805, + "eval_runtime": 1.5696, + "eval_samples_per_second": 78.366, + "eval_steps_per_second": 10.194, + "step": 28334 + }, + { + "epoch": 458.0, + "eval_loss": 11.129241943359375, + "eval_runtime": 1.5563, + "eval_samples_per_second": 79.032, + "eval_steps_per_second": 10.281, + "step": 28396 + }, + { + "epoch": 459.0, + "eval_loss": 11.168819427490234, + "eval_runtime": 1.5502, + "eval_samples_per_second": 79.343, + "eval_steps_per_second": 10.321, + "step": 28458 + }, + { + "epoch": 459.68, + "learning_rate": 5.645161290322581e-09, + "loss": 9.9918, + "step": 28500 + }, + { + "epoch": 460.0, + "eval_loss": 11.043048858642578, + "eval_runtime": 1.5622, + "eval_samples_per_second": 78.736, + "eval_steps_per_second": 10.242, + "step": 28520 + }, + { + "epoch": 461.0, + "eval_loss": 11.071016311645508, + "eval_runtime": 1.5582, + "eval_samples_per_second": 78.938, + "eval_steps_per_second": 10.268, + "step": 28582 + }, + { + "epoch": 462.0, + "eval_loss": 11.011009216308594, + "eval_runtime": 1.554, + "eval_samples_per_second": 79.151, + "eval_steps_per_second": 10.296, + "step": 28644 + }, + { + "epoch": 463.0, + "eval_loss": 11.087599754333496, + "eval_runtime": 1.5566, + "eval_samples_per_second": 79.02, + "eval_steps_per_second": 10.279, + "step": 28706 + }, + { + "epoch": 464.0, + "eval_loss": 11.0982027053833, + "eval_runtime": 1.5598, + "eval_samples_per_second": 78.856, + "eval_steps_per_second": 10.258, + "step": 28768 + }, + { + "epoch": 465.0, + "eval_loss": 10.962496757507324, + "eval_runtime": 1.5771, + "eval_samples_per_second": 77.992, + "eval_steps_per_second": 10.145, + "step": 28830 + }, + { + "epoch": 466.0, + "eval_loss": 10.964188575744629, + "eval_runtime": 1.5492, + "eval_samples_per_second": 79.395, + "eval_steps_per_second": 10.328, + "step": 28892 + }, + { + "epoch": 467.0, + "eval_loss": 10.798909187316895, + "eval_runtime": 1.5512, + "eval_samples_per_second": 79.296, + "eval_steps_per_second": 10.315, + "step": 28954 + }, + { + "epoch": 467.74, + "learning_rate": 4.516129032258065e-09, + "loss": 9.9799, + "step": 29000 + }, + { + "epoch": 468.0, + "eval_loss": 11.00378131866455, + "eval_runtime": 1.562, + "eval_samples_per_second": 78.745, + "eval_steps_per_second": 10.243, + "step": 29016 + }, + { + "epoch": 469.0, + "eval_loss": 11.124489784240723, + "eval_runtime": 1.5579, + "eval_samples_per_second": 78.951, + "eval_steps_per_second": 10.27, + "step": 29078 + }, + { + "epoch": 470.0, + "eval_loss": 10.913480758666992, + "eval_runtime": 1.5763, + "eval_samples_per_second": 78.031, + "eval_steps_per_second": 10.15, + "step": 29140 + }, + { + "epoch": 471.0, + "eval_loss": 10.943198204040527, + "eval_runtime": 1.5563, + "eval_samples_per_second": 79.031, + "eval_steps_per_second": 10.281, + "step": 29202 + }, + { + "epoch": 472.0, + "eval_loss": 10.829460144042969, + "eval_runtime": 1.5569, + "eval_samples_per_second": 79.005, + "eval_steps_per_second": 10.277, + "step": 29264 + }, + { + "epoch": 473.0, + "eval_loss": 10.927714347839355, + "eval_runtime": 1.5542, + "eval_samples_per_second": 79.141, + "eval_steps_per_second": 10.295, + "step": 29326 + }, + { + "epoch": 474.0, + "eval_loss": 11.094922065734863, + "eval_runtime": 1.5516, + "eval_samples_per_second": 79.274, + "eval_steps_per_second": 10.312, + "step": 29388 + }, + { + "epoch": 475.0, + "eval_loss": 11.037424087524414, + "eval_runtime": 1.5502, + "eval_samples_per_second": 79.343, + "eval_steps_per_second": 10.321, + "step": 29450 + }, + { + "epoch": 475.81, + "learning_rate": 3.3870967741935484e-09, + "loss": 9.9961, + "step": 29500 + }, + { + "epoch": 476.0, + "eval_loss": 10.958832740783691, + "eval_runtime": 1.5596, + "eval_samples_per_second": 78.865, + "eval_steps_per_second": 10.259, + "step": 29512 + }, + { + "epoch": 477.0, + "eval_loss": 11.034053802490234, + "eval_runtime": 1.5537, + "eval_samples_per_second": 79.168, + "eval_steps_per_second": 10.298, + "step": 29574 + }, + { + "epoch": 478.0, + "eval_loss": 10.954147338867188, + "eval_runtime": 1.563, + "eval_samples_per_second": 78.695, + "eval_steps_per_second": 10.237, + "step": 29636 + }, + { + "epoch": 479.0, + "eval_loss": 10.934306144714355, + "eval_runtime": 1.5578, + "eval_samples_per_second": 78.957, + "eval_steps_per_second": 10.271, + "step": 29698 + }, + { + "epoch": 480.0, + "eval_loss": 10.959942817687988, + "eval_runtime": 1.5525, + "eval_samples_per_second": 79.225, + "eval_steps_per_second": 10.306, + "step": 29760 + }, + { + "epoch": 481.0, + "eval_loss": 10.95332145690918, + "eval_runtime": 1.5572, + "eval_samples_per_second": 78.988, + "eval_steps_per_second": 10.275, + "step": 29822 + }, + { + "epoch": 482.0, + "eval_loss": 11.204573631286621, + "eval_runtime": 1.5776, + "eval_samples_per_second": 77.964, + "eval_steps_per_second": 10.142, + "step": 29884 + }, + { + "epoch": 483.0, + "eval_loss": 10.935157775878906, + "eval_runtime": 1.5461, + "eval_samples_per_second": 79.557, + "eval_steps_per_second": 10.349, + "step": 29946 + }, + { + "epoch": 483.87, + "learning_rate": 2.2580645161290324e-09, + "loss": 10.006, + "step": 30000 + }, + { + "epoch": 484.0, + "eval_loss": 11.172720909118652, + "eval_runtime": 1.5619, + "eval_samples_per_second": 78.751, + "eval_steps_per_second": 10.244, + "step": 30008 + }, + { + "epoch": 485.0, + "eval_loss": 10.99561595916748, + "eval_runtime": 1.5552, + "eval_samples_per_second": 79.09, + "eval_steps_per_second": 10.288, + "step": 30070 + }, + { + "epoch": 486.0, + "eval_loss": 11.166374206542969, + "eval_runtime": 1.5713, + "eval_samples_per_second": 78.281, + "eval_steps_per_second": 10.183, + "step": 30132 + }, + { + "epoch": 487.0, + "eval_loss": 11.094931602478027, + "eval_runtime": 1.553, + "eval_samples_per_second": 79.2, + "eval_steps_per_second": 10.302, + "step": 30194 + }, + { + "epoch": 488.0, + "eval_loss": 10.948943138122559, + "eval_runtime": 1.5549, + "eval_samples_per_second": 79.102, + "eval_steps_per_second": 10.29, + "step": 30256 + }, + { + "epoch": 489.0, + "eval_loss": 11.004921913146973, + "eval_runtime": 1.5546, + "eval_samples_per_second": 79.121, + "eval_steps_per_second": 10.292, + "step": 30318 + }, + { + "epoch": 490.0, + "eval_loss": 11.18515396118164, + "eval_runtime": 1.5529, + "eval_samples_per_second": 79.207, + "eval_steps_per_second": 10.303, + "step": 30380 + }, + { + "epoch": 491.0, + "eval_loss": 11.10179615020752, + "eval_runtime": 1.5524, + "eval_samples_per_second": 79.232, + "eval_steps_per_second": 10.307, + "step": 30442 + }, + { + "epoch": 491.94, + "learning_rate": 1.1290322580645162e-09, + "loss": 10.0083, + "step": 30500 + }, + { + "epoch": 492.0, + "eval_loss": 10.913029670715332, + "eval_runtime": 1.5534, + "eval_samples_per_second": 79.18, + "eval_steps_per_second": 10.3, + "step": 30504 + }, + { + "epoch": 493.0, + "eval_loss": 10.935208320617676, + "eval_runtime": 1.5558, + "eval_samples_per_second": 79.058, + "eval_steps_per_second": 10.284, + "step": 30566 + }, + { + "epoch": 494.0, + "eval_loss": 10.9716157913208, + "eval_runtime": 1.5545, + "eval_samples_per_second": 79.127, + "eval_steps_per_second": 10.293, + "step": 30628 + }, + { + "epoch": 495.0, + "eval_loss": 11.111549377441406, + "eval_runtime": 1.5613, + "eval_samples_per_second": 78.778, + "eval_steps_per_second": 10.248, + "step": 30690 + }, + { + "epoch": 496.0, + "eval_loss": 10.980956077575684, + "eval_runtime": 1.5565, + "eval_samples_per_second": 79.022, + "eval_steps_per_second": 10.279, + "step": 30752 + }, + { + "epoch": 497.0, + "eval_loss": 10.970294952392578, + "eval_runtime": 1.5649, + "eval_samples_per_second": 78.597, + "eval_steps_per_second": 10.224, + "step": 30814 + }, + { + "epoch": 498.0, + "eval_loss": 10.939518928527832, + "eval_runtime": 1.547, + "eval_samples_per_second": 79.51, + "eval_steps_per_second": 10.343, + "step": 30876 + }, + { + "epoch": 499.0, + "eval_loss": 11.018895149230957, + "eval_runtime": 1.5496, + "eval_samples_per_second": 79.375, + "eval_steps_per_second": 10.325, + "step": 30938 + }, + { + "epoch": 500.0, + "learning_rate": 0.0, + "loss": 10.0106, + "step": 31000 + }, + { + "epoch": 500.0, + "eval_loss": 10.97592544555664, + "eval_runtime": 1.5827, + "eval_samples_per_second": 77.713, + "eval_steps_per_second": 10.109, + "step": 31000 + }, + { + "epoch": 500.0, + "step": 31000, + "total_flos": 1.6136618439168e+16, + "train_loss": 11.119525689894154, + "train_runtime": 17411.7991, + "train_samples_per_second": 14.042, + "train_steps_per_second": 1.78 } ], - "max_steps": 3100, - "num_train_epochs": 50, - "total_flos": 1620261687859200.0, + "max_steps": 31000, + "num_train_epochs": 500, + "total_flos": 1.6136618439168e+16, "trial_name": null, "trial_params": null }