diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5558 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.280975609756098, + "global_step": 850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 6.25e-06, + "loss": 2.9098, + "step": 1 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "loss": 2.8818, + "step": 2 + }, + { + "epoch": 0.05, + "learning_rate": 1.875e-05, + "loss": 2.9635, + "step": 3 + }, + { + "epoch": 0.06, + "learning_rate": 2.5e-05, + "loss": 2.8696, + "step": 4 + }, + { + "epoch": 0.08, + "learning_rate": 3.125e-05, + "loss": 2.8205, + "step": 5 + }, + { + "epoch": 0.09, + "learning_rate": 3.75e-05, + "loss": 2.7737, + "step": 6 + }, + { + "epoch": 0.11, + "learning_rate": 4.375e-05, + "loss": 2.751, + "step": 7 + }, + { + "epoch": 0.12, + "learning_rate": 5e-05, + "loss": 2.7525, + "step": 8 + }, + { + "epoch": 0.14, + "learning_rate": 5.6250000000000005e-05, + "loss": 2.7303, + "step": 9 + }, + { + "epoch": 0.16, + "learning_rate": 6.25e-05, + "loss": 2.7412, + "step": 10 + }, + { + "epoch": 0.17, + "learning_rate": 6.875e-05, + "loss": 2.4844, + "step": 11 + }, + { + "epoch": 0.19, + "learning_rate": 7.5e-05, + "loss": 2.5187, + "step": 12 + }, + { + "epoch": 0.2, + "learning_rate": 8.125000000000001e-05, + "loss": 2.4352, + "step": 13 + }, + { + "epoch": 0.22, + "learning_rate": 8.75e-05, + "loss": 2.3852, + "step": 14 + }, + { + "epoch": 0.23, + "learning_rate": 9.375e-05, + "loss": 2.388, + "step": 15 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 2.4145, + "step": 16 + }, + { + "epoch": 0.27, + "learning_rate": 0.00010625, + "loss": 2.3989, + "step": 17 + }, + { + "epoch": 0.28, + "learning_rate": 0.00011250000000000001, + "loss": 2.2944, + "step": 18 + }, + { + "epoch": 0.3, + "learning_rate": 0.00011875, + "loss": 2.2215, + "step": 19 + }, + { + "epoch": 0.31, + "learning_rate": 0.000125, + "loss": 2.1633, + "step": 20 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013125000000000002, + "loss": 2.2124, + "step": 21 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001375, + "loss": 2.1828, + "step": 22 + }, + { + "epoch": 0.36, + "learning_rate": 0.00014375, + "loss": 2.0791, + "step": 23 + }, + { + "epoch": 0.37, + "learning_rate": 0.00015, + "loss": 2.1757, + "step": 24 + }, + { + "epoch": 0.39, + "learning_rate": 0.00015625, + "loss": 2.1198, + "step": 25 + }, + { + "epoch": 0.39, + "eval_gen_len": 805.336, + "eval_loss": 1.8719514608383179, + "eval_rouge1": 29.4332, + "eval_rouge2": 7.3761, + "eval_rougeL": 17.0816, + "eval_rougeLsum": 25.065, + "eval_runtime": 886.4717, + "eval_samples_per_second": 0.282, + "eval_steps_per_second": 0.282, + "step": 25 + }, + { + "epoch": 0.41, + "learning_rate": 0.00016250000000000002, + "loss": 2.1116, + "step": 26 + }, + { + "epoch": 0.42, + "learning_rate": 0.00016875, + "loss": 2.0973, + "step": 27 + }, + { + "epoch": 0.44, + "learning_rate": 0.000175, + "loss": 2.0698, + "step": 28 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018125, + "loss": 2.1397, + "step": 29 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001875, + "loss": 2.0961, + "step": 30 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019375000000000002, + "loss": 2.0803, + "step": 31 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002, + "loss": 2.1034, + "step": 32 + }, + { + "epoch": 0.52, + "learning_rate": 0.00020625, + "loss": 2.0722, + "step": 33 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002125, + "loss": 2.0459, + "step": 34 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021875, + "loss": 2.0633, + "step": 35 + }, + { + "epoch": 0.56, + "learning_rate": 0.00022500000000000002, + "loss": 1.9343, + "step": 36 + }, + { + "epoch": 0.58, + "learning_rate": 0.00023125, + "loss": 1.9526, + "step": 37 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002375, + "loss": 1.9989, + "step": 38 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024375, + "loss": 1.9767, + "step": 39 + }, + { + "epoch": 0.62, + "learning_rate": 0.00025, + "loss": 1.9694, + "step": 40 + }, + { + "epoch": 0.64, + "learning_rate": 0.00025624999999999997, + "loss": 1.9294, + "step": 41 + }, + { + "epoch": 0.66, + "learning_rate": 0.00026250000000000004, + "loss": 1.9794, + "step": 42 + }, + { + "epoch": 0.67, + "learning_rate": 0.00026875, + "loss": 1.9808, + "step": 43 + }, + { + "epoch": 0.69, + "learning_rate": 0.000275, + "loss": 1.9719, + "step": 44 + }, + { + "epoch": 0.7, + "learning_rate": 0.00028125000000000003, + "loss": 1.9447, + "step": 45 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002875, + "loss": 1.9479, + "step": 46 + }, + { + "epoch": 0.73, + "learning_rate": 0.00029375, + "loss": 1.95, + "step": 47 + }, + { + "epoch": 0.75, + "learning_rate": 0.0003, + "loss": 1.9718, + "step": 48 + }, + { + "epoch": 0.76, + "learning_rate": 0.00030625000000000004, + "loss": 1.8785, + "step": 49 + }, + { + "epoch": 0.78, + "learning_rate": 0.0003125, + "loss": 1.8609, + "step": 50 + }, + { + "epoch": 0.78, + "eval_gen_len": 833.404, + "eval_loss": 1.7601044178009033, + "eval_rouge1": 35.3533, + "eval_rouge2": 10.6624, + "eval_rougeL": 18.643, + "eval_rougeLsum": 31.6979, + "eval_runtime": 912.2832, + "eval_samples_per_second": 0.274, + "eval_steps_per_second": 0.274, + "step": 50 + }, + { + "epoch": 0.8, + "learning_rate": 0.00031874999999999997, + "loss": 1.9477, + "step": 51 + }, + { + "epoch": 0.81, + "learning_rate": 0.00032500000000000004, + "loss": 1.8888, + "step": 52 + }, + { + "epoch": 0.83, + "learning_rate": 0.00033125, + "loss": 1.8034, + "step": 53 + }, + { + "epoch": 0.84, + "learning_rate": 0.0003375, + "loss": 1.9102, + "step": 54 + }, + { + "epoch": 0.86, + "learning_rate": 0.00034375, + "loss": 1.9455, + "step": 55 + }, + { + "epoch": 0.87, + "learning_rate": 0.00035, + "loss": 1.8725, + "step": 56 + }, + { + "epoch": 0.89, + "learning_rate": 0.00035625, + "loss": 1.8375, + "step": 57 + }, + { + "epoch": 0.91, + "learning_rate": 0.0003625, + "loss": 1.8795, + "step": 58 + }, + { + "epoch": 0.92, + "learning_rate": 0.00036875000000000005, + "loss": 1.7643, + "step": 59 + }, + { + "epoch": 0.94, + "learning_rate": 0.000375, + "loss": 1.7923, + "step": 60 + }, + { + "epoch": 0.95, + "learning_rate": 0.00038124999999999997, + "loss": 1.869, + "step": 61 + }, + { + "epoch": 0.97, + "learning_rate": 0.00038750000000000004, + "loss": 1.8725, + "step": 62 + }, + { + "epoch": 0.98, + "learning_rate": 0.00039375, + "loss": 1.9809, + "step": 63 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004, + "loss": 1.8203, + "step": 64 + }, + { + "epoch": 1.02, + "learning_rate": 0.00040625000000000004, + "loss": 1.9199, + "step": 65 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004125, + "loss": 1.8922, + "step": 66 + }, + { + "epoch": 1.05, + "learning_rate": 0.00041875, + "loss": 1.8536, + "step": 67 + }, + { + "epoch": 1.06, + "learning_rate": 0.000425, + "loss": 1.8278, + "step": 68 + }, + { + "epoch": 1.08, + "learning_rate": 0.00043125000000000005, + "loss": 1.79, + "step": 69 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004375, + "loss": 1.8595, + "step": 70 + }, + { + "epoch": 1.11, + "learning_rate": 0.00044374999999999997, + "loss": 1.7372, + "step": 71 + }, + { + "epoch": 1.12, + "learning_rate": 0.00045000000000000004, + "loss": 1.7542, + "step": 72 + }, + { + "epoch": 1.14, + "learning_rate": 0.00045625, + "loss": 1.7291, + "step": 73 + }, + { + "epoch": 1.16, + "learning_rate": 0.0004625, + "loss": 1.8462, + "step": 74 + }, + { + "epoch": 1.17, + "learning_rate": 0.00046875, + "loss": 1.7805, + "step": 75 + }, + { + "epoch": 1.17, + "eval_gen_len": 866.356, + "eval_loss": 1.683250069618225, + "eval_rouge1": 36.5786, + "eval_rouge2": 11.1185, + "eval_rougeL": 20.0358, + "eval_rougeLsum": 33.2116, + "eval_runtime": 941.453, + "eval_samples_per_second": 0.266, + "eval_steps_per_second": 0.266, + "step": 75 + }, + { + "epoch": 1.19, + "learning_rate": 0.000475, + "loss": 1.7477, + "step": 76 + }, + { + "epoch": 1.2, + "learning_rate": 0.00048125, + "loss": 1.784, + "step": 77 + }, + { + "epoch": 1.22, + "learning_rate": 0.0004875, + "loss": 1.8665, + "step": 78 + }, + { + "epoch": 1.23, + "learning_rate": 0.00049375, + "loss": 1.7594, + "step": 79 + }, + { + "epoch": 1.25, + "learning_rate": 0.0005, + "loss": 1.7236, + "step": 80 + }, + { + "epoch": 1.27, + "learning_rate": 0.0004999994660231515, + "loss": 1.8202, + "step": 81 + }, + { + "epoch": 1.28, + "learning_rate": 0.0004999978640948868, + "loss": 1.708, + "step": 82 + }, + { + "epoch": 1.3, + "learning_rate": 0.0004999951942220491, + "loss": 1.7961, + "step": 83 + }, + { + "epoch": 1.31, + "learning_rate": 0.0004999914564160436, + "loss": 1.771, + "step": 84 + }, + { + "epoch": 1.33, + "learning_rate": 0.0004999866506928376, + "loss": 1.7643, + "step": 85 + }, + { + "epoch": 1.34, + "learning_rate": 0.0004999807770729603, + "loss": 1.8169, + "step": 86 + }, + { + "epoch": 1.36, + "learning_rate": 0.0004999738355815026, + "loss": 1.7454, + "step": 87 + }, + { + "epoch": 1.37, + "learning_rate": 0.0004999658262481172, + "loss": 1.8016, + "step": 88 + }, + { + "epoch": 1.39, + "learning_rate": 0.0004999567491070187, + "loss": 1.752, + "step": 89 + }, + { + "epoch": 1.41, + "learning_rate": 0.0004999466041969828, + "loss": 1.7792, + "step": 90 + }, + { + "epoch": 1.42, + "learning_rate": 0.0004999353915613467, + "loss": 1.7139, + "step": 91 + }, + { + "epoch": 1.44, + "learning_rate": 0.0004999231112480087, + "loss": 1.7583, + "step": 92 + }, + { + "epoch": 1.45, + "learning_rate": 0.000499909763309428, + "loss": 1.7325, + "step": 93 + }, + { + "epoch": 1.47, + "learning_rate": 0.0004998953478026247, + "loss": 1.7276, + "step": 94 + }, + { + "epoch": 1.48, + "learning_rate": 0.000499879864789179, + "loss": 1.7598, + "step": 95 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004998633143352315, + "loss": 1.7816, + "step": 96 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004998456965114827, + "loss": 1.7535, + "step": 97 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004998270113931926, + "loss": 1.753, + "step": 98 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004998072590601808, + "loss": 1.7538, + "step": 99 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004997864395968252, + "loss": 1.7352, + "step": 100 + }, + { + "epoch": 1.56, + "eval_gen_len": 822.348, + "eval_loss": 1.6523675918579102, + "eval_rouge1": 40.5489, + "eval_rouge2": 13.0695, + "eval_rougeL": 20.1256, + "eval_rougeLsum": 37.1369, + "eval_runtime": 897.0701, + "eval_samples_per_second": 0.279, + "eval_steps_per_second": 0.279, + "step": 100 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004997645530920631, + "loss": 1.7694, + "step": 101 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004997415996393894, + "loss": 1.7363, + "step": 102 + }, + { + "epoch": 1.61, + "learning_rate": 0.000499717579336857, + "loss": 1.7284, + "step": 103 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004996924922870762, + "loss": 1.8031, + "step": 104 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004996663385972142, + "loss": 1.7258, + "step": 105 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004996391183789949, + "loss": 1.7247, + "step": 106 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004996108317486977, + "loss": 1.6966, + "step": 107 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004995814788271582, + "loss": 1.7258, + "step": 108 + }, + { + "epoch": 1.7, + "learning_rate": 0.0004995510597397663, + "loss": 1.8111, + "step": 109 + }, + { + "epoch": 1.72, + "learning_rate": 0.0004995195746164671, + "loss": 1.7486, + "step": 110 + }, + { + "epoch": 1.73, + "learning_rate": 0.0004994870235917588, + "loss": 1.7584, + "step": 111 + }, + { + "epoch": 1.75, + "learning_rate": 0.0004994534068046936, + "loss": 1.6991, + "step": 112 + }, + { + "epoch": 1.76, + "learning_rate": 0.0004994187243988762, + "loss": 1.6487, + "step": 113 + }, + { + "epoch": 1.78, + "learning_rate": 0.0004993829765224633, + "loss": 1.6318, + "step": 114 + }, + { + "epoch": 1.8, + "learning_rate": 0.0004993461633281633, + "loss": 1.6386, + "step": 115 + }, + { + "epoch": 1.81, + "learning_rate": 0.0004993082849732353, + "loss": 1.743, + "step": 116 + }, + { + "epoch": 1.83, + "learning_rate": 0.0004992693416194886, + "loss": 1.6354, + "step": 117 + }, + { + "epoch": 1.84, + "learning_rate": 0.000499229333433282, + "loss": 1.7054, + "step": 118 + }, + { + "epoch": 1.86, + "learning_rate": 0.0004991882605855231, + "loss": 1.7264, + "step": 119 + }, + { + "epoch": 1.87, + "learning_rate": 0.0004991461232516675, + "loss": 1.6596, + "step": 120 + }, + { + "epoch": 1.89, + "learning_rate": 0.000499102921611718, + "loss": 1.6623, + "step": 121 + }, + { + "epoch": 1.91, + "learning_rate": 0.0004990586558502241, + "loss": 1.698, + "step": 122 + }, + { + "epoch": 1.92, + "learning_rate": 0.0004990133261562809, + "loss": 1.6981, + "step": 123 + }, + { + "epoch": 1.94, + "learning_rate": 0.0004989669327235284, + "loss": 1.744, + "step": 124 + }, + { + "epoch": 1.95, + "learning_rate": 0.000498919475750151, + "loss": 1.7371, + "step": 125 + }, + { + "epoch": 1.95, + "eval_gen_len": 765.6, + "eval_loss": 1.6293846368789673, + "eval_rouge1": 43.8594, + "eval_rouge2": 15.2962, + "eval_rougeL": 20.7807, + "eval_rougeLsum": 40.3461, + "eval_runtime": 841.3174, + "eval_samples_per_second": 0.297, + "eval_steps_per_second": 0.297, + "step": 125 + }, + { + "epoch": 1.97, + "learning_rate": 0.0004988709554388757, + "loss": 1.7668, + "step": 126 + }, + { + "epoch": 1.98, + "learning_rate": 0.0004988213719969726, + "loss": 1.7128, + "step": 127 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004987707256362529, + "loss": 1.6625, + "step": 128 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004987190165730684, + "loss": 1.741, + "step": 129 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004986662450283106, + "loss": 1.6843, + "step": 130 + }, + { + "epoch": 2.05, + "learning_rate": 0.0004986124112274099, + "loss": 1.5982, + "step": 131 + }, + { + "epoch": 2.06, + "learning_rate": 0.0004985575154003344, + "loss": 1.5923, + "step": 132 + }, + { + "epoch": 2.08, + "learning_rate": 0.0004985015577815888, + "loss": 1.7067, + "step": 133 + }, + { + "epoch": 2.09, + "learning_rate": 0.0004984445386102135, + "loss": 1.6505, + "step": 134 + }, + { + "epoch": 2.11, + "learning_rate": 0.0004983864581297841, + "loss": 1.6218, + "step": 135 + }, + { + "epoch": 2.12, + "learning_rate": 0.0004983273165884096, + "loss": 1.6585, + "step": 136 + }, + { + "epoch": 2.14, + "learning_rate": 0.0004982671142387316, + "loss": 1.6493, + "step": 137 + }, + { + "epoch": 2.16, + "learning_rate": 0.0004982058513379235, + "loss": 1.6432, + "step": 138 + }, + { + "epoch": 2.17, + "learning_rate": 0.000498143528147689, + "loss": 1.6534, + "step": 139 + }, + { + "epoch": 2.19, + "learning_rate": 0.0004980801449342612, + "loss": 1.6082, + "step": 140 + }, + { + "epoch": 2.2, + "learning_rate": 0.0004980157019684016, + "loss": 1.6816, + "step": 141 + }, + { + "epoch": 2.22, + "learning_rate": 0.0004979501995253985, + "loss": 1.6422, + "step": 142 + }, + { + "epoch": 2.23, + "learning_rate": 0.0004978836378850663, + "loss": 1.6053, + "step": 143 + }, + { + "epoch": 2.25, + "learning_rate": 0.0004978160173317438, + "loss": 1.6641, + "step": 144 + }, + { + "epoch": 2.27, + "learning_rate": 0.0004977473381542936, + "loss": 1.6122, + "step": 145 + }, + { + "epoch": 2.28, + "learning_rate": 0.0004976776006461005, + "loss": 1.7197, + "step": 146 + }, + { + "epoch": 2.3, + "learning_rate": 0.0004976068051050702, + "loss": 1.5697, + "step": 147 + }, + { + "epoch": 2.31, + "learning_rate": 0.000497534951833628, + "loss": 1.7142, + "step": 148 + }, + { + "epoch": 2.33, + "learning_rate": 0.0004974620411387178, + "loss": 1.6504, + "step": 149 + }, + { + "epoch": 2.34, + "learning_rate": 0.0004973880733318007, + "loss": 1.6428, + "step": 150 + }, + { + "epoch": 2.34, + "eval_gen_len": 844.184, + "eval_loss": 1.605539321899414, + "eval_rouge1": 44.5054, + "eval_rouge2": 15.731, + "eval_rougeL": 21.2582, + "eval_rougeLsum": 40.9775, + "eval_runtime": 922.4102, + "eval_samples_per_second": 0.271, + "eval_steps_per_second": 0.271, + "step": 150 + }, + { + "epoch": 2.36, + "learning_rate": 0.0004973130487288534, + "loss": 1.6719, + "step": 151 + }, + { + "epoch": 2.37, + "learning_rate": 0.0004972369676503671, + "loss": 1.6606, + "step": 152 + }, + { + "epoch": 2.39, + "learning_rate": 0.0004971598304213461, + "loss": 1.6061, + "step": 153 + }, + { + "epoch": 2.41, + "learning_rate": 0.0004970816373713064, + "loss": 1.666, + "step": 154 + }, + { + "epoch": 2.42, + "learning_rate": 0.0004970023888342742, + "loss": 1.6483, + "step": 155 + }, + { + "epoch": 2.44, + "learning_rate": 0.0004969220851487844, + "loss": 1.6359, + "step": 156 + }, + { + "epoch": 2.45, + "learning_rate": 0.0004968407266578797, + "loss": 1.6336, + "step": 157 + }, + { + "epoch": 2.47, + "learning_rate": 0.0004967583137091085, + "loss": 1.7383, + "step": 158 + }, + { + "epoch": 2.48, + "learning_rate": 0.0004966748466545235, + "loss": 1.6486, + "step": 159 + }, + { + "epoch": 2.5, + "learning_rate": 0.0004965903258506806, + "loss": 1.642, + "step": 160 + }, + { + "epoch": 2.52, + "learning_rate": 0.000496504751658637, + "loss": 1.6086, + "step": 161 + }, + { + "epoch": 2.53, + "learning_rate": 0.0004964181244439498, + "loss": 1.6132, + "step": 162 + }, + { + "epoch": 2.55, + "learning_rate": 0.0004963304445766743, + "loss": 1.7406, + "step": 163 + }, + { + "epoch": 2.56, + "learning_rate": 0.000496241712431363, + "loss": 1.5989, + "step": 164 + }, + { + "epoch": 2.58, + "learning_rate": 0.0004961519283870628, + "loss": 1.6851, + "step": 165 + }, + { + "epoch": 2.59, + "learning_rate": 0.0004960610928273146, + "loss": 1.7098, + "step": 166 + }, + { + "epoch": 2.61, + "learning_rate": 0.0004959692061401512, + "loss": 1.6937, + "step": 167 + }, + { + "epoch": 2.62, + "learning_rate": 0.0004958762687180956, + "loss": 1.587, + "step": 168 + }, + { + "epoch": 2.64, + "learning_rate": 0.000495782280958159, + "loss": 1.6361, + "step": 169 + }, + { + "epoch": 2.66, + "learning_rate": 0.0004956872432618399, + "loss": 1.5235, + "step": 170 + }, + { + "epoch": 2.67, + "learning_rate": 0.0004955911560351215, + "loss": 1.6453, + "step": 171 + }, + { + "epoch": 2.69, + "learning_rate": 0.000495494019688471, + "loss": 1.695, + "step": 172 + }, + { + "epoch": 2.7, + "learning_rate": 0.0004953958346368365, + "loss": 1.6334, + "step": 173 + }, + { + "epoch": 2.72, + "learning_rate": 0.0004952966012996466, + "loss": 1.6352, + "step": 174 + }, + { + "epoch": 2.73, + "learning_rate": 0.0004951963201008077, + "loss": 1.6567, + "step": 175 + }, + { + "epoch": 2.73, + "eval_gen_len": 857.236, + "eval_loss": 1.6031476259231567, + "eval_rouge1": 47.3641, + "eval_rouge2": 16.9664, + "eval_rougeL": 21.4998, + "eval_rougeLsum": 43.994, + "eval_runtime": 935.2302, + "eval_samples_per_second": 0.267, + "eval_steps_per_second": 0.267, + "step": 175 + }, + { + "epoch": 2.75, + "learning_rate": 0.0004950949914687023, + "loss": 1.6236, + "step": 176 + }, + { + "epoch": 2.76, + "learning_rate": 0.0004949926158361879, + "loss": 1.6126, + "step": 177 + }, + { + "epoch": 2.78, + "learning_rate": 0.000494889193640594, + "loss": 1.5928, + "step": 178 + }, + { + "epoch": 2.8, + "learning_rate": 0.0004947847253237212, + "loss": 1.5603, + "step": 179 + }, + { + "epoch": 2.81, + "learning_rate": 0.0004946792113318387, + "loss": 1.595, + "step": 180 + }, + { + "epoch": 2.83, + "learning_rate": 0.0004945726521156827, + "loss": 1.6604, + "step": 181 + }, + { + "epoch": 2.84, + "learning_rate": 0.0004944650481304545, + "loss": 1.7129, + "step": 182 + }, + { + "epoch": 2.86, + "learning_rate": 0.0004943563998358185, + "loss": 1.6234, + "step": 183 + }, + { + "epoch": 2.87, + "learning_rate": 0.0004942467076958999, + "loss": 1.6682, + "step": 184 + }, + { + "epoch": 2.89, + "learning_rate": 0.0004941359721792832, + "loss": 1.5966, + "step": 185 + }, + { + "epoch": 2.91, + "learning_rate": 0.0004940241937590102, + "loss": 1.6909, + "step": 186 + }, + { + "epoch": 2.92, + "learning_rate": 0.0004939113729125775, + "loss": 1.6324, + "step": 187 + }, + { + "epoch": 2.94, + "learning_rate": 0.000493797510121935, + "loss": 1.5672, + "step": 188 + }, + { + "epoch": 2.95, + "learning_rate": 0.0004936826058734832, + "loss": 1.6071, + "step": 189 + }, + { + "epoch": 2.97, + "learning_rate": 0.0004935666606580718, + "loss": 1.6646, + "step": 190 + }, + { + "epoch": 2.98, + "learning_rate": 0.0004934496749709976, + "loss": 1.6769, + "step": 191 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004933316493120015, + "loss": 1.6161, + "step": 192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004932125841852673, + "loss": 1.6829, + "step": 193 + }, + { + "epoch": 3.03, + "learning_rate": 0.0004930924800994192, + "loss": 1.6455, + "step": 194 + }, + { + "epoch": 3.05, + "learning_rate": 0.0004929713375675195, + "loss": 1.5707, + "step": 195 + }, + { + "epoch": 3.06, + "learning_rate": 0.0004928491571070669, + "loss": 1.6266, + "step": 196 + }, + { + "epoch": 3.08, + "learning_rate": 0.0004927259392399936, + "loss": 1.6399, + "step": 197 + }, + { + "epoch": 3.09, + "learning_rate": 0.0004926016844926634, + "loss": 1.6322, + "step": 198 + }, + { + "epoch": 3.11, + "learning_rate": 0.0004924763933958695, + "loss": 1.5177, + "step": 199 + }, + { + "epoch": 3.12, + "learning_rate": 0.0004923500664848326, + "loss": 1.5773, + "step": 200 + }, + { + "epoch": 3.12, + "eval_gen_len": 841.86, + "eval_loss": 1.5855237245559692, + "eval_rouge1": 47.2284, + "eval_rouge2": 17.3099, + "eval_rougeL": 21.6793, + "eval_rougeLsum": 43.9018, + "eval_runtime": 920.2336, + "eval_samples_per_second": 0.272, + "eval_steps_per_second": 0.272, + "step": 200 + }, + { + "epoch": 3.14, + "learning_rate": 0.0004922227042991976, + "loss": 1.5497, + "step": 201 + }, + { + "epoch": 3.16, + "learning_rate": 0.0004920943073830322, + "loss": 1.6289, + "step": 202 + }, + { + "epoch": 3.17, + "learning_rate": 0.0004919648762848243, + "loss": 1.5412, + "step": 203 + }, + { + "epoch": 3.19, + "learning_rate": 0.0004918344115574796, + "loss": 1.608, + "step": 204 + }, + { + "epoch": 3.2, + "learning_rate": 0.0004917029137583191, + "loss": 1.6501, + "step": 205 + }, + { + "epoch": 3.22, + "learning_rate": 0.0004915703834490773, + "loss": 1.5859, + "step": 206 + }, + { + "epoch": 3.23, + "learning_rate": 0.0004914368211958989, + "loss": 1.5605, + "step": 207 + }, + { + "epoch": 3.25, + "learning_rate": 0.0004913022275693372, + "loss": 1.5337, + "step": 208 + }, + { + "epoch": 3.27, + "learning_rate": 0.0004911666031443512, + "loss": 1.5706, + "step": 209 + }, + { + "epoch": 3.28, + "learning_rate": 0.0004910299485003034, + "loss": 1.5739, + "step": 210 + }, + { + "epoch": 3.3, + "learning_rate": 0.0004908922642209571, + "loss": 1.6501, + "step": 211 + }, + { + "epoch": 3.31, + "learning_rate": 0.000490753550894474, + "loss": 1.5612, + "step": 212 + }, + { + "epoch": 3.33, + "learning_rate": 0.0004906138091134118, + "loss": 1.6209, + "step": 213 + }, + { + "epoch": 3.34, + "learning_rate": 0.0004904730394747215, + "loss": 1.5751, + "step": 214 + }, + { + "epoch": 3.36, + "learning_rate": 0.0004903312425797449, + "loss": 1.6414, + "step": 215 + }, + { + "epoch": 3.37, + "learning_rate": 0.0004901884190342121, + "loss": 1.6426, + "step": 216 + }, + { + "epoch": 3.39, + "learning_rate": 0.0004900445694482387, + "loss": 1.5086, + "step": 217 + }, + { + "epoch": 3.41, + "learning_rate": 0.0004898996944363237, + "loss": 1.5461, + "step": 218 + }, + { + "epoch": 3.42, + "learning_rate": 0.0004897537946173461, + "loss": 1.6256, + "step": 219 + }, + { + "epoch": 3.44, + "learning_rate": 0.0004896068706145633, + "loss": 1.5871, + "step": 220 + }, + { + "epoch": 3.45, + "learning_rate": 0.0004894589230556069, + "loss": 1.6357, + "step": 221 + }, + { + "epoch": 3.47, + "learning_rate": 0.0004893099525724818, + "loss": 1.5664, + "step": 222 + }, + { + "epoch": 3.48, + "learning_rate": 0.0004891599598015621, + "loss": 1.5824, + "step": 223 + }, + { + "epoch": 3.5, + "learning_rate": 0.0004890089453835894, + "loss": 1.5546, + "step": 224 + }, + { + "epoch": 3.52, + "learning_rate": 0.0004888569099636692, + "loss": 1.5614, + "step": 225 + }, + { + "epoch": 3.52, + "eval_gen_len": 832.8, + "eval_loss": 1.5882554054260254, + "eval_rouge1": 46.4612, + "eval_rouge2": 17.1368, + "eval_rougeL": 21.5931, + "eval_rougeLsum": 43.1184, + "eval_runtime": 912.3034, + "eval_samples_per_second": 0.274, + "eval_steps_per_second": 0.274, + "step": 225 + }, + { + "epoch": 3.53, + "learning_rate": 0.0004887038541912687, + "loss": 1.5713, + "step": 226 + }, + { + "epoch": 3.55, + "learning_rate": 0.0004885497787202137, + "loss": 1.5822, + "step": 227 + }, + { + "epoch": 3.56, + "learning_rate": 0.0004883946842086861, + "loss": 1.571, + "step": 228 + }, + { + "epoch": 3.58, + "learning_rate": 0.00048823857131922093, + "loss": 1.6026, + "step": 229 + }, + { + "epoch": 3.59, + "learning_rate": 0.00048808144071870363, + "loss": 1.5299, + "step": 230 + }, + { + "epoch": 3.61, + "learning_rate": 0.00048792329307836705, + "loss": 1.5651, + "step": 231 + }, + { + "epoch": 3.62, + "learning_rate": 0.0004877641290737884, + "loss": 1.5702, + "step": 232 + }, + { + "epoch": 3.64, + "learning_rate": 0.00048760394938488703, + "loss": 1.5424, + "step": 233 + }, + { + "epoch": 3.66, + "learning_rate": 0.0004874427546959208, + "loss": 1.5763, + "step": 234 + }, + { + "epoch": 3.67, + "learning_rate": 0.0004872805456954837, + "loss": 1.493, + "step": 235 + }, + { + "epoch": 3.69, + "learning_rate": 0.00048711732307650236, + "loss": 1.5906, + "step": 236 + }, + { + "epoch": 3.7, + "learning_rate": 0.00048695308753623367, + "loss": 1.5255, + "step": 237 + }, + { + "epoch": 3.72, + "learning_rate": 0.0004867878397762615, + "loss": 1.4685, + "step": 238 + }, + { + "epoch": 3.73, + "learning_rate": 0.0004866215805024935, + "loss": 1.5349, + "step": 239 + }, + { + "epoch": 3.75, + "learning_rate": 0.00048645431042515866, + "loss": 1.6122, + "step": 240 + }, + { + "epoch": 3.76, + "learning_rate": 0.0004862860302588037, + "loss": 1.5036, + "step": 241 + }, + { + "epoch": 3.78, + "learning_rate": 0.0004861167407222904, + "loss": 1.5529, + "step": 242 + }, + { + "epoch": 3.8, + "learning_rate": 0.0004859464425387922, + "loss": 1.6724, + "step": 243 + }, + { + "epoch": 3.81, + "learning_rate": 0.0004857751364357913, + "loss": 1.6384, + "step": 244 + }, + { + "epoch": 3.83, + "learning_rate": 0.00048560282314507594, + "loss": 1.5088, + "step": 245 + }, + { + "epoch": 3.84, + "learning_rate": 0.0004854295034027364, + "loss": 1.5407, + "step": 246 + }, + { + "epoch": 3.86, + "learning_rate": 0.00048525517794916254, + "loss": 1.6202, + "step": 247 + }, + { + "epoch": 3.87, + "learning_rate": 0.00048507984752904035, + "loss": 1.5655, + "step": 248 + }, + { + "epoch": 3.89, + "learning_rate": 0.000484903512891349, + "loss": 1.5927, + "step": 249 + }, + { + "epoch": 3.91, + "learning_rate": 0.0004847261747893574, + "loss": 1.5328, + "step": 250 + }, + { + "epoch": 3.91, + "eval_gen_len": 790.056, + "eval_loss": 1.5730280876159668, + "eval_rouge1": 46.5685, + "eval_rouge2": 17.5423, + "eval_rougeL": 22.2082, + "eval_rougeLsum": 43.1811, + "eval_runtime": 869.0966, + "eval_samples_per_second": 0.288, + "eval_steps_per_second": 0.288, + "step": 250 + }, + { + "epoch": 3.92, + "learning_rate": 0.0004845478339806211, + "loss": 1.613, + "step": 251 + }, + { + "epoch": 3.94, + "learning_rate": 0.00048436849122697885, + "loss": 1.5404, + "step": 252 + }, + { + "epoch": 3.95, + "learning_rate": 0.00048418814729454985, + "loss": 1.5256, + "step": 253 + }, + { + "epoch": 3.97, + "learning_rate": 0.0004840068029537299, + "loss": 1.5344, + "step": 254 + }, + { + "epoch": 3.98, + "learning_rate": 0.00048382445897918847, + "loss": 1.5377, + "step": 255 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004836411161498652, + "loss": 1.5981, + "step": 256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004834567752489668, + "loss": 1.645, + "step": 257 + }, + { + "epoch": 4.03, + "learning_rate": 0.0004832714370639633, + "loss": 1.4417, + "step": 258 + }, + { + "epoch": 4.05, + "learning_rate": 0.0004830851023865852, + "loss": 1.5535, + "step": 259 + }, + { + "epoch": 4.06, + "learning_rate": 0.00048289777201281976, + "loss": 1.5704, + "step": 260 + }, + { + "epoch": 4.08, + "learning_rate": 0.0004827094467429076, + "loss": 1.5286, + "step": 261 + }, + { + "epoch": 4.09, + "learning_rate": 0.0004825201273813393, + "loss": 1.6414, + "step": 262 + }, + { + "epoch": 4.11, + "learning_rate": 0.0004823298147368523, + "loss": 1.4781, + "step": 263 + }, + { + "epoch": 4.12, + "learning_rate": 0.0004821385096224268, + "loss": 1.4318, + "step": 264 + }, + { + "epoch": 4.14, + "learning_rate": 0.0004819462128552829, + "loss": 1.5628, + "step": 265 + }, + { + "epoch": 4.16, + "learning_rate": 0.00048175292525687676, + "loss": 1.474, + "step": 266 + }, + { + "epoch": 4.17, + "learning_rate": 0.00048155864765289717, + "loss": 1.4858, + "step": 267 + }, + { + "epoch": 4.19, + "learning_rate": 0.00048136338087326213, + "loss": 1.5357, + "step": 268 + }, + { + "epoch": 4.2, + "learning_rate": 0.0004811671257521151, + "loss": 1.5283, + "step": 269 + }, + { + "epoch": 4.22, + "learning_rate": 0.0004809698831278217, + "loss": 1.4921, + "step": 270 + }, + { + "epoch": 4.23, + "learning_rate": 0.0004807716538429658, + "loss": 1.5086, + "step": 271 + }, + { + "epoch": 4.25, + "learning_rate": 0.0004805724387443462, + "loss": 1.5572, + "step": 272 + }, + { + "epoch": 4.27, + "learning_rate": 0.0004803722386829729, + "loss": 1.4695, + "step": 273 + }, + { + "epoch": 4.28, + "learning_rate": 0.0004801710545140635, + "loss": 1.534, + "step": 274 + }, + { + "epoch": 4.3, + "learning_rate": 0.00047996888709703954, + "loss": 1.5194, + "step": 275 + }, + { + "epoch": 4.3, + "eval_gen_len": 825.868, + "eval_loss": 1.568995475769043, + "eval_rouge1": 47.6205, + "eval_rouge2": 18.377, + "eval_rougeL": 22.7639, + "eval_rougeLsum": 44.3701, + "eval_runtime": 903.1087, + "eval_samples_per_second": 0.277, + "eval_steps_per_second": 0.277, + "step": 275 + }, + { + "epoch": 4.31, + "learning_rate": 0.00047976573729552276, + "loss": 1.5358, + "step": 276 + }, + { + "epoch": 4.33, + "learning_rate": 0.0004795616059773315, + "loss": 1.5047, + "step": 277 + }, + { + "epoch": 4.34, + "learning_rate": 0.0004793564940144769, + "loss": 1.5063, + "step": 278 + }, + { + "epoch": 4.36, + "learning_rate": 0.00047915040228315933, + "loss": 1.6249, + "step": 279 + }, + { + "epoch": 4.37, + "learning_rate": 0.00047894333166376434, + "loss": 1.5403, + "step": 280 + }, + { + "epoch": 4.39, + "learning_rate": 0.0004787352830408595, + "loss": 1.5495, + "step": 281 + }, + { + "epoch": 4.41, + "learning_rate": 0.0004785262573031899, + "loss": 1.5596, + "step": 282 + }, + { + "epoch": 4.42, + "learning_rate": 0.0004783162553436747, + "loss": 1.5155, + "step": 283 + }, + { + "epoch": 4.44, + "learning_rate": 0.0004781052780594034, + "loss": 1.4722, + "step": 284 + }, + { + "epoch": 4.45, + "learning_rate": 0.00047789332635163204, + "loss": 1.5476, + "step": 285 + }, + { + "epoch": 4.47, + "learning_rate": 0.0004776804011257788, + "loss": 1.5014, + "step": 286 + }, + { + "epoch": 4.48, + "learning_rate": 0.00047746650329142103, + "loss": 1.4926, + "step": 287 + }, + { + "epoch": 4.5, + "learning_rate": 0.00047725163376229063, + "loss": 1.5515, + "step": 288 + }, + { + "epoch": 4.52, + "learning_rate": 0.00047703579345627036, + "loss": 1.4934, + "step": 289 + }, + { + "epoch": 4.53, + "learning_rate": 0.00047681898329539, + "loss": 1.5362, + "step": 290 + }, + { + "epoch": 4.55, + "learning_rate": 0.0004766012042058225, + "loss": 1.5657, + "step": 291 + }, + { + "epoch": 4.56, + "learning_rate": 0.00047638245711787976, + "loss": 1.5209, + "step": 292 + }, + { + "epoch": 4.58, + "learning_rate": 0.00047616274296600873, + "loss": 1.5339, + "step": 293 + }, + { + "epoch": 4.59, + "learning_rate": 0.0004759420626887877, + "loss": 1.5366, + "step": 294 + }, + { + "epoch": 4.61, + "learning_rate": 0.00047572041722892194, + "loss": 1.4674, + "step": 295 + }, + { + "epoch": 4.62, + "learning_rate": 0.0004754978075332398, + "loss": 1.4913, + "step": 296 + }, + { + "epoch": 4.64, + "learning_rate": 0.00047527423455268855, + "loss": 1.5963, + "step": 297 + }, + { + "epoch": 4.66, + "learning_rate": 0.00047504969924233066, + "loss": 1.5394, + "step": 298 + }, + { + "epoch": 4.67, + "learning_rate": 0.00047482420256133933, + "loss": 1.4853, + "step": 299 + }, + { + "epoch": 4.69, + "learning_rate": 0.00047459774547299475, + "loss": 1.571, + "step": 300 + }, + { + "epoch": 4.69, + "eval_gen_len": 794.032, + "eval_loss": 1.5675740242004395, + "eval_rouge1": 49.2203, + "eval_rouge2": 19.1109, + "eval_rougeL": 22.8005, + "eval_rougeLsum": 46.0679, + "eval_runtime": 869.0821, + "eval_samples_per_second": 0.288, + "eval_steps_per_second": 0.288, + "step": 300 + }, + { + "epoch": 4.7, + "learning_rate": 0.00047437032894467944, + "loss": 1.5875, + "step": 301 + }, + { + "epoch": 4.72, + "learning_rate": 0.000474141953947875, + "loss": 1.4682, + "step": 302 + }, + { + "epoch": 4.73, + "learning_rate": 0.00047391262145815674, + "loss": 1.5559, + "step": 303 + }, + { + "epoch": 4.75, + "learning_rate": 0.0004736823324551909, + "loss": 1.5251, + "step": 304 + }, + { + "epoch": 4.76, + "learning_rate": 0.00047345108792272927, + "loss": 1.5021, + "step": 305 + }, + { + "epoch": 4.78, + "learning_rate": 0.0004732188888486057, + "loss": 1.5081, + "step": 306 + }, + { + "epoch": 4.8, + "learning_rate": 0.00047298573622473166, + "loss": 1.5025, + "step": 307 + }, + { + "epoch": 4.81, + "learning_rate": 0.00047275163104709196, + "loss": 1.5625, + "step": 308 + }, + { + "epoch": 4.83, + "learning_rate": 0.00047251657431574055, + "loss": 1.6046, + "step": 309 + }, + { + "epoch": 4.84, + "learning_rate": 0.0004722805670347963, + "loss": 1.5373, + "step": 310 + }, + { + "epoch": 4.86, + "learning_rate": 0.0004720436102124386, + "loss": 1.5182, + "step": 311 + }, + { + "epoch": 4.87, + "learning_rate": 0.000471805704860903, + "loss": 1.5097, + "step": 312 + }, + { + "epoch": 4.89, + "learning_rate": 0.00047156685199647714, + "loss": 1.5336, + "step": 313 + }, + { + "epoch": 4.91, + "learning_rate": 0.0004713270526394963, + "loss": 1.5097, + "step": 314 + }, + { + "epoch": 4.92, + "learning_rate": 0.00047108630781433886, + "loss": 1.5173, + "step": 315 + }, + { + "epoch": 4.94, + "learning_rate": 0.0004708446185494222, + "loss": 1.5142, + "step": 316 + }, + { + "epoch": 4.95, + "learning_rate": 0.00047060198587719793, + "loss": 1.466, + "step": 317 + }, + { + "epoch": 4.97, + "learning_rate": 0.00047035841083414804, + "loss": 1.484, + "step": 318 + }, + { + "epoch": 4.98, + "learning_rate": 0.0004701138944607799, + "loss": 1.5159, + "step": 319 + }, + { + "epoch": 5.0, + "learning_rate": 0.00046986843780162223, + "loss": 1.4444, + "step": 320 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004696220419052205, + "loss": 1.5786, + "step": 321 + }, + { + "epoch": 5.03, + "learning_rate": 0.00046937470782413215, + "loss": 1.5069, + "step": 322 + }, + { + "epoch": 5.05, + "learning_rate": 0.0004691264366149227, + "loss": 1.4675, + "step": 323 + }, + { + "epoch": 5.06, + "learning_rate": 0.00046887722933816074, + "loss": 1.4876, + "step": 324 + }, + { + "epoch": 5.08, + "learning_rate": 0.0004686270870584136, + "loss": 1.4275, + "step": 325 + }, + { + "epoch": 5.08, + "eval_gen_len": 833.068, + "eval_loss": 1.5656464099884033, + "eval_rouge1": 50.6982, + "eval_rouge2": 20.0278, + "eval_rougeL": 23.5585, + "eval_rougeLsum": 47.5036, + "eval_runtime": 908.634, + "eval_samples_per_second": 0.275, + "eval_steps_per_second": 0.275, + "step": 325 + }, + { + "epoch": 5.09, + "learning_rate": 0.00046837601084424277, + "loss": 1.5108, + "step": 326 + }, + { + "epoch": 5.11, + "learning_rate": 0.0004681240017681993, + "loss": 1.4831, + "step": 327 + }, + { + "epoch": 5.12, + "learning_rate": 0.0004678710609068193, + "loss": 1.421, + "step": 328 + }, + { + "epoch": 5.14, + "learning_rate": 0.0004676171893406194, + "loss": 1.4381, + "step": 329 + }, + { + "epoch": 5.16, + "learning_rate": 0.0004673623881540917, + "loss": 1.5484, + "step": 330 + }, + { + "epoch": 5.17, + "learning_rate": 0.0004671066584356999, + "loss": 1.4882, + "step": 331 + }, + { + "epoch": 5.19, + "learning_rate": 0.0004668500012778738, + "loss": 1.4782, + "step": 332 + }, + { + "epoch": 5.2, + "learning_rate": 0.0004665924177770053, + "loss": 1.515, + "step": 333 + }, + { + "epoch": 5.22, + "learning_rate": 0.00046633390903344355, + "loss": 1.5186, + "step": 334 + }, + { + "epoch": 5.23, + "learning_rate": 0.00046607447615148984, + "loss": 1.5037, + "step": 335 + }, + { + "epoch": 5.25, + "learning_rate": 0.0004658141202393935, + "loss": 1.5083, + "step": 336 + }, + { + "epoch": 5.27, + "learning_rate": 0.0004655528424093467, + "loss": 1.3176, + "step": 337 + }, + { + "epoch": 5.28, + "learning_rate": 0.00046529064377747993, + "loss": 1.491, + "step": 338 + }, + { + "epoch": 5.3, + "learning_rate": 0.00046502752546385723, + "loss": 1.4108, + "step": 339 + }, + { + "epoch": 5.31, + "learning_rate": 0.0004647634885924713, + "loss": 1.5044, + "step": 340 + }, + { + "epoch": 5.33, + "learning_rate": 0.0004644985342912388, + "loss": 1.4611, + "step": 341 + }, + { + "epoch": 5.34, + "learning_rate": 0.0004642326636919952, + "loss": 1.4847, + "step": 342 + }, + { + "epoch": 5.36, + "learning_rate": 0.00046396587793049083, + "loss": 1.5091, + "step": 343 + }, + { + "epoch": 5.37, + "learning_rate": 0.00046369817814638476, + "loss": 1.498, + "step": 344 + }, + { + "epoch": 5.39, + "learning_rate": 0.000463429565483241, + "loss": 1.4504, + "step": 345 + }, + { + "epoch": 5.41, + "learning_rate": 0.00046316004108852305, + "loss": 1.5032, + "step": 346 + }, + { + "epoch": 5.42, + "learning_rate": 0.00046288960611358924, + "loss": 1.4502, + "step": 347 + }, + { + "epoch": 5.44, + "learning_rate": 0.0004626182617136877, + "loss": 1.5371, + "step": 348 + }, + { + "epoch": 5.45, + "learning_rate": 0.00046234600904795144, + "loss": 1.4603, + "step": 349 + }, + { + "epoch": 5.47, + "learning_rate": 0.0004620728492793934, + "loss": 1.4912, + "step": 350 + }, + { + "epoch": 5.47, + "eval_gen_len": 793.068, + "eval_loss": 1.5625073909759521, + "eval_rouge1": 50.3371, + "eval_rouge2": 19.8639, + "eval_rougeL": 23.3666, + "eval_rougeLsum": 47.1898, + "eval_runtime": 869.2692, + "eval_samples_per_second": 0.288, + "eval_steps_per_second": 0.288, + "step": 350 + }, + { + "epoch": 5.48, + "learning_rate": 0.00046179878357490155, + "loss": 1.4555, + "step": 351 + }, + { + "epoch": 5.5, + "learning_rate": 0.00046152381310523384, + "loss": 1.5145, + "step": 352 + }, + { + "epoch": 5.52, + "learning_rate": 0.00046124793904501316, + "loss": 1.4795, + "step": 353 + }, + { + "epoch": 5.53, + "learning_rate": 0.0004609711625727224, + "loss": 1.4764, + "step": 354 + }, + { + "epoch": 5.55, + "learning_rate": 0.00046069348487069934, + "loss": 1.5356, + "step": 355 + }, + { + "epoch": 5.56, + "learning_rate": 0.00046041490712513177, + "loss": 1.4876, + "step": 356 + }, + { + "epoch": 5.58, + "learning_rate": 0.0004601354305260522, + "loss": 1.5048, + "step": 357 + }, + { + "epoch": 5.59, + "learning_rate": 0.00045985505626733287, + "loss": 1.4918, + "step": 358 + }, + { + "epoch": 5.61, + "learning_rate": 0.0004595737855466807, + "loss": 1.5214, + "step": 359 + }, + { + "epoch": 5.62, + "learning_rate": 0.00045929161956563216, + "loss": 1.5063, + "step": 360 + }, + { + "epoch": 5.64, + "learning_rate": 0.000459008559529548, + "loss": 1.4175, + "step": 361 + }, + { + "epoch": 5.66, + "learning_rate": 0.00045872460664760827, + "loss": 1.4646, + "step": 362 + }, + { + "epoch": 5.67, + "learning_rate": 0.00045843976213280716, + "loss": 1.469, + "step": 363 + }, + { + "epoch": 5.69, + "learning_rate": 0.0004581540272019476, + "loss": 1.4715, + "step": 364 + }, + { + "epoch": 5.7, + "learning_rate": 0.00045786740307563633, + "loss": 1.5284, + "step": 365 + }, + { + "epoch": 5.72, + "learning_rate": 0.0004575798909782785, + "loss": 1.4902, + "step": 366 + }, + { + "epoch": 5.73, + "learning_rate": 0.0004572914921380726, + "loss": 1.4994, + "step": 367 + }, + { + "epoch": 5.75, + "learning_rate": 0.000457002207787005, + "loss": 1.4458, + "step": 368 + }, + { + "epoch": 5.76, + "learning_rate": 0.00045671203916084494, + "loss": 1.4403, + "step": 369 + }, + { + "epoch": 5.78, + "learning_rate": 0.00045642098749913896, + "loss": 1.4553, + "step": 370 + }, + { + "epoch": 5.8, + "learning_rate": 0.00045612905404520586, + "loss": 1.506, + "step": 371 + }, + { + "epoch": 5.81, + "learning_rate": 0.00045583624004613145, + "loss": 1.4695, + "step": 372 + }, + { + "epoch": 5.83, + "learning_rate": 0.00045554254675276266, + "loss": 1.5557, + "step": 373 + }, + { + "epoch": 5.84, + "learning_rate": 0.00045524797541970306, + "loss": 1.4717, + "step": 374 + }, + { + "epoch": 5.86, + "learning_rate": 0.0004549525273053067, + "loss": 1.4764, + "step": 375 + }, + { + "epoch": 5.86, + "eval_gen_len": 819.86, + "eval_loss": 1.5531580448150635, + "eval_rouge1": 50.9702, + "eval_rouge2": 20.7532, + "eval_rougeL": 23.8765, + "eval_rougeLsum": 47.9915, + "eval_runtime": 894.7339, + "eval_samples_per_second": 0.279, + "eval_steps_per_second": 0.279, + "step": 375 + }, + { + "epoch": 5.87, + "learning_rate": 0.0004546562036716732, + "loss": 1.4498, + "step": 376 + }, + { + "epoch": 5.89, + "learning_rate": 0.0004543590057846422, + "loss": 1.5125, + "step": 377 + }, + { + "epoch": 5.91, + "learning_rate": 0.00045406093491378815, + "loss": 1.4536, + "step": 378 + }, + { + "epoch": 5.92, + "learning_rate": 0.00045376199233241454, + "loss": 1.5585, + "step": 379 + }, + { + "epoch": 5.94, + "learning_rate": 0.00045346217931754873, + "loss": 1.4651, + "step": 380 + }, + { + "epoch": 5.95, + "learning_rate": 0.00045316149714993645, + "loss": 1.5225, + "step": 381 + }, + { + "epoch": 5.97, + "learning_rate": 0.00045285994711403613, + "loss": 1.4312, + "step": 382 + }, + { + "epoch": 5.98, + "learning_rate": 0.00045255753049801365, + "loss": 1.5308, + "step": 383 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004522542485937369, + "loss": 1.4896, + "step": 384 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004519501026967699, + "loss": 1.5514, + "step": 385 + }, + { + "epoch": 6.03, + "learning_rate": 0.0004516450941063677, + "loss": 1.4681, + "step": 386 + }, + { + "epoch": 6.05, + "learning_rate": 0.00045133922412547034, + "loss": 1.3779, + "step": 387 + }, + { + "epoch": 6.06, + "learning_rate": 0.0004510324940606979, + "loss": 1.4405, + "step": 388 + }, + { + "epoch": 6.08, + "learning_rate": 0.00045072490522234435, + "loss": 1.4182, + "step": 389 + }, + { + "epoch": 6.09, + "learning_rate": 0.0004504164589243721, + "loss": 1.4541, + "step": 390 + }, + { + "epoch": 6.11, + "learning_rate": 0.00045010715648440686, + "loss": 1.4429, + "step": 391 + }, + { + "epoch": 6.12, + "learning_rate": 0.0004497969992237312, + "loss": 1.3827, + "step": 392 + }, + { + "epoch": 6.14, + "learning_rate": 0.0004494859884672795, + "loss": 1.4369, + "step": 393 + }, + { + "epoch": 6.16, + "learning_rate": 0.00044917412554363223, + "loss": 1.4117, + "step": 394 + }, + { + "epoch": 6.17, + "learning_rate": 0.00044886141178500984, + "loss": 1.368, + "step": 395 + }, + { + "epoch": 6.19, + "learning_rate": 0.00044854784852726775, + "loss": 1.5125, + "step": 396 + }, + { + "epoch": 6.2, + "learning_rate": 0.0004482334371098901, + "loss": 1.4048, + "step": 397 + }, + { + "epoch": 6.22, + "learning_rate": 0.0004479181788759842, + "loss": 1.4849, + "step": 398 + }, + { + "epoch": 6.23, + "learning_rate": 0.0004476020751722748, + "loss": 1.4988, + "step": 399 + }, + { + "epoch": 6.25, + "learning_rate": 0.00044728512734909845, + "loss": 1.3972, + "step": 400 + }, + { + "epoch": 6.25, + "eval_gen_len": 770.78, + "eval_loss": 1.5564013719558716, + "eval_rouge1": 49.279, + "eval_rouge2": 19.4781, + "eval_rougeL": 23.1018, + "eval_rougeLsum": 46.1942, + "eval_runtime": 846.8994, + "eval_samples_per_second": 0.295, + "eval_steps_per_second": 0.295, + "step": 400 + }, + { + "epoch": 6.27, + "learning_rate": 0.00044696733676039745, + "loss": 1.4684, + "step": 401 + }, + { + "epoch": 6.28, + "learning_rate": 0.00044664870476371447, + "loss": 1.4135, + "step": 402 + }, + { + "epoch": 6.3, + "learning_rate": 0.0004463292327201862, + "loss": 1.4849, + "step": 403 + }, + { + "epoch": 6.31, + "learning_rate": 0.00044600892199453824, + "loss": 1.4578, + "step": 404 + }, + { + "epoch": 6.33, + "learning_rate": 0.00044568777395507863, + "loss": 1.4226, + "step": 405 + }, + { + "epoch": 6.34, + "learning_rate": 0.0004453657899736923, + "loss": 1.4242, + "step": 406 + }, + { + "epoch": 6.36, + "learning_rate": 0.00044504297142583505, + "loss": 1.4406, + "step": 407 + }, + { + "epoch": 6.37, + "learning_rate": 0.00044471931969052817, + "loss": 1.5071, + "step": 408 + }, + { + "epoch": 6.39, + "learning_rate": 0.0004443948361503517, + "loss": 1.4148, + "step": 409 + }, + { + "epoch": 6.41, + "learning_rate": 0.00044406952219143935, + "loss": 1.3703, + "step": 410 + }, + { + "epoch": 6.42, + "learning_rate": 0.0004437433792034721, + "loss": 1.4795, + "step": 411 + }, + { + "epoch": 6.44, + "learning_rate": 0.00044341640857967234, + "loss": 1.4414, + "step": 412 + }, + { + "epoch": 6.45, + "learning_rate": 0.00044308861171679816, + "loss": 1.4081, + "step": 413 + }, + { + "epoch": 6.47, + "learning_rate": 0.0004427599900151368, + "loss": 1.5113, + "step": 414 + }, + { + "epoch": 6.48, + "learning_rate": 0.0004424305448784995, + "loss": 1.4275, + "step": 415 + }, + { + "epoch": 6.5, + "learning_rate": 0.0004421002777142148, + "loss": 1.5111, + "step": 416 + }, + { + "epoch": 6.52, + "learning_rate": 0.00044176918993312284, + "loss": 1.467, + "step": 417 + }, + { + "epoch": 6.53, + "learning_rate": 0.0004414372829495693, + "loss": 1.3737, + "step": 418 + }, + { + "epoch": 6.55, + "learning_rate": 0.0004411045581813994, + "loss": 1.4639, + "step": 419 + }, + { + "epoch": 6.56, + "learning_rate": 0.00044077101704995163, + "loss": 1.3785, + "step": 420 + }, + { + "epoch": 6.58, + "learning_rate": 0.00044043666098005205, + "loss": 1.5022, + "step": 421 + }, + { + "epoch": 6.59, + "learning_rate": 0.0004401014914000078, + "loss": 1.5083, + "step": 422 + }, + { + "epoch": 6.61, + "learning_rate": 0.00043976550974160114, + "loss": 1.4377, + "step": 423 + }, + { + "epoch": 6.62, + "learning_rate": 0.00043942871744008375, + "loss": 1.4162, + "step": 424 + }, + { + "epoch": 6.64, + "learning_rate": 0.00043909111593416967, + "loss": 1.4479, + "step": 425 + }, + { + "epoch": 6.64, + "eval_gen_len": 806.244, + "eval_loss": 1.5529119968414307, + "eval_rouge1": 50.3317, + "eval_rouge2": 20.2888, + "eval_rougeL": 23.4454, + "eval_rougeLsum": 47.3491, + "eval_runtime": 882.7996, + "eval_samples_per_second": 0.283, + "eval_steps_per_second": 0.283, + "step": 425 + }, + { + "epoch": 6.66, + "learning_rate": 0.00043875270666603015, + "loss": 1.4568, + "step": 426 + }, + { + "epoch": 6.67, + "learning_rate": 0.00043841349108128704, + "loss": 1.4055, + "step": 427 + }, + { + "epoch": 6.69, + "learning_rate": 0.00043807347062900623, + "loss": 1.4544, + "step": 428 + }, + { + "epoch": 6.7, + "learning_rate": 0.00043773264676169225, + "loss": 1.4898, + "step": 429 + }, + { + "epoch": 6.72, + "learning_rate": 0.00043739102093528153, + "loss": 1.4298, + "step": 430 + }, + { + "epoch": 6.73, + "learning_rate": 0.00043704859460913634, + "loss": 1.4306, + "step": 431 + }, + { + "epoch": 6.75, + "learning_rate": 0.0004367053692460385, + "loss": 1.4758, + "step": 432 + }, + { + "epoch": 6.76, + "learning_rate": 0.00043636134631218316, + "loss": 1.4058, + "step": 433 + }, + { + "epoch": 6.78, + "learning_rate": 0.00043601652727717267, + "loss": 1.4258, + "step": 434 + }, + { + "epoch": 6.8, + "learning_rate": 0.00043567091361401, + "loss": 1.471, + "step": 435 + }, + { + "epoch": 6.81, + "learning_rate": 0.00043532450679909277, + "loss": 1.4146, + "step": 436 + }, + { + "epoch": 6.83, + "learning_rate": 0.0004349773083122066, + "loss": 1.5167, + "step": 437 + }, + { + "epoch": 6.84, + "learning_rate": 0.0004346293196365193, + "loss": 1.4361, + "step": 438 + }, + { + "epoch": 6.86, + "learning_rate": 0.0004342805422585739, + "loss": 1.5092, + "step": 439 + }, + { + "epoch": 6.87, + "learning_rate": 0.00043393097766828293, + "loss": 1.5393, + "step": 440 + }, + { + "epoch": 6.89, + "learning_rate": 0.0004335806273589214, + "loss": 1.4637, + "step": 441 + }, + { + "epoch": 6.91, + "learning_rate": 0.000433229492827121, + "loss": 1.4411, + "step": 442 + }, + { + "epoch": 6.92, + "learning_rate": 0.00043287757557286343, + "loss": 1.4045, + "step": 443 + }, + { + "epoch": 6.94, + "learning_rate": 0.0004325248770994741, + "loss": 1.44, + "step": 444 + }, + { + "epoch": 6.95, + "learning_rate": 0.0004321713989136153, + "loss": 1.371, + "step": 445 + }, + { + "epoch": 6.97, + "learning_rate": 0.0004318171425252808, + "loss": 1.4431, + "step": 446 + }, + { + "epoch": 6.98, + "learning_rate": 0.0004314621094477879, + "loss": 1.4203, + "step": 447 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004311063011977723, + "loss": 1.4698, + "step": 448 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004307497192951809, + "loss": 1.4772, + "step": 449 + }, + { + "epoch": 7.03, + "learning_rate": 0.0004303923652632655, + "loss": 1.4567, + "step": 450 + }, + { + "epoch": 7.03, + "eval_gen_len": 787.48, + "eval_loss": 1.5590322017669678, + "eval_rouge1": 52.2209, + "eval_rouge2": 21.2868, + "eval_rougeL": 23.9284, + "eval_rougeLsum": 49.1691, + "eval_runtime": 864.0005, + "eval_samples_per_second": 0.289, + "eval_steps_per_second": 0.289, + "step": 450 + }, + { + "epoch": 7.05, + "learning_rate": 0.00043003424062857656, + "loss": 1.4111, + "step": 451 + }, + { + "epoch": 7.06, + "learning_rate": 0.000429675346920956, + "loss": 1.3935, + "step": 452 + }, + { + "epoch": 7.08, + "learning_rate": 0.0004293156856735313, + "loss": 1.4402, + "step": 453 + }, + { + "epoch": 7.09, + "learning_rate": 0.00042895525842270865, + "loss": 1.3481, + "step": 454 + }, + { + "epoch": 7.11, + "learning_rate": 0.00042859406670816663, + "loss": 1.4009, + "step": 455 + }, + { + "epoch": 7.12, + "learning_rate": 0.0004282321120728493, + "loss": 1.3885, + "step": 456 + }, + { + "epoch": 7.14, + "learning_rate": 0.00042786939606295976, + "loss": 1.3837, + "step": 457 + }, + { + "epoch": 7.16, + "learning_rate": 0.0004275059202279537, + "loss": 1.3358, + "step": 458 + }, + { + "epoch": 7.17, + "learning_rate": 0.0004271416861205325, + "loss": 1.4314, + "step": 459 + }, + { + "epoch": 7.19, + "learning_rate": 0.00042677669529663686, + "loss": 1.3707, + "step": 460 + }, + { + "epoch": 7.2, + "learning_rate": 0.00042641094931544, + "loss": 1.421, + "step": 461 + }, + { + "epoch": 7.22, + "learning_rate": 0.00042604444973934097, + "loss": 1.3735, + "step": 462 + }, + { + "epoch": 7.23, + "learning_rate": 0.0004256771981339581, + "loss": 1.4209, + "step": 463 + }, + { + "epoch": 7.25, + "learning_rate": 0.00042530919606812215, + "loss": 1.381, + "step": 464 + }, + { + "epoch": 7.27, + "learning_rate": 0.0004249404451138699, + "loss": 1.4744, + "step": 465 + }, + { + "epoch": 7.28, + "learning_rate": 0.0004245709468464371, + "loss": 1.4357, + "step": 466 + }, + { + "epoch": 7.3, + "learning_rate": 0.0004242007028442519, + "loss": 1.384, + "step": 467 + }, + { + "epoch": 7.31, + "learning_rate": 0.00042382971468892807, + "loss": 1.437, + "step": 468 + }, + { + "epoch": 7.33, + "learning_rate": 0.0004234579839652583, + "loss": 1.4199, + "step": 469 + }, + { + "epoch": 7.34, + "learning_rate": 0.0004230855122612074, + "loss": 1.4603, + "step": 470 + }, + { + "epoch": 7.36, + "learning_rate": 0.0004227123011679056, + "loss": 1.3556, + "step": 471 + }, + { + "epoch": 7.37, + "learning_rate": 0.00042233835227964146, + "loss": 1.3665, + "step": 472 + }, + { + "epoch": 7.39, + "learning_rate": 0.0004219636671938554, + "loss": 1.4353, + "step": 473 + }, + { + "epoch": 7.41, + "learning_rate": 0.00042158824751113277, + "loss": 1.3791, + "step": 474 + }, + { + "epoch": 7.42, + "learning_rate": 0.0004212120948351968, + "loss": 1.3933, + "step": 475 + }, + { + "epoch": 7.42, + "eval_gen_len": 842.664, + "eval_loss": 1.556054949760437, + "eval_rouge1": 51.9578, + "eval_rouge2": 20.5806, + "eval_rougeL": 23.7177, + "eval_rougeLsum": 48.9121, + "eval_runtime": 918.7994, + "eval_samples_per_second": 0.272, + "eval_steps_per_second": 0.272, + "step": 475 + }, + { + "epoch": 7.44, + "learning_rate": 0.0004208352107729021, + "loss": 1.4203, + "step": 476 + }, + { + "epoch": 7.45, + "learning_rate": 0.0004204575969342277, + "loss": 1.3513, + "step": 477 + }, + { + "epoch": 7.47, + "learning_rate": 0.0004200792549322698, + "loss": 1.3963, + "step": 478 + }, + { + "epoch": 7.48, + "learning_rate": 0.00041970018638323546, + "loss": 1.4117, + "step": 479 + }, + { + "epoch": 7.5, + "learning_rate": 0.0004193203929064353, + "loss": 1.4318, + "step": 480 + }, + { + "epoch": 7.52, + "learning_rate": 0.00041893987612427665, + "loss": 1.4077, + "step": 481 + }, + { + "epoch": 7.53, + "learning_rate": 0.0004185586376622569, + "loss": 1.4389, + "step": 482 + }, + { + "epoch": 7.55, + "learning_rate": 0.0004181766791489559, + "loss": 1.4365, + "step": 483 + }, + { + "epoch": 7.56, + "learning_rate": 0.0004177940022160299, + "loss": 1.3756, + "step": 484 + }, + { + "epoch": 7.58, + "learning_rate": 0.00041741060849820376, + "loss": 1.4333, + "step": 485 + }, + { + "epoch": 7.59, + "learning_rate": 0.0004170264996332644, + "loss": 1.3782, + "step": 486 + }, + { + "epoch": 7.61, + "learning_rate": 0.00041664167726205393, + "loss": 1.3374, + "step": 487 + }, + { + "epoch": 7.62, + "learning_rate": 0.00041625614302846206, + "loss": 1.373, + "step": 488 + }, + { + "epoch": 7.64, + "learning_rate": 0.0004158698985794197, + "loss": 1.4767, + "step": 489 + }, + { + "epoch": 7.66, + "learning_rate": 0.00041548294556489163, + "loss": 1.421, + "step": 490 + }, + { + "epoch": 7.67, + "learning_rate": 0.00041509528563786946, + "loss": 1.4624, + "step": 491 + }, + { + "epoch": 7.69, + "learning_rate": 0.00041470692045436446, + "loss": 1.4403, + "step": 492 + }, + { + "epoch": 7.7, + "learning_rate": 0.00041431785167340095, + "loss": 1.4075, + "step": 493 + }, + { + "epoch": 7.72, + "learning_rate": 0.0004139280809570086, + "loss": 1.467, + "step": 494 + }, + { + "epoch": 7.73, + "learning_rate": 0.0004135376099702158, + "loss": 1.4193, + "step": 495 + }, + { + "epoch": 7.75, + "learning_rate": 0.00041314644038104216, + "loss": 1.4129, + "step": 496 + }, + { + "epoch": 7.76, + "learning_rate": 0.0004127545738604918, + "loss": 1.392, + "step": 497 + }, + { + "epoch": 7.78, + "learning_rate": 0.0004123620120825459, + "loss": 1.3938, + "step": 498 + }, + { + "epoch": 7.8, + "learning_rate": 0.0004119687567241557, + "loss": 1.4033, + "step": 499 + }, + { + "epoch": 7.81, + "learning_rate": 0.0004115748094652352, + "loss": 1.4245, + "step": 500 + }, + { + "epoch": 7.81, + "eval_gen_len": 813.772, + "eval_loss": 1.5419564247131348, + "eval_rouge1": 52.3725, + "eval_rouge2": 21.7787, + "eval_rougeL": 24.5209, + "eval_rougeLsum": 49.4003, + "eval_runtime": 890.9, + "eval_samples_per_second": 0.281, + "eval_steps_per_second": 0.281, + "step": 500 + }, + { + "epoch": 7.83, + "learning_rate": 0.0004111801719886542, + "loss": 1.3977, + "step": 501 + }, + { + "epoch": 7.84, + "learning_rate": 0.0004107848459802309, + "loss": 1.3682, + "step": 502 + }, + { + "epoch": 7.86, + "learning_rate": 0.0004103888331287247, + "loss": 1.4175, + "step": 503 + }, + { + "epoch": 7.87, + "learning_rate": 0.0004099921351258292, + "loss": 1.4152, + "step": 504 + }, + { + "epoch": 7.89, + "learning_rate": 0.00040959475366616474, + "loss": 1.4109, + "step": 505 + }, + { + "epoch": 7.91, + "learning_rate": 0.0004091966904472715, + "loss": 1.4189, + "step": 506 + }, + { + "epoch": 7.92, + "learning_rate": 0.0004087979471696016, + "loss": 1.3991, + "step": 507 + }, + { + "epoch": 7.94, + "learning_rate": 0.0004083985255365126, + "loss": 1.4259, + "step": 508 + }, + { + "epoch": 7.95, + "learning_rate": 0.0004079984272542597, + "loss": 1.4068, + "step": 509 + }, + { + "epoch": 7.97, + "learning_rate": 0.00040759765403198877, + "loss": 1.3935, + "step": 510 + }, + { + "epoch": 7.98, + "learning_rate": 0.00040719620758172863, + "loss": 1.4305, + "step": 511 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040679408961838426, + "loss": 1.4299, + "step": 512 + }, + { + "epoch": 8.02, + "learning_rate": 0.00040639130185972903, + "loss": 1.4568, + "step": 513 + }, + { + "epoch": 8.03, + "learning_rate": 0.00040598784602639777, + "loss": 1.3649, + "step": 514 + }, + { + "epoch": 8.05, + "learning_rate": 0.000405583723841879, + "loss": 1.417, + "step": 515 + }, + { + "epoch": 8.06, + "learning_rate": 0.0004051789370325078, + "loss": 1.3732, + "step": 516 + }, + { + "epoch": 8.08, + "learning_rate": 0.00040477348732745853, + "loss": 1.346, + "step": 517 + }, + { + "epoch": 8.09, + "learning_rate": 0.00040436737645873716, + "loss": 1.3935, + "step": 518 + }, + { + "epoch": 8.11, + "learning_rate": 0.00040396060616117423, + "loss": 1.3829, + "step": 519 + }, + { + "epoch": 8.12, + "learning_rate": 0.000403553178172417, + "loss": 1.3617, + "step": 520 + }, + { + "epoch": 8.14, + "learning_rate": 0.0004031450942329224, + "loss": 1.3344, + "step": 521 + }, + { + "epoch": 8.16, + "learning_rate": 0.0004027363560859494, + "loss": 1.3731, + "step": 522 + }, + { + "epoch": 8.17, + "learning_rate": 0.00040232696547755174, + "loss": 1.367, + "step": 523 + }, + { + "epoch": 8.19, + "learning_rate": 0.0004019169241565703, + "loss": 1.4125, + "step": 524 + }, + { + "epoch": 8.2, + "learning_rate": 0.0004015062338746256, + "loss": 1.3033, + "step": 525 + }, + { + "epoch": 8.2, + "eval_gen_len": 824.66, + "eval_loss": 1.549867868423462, + "eval_rouge1": 52.7839, + "eval_rouge2": 21.589, + "eval_rougeL": 24.5617, + "eval_rougeLsum": 49.8609, + "eval_runtime": 900.6753, + "eval_samples_per_second": 0.278, + "eval_steps_per_second": 0.278, + "step": 525 + }, + { + "epoch": 8.22, + "learning_rate": 0.0004010948963861104, + "loss": 1.3803, + "step": 526 + }, + { + "epoch": 8.23, + "learning_rate": 0.0004006829134481824, + "loss": 1.3491, + "step": 527 + }, + { + "epoch": 8.25, + "learning_rate": 0.00040027028682075626, + "loss": 1.4008, + "step": 528 + }, + { + "epoch": 8.27, + "learning_rate": 0.00039985701826649665, + "loss": 1.3536, + "step": 529 + }, + { + "epoch": 8.28, + "learning_rate": 0.0003994431095508102, + "loss": 1.2883, + "step": 530 + }, + { + "epoch": 8.3, + "learning_rate": 0.0003990285624418384, + "loss": 1.356, + "step": 531 + }, + { + "epoch": 8.31, + "learning_rate": 0.00039861337871044953, + "loss": 1.3975, + "step": 532 + }, + { + "epoch": 8.33, + "learning_rate": 0.0003981975601302317, + "loss": 1.3431, + "step": 533 + }, + { + "epoch": 8.34, + "learning_rate": 0.00039778110847748485, + "loss": 1.3478, + "step": 534 + }, + { + "epoch": 8.36, + "learning_rate": 0.00039736402553121336, + "loss": 1.3866, + "step": 535 + }, + { + "epoch": 8.37, + "learning_rate": 0.0003969463130731183, + "loss": 1.4443, + "step": 536 + }, + { + "epoch": 8.39, + "learning_rate": 0.0003965279728875899, + "loss": 1.3758, + "step": 537 + }, + { + "epoch": 8.41, + "learning_rate": 0.0003961090067617, + "loss": 1.3314, + "step": 538 + }, + { + "epoch": 8.42, + "learning_rate": 0.00039568941648519416, + "loss": 1.3783, + "step": 539 + }, + { + "epoch": 8.44, + "learning_rate": 0.0003952692038504846, + "loss": 1.4518, + "step": 540 + }, + { + "epoch": 8.45, + "learning_rate": 0.00039484837065264155, + "loss": 1.3759, + "step": 541 + }, + { + "epoch": 8.47, + "learning_rate": 0.00039442691868938674, + "loss": 1.3526, + "step": 542 + }, + { + "epoch": 8.48, + "learning_rate": 0.00039400484976108473, + "loss": 1.3463, + "step": 543 + }, + { + "epoch": 8.5, + "learning_rate": 0.0003935821656707359, + "loss": 1.3783, + "step": 544 + }, + { + "epoch": 8.52, + "learning_rate": 0.0003931588682239684, + "loss": 1.4501, + "step": 545 + }, + { + "epoch": 8.53, + "learning_rate": 0.00039273495922903046, + "loss": 1.3807, + "step": 546 + }, + { + "epoch": 8.55, + "learning_rate": 0.00039231044049678287, + "loss": 1.3869, + "step": 547 + }, + { + "epoch": 8.56, + "learning_rate": 0.0003918853138406909, + "loss": 1.3862, + "step": 548 + }, + { + "epoch": 8.58, + "learning_rate": 0.0003914595810768171, + "loss": 1.3999, + "step": 549 + }, + { + "epoch": 8.59, + "learning_rate": 0.0003910332440238128, + "loss": 1.3673, + "step": 550 + }, + { + "epoch": 8.59, + "eval_gen_len": 807.348, + "eval_loss": 1.5529675483703613, + "eval_rouge1": 53.2339, + "eval_rouge2": 22.152, + "eval_rougeL": 24.7587, + "eval_rougeLsum": 50.2502, + "eval_runtime": 882.3041, + "eval_samples_per_second": 0.283, + "eval_steps_per_second": 0.283, + "step": 550 + }, + { + "epoch": 8.61, + "learning_rate": 0.00039060630450291094, + "loss": 1.3473, + "step": 551 + }, + { + "epoch": 8.62, + "learning_rate": 0.00039017876433791824, + "loss": 1.3673, + "step": 552 + }, + { + "epoch": 8.64, + "learning_rate": 0.00038975062535520687, + "loss": 1.3757, + "step": 553 + }, + { + "epoch": 8.66, + "learning_rate": 0.00038932188938370745, + "loss": 1.3811, + "step": 554 + }, + { + "epoch": 8.67, + "learning_rate": 0.00038889255825490053, + "loss": 1.4296, + "step": 555 + }, + { + "epoch": 8.69, + "learning_rate": 0.00038846263380280934, + "loss": 1.3868, + "step": 556 + }, + { + "epoch": 8.7, + "learning_rate": 0.00038803211786399126, + "loss": 1.3756, + "step": 557 + }, + { + "epoch": 8.72, + "learning_rate": 0.0003876010122775309, + "loss": 1.3312, + "step": 558 + }, + { + "epoch": 8.73, + "learning_rate": 0.00038716931888503126, + "loss": 1.3168, + "step": 559 + }, + { + "epoch": 8.75, + "learning_rate": 0.00038673703953060677, + "loss": 1.3538, + "step": 560 + }, + { + "epoch": 8.76, + "learning_rate": 0.00038630417606087456, + "loss": 1.4301, + "step": 561 + }, + { + "epoch": 8.78, + "learning_rate": 0.00038587073032494734, + "loss": 1.4171, + "step": 562 + }, + { + "epoch": 8.8, + "learning_rate": 0.000385436704174425, + "loss": 1.3681, + "step": 563 + }, + { + "epoch": 8.81, + "learning_rate": 0.0003850020994633868, + "loss": 1.4067, + "step": 564 + }, + { + "epoch": 8.83, + "learning_rate": 0.0003845669180483836, + "loss": 1.3763, + "step": 565 + }, + { + "epoch": 8.84, + "learning_rate": 0.0003841311617884299, + "loss": 1.3688, + "step": 566 + }, + { + "epoch": 8.86, + "learning_rate": 0.0003836948325449956, + "loss": 1.3775, + "step": 567 + }, + { + "epoch": 8.87, + "learning_rate": 0.00038325793218199844, + "loss": 1.4386, + "step": 568 + }, + { + "epoch": 8.89, + "learning_rate": 0.00038282046256579597, + "loss": 1.401, + "step": 569 + }, + { + "epoch": 8.91, + "learning_rate": 0.00038238242556517724, + "loss": 1.3611, + "step": 570 + }, + { + "epoch": 8.92, + "learning_rate": 0.00038194382305135525, + "loss": 1.3689, + "step": 571 + }, + { + "epoch": 8.94, + "learning_rate": 0.00038150465689795857, + "loss": 1.3925, + "step": 572 + }, + { + "epoch": 8.95, + "learning_rate": 0.00038106492898102386, + "loss": 1.4082, + "step": 573 + }, + { + "epoch": 8.97, + "learning_rate": 0.0003806246411789872, + "loss": 1.4461, + "step": 574 + }, + { + "epoch": 8.98, + "learning_rate": 0.00038018379537267666, + "loss": 1.3634, + "step": 575 + }, + { + "epoch": 8.98, + "eval_gen_len": 767.952, + "eval_loss": 1.5457568168640137, + "eval_rouge1": 53.0293, + "eval_rouge2": 22.3194, + "eval_rougeL": 25.174, + "eval_rougeLsum": 50.078, + "eval_runtime": 842.6599, + "eval_samples_per_second": 0.297, + "eval_steps_per_second": 0.297, + "step": 575 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003797423934453038, + "loss": 1.3482, + "step": 576 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003793004372824559, + "loss": 1.406, + "step": 577 + }, + { + "epoch": 9.03, + "learning_rate": 0.0003788579287720878, + "loss": 1.3285, + "step": 578 + }, + { + "epoch": 9.05, + "learning_rate": 0.000378414869804514, + "loss": 1.3259, + "step": 579 + }, + { + "epoch": 9.06, + "learning_rate": 0.00037797126227240025, + "loss": 1.2688, + "step": 580 + }, + { + "epoch": 9.08, + "learning_rate": 0.00037752710807075585, + "loss": 1.3329, + "step": 581 + }, + { + "epoch": 9.09, + "learning_rate": 0.00037708240909692516, + "loss": 1.337, + "step": 582 + }, + { + "epoch": 9.11, + "learning_rate": 0.00037663716725058, + "loss": 1.3956, + "step": 583 + }, + { + "epoch": 9.12, + "learning_rate": 0.000376191384433711, + "loss": 1.4836, + "step": 584 + }, + { + "epoch": 9.14, + "learning_rate": 0.00037574506255061966, + "loss": 1.3266, + "step": 585 + }, + { + "epoch": 9.16, + "learning_rate": 0.00037529820350791065, + "loss": 1.2573, + "step": 586 + }, + { + "epoch": 9.17, + "learning_rate": 0.0003748508092144828, + "loss": 1.3676, + "step": 587 + }, + { + "epoch": 9.19, + "learning_rate": 0.00037440288158152184, + "loss": 1.3651, + "step": 588 + }, + { + "epoch": 9.2, + "learning_rate": 0.00037395442252249157, + "loss": 1.3189, + "step": 589 + }, + { + "epoch": 9.22, + "learning_rate": 0.00037350543395312603, + "loss": 1.338, + "step": 590 + }, + { + "epoch": 9.23, + "learning_rate": 0.0003730559177914212, + "loss": 1.3612, + "step": 591 + }, + { + "epoch": 9.25, + "learning_rate": 0.00037260587595762705, + "loss": 1.3696, + "step": 592 + }, + { + "epoch": 9.27, + "learning_rate": 0.0003721553103742388, + "loss": 1.3936, + "step": 593 + }, + { + "epoch": 9.28, + "learning_rate": 0.0003717042229659891, + "loss": 1.3243, + "step": 594 + }, + { + "epoch": 9.3, + "learning_rate": 0.0003712526156598399, + "loss": 1.3273, + "step": 595 + }, + { + "epoch": 9.31, + "learning_rate": 0.00037080049038497407, + "loss": 1.345, + "step": 596 + }, + { + "epoch": 9.33, + "learning_rate": 0.0003703478490727868, + "loss": 1.339, + "step": 597 + }, + { + "epoch": 9.34, + "learning_rate": 0.00036989469365687815, + "loss": 1.4087, + "step": 598 + }, + { + "epoch": 9.36, + "learning_rate": 0.00036944102607304404, + "loss": 1.3427, + "step": 599 + }, + { + "epoch": 9.37, + "learning_rate": 0.0003689868482592684, + "loss": 1.3095, + "step": 600 + }, + { + "epoch": 9.37, + "eval_gen_len": 856.252, + "eval_loss": 1.5412006378173828, + "eval_rouge1": 53.7658, + "eval_rouge2": 22.5229, + "eval_rougeL": 25.0448, + "eval_rougeLsum": 50.708, + "eval_runtime": 931.7127, + "eval_samples_per_second": 0.268, + "eval_steps_per_second": 0.268, + "step": 600 + }, + { + "epoch": 9.39, + "learning_rate": 0.0003685321621557147, + "loss": 1.3381, + "step": 601 + }, + { + "epoch": 9.41, + "learning_rate": 0.00036807696970471784, + "loss": 1.3336, + "step": 602 + }, + { + "epoch": 9.42, + "learning_rate": 0.00036762127285077544, + "loss": 1.403, + "step": 603 + }, + { + "epoch": 9.44, + "learning_rate": 0.0003671650735405404, + "loss": 1.3572, + "step": 604 + }, + { + "epoch": 9.45, + "learning_rate": 0.00036670837372281136, + "loss": 1.3604, + "step": 605 + }, + { + "epoch": 9.47, + "learning_rate": 0.0003662511753485256, + "loss": 1.3897, + "step": 606 + }, + { + "epoch": 9.48, + "learning_rate": 0.0003657934803707497, + "loss": 1.3179, + "step": 607 + }, + { + "epoch": 9.5, + "learning_rate": 0.000365335290744672, + "loss": 1.3702, + "step": 608 + }, + { + "epoch": 9.52, + "learning_rate": 0.00036487660842759355, + "loss": 1.3762, + "step": 609 + }, + { + "epoch": 9.53, + "learning_rate": 0.00036441743537892043, + "loss": 1.3296, + "step": 610 + }, + { + "epoch": 9.55, + "learning_rate": 0.0003639577735601548, + "loss": 1.3171, + "step": 611 + }, + { + "epoch": 9.56, + "learning_rate": 0.00036349762493488667, + "loss": 1.3241, + "step": 612 + }, + { + "epoch": 9.58, + "learning_rate": 0.000363036991468786, + "loss": 1.3366, + "step": 613 + }, + { + "epoch": 9.59, + "learning_rate": 0.00036257587512959343, + "loss": 1.3323, + "step": 614 + }, + { + "epoch": 9.61, + "learning_rate": 0.0003621142778871127, + "loss": 1.3814, + "step": 615 + }, + { + "epoch": 9.62, + "learning_rate": 0.00036165220171320166, + "loss": 1.3978, + "step": 616 + }, + { + "epoch": 9.64, + "learning_rate": 0.00036118964858176413, + "loss": 1.339, + "step": 617 + }, + { + "epoch": 9.66, + "learning_rate": 0.00036072662046874145, + "loss": 1.3834, + "step": 618 + }, + { + "epoch": 9.67, + "learning_rate": 0.00036026311935210393, + "loss": 1.3568, + "step": 619 + }, + { + "epoch": 9.69, + "learning_rate": 0.0003597991472118426, + "loss": 1.4211, + "step": 620 + }, + { + "epoch": 9.7, + "learning_rate": 0.00035933470602996035, + "loss": 1.3621, + "step": 621 + }, + { + "epoch": 9.72, + "learning_rate": 0.000358869797790464, + "loss": 1.3238, + "step": 622 + }, + { + "epoch": 9.73, + "learning_rate": 0.0003584044244793555, + "loss": 1.3429, + "step": 623 + }, + { + "epoch": 9.75, + "learning_rate": 0.0003579385880846232, + "loss": 1.3871, + "step": 624 + }, + { + "epoch": 9.76, + "learning_rate": 0.0003574722905962342, + "loss": 1.3492, + "step": 625 + }, + { + "epoch": 9.76, + "eval_gen_len": 826.064, + "eval_loss": 1.5389072895050049, + "eval_rouge1": 51.8662, + "eval_rouge2": 21.6229, + "eval_rougeL": 24.6819, + "eval_rougeLsum": 48.8648, + "eval_runtime": 900.6261, + "eval_samples_per_second": 0.278, + "eval_steps_per_second": 0.278, + "step": 625 + }, + { + "epoch": 9.78, + "learning_rate": 0.0003570055340061248, + "loss": 1.3172, + "step": 626 + }, + { + "epoch": 9.8, + "learning_rate": 0.0003565383203081928, + "loss": 1.3828, + "step": 627 + }, + { + "epoch": 9.81, + "learning_rate": 0.0003560706514982884, + "loss": 1.4252, + "step": 628 + }, + { + "epoch": 9.83, + "learning_rate": 0.0003556025295742065, + "loss": 1.3182, + "step": 629 + }, + { + "epoch": 9.84, + "learning_rate": 0.0003551339565356769, + "loss": 1.3041, + "step": 630 + }, + { + "epoch": 9.86, + "learning_rate": 0.00035466493438435703, + "loss": 1.359, + "step": 631 + }, + { + "epoch": 9.87, + "learning_rate": 0.0003541954651238226, + "loss": 1.365, + "step": 632 + }, + { + "epoch": 9.89, + "learning_rate": 0.00035372555075955937, + "loss": 1.2863, + "step": 633 + }, + { + "epoch": 9.91, + "learning_rate": 0.0003532551932989544, + "loss": 1.3145, + "step": 634 + }, + { + "epoch": 9.92, + "learning_rate": 0.0003527843947512878, + "loss": 1.4008, + "step": 635 + }, + { + "epoch": 9.94, + "learning_rate": 0.00035231315712772347, + "loss": 1.2704, + "step": 636 + }, + { + "epoch": 9.95, + "learning_rate": 0.0003518414824413017, + "loss": 1.3213, + "step": 637 + }, + { + "epoch": 9.97, + "learning_rate": 0.00035136937270692893, + "loss": 1.3692, + "step": 638 + }, + { + "epoch": 9.98, + "learning_rate": 0.0003508968299413708, + "loss": 1.3404, + "step": 639 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003504238561632424, + "loss": 1.3378, + "step": 640 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003499504533930001, + "loss": 1.3609, + "step": 641 + }, + { + "epoch": 10.03, + "learning_rate": 0.0003494766236529329, + "loss": 1.3558, + "step": 642 + }, + { + "epoch": 10.05, + "learning_rate": 0.00034900236896715355, + "loss": 1.3094, + "step": 643 + }, + { + "epoch": 10.06, + "learning_rate": 0.0003485276913615905, + "loss": 1.3134, + "step": 644 + }, + { + "epoch": 10.08, + "learning_rate": 0.00034805259286397824, + "loss": 1.3176, + "step": 645 + }, + { + "epoch": 10.09, + "learning_rate": 0.00034757707550384976, + "loss": 1.3581, + "step": 646 + }, + { + "epoch": 10.11, + "learning_rate": 0.0003471011413125271, + "loss": 1.3142, + "step": 647 + }, + { + "epoch": 10.12, + "learning_rate": 0.00034662479232311306, + "loss": 1.2926, + "step": 648 + }, + { + "epoch": 10.14, + "learning_rate": 0.0003461480305704822, + "loss": 1.2874, + "step": 649 + }, + { + "epoch": 10.16, + "learning_rate": 0.0003456708580912725, + "loss": 1.3007, + "step": 650 + }, + { + "epoch": 10.16, + "eval_gen_len": 843.544, + "eval_loss": 1.5403586626052856, + "eval_rouge1": 53.6692, + "eval_rouge2": 22.154, + "eval_rougeL": 24.6218, + "eval_rougeLsum": 50.6864, + "eval_runtime": 918.5032, + "eval_samples_per_second": 0.272, + "eval_steps_per_second": 0.272, + "step": 650 + }, + { + "epoch": 10.17, + "learning_rate": 0.0003451932769238763, + "loss": 1.311, + "step": 651 + }, + { + "epoch": 10.19, + "learning_rate": 0.0003447152891084319, + "loss": 1.3283, + "step": 652 + }, + { + "epoch": 10.2, + "learning_rate": 0.0003442368966868149, + "loss": 1.3141, + "step": 653 + }, + { + "epoch": 10.22, + "learning_rate": 0.0003437581017026289, + "loss": 1.3405, + "step": 654 + }, + { + "epoch": 10.23, + "learning_rate": 0.00034327890620119755, + "loss": 1.3586, + "step": 655 + }, + { + "epoch": 10.25, + "learning_rate": 0.00034279931222955517, + "loss": 1.2894, + "step": 656 + }, + { + "epoch": 10.27, + "learning_rate": 0.0003423193218364385, + "loss": 1.2816, + "step": 657 + }, + { + "epoch": 10.28, + "learning_rate": 0.0003418389370722775, + "loss": 1.2884, + "step": 658 + }, + { + "epoch": 10.3, + "learning_rate": 0.00034135815998918693, + "loss": 1.331, + "step": 659 + }, + { + "epoch": 10.31, + "learning_rate": 0.00034087699264095743, + "loss": 1.3353, + "step": 660 + }, + { + "epoch": 10.33, + "learning_rate": 0.00034039543708304696, + "loss": 1.3334, + "step": 661 + }, + { + "epoch": 10.34, + "learning_rate": 0.00033991349537257137, + "loss": 1.318, + "step": 662 + }, + { + "epoch": 10.36, + "learning_rate": 0.00033943116956829664, + "loss": 1.2815, + "step": 663 + }, + { + "epoch": 10.37, + "learning_rate": 0.00033894846173062915, + "loss": 1.3458, + "step": 664 + }, + { + "epoch": 10.39, + "learning_rate": 0.0003384653739216074, + "loss": 1.3625, + "step": 665 + }, + { + "epoch": 10.41, + "learning_rate": 0.00033798190820489313, + "loss": 1.3976, + "step": 666 + }, + { + "epoch": 10.42, + "learning_rate": 0.0003374980666457621, + "loss": 1.3768, + "step": 667 + }, + { + "epoch": 10.44, + "learning_rate": 0.00033701385131109616, + "loss": 1.2797, + "step": 668 + }, + { + "epoch": 10.45, + "learning_rate": 0.0003365292642693733, + "loss": 1.3, + "step": 669 + }, + { + "epoch": 10.47, + "learning_rate": 0.00033604430759065967, + "loss": 1.331, + "step": 670 + }, + { + "epoch": 10.48, + "learning_rate": 0.0003355589833466004, + "loss": 1.3494, + "step": 671 + }, + { + "epoch": 10.5, + "learning_rate": 0.0003350732936104108, + "loss": 1.2914, + "step": 672 + }, + { + "epoch": 10.52, + "learning_rate": 0.0003345872404568674, + "loss": 1.3664, + "step": 673 + }, + { + "epoch": 10.53, + "learning_rate": 0.0003341008259622993, + "loss": 1.3086, + "step": 674 + }, + { + "epoch": 10.55, + "learning_rate": 0.00033361405220457913, + "loss": 1.2729, + "step": 675 + }, + { + "epoch": 10.55, + "eval_gen_len": 808.764, + "eval_loss": 1.5428452491760254, + "eval_rouge1": 54.6479, + "eval_rouge2": 23.3029, + "eval_rougeL": 25.5647, + "eval_rougeLsum": 51.6394, + "eval_runtime": 883.7152, + "eval_samples_per_second": 0.283, + "eval_steps_per_second": 0.283, + "step": 675 + }, + { + "epoch": 10.56, + "learning_rate": 0.00033312692126311424, + "loss": 1.2774, + "step": 676 + }, + { + "epoch": 10.58, + "learning_rate": 0.00033263943521883766, + "loss": 1.3757, + "step": 677 + }, + { + "epoch": 10.59, + "learning_rate": 0.0003321515961541997, + "loss": 1.3389, + "step": 678 + }, + { + "epoch": 10.61, + "learning_rate": 0.00033166340615315836, + "loss": 1.3292, + "step": 679 + }, + { + "epoch": 10.62, + "learning_rate": 0.00033117486730117093, + "loss": 1.3003, + "step": 680 + }, + { + "epoch": 10.64, + "learning_rate": 0.00033068598168518485, + "loss": 1.2605, + "step": 681 + }, + { + "epoch": 10.66, + "learning_rate": 0.000330196751393629, + "loss": 1.2984, + "step": 682 + }, + { + "epoch": 10.67, + "learning_rate": 0.0003297071785164045, + "loss": 1.3517, + "step": 683 + }, + { + "epoch": 10.69, + "learning_rate": 0.0003292172651448761, + "loss": 1.3402, + "step": 684 + }, + { + "epoch": 10.7, + "learning_rate": 0.00032872701337186293, + "loss": 1.2718, + "step": 685 + }, + { + "epoch": 10.72, + "learning_rate": 0.0003282364252916298, + "loss": 1.3146, + "step": 686 + }, + { + "epoch": 10.73, + "learning_rate": 0.0003277455029998781, + "loss": 1.2862, + "step": 687 + }, + { + "epoch": 10.75, + "learning_rate": 0.00032725424859373687, + "loss": 1.3264, + "step": 688 + }, + { + "epoch": 10.76, + "learning_rate": 0.0003267626641717541, + "loss": 1.3557, + "step": 689 + }, + { + "epoch": 10.78, + "learning_rate": 0.0003262707518338872, + "loss": 1.3663, + "step": 690 + }, + { + "epoch": 10.8, + "learning_rate": 0.0003257785136814948, + "loss": 1.3282, + "step": 691 + }, + { + "epoch": 10.81, + "learning_rate": 0.0003252859518173269, + "loss": 1.3846, + "step": 692 + }, + { + "epoch": 10.83, + "learning_rate": 0.00032479306834551667, + "loss": 1.2832, + "step": 693 + }, + { + "epoch": 10.84, + "learning_rate": 0.00032429986537157096, + "loss": 1.2998, + "step": 694 + }, + { + "epoch": 10.86, + "learning_rate": 0.0003238063450023617, + "loss": 1.3911, + "step": 695 + }, + { + "epoch": 10.87, + "learning_rate": 0.0003233125093461162, + "loss": 1.3407, + "step": 696 + }, + { + "epoch": 10.89, + "learning_rate": 0.0003228183605124092, + "loss": 1.358, + "step": 697 + }, + { + "epoch": 10.91, + "learning_rate": 0.0003223239006121528, + "loss": 1.3951, + "step": 698 + }, + { + "epoch": 10.92, + "learning_rate": 0.0003218291317575882, + "loss": 1.3062, + "step": 699 + }, + { + "epoch": 10.94, + "learning_rate": 0.00032133405606227635, + "loss": 1.3758, + "step": 700 + }, + { + "epoch": 10.94, + "eval_gen_len": 800.152, + "eval_loss": 1.5403379201889038, + "eval_rouge1": 54.9418, + "eval_rouge2": 23.3323, + "eval_rougeL": 25.6087, + "eval_rougeLsum": 51.9256, + "eval_runtime": 874.0945, + "eval_samples_per_second": 0.286, + "eval_steps_per_second": 0.286, + "step": 700 + }, + { + "epoch": 10.95, + "learning_rate": 0.00032083867564108886, + "loss": 1.3445, + "step": 701 + }, + { + "epoch": 10.97, + "learning_rate": 0.0003203429926101991, + "loss": 1.3103, + "step": 702 + }, + { + "epoch": 10.98, + "learning_rate": 0.00031984700908707333, + "loss": 1.2862, + "step": 703 + }, + { + "epoch": 11.0, + "learning_rate": 0.00031935072719046115, + "loss": 1.3072, + "step": 704 + }, + { + "epoch": 11.02, + "learning_rate": 0.000318854149040387, + "loss": 1.3737, + "step": 705 + }, + { + "epoch": 11.03, + "learning_rate": 0.0003183572767581406, + "loss": 1.2964, + "step": 706 + }, + { + "epoch": 11.05, + "learning_rate": 0.00031786011246626855, + "loss": 1.3451, + "step": 707 + }, + { + "epoch": 11.06, + "learning_rate": 0.00031736265828856446, + "loss": 1.2713, + "step": 708 + }, + { + "epoch": 11.08, + "learning_rate": 0.00031686491635006054, + "loss": 1.3316, + "step": 709 + }, + { + "epoch": 11.09, + "learning_rate": 0.0003163668887770181, + "loss": 1.2551, + "step": 710 + }, + { + "epoch": 11.11, + "learning_rate": 0.0003158685776969187, + "loss": 1.2511, + "step": 711 + }, + { + "epoch": 11.12, + "learning_rate": 0.00031536998523845497, + "loss": 1.2933, + "step": 712 + }, + { + "epoch": 11.14, + "learning_rate": 0.0003148711135315215, + "loss": 1.2744, + "step": 713 + }, + { + "epoch": 11.16, + "learning_rate": 0.00031437196470720597, + "loss": 1.3624, + "step": 714 + }, + { + "epoch": 11.17, + "learning_rate": 0.00031387254089777967, + "loss": 1.232, + "step": 715 + }, + { + "epoch": 11.19, + "learning_rate": 0.0003133728442366885, + "loss": 1.2982, + "step": 716 + }, + { + "epoch": 11.2, + "learning_rate": 0.00031287287685854417, + "loss": 1.3225, + "step": 717 + }, + { + "epoch": 11.22, + "learning_rate": 0.00031237264089911465, + "loss": 1.3021, + "step": 718 + }, + { + "epoch": 11.23, + "learning_rate": 0.0003118721384953153, + "loss": 1.3114, + "step": 719 + }, + { + "epoch": 11.25, + "learning_rate": 0.0003113713717851998, + "loss": 1.2986, + "step": 720 + }, + { + "epoch": 11.27, + "learning_rate": 0.0003108703429079506, + "loss": 1.2522, + "step": 721 + }, + { + "epoch": 11.28, + "learning_rate": 0.0003103690540038705, + "loss": 1.3075, + "step": 722 + }, + { + "epoch": 11.3, + "learning_rate": 0.0003098675072143727, + "loss": 1.3449, + "step": 723 + }, + { + "epoch": 11.31, + "learning_rate": 0.00030936570468197215, + "loss": 1.2293, + "step": 724 + }, + { + "epoch": 11.33, + "learning_rate": 0.00030886364855027646, + "loss": 1.3357, + "step": 725 + }, + { + "epoch": 11.33, + "eval_gen_len": 814.496, + "eval_loss": 1.5454789400100708, + "eval_rouge1": 55.2511, + "eval_rouge2": 23.5606, + "eval_rougeL": 25.8237, + "eval_rougeLsum": 52.3183, + "eval_runtime": 889.3375, + "eval_samples_per_second": 0.281, + "eval_steps_per_second": 0.281, + "step": 725 + }, + { + "epoch": 11.34, + "learning_rate": 0.0003083613409639764, + "loss": 1.3662, + "step": 726 + }, + { + "epoch": 11.36, + "learning_rate": 0.0003078587840688368, + "loss": 1.3794, + "step": 727 + }, + { + "epoch": 11.37, + "learning_rate": 0.0003073559800116879, + "loss": 1.2878, + "step": 728 + }, + { + "epoch": 11.39, + "learning_rate": 0.00030685293094041534, + "loss": 1.2361, + "step": 729 + }, + { + "epoch": 11.41, + "learning_rate": 0.00030634963900395156, + "loss": 1.3076, + "step": 730 + }, + { + "epoch": 11.42, + "learning_rate": 0.00030584610635226654, + "loss": 1.3512, + "step": 731 + }, + { + "epoch": 11.44, + "learning_rate": 0.0003053423351363586, + "loss": 1.2882, + "step": 732 + }, + { + "epoch": 11.45, + "learning_rate": 0.00030483832750824495, + "loss": 1.3675, + "step": 733 + }, + { + "epoch": 11.47, + "learning_rate": 0.0003043340856209529, + "loss": 1.3085, + "step": 734 + }, + { + "epoch": 11.48, + "learning_rate": 0.0003038296116285103, + "loss": 1.2838, + "step": 735 + }, + { + "epoch": 11.5, + "learning_rate": 0.0003033249076859367, + "loss": 1.3298, + "step": 736 + }, + { + "epoch": 11.52, + "learning_rate": 0.0003028199759492339, + "loss": 1.3187, + "step": 737 + }, + { + "epoch": 11.53, + "learning_rate": 0.0003023148185753766, + "loss": 1.3395, + "step": 738 + }, + { + "epoch": 11.55, + "learning_rate": 0.0003018094377223037, + "loss": 1.3484, + "step": 739 + }, + { + "epoch": 11.56, + "learning_rate": 0.00030130383554890854, + "loss": 1.2969, + "step": 740 + }, + { + "epoch": 11.58, + "learning_rate": 0.00030079801421503005, + "loss": 1.2698, + "step": 741 + }, + { + "epoch": 11.59, + "learning_rate": 0.0003002919758814431, + "loss": 1.3265, + "step": 742 + }, + { + "epoch": 11.61, + "learning_rate": 0.00029978572270984987, + "loss": 1.3148, + "step": 743 + }, + { + "epoch": 11.62, + "learning_rate": 0.00029927925686287006, + "loss": 1.2497, + "step": 744 + }, + { + "epoch": 11.64, + "learning_rate": 0.0002987725805040321, + "loss": 1.3015, + "step": 745 + }, + { + "epoch": 11.66, + "learning_rate": 0.0002982656957977634, + "loss": 1.3065, + "step": 746 + }, + { + "epoch": 11.67, + "learning_rate": 0.0002977586049093816, + "loss": 1.2443, + "step": 747 + }, + { + "epoch": 11.69, + "learning_rate": 0.0002972513100050851, + "loss": 1.303, + "step": 748 + }, + { + "epoch": 11.7, + "learning_rate": 0.00029674381325194373, + "loss": 1.3074, + "step": 749 + }, + { + "epoch": 11.72, + "learning_rate": 0.00029623611681788963, + "loss": 1.2817, + "step": 750 + }, + { + "epoch": 11.72, + "eval_gen_len": 811.144, + "eval_loss": 1.5412460565567017, + "eval_rouge1": 55.2847, + "eval_rouge2": 23.6632, + "eval_rougeL": 25.9341, + "eval_rougeLsum": 52.3146, + "eval_runtime": 886.0701, + "eval_samples_per_second": 0.282, + "eval_steps_per_second": 0.282, + "step": 750 + }, + { + "epoch": 11.73, + "learning_rate": 0.00029572822287170794, + "loss": 1.2607, + "step": 751 + }, + { + "epoch": 11.75, + "learning_rate": 0.0002952201335830275, + "loss": 1.2668, + "step": 752 + }, + { + "epoch": 11.76, + "learning_rate": 0.0002947118511223117, + "loss": 1.3079, + "step": 753 + }, + { + "epoch": 11.78, + "learning_rate": 0.00029420337766084903, + "loss": 1.2802, + "step": 754 + }, + { + "epoch": 11.8, + "learning_rate": 0.0002936947153707439, + "loss": 1.3151, + "step": 755 + }, + { + "epoch": 11.81, + "learning_rate": 0.00029318586642490763, + "loss": 1.2845, + "step": 756 + }, + { + "epoch": 11.83, + "learning_rate": 0.0002926768329970484, + "loss": 1.2877, + "step": 757 + }, + { + "epoch": 11.84, + "learning_rate": 0.00029216761726166287, + "loss": 1.2546, + "step": 758 + }, + { + "epoch": 11.86, + "learning_rate": 0.0002916582213940263, + "loss": 1.253, + "step": 759 + }, + { + "epoch": 11.87, + "learning_rate": 0.0002911486475701835, + "loss": 1.3328, + "step": 760 + }, + { + "epoch": 11.89, + "learning_rate": 0.00029063889796693954, + "loss": 1.309, + "step": 761 + }, + { + "epoch": 11.91, + "learning_rate": 0.0002901289747618502, + "loss": 1.3281, + "step": 762 + }, + { + "epoch": 11.92, + "learning_rate": 0.000289618880133213, + "loss": 1.287, + "step": 763 + }, + { + "epoch": 11.94, + "learning_rate": 0.00028910861626005774, + "loss": 1.3711, + "step": 764 + }, + { + "epoch": 11.95, + "learning_rate": 0.00028859818532213724, + "loss": 1.2575, + "step": 765 + }, + { + "epoch": 11.97, + "learning_rate": 0.00028808758949991776, + "loss": 1.2802, + "step": 766 + }, + { + "epoch": 11.98, + "learning_rate": 0.00028757683097457035, + "loss": 1.3022, + "step": 767 + }, + { + "epoch": 12.0, + "learning_rate": 0.0002870659119279605, + "loss": 1.2998, + "step": 768 + }, + { + "epoch": 12.02, + "learning_rate": 0.00028655483454264, + "loss": 1.3663, + "step": 769 + }, + { + "epoch": 12.03, + "learning_rate": 0.00028604360100183666, + "loss": 1.3068, + "step": 770 + }, + { + "epoch": 12.05, + "learning_rate": 0.00028553221348944565, + "loss": 1.2841, + "step": 771 + }, + { + "epoch": 12.06, + "learning_rate": 0.0002850206741900195, + "loss": 1.315, + "step": 772 + }, + { + "epoch": 12.08, + "learning_rate": 0.0002845089852887595, + "loss": 1.267, + "step": 773 + }, + { + "epoch": 12.09, + "learning_rate": 0.0002839971489715057, + "loss": 1.2197, + "step": 774 + }, + { + "epoch": 12.11, + "learning_rate": 0.0002834851674247282, + "loss": 1.2771, + "step": 775 + }, + { + "epoch": 12.11, + "eval_gen_len": 852.704, + "eval_loss": 1.5449961423873901, + "eval_rouge1": 55.1956, + "eval_rouge2": 23.5545, + "eval_rougeL": 25.677, + "eval_rougeLsum": 52.1841, + "eval_runtime": 928.7433, + "eval_samples_per_second": 0.269, + "eval_steps_per_second": 0.269, + "step": 775 + }, + { + "epoch": 12.12, + "learning_rate": 0.00028297304283551725, + "loss": 1.2442, + "step": 776 + }, + { + "epoch": 12.14, + "learning_rate": 0.00028246077739157435, + "loss": 1.316, + "step": 777 + }, + { + "epoch": 12.16, + "learning_rate": 0.0002819483732812025, + "loss": 1.2588, + "step": 778 + }, + { + "epoch": 12.17, + "learning_rate": 0.0002814358326932972, + "loss": 1.2439, + "step": 779 + }, + { + "epoch": 12.19, + "learning_rate": 0.000280923157817337, + "loss": 1.223, + "step": 780 + }, + { + "epoch": 12.2, + "learning_rate": 0.0002804103508433738, + "loss": 1.3348, + "step": 781 + }, + { + "epoch": 12.22, + "learning_rate": 0.0002798974139620242, + "loss": 1.2156, + "step": 782 + }, + { + "epoch": 12.23, + "learning_rate": 0.00027938434936445943, + "loss": 1.3101, + "step": 783 + }, + { + "epoch": 12.25, + "learning_rate": 0.0002788711592423966, + "loss": 1.2715, + "step": 784 + }, + { + "epoch": 12.27, + "learning_rate": 0.00027835784578808867, + "loss": 1.373, + "step": 785 + }, + { + "epoch": 12.28, + "learning_rate": 0.00027784441119431576, + "loss": 1.2366, + "step": 786 + }, + { + "epoch": 12.3, + "learning_rate": 0.00027733085765437526, + "loss": 1.2454, + "step": 787 + }, + { + "epoch": 12.31, + "learning_rate": 0.00027681718736207296, + "loss": 1.3599, + "step": 788 + }, + { + "epoch": 12.33, + "learning_rate": 0.00027630340251171297, + "loss": 1.2374, + "step": 789 + }, + { + "epoch": 12.34, + "learning_rate": 0.00027578950529808926, + "loss": 1.3201, + "step": 790 + }, + { + "epoch": 12.36, + "learning_rate": 0.00027527549791647537, + "loss": 1.2734, + "step": 791 + }, + { + "epoch": 12.37, + "learning_rate": 0.00027476138256261575, + "loss": 1.2887, + "step": 792 + }, + { + "epoch": 12.39, + "learning_rate": 0.00027424716143271575, + "loss": 1.2598, + "step": 793 + }, + { + "epoch": 12.41, + "learning_rate": 0.0002737328367234331, + "loss": 1.3082, + "step": 794 + }, + { + "epoch": 12.42, + "learning_rate": 0.0002732184106318675, + "loss": 1.2133, + "step": 795 + }, + { + "epoch": 12.44, + "learning_rate": 0.00027270388535555206, + "loss": 1.2794, + "step": 796 + }, + { + "epoch": 12.45, + "learning_rate": 0.00027218926309244333, + "loss": 1.2522, + "step": 797 + }, + { + "epoch": 12.47, + "learning_rate": 0.0002716745460409125, + "loss": 1.3457, + "step": 798 + }, + { + "epoch": 12.48, + "learning_rate": 0.0002711597363997352, + "loss": 1.2477, + "step": 799 + }, + { + "epoch": 12.5, + "learning_rate": 0.00027064483636808314, + "loss": 1.2892, + "step": 800 + }, + { + "epoch": 12.5, + "eval_gen_len": 805.844, + "eval_loss": 1.5368956327438354, + "eval_rouge1": 54.9563, + "eval_rouge2": 23.5105, + "eval_rougeL": 25.8876, + "eval_rougeLsum": 51.9568, + "eval_runtime": 884.5538, + "eval_samples_per_second": 0.283, + "eval_steps_per_second": 0.283, + "step": 800 + }, + { + "epoch": 12.52, + "learning_rate": 0.00027012984814551365, + "loss": 1.3415, + "step": 801 + }, + { + "epoch": 12.53, + "learning_rate": 0.00026961477393196127, + "loss": 1.2526, + "step": 802 + }, + { + "epoch": 12.55, + "learning_rate": 0.00026909961592772746, + "loss": 1.2488, + "step": 803 + }, + { + "epoch": 12.56, + "learning_rate": 0.0002685843763334719, + "loss": 1.2557, + "step": 804 + }, + { + "epoch": 12.58, + "learning_rate": 0.00026806905735020267, + "loss": 1.2802, + "step": 805 + }, + { + "epoch": 12.59, + "learning_rate": 0.00026755366117926706, + "loss": 1.2627, + "step": 806 + }, + { + "epoch": 12.61, + "learning_rate": 0.000267038190022342, + "loss": 1.1703, + "step": 807 + }, + { + "epoch": 12.62, + "learning_rate": 0.00026652264608142484, + "loss": 1.2512, + "step": 808 + }, + { + "epoch": 12.64, + "learning_rate": 0.0002660070315588238, + "loss": 1.3306, + "step": 809 + }, + { + "epoch": 12.66, + "learning_rate": 0.0002654913486571487, + "loss": 1.2447, + "step": 810 + }, + { + "epoch": 12.67, + "learning_rate": 0.00026497559957930117, + "loss": 1.2808, + "step": 811 + }, + { + "epoch": 12.69, + "learning_rate": 0.00026445978652846604, + "loss": 1.248, + "step": 812 + }, + { + "epoch": 12.7, + "learning_rate": 0.0002639439117081009, + "loss": 1.2949, + "step": 813 + }, + { + "epoch": 12.72, + "learning_rate": 0.0002634279773219275, + "loss": 1.2704, + "step": 814 + }, + { + "epoch": 12.73, + "learning_rate": 0.00026291198557392207, + "loss": 1.3479, + "step": 815 + }, + { + "epoch": 12.75, + "learning_rate": 0.0002623959386683056, + "loss": 1.2575, + "step": 816 + }, + { + "epoch": 12.76, + "learning_rate": 0.0002618798388095351, + "loss": 1.2874, + "step": 817 + }, + { + "epoch": 12.78, + "learning_rate": 0.0002613636882022934, + "loss": 1.2523, + "step": 818 + }, + { + "epoch": 12.8, + "learning_rate": 0.00026084748905148044, + "loss": 1.3145, + "step": 819 + }, + { + "epoch": 12.81, + "learning_rate": 0.00026033124356220327, + "loss": 1.2299, + "step": 820 + }, + { + "epoch": 12.83, + "learning_rate": 0.00025981495393976716, + "loss": 1.3173, + "step": 821 + }, + { + "epoch": 12.84, + "learning_rate": 0.0002592986223896656, + "loss": 1.3262, + "step": 822 + }, + { + "epoch": 12.86, + "learning_rate": 0.0002587822511175715, + "loss": 1.2522, + "step": 823 + }, + { + "epoch": 12.87, + "learning_rate": 0.00025826584232932704, + "loss": 1.2762, + "step": 824 + }, + { + "epoch": 12.89, + "learning_rate": 0.0002577493982309352, + "loss": 1.2757, + "step": 825 + }, + { + "epoch": 12.89, + "eval_gen_len": 813.476, + "eval_loss": 1.5466704368591309, + "eval_rouge1": 56.4728, + "eval_rouge2": 24.6875, + "eval_rougeL": 26.4415, + "eval_rougeLsum": 53.4939, + "eval_runtime": 890.3223, + "eval_samples_per_second": 0.281, + "eval_steps_per_second": 0.281, + "step": 825 + }, + { + "epoch": 12.91, + "learning_rate": 0.0002572329210285493, + "loss": 1.2617, + "step": 826 + }, + { + "epoch": 12.92, + "learning_rate": 0.00025671641292846447, + "loss": 1.3063, + "step": 827 + }, + { + "epoch": 12.94, + "learning_rate": 0.00025619987613710756, + "loss": 1.275, + "step": 828 + }, + { + "epoch": 12.95, + "learning_rate": 0.00025568331286102803, + "loss": 1.3496, + "step": 829 + }, + { + "epoch": 12.97, + "learning_rate": 0.00025516672530688864, + "loss": 1.2791, + "step": 830 + }, + { + "epoch": 12.98, + "learning_rate": 0.00025465011568145555, + "loss": 1.268, + "step": 831 + }, + { + "epoch": 13.0, + "learning_rate": 0.0002541334861915897, + "loss": 1.3177, + "step": 832 + }, + { + "epoch": 13.02, + "learning_rate": 0.00025361683904423624, + "loss": 1.3568, + "step": 833 + }, + { + "epoch": 13.03, + "learning_rate": 0.00025310017644641637, + "loss": 1.2423, + "step": 834 + }, + { + "epoch": 13.05, + "learning_rate": 0.00025258350060521686, + "loss": 1.2066, + "step": 835 + }, + { + "epoch": 13.06, + "learning_rate": 0.00025206681372778124, + "loss": 1.2076, + "step": 836 + }, + { + "epoch": 13.08, + "learning_rate": 0.00025155011802130016, + "loss": 1.258, + "step": 837 + }, + { + "epoch": 13.09, + "learning_rate": 0.0002510334156930021, + "loss": 1.2864, + "step": 838 + }, + { + "epoch": 13.11, + "learning_rate": 0.0002505167089501435, + "loss": 1.2762, + "step": 839 + }, + { + "epoch": 13.12, + "learning_rate": 0.00025, + "loss": 1.2388, + "step": 840 + }, + { + "epoch": 13.14, + "learning_rate": 0.0002494832910498565, + "loss": 1.2254, + "step": 841 + }, + { + "epoch": 13.16, + "learning_rate": 0.000248966584306998, + "loss": 1.2315, + "step": 842 + }, + { + "epoch": 13.17, + "learning_rate": 0.0002484498819786998, + "loss": 1.2186, + "step": 843 + }, + { + "epoch": 13.19, + "learning_rate": 0.00024793318627221877, + "loss": 1.27, + "step": 844 + }, + { + "epoch": 13.2, + "learning_rate": 0.0002474164993947832, + "loss": 1.2413, + "step": 845 + }, + { + "epoch": 13.22, + "learning_rate": 0.00024689982355358375, + "loss": 1.2759, + "step": 846 + }, + { + "epoch": 13.23, + "learning_rate": 0.0002463831609557638, + "loss": 1.2606, + "step": 847 + }, + { + "epoch": 13.25, + "learning_rate": 0.0002458665138084104, + "loss": 1.2803, + "step": 848 + }, + { + "epoch": 13.27, + "learning_rate": 0.00024534988431854446, + "loss": 1.343, + "step": 849 + }, + { + "epoch": 13.28, + "learning_rate": 0.0002448332746931115, + "loss": 1.2382, + "step": 850 + }, + { + "epoch": 13.28, + "eval_gen_len": 787.34, + "eval_loss": 1.5448322296142578, + "eval_rouge1": 57.2303, + "eval_rouge2": 24.9705, + "eval_rougeL": 26.8081, + "eval_rougeLsum": 54.2747, + "eval_runtime": 870.1853, + "eval_samples_per_second": 0.287, + "eval_steps_per_second": 0.287, + "step": 850 + } + ], + "max_steps": 1600, + "num_train_epochs": 25, + "total_flos": 1.193214367236096e+18, + "trial_name": null, + "trial_params": null +}