{ "best_metric": 29.1144, "best_model_checkpoint": "./ko-en_mbartLarge_exp20p_linear_alpha/checkpoint-60000", "epoch": 8.81772827474185, "eval_steps": 4000, "global_step": 76000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 5.4920234365935726e-05, "loss": 2.3766, "step": 500 }, { "epoch": 0.12, "learning_rate": 5.484046873187145e-05, "loss": 1.7994, "step": 1000 }, { "epoch": 0.17, "learning_rate": 5.476070309780717e-05, "loss": 1.6333, "step": 1500 }, { "epoch": 0.23, "learning_rate": 5.4680937463742896e-05, "loss": 1.554, "step": 2000 }, { "epoch": 0.29, "learning_rate": 5.460117182967862e-05, "loss": 1.5092, "step": 2500 }, { "epoch": 0.35, "learning_rate": 5.4521406195614344e-05, "loss": 1.4709, "step": 3000 }, { "epoch": 0.41, "learning_rate": 5.444164056155007e-05, "loss": 1.4354, "step": 3500 }, { "epoch": 0.46, "learning_rate": 5.4361874927485784e-05, "loss": 1.404, "step": 4000 }, { "epoch": 0.46, "eval_bleu": 22.5375, "eval_gen_len": 18.6852, "eval_loss": 1.373841404914856, "eval_runtime": 1144.9051, "eval_samples_per_second": 15.055, "eval_steps_per_second": 0.942, "step": 4000 }, { "epoch": 0.52, "learning_rate": 5.4282109293421515e-05, "loss": 1.3753, "step": 4500 }, { "epoch": 0.58, "learning_rate": 5.420234365935724e-05, "loss": 1.3399, "step": 5000 }, { "epoch": 0.64, "learning_rate": 5.412257802529296e-05, "loss": 1.3272, "step": 5500 }, { "epoch": 0.7, "learning_rate": 5.404281239122868e-05, "loss": 1.3226, "step": 6000 }, { "epoch": 0.75, "learning_rate": 5.39630467571644e-05, "loss": 1.2978, "step": 6500 }, { "epoch": 0.81, "learning_rate": 5.388328112310013e-05, "loss": 1.269, "step": 7000 }, { "epoch": 0.87, "learning_rate": 5.3803515489035856e-05, "loss": 1.2787, "step": 7500 }, { "epoch": 0.93, "learning_rate": 5.372374985497158e-05, "loss": 1.2629, "step": 8000 }, { "epoch": 0.93, "eval_bleu": 25.3741, "eval_gen_len": 18.7797, "eval_loss": 1.245756983757019, "eval_runtime": 1156.1784, "eval_samples_per_second": 14.909, "eval_steps_per_second": 0.932, "step": 8000 }, { "epoch": 0.99, "learning_rate": 5.36439842209073e-05, "loss": 1.2572, "step": 8500 }, { "epoch": 1.04, "learning_rate": 5.356421858684302e-05, "loss": 1.2035, "step": 9000 }, { "epoch": 1.1, "learning_rate": 5.348445295277875e-05, "loss": 1.1945, "step": 9500 }, { "epoch": 1.16, "learning_rate": 5.3404687318714474e-05, "loss": 1.179, "step": 10000 }, { "epoch": 1.22, "learning_rate": 5.332492168465019e-05, "loss": 1.2032, "step": 10500 }, { "epoch": 1.28, "learning_rate": 5.3245156050585915e-05, "loss": 1.192, "step": 11000 }, { "epoch": 1.33, "learning_rate": 5.316539041652164e-05, "loss": 1.1795, "step": 11500 }, { "epoch": 1.39, "learning_rate": 5.308562478245736e-05, "loss": 1.1951, "step": 12000 }, { "epoch": 1.39, "eval_bleu": 26.1281, "eval_gen_len": 18.6597, "eval_loss": 1.2066867351531982, "eval_runtime": 1158.2083, "eval_samples_per_second": 14.882, "eval_steps_per_second": 0.931, "step": 12000 }, { "epoch": 1.45, "learning_rate": 5.300585914839309e-05, "loss": 1.1704, "step": 12500 }, { "epoch": 1.51, "learning_rate": 5.292609351432881e-05, "loss": 1.1547, "step": 13000 }, { "epoch": 1.57, "learning_rate": 5.284632788026453e-05, "loss": 1.1419, "step": 13500 }, { "epoch": 1.62, "learning_rate": 5.276656224620026e-05, "loss": 1.1396, "step": 14000 }, { "epoch": 1.68, "learning_rate": 5.268679661213598e-05, "loss": 1.1549, "step": 14500 }, { "epoch": 1.74, "learning_rate": 5.2607030978071704e-05, "loss": 1.1255, "step": 15000 }, { "epoch": 1.8, "learning_rate": 5.252726534400743e-05, "loss": 1.1395, "step": 15500 }, { "epoch": 1.86, "learning_rate": 5.244749970994315e-05, "loss": 1.1317, "step": 16000 }, { "epoch": 1.86, "eval_bleu": 26.5384, "eval_gen_len": 19.2055, "eval_loss": 1.1767752170562744, "eval_runtime": 1178.3467, "eval_samples_per_second": 14.628, "eval_steps_per_second": 0.915, "step": 16000 }, { "epoch": 1.91, "learning_rate": 5.2367734075878875e-05, "loss": 1.1557, "step": 16500 }, { "epoch": 1.97, "learning_rate": 5.22879684418146e-05, "loss": 1.1478, "step": 17000 }, { "epoch": 2.03, "learning_rate": 5.2208202807750315e-05, "loss": 1.0771, "step": 17500 }, { "epoch": 2.09, "learning_rate": 5.2128437173686046e-05, "loss": 1.0135, "step": 18000 }, { "epoch": 2.15, "learning_rate": 5.204867153962177e-05, "loss": 1.0028, "step": 18500 }, { "epoch": 2.2, "learning_rate": 5.196890590555749e-05, "loss": 0.9894, "step": 19000 }, { "epoch": 2.26, "learning_rate": 5.1889140271493216e-05, "loss": 1.0178, "step": 19500 }, { "epoch": 2.32, "learning_rate": 5.180937463742893e-05, "loss": 0.9906, "step": 20000 }, { "epoch": 2.32, "eval_bleu": 28.2459, "eval_gen_len": 18.7269, "eval_loss": 1.1362603902816772, "eval_runtime": 1157.7765, "eval_samples_per_second": 14.888, "eval_steps_per_second": 0.931, "step": 20000 }, { "epoch": 2.38, "learning_rate": 5.1729609003364664e-05, "loss": 1.0083, "step": 20500 }, { "epoch": 2.44, "learning_rate": 5.164984336930039e-05, "loss": 0.9965, "step": 21000 }, { "epoch": 2.49, "learning_rate": 5.157007773523611e-05, "loss": 0.9992, "step": 21500 }, { "epoch": 2.55, "learning_rate": 5.149031210117183e-05, "loss": 1.0042, "step": 22000 }, { "epoch": 2.61, "learning_rate": 5.141054646710755e-05, "loss": 0.9998, "step": 22500 }, { "epoch": 2.67, "learning_rate": 5.1330780833043275e-05, "loss": 1.0076, "step": 23000 }, { "epoch": 2.73, "learning_rate": 5.1251015198979005e-05, "loss": 1.0047, "step": 23500 }, { "epoch": 2.78, "learning_rate": 5.117124956491473e-05, "loss": 0.9894, "step": 24000 }, { "epoch": 2.78, "eval_bleu": 28.5124, "eval_gen_len": 18.6882, "eval_loss": 1.1238752603530884, "eval_runtime": 1152.8912, "eval_samples_per_second": 14.951, "eval_steps_per_second": 0.935, "step": 24000 }, { "epoch": 2.84, "learning_rate": 5.1091483930850446e-05, "loss": 1.0115, "step": 24500 }, { "epoch": 2.9, "learning_rate": 5.101171829678617e-05, "loss": 0.9958, "step": 25000 }, { "epoch": 2.96, "learning_rate": 5.093195266272189e-05, "loss": 1.0044, "step": 25500 }, { "epoch": 3.02, "learning_rate": 5.0852187028657624e-05, "loss": 0.9668, "step": 26000 }, { "epoch": 3.07, "learning_rate": 5.077242139459334e-05, "loss": 0.8751, "step": 26500 }, { "epoch": 3.13, "learning_rate": 5.0692655760529064e-05, "loss": 0.8916, "step": 27000 }, { "epoch": 3.19, "learning_rate": 5.061289012646479e-05, "loss": 0.8861, "step": 27500 }, { "epoch": 3.25, "learning_rate": 5.053312449240051e-05, "loss": 0.8965, "step": 28000 }, { "epoch": 3.25, "eval_bleu": 28.5335, "eval_gen_len": 18.4917, "eval_loss": 1.1277672052383423, "eval_runtime": 1142.9887, "eval_samples_per_second": 15.081, "eval_steps_per_second": 0.943, "step": 28000 }, { "epoch": 3.31, "learning_rate": 5.045335885833624e-05, "loss": 0.8898, "step": 28500 }, { "epoch": 3.36, "learning_rate": 5.037359322427196e-05, "loss": 0.8982, "step": 29000 }, { "epoch": 3.42, "learning_rate": 5.029382759020768e-05, "loss": 0.896, "step": 29500 }, { "epoch": 3.48, "learning_rate": 5.0214061956143406e-05, "loss": 0.8889, "step": 30000 }, { "epoch": 3.54, "learning_rate": 5.013429632207913e-05, "loss": 0.9056, "step": 30500 }, { "epoch": 3.6, "learning_rate": 5.005453068801485e-05, "loss": 0.8867, "step": 31000 }, { "epoch": 3.65, "learning_rate": 4.997476505395058e-05, "loss": 0.911, "step": 31500 }, { "epoch": 3.71, "learning_rate": 4.98949994198863e-05, "loss": 0.9138, "step": 32000 }, { "epoch": 3.71, "eval_bleu": 28.8189, "eval_gen_len": 18.7873, "eval_loss": 1.1216284036636353, "eval_runtime": 1156.9444, "eval_samples_per_second": 14.899, "eval_steps_per_second": 0.932, "step": 32000 }, { "epoch": 3.77, "learning_rate": 4.9815233785822024e-05, "loss": 0.9023, "step": 32500 }, { "epoch": 3.83, "learning_rate": 4.973546815175775e-05, "loss": 0.9127, "step": 33000 }, { "epoch": 3.89, "learning_rate": 4.9655702517693464e-05, "loss": 0.9168, "step": 33500 }, { "epoch": 3.94, "learning_rate": 4.9575936883629195e-05, "loss": 0.8978, "step": 34000 }, { "epoch": 4.0, "learning_rate": 4.949617124956492e-05, "loss": 0.9229, "step": 34500 }, { "epoch": 4.06, "learning_rate": 4.941640561550064e-05, "loss": 0.828, "step": 35000 }, { "epoch": 4.12, "learning_rate": 4.9336639981436366e-05, "loss": 0.8326, "step": 35500 }, { "epoch": 4.18, "learning_rate": 4.925687434737208e-05, "loss": 0.8272, "step": 36000 }, { "epoch": 4.18, "eval_bleu": 28.332, "eval_gen_len": 18.6516, "eval_loss": 1.1468099355697632, "eval_runtime": 1152.7736, "eval_samples_per_second": 14.953, "eval_steps_per_second": 0.935, "step": 36000 }, { "epoch": 4.23, "learning_rate": 4.9177108713307806e-05, "loss": 0.8259, "step": 36500 }, { "epoch": 4.29, "learning_rate": 4.9097343079243536e-05, "loss": 0.8243, "step": 37000 }, { "epoch": 4.35, "learning_rate": 4.901757744517926e-05, "loss": 0.8664, "step": 37500 }, { "epoch": 4.41, "learning_rate": 4.893781181111498e-05, "loss": 0.8893, "step": 38000 }, { "epoch": 4.47, "learning_rate": 4.88580461770507e-05, "loss": 0.8958, "step": 38500 }, { "epoch": 4.52, "learning_rate": 4.8778280542986424e-05, "loss": 0.8924, "step": 39000 }, { "epoch": 4.58, "learning_rate": 4.8698514908922155e-05, "loss": 0.885, "step": 39500 }, { "epoch": 4.64, "learning_rate": 4.861874927485788e-05, "loss": 0.8753, "step": 40000 }, { "epoch": 4.64, "eval_bleu": 28.2695, "eval_gen_len": 18.4919, "eval_loss": 1.1344704627990723, "eval_runtime": 1148.1227, "eval_samples_per_second": 15.013, "eval_steps_per_second": 0.939, "step": 40000 }, { "epoch": 4.7, "learning_rate": 4.8538983640793595e-05, "loss": 0.8749, "step": 40500 }, { "epoch": 4.76, "learning_rate": 4.845921800672932e-05, "loss": 0.8621, "step": 41000 }, { "epoch": 4.81, "learning_rate": 4.837945237266504e-05, "loss": 0.8616, "step": 41500 }, { "epoch": 4.87, "learning_rate": 4.8299686738600766e-05, "loss": 0.8551, "step": 42000 }, { "epoch": 4.93, "learning_rate": 4.821992110453649e-05, "loss": 0.8504, "step": 42500 }, { "epoch": 4.99, "learning_rate": 4.814015547047221e-05, "loss": 0.8459, "step": 43000 }, { "epoch": 5.05, "learning_rate": 4.806038983640794e-05, "loss": 0.7255, "step": 43500 }, { "epoch": 5.11, "learning_rate": 4.798062420234366e-05, "loss": 0.6855, "step": 44000 }, { "epoch": 5.11, "eval_bleu": 28.7913, "eval_gen_len": 18.7596, "eval_loss": 1.154221773147583, "eval_runtime": 1168.8964, "eval_samples_per_second": 14.746, "eval_steps_per_second": 0.922, "step": 44000 }, { "epoch": 5.16, "learning_rate": 4.7900858568279384e-05, "loss": 0.7002, "step": 44500 }, { "epoch": 5.22, "learning_rate": 4.782109293421511e-05, "loss": 0.6982, "step": 45000 }, { "epoch": 5.28, "learning_rate": 4.774132730015083e-05, "loss": 0.6976, "step": 45500 }, { "epoch": 5.34, "learning_rate": 4.7661561666086555e-05, "loss": 0.7028, "step": 46000 }, { "epoch": 5.4, "learning_rate": 4.758179603202228e-05, "loss": 0.7138, "step": 46500 }, { "epoch": 5.45, "learning_rate": 4.7502030397958e-05, "loss": 0.7121, "step": 47000 }, { "epoch": 5.51, "learning_rate": 4.7422264763893726e-05, "loss": 0.7043, "step": 47500 }, { "epoch": 5.57, "learning_rate": 4.734249912982945e-05, "loss": 0.7088, "step": 48000 }, { "epoch": 5.57, "eval_bleu": 29.0865, "eval_gen_len": 18.6626, "eval_loss": 1.153067708015442, "eval_runtime": 1149.2413, "eval_samples_per_second": 14.999, "eval_steps_per_second": 0.938, "step": 48000 }, { "epoch": 5.63, "learning_rate": 4.726273349576517e-05, "loss": 0.7116, "step": 48500 }, { "epoch": 5.69, "learning_rate": 4.71829678617009e-05, "loss": 0.7292, "step": 49000 }, { "epoch": 5.74, "learning_rate": 4.710320222763662e-05, "loss": 0.7289, "step": 49500 }, { "epoch": 5.8, "learning_rate": 4.702343659357234e-05, "loss": 0.7514, "step": 50000 }, { "epoch": 5.86, "learning_rate": 4.694367095950807e-05, "loss": 0.7545, "step": 50500 }, { "epoch": 5.92, "learning_rate": 4.686390532544379e-05, "loss": 0.7362, "step": 51000 }, { "epoch": 5.98, "learning_rate": 4.6784139691379515e-05, "loss": 0.7413, "step": 51500 }, { "epoch": 6.03, "learning_rate": 4.670437405731523e-05, "loss": 0.6738, "step": 52000 }, { "epoch": 6.03, "eval_bleu": 28.0235, "eval_gen_len": 18.4243, "eval_loss": 1.190636396408081, "eval_runtime": 1139.2873, "eval_samples_per_second": 15.13, "eval_steps_per_second": 0.946, "step": 52000 }, { "epoch": 6.09, "learning_rate": 4.6624608423250955e-05, "loss": 0.6467, "step": 52500 }, { "epoch": 6.15, "learning_rate": 4.6544842789186686e-05, "loss": 0.6508, "step": 53000 }, { "epoch": 6.21, "learning_rate": 4.646507715512241e-05, "loss": 0.6742, "step": 53500 }, { "epoch": 6.27, "learning_rate": 4.638531152105813e-05, "loss": 0.6609, "step": 54000 }, { "epoch": 6.32, "learning_rate": 4.630554588699385e-05, "loss": 0.6464, "step": 54500 }, { "epoch": 6.38, "learning_rate": 4.622578025292957e-05, "loss": 0.6362, "step": 55000 }, { "epoch": 6.44, "learning_rate": 4.61460146188653e-05, "loss": 0.6552, "step": 55500 }, { "epoch": 6.5, "learning_rate": 4.606624898480103e-05, "loss": 0.6763, "step": 56000 }, { "epoch": 6.5, "eval_bleu": 28.1501, "eval_gen_len": 18.6932, "eval_loss": 1.1940782070159912, "eval_runtime": 1154.4139, "eval_samples_per_second": 14.931, "eval_steps_per_second": 0.934, "step": 56000 }, { "epoch": 6.56, "learning_rate": 4.5986483350736744e-05, "loss": 0.6775, "step": 56500 }, { "epoch": 6.61, "learning_rate": 4.590671771667247e-05, "loss": 0.6765, "step": 57000 }, { "epoch": 6.67, "learning_rate": 4.582695208260819e-05, "loss": 0.681, "step": 57500 }, { "epoch": 6.73, "learning_rate": 4.5747186448543915e-05, "loss": 0.6747, "step": 58000 }, { "epoch": 6.79, "learning_rate": 4.5667420814479645e-05, "loss": 0.6612, "step": 58500 }, { "epoch": 6.85, "learning_rate": 4.558765518041536e-05, "loss": 0.6662, "step": 59000 }, { "epoch": 6.9, "learning_rate": 4.5507889546351086e-05, "loss": 0.6653, "step": 59500 }, { "epoch": 6.96, "learning_rate": 4.542812391228681e-05, "loss": 0.6594, "step": 60000 }, { "epoch": 6.96, "eval_bleu": 29.1144, "eval_gen_len": 18.5459, "eval_loss": 1.1682050228118896, "eval_runtime": 1139.7821, "eval_samples_per_second": 15.123, "eval_steps_per_second": 0.946, "step": 60000 }, { "epoch": 7.02, "learning_rate": 4.534835827822253e-05, "loss": 0.6333, "step": 60500 }, { "epoch": 7.08, "learning_rate": 4.526859264415826e-05, "loss": 0.5491, "step": 61000 }, { "epoch": 7.14, "learning_rate": 4.518882701009398e-05, "loss": 0.5476, "step": 61500 }, { "epoch": 7.19, "learning_rate": 4.5109061376029704e-05, "loss": 0.5469, "step": 62000 }, { "epoch": 7.25, "learning_rate": 4.502929574196543e-05, "loss": 0.5708, "step": 62500 }, { "epoch": 7.31, "learning_rate": 4.494953010790115e-05, "loss": 0.5785, "step": 63000 }, { "epoch": 7.37, "learning_rate": 4.486976447383687e-05, "loss": 0.5803, "step": 63500 }, { "epoch": 7.43, "learning_rate": 4.47899988397726e-05, "loss": 0.5971, "step": 64000 }, { "epoch": 7.43, "eval_bleu": 27.9464, "eval_gen_len": 18.4482, "eval_loss": 1.2448896169662476, "eval_runtime": 1136.4568, "eval_samples_per_second": 15.167, "eval_steps_per_second": 0.949, "step": 64000 }, { "epoch": 7.48, "learning_rate": 4.471023320570832e-05, "loss": 0.5985, "step": 64500 }, { "epoch": 7.54, "learning_rate": 4.4630467571644046e-05, "loss": 0.6032, "step": 65000 }, { "epoch": 7.6, "learning_rate": 4.455070193757977e-05, "loss": 0.6079, "step": 65500 }, { "epoch": 7.66, "learning_rate": 4.4470936303515486e-05, "loss": 0.6104, "step": 66000 }, { "epoch": 7.72, "learning_rate": 4.439117066945122e-05, "loss": 0.6063, "step": 66500 }, { "epoch": 7.77, "learning_rate": 4.431140503538694e-05, "loss": 0.607, "step": 67000 }, { "epoch": 7.83, "learning_rate": 4.4231639401322664e-05, "loss": 0.5891, "step": 67500 }, { "epoch": 7.89, "learning_rate": 4.415187376725838e-05, "loss": 0.5935, "step": 68000 }, { "epoch": 7.89, "eval_bleu": 28.6034, "eval_gen_len": 18.5967, "eval_loss": 1.2156028747558594, "eval_runtime": 1146.9398, "eval_samples_per_second": 15.029, "eval_steps_per_second": 0.94, "step": 68000 }, { "epoch": 7.95, "learning_rate": 4.4072108133194104e-05, "loss": 0.5981, "step": 68500 }, { "epoch": 8.01, "learning_rate": 4.399234249912983e-05, "loss": 0.5881, "step": 69000 }, { "epoch": 8.06, "learning_rate": 4.391257686506556e-05, "loss": 0.5013, "step": 69500 }, { "epoch": 8.12, "learning_rate": 4.383281123100128e-05, "loss": 0.5109, "step": 70000 }, { "epoch": 8.18, "learning_rate": 4.3753045596937e-05, "loss": 0.5187, "step": 70500 }, { "epoch": 8.24, "learning_rate": 4.367327996287272e-05, "loss": 0.5198, "step": 71000 }, { "epoch": 8.3, "learning_rate": 4.3593514328808446e-05, "loss": 0.5175, "step": 71500 }, { "epoch": 8.35, "learning_rate": 4.3513748694744177e-05, "loss": 0.5383, "step": 72000 }, { "epoch": 8.35, "eval_bleu": 27.891, "eval_gen_len": 18.6539, "eval_loss": 1.2927179336547852, "eval_runtime": 1152.5208, "eval_samples_per_second": 14.956, "eval_steps_per_second": 0.935, "step": 72000 }, { "epoch": 8.41, "learning_rate": 4.343398306067989e-05, "loss": 0.5285, "step": 72500 }, { "epoch": 8.47, "learning_rate": 4.335421742661562e-05, "loss": 0.5346, "step": 73000 }, { "epoch": 8.53, "learning_rate": 4.327445179255134e-05, "loss": 0.5601, "step": 73500 }, { "epoch": 8.59, "learning_rate": 4.3194686158487064e-05, "loss": 0.5869, "step": 74000 }, { "epoch": 8.64, "learning_rate": 4.311492052442279e-05, "loss": 0.5873, "step": 74500 }, { "epoch": 8.7, "learning_rate": 4.303515489035851e-05, "loss": 0.5924, "step": 75000 }, { "epoch": 8.76, "learning_rate": 4.2955389256294235e-05, "loss": 0.601, "step": 75500 }, { "epoch": 8.82, "learning_rate": 4.287562362222996e-05, "loss": 0.6022, "step": 76000 }, { "epoch": 8.82, "eval_bleu": 27.7624, "eval_gen_len": 18.5558, "eval_loss": 1.283076286315918, "eval_runtime": 1144.2558, "eval_samples_per_second": 15.064, "eval_steps_per_second": 0.942, "step": 76000 }, { "epoch": 8.82, "step": 76000, "total_flos": 2.6353338197706015e+18, "train_loss": 0.8847039678473222, "train_runtime": 77092.0129, "train_samples_per_second": 71.551, "train_steps_per_second": 4.472 } ], "logging_steps": 500, "max_steps": 344760, "num_train_epochs": 40, "save_steps": 4000, "total_flos": 2.6353338197706015e+18, "trial_name": null, "trial_params": null }