{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 2000, "global_step": 60928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 4.967174369747899e-05, "loss": 3.2651, "step": 500 }, { "epoch": 0.07, "learning_rate": 4.934348739495799e-05, "loss": 1.7248, "step": 1000 }, { "epoch": 0.1, "learning_rate": 4.901523109243697e-05, "loss": 1.3764, "step": 1500 }, { "epoch": 0.13, "learning_rate": 4.868697478991597e-05, "loss": 1.1757, "step": 2000 }, { "epoch": 0.13, "eval_e": 0.581678956324447, "eval_f1": 0.5311118764265024, "eval_loss": 1.6264814138412476, "eval_runtime": 40.5361, "eval_samples_per_second": 86.984, "eval_steps_per_second": 0.691, "step": 2000 }, { "epoch": 0.16, "learning_rate": 4.835871848739496e-05, "loss": 1.0579, "step": 2500 }, { "epoch": 0.2, "learning_rate": 4.8030462184873956e-05, "loss": 0.9707, "step": 3000 }, { "epoch": 0.23, "learning_rate": 4.7702205882352946e-05, "loss": 0.9118, "step": 3500 }, { "epoch": 0.26, "learning_rate": 4.7373949579831936e-05, "loss": 0.8875, "step": 4000 }, { "epoch": 0.26, "eval_e": 0.6534316505955757, "eval_f1": 0.596454998267645, "eval_loss": 1.359203577041626, "eval_runtime": 41.4011, "eval_samples_per_second": 85.167, "eval_steps_per_second": 0.676, "step": 4000 }, { "epoch": 0.3, "learning_rate": 4.7045693277310926e-05, "loss": 0.8308, "step": 4500 }, { "epoch": 0.33, "learning_rate": 4.6717436974789916e-05, "loss": 0.8191, "step": 5000 }, { "epoch": 0.36, "learning_rate": 4.638918067226891e-05, "loss": 0.8039, "step": 5500 }, { "epoch": 0.39, "learning_rate": 4.6060924369747897e-05, "loss": 0.7682, "step": 6000 }, { "epoch": 0.39, "eval_e": 0.6781055019852524, "eval_f1": 0.6213789300225637, "eval_loss": 1.1949362754821777, "eval_runtime": 40.0845, "eval_samples_per_second": 87.964, "eval_steps_per_second": 0.699, "step": 6000 }, { "epoch": 0.43, "learning_rate": 4.5732668067226893e-05, "loss": 0.7669, "step": 6500 }, { "epoch": 0.46, "learning_rate": 4.5404411764705883e-05, "loss": 0.7246, "step": 7000 }, { "epoch": 0.49, "learning_rate": 4.507615546218488e-05, "loss": 0.723, "step": 7500 }, { "epoch": 0.53, "learning_rate": 4.474789915966387e-05, "loss": 0.7023, "step": 8000 }, { "epoch": 0.53, "eval_e": 0.7073170731707317, "eval_f1": 0.6500612956253415, "eval_loss": 1.1214724779129028, "eval_runtime": 36.2407, "eval_samples_per_second": 97.294, "eval_steps_per_second": 0.773, "step": 8000 }, { "epoch": 0.56, "learning_rate": 4.4419642857142854e-05, "loss": 0.6801, "step": 8500 }, { "epoch": 0.59, "learning_rate": 4.409138655462185e-05, "loss": 0.6614, "step": 9000 }, { "epoch": 0.62, "learning_rate": 4.376313025210084e-05, "loss": 0.6895, "step": 9500 }, { "epoch": 0.66, "learning_rate": 4.343487394957984e-05, "loss": 0.6633, "step": 10000 }, { "epoch": 0.66, "eval_e": 0.707884288145207, "eval_f1": 0.6365888205833479, "eval_loss": 1.1668164730072021, "eval_runtime": 36.2099, "eval_samples_per_second": 97.377, "eval_steps_per_second": 0.773, "step": 10000 }, { "epoch": 0.69, "learning_rate": 4.310661764705883e-05, "loss": 0.6346, "step": 10500 }, { "epoch": 0.72, "learning_rate": 4.277836134453782e-05, "loss": 0.6376, "step": 11000 }, { "epoch": 0.75, "learning_rate": 4.245010504201681e-05, "loss": 0.654, "step": 11500 }, { "epoch": 0.79, "learning_rate": 4.21218487394958e-05, "loss": 0.6169, "step": 12000 }, { "epoch": 0.79, "eval_e": 0.7271695972773681, "eval_f1": 0.6654857811733846, "eval_loss": 1.0702860355377197, "eval_runtime": 36.2662, "eval_samples_per_second": 97.226, "eval_steps_per_second": 0.772, "step": 12000 }, { "epoch": 0.82, "learning_rate": 4.1793592436974794e-05, "loss": 0.6161, "step": 12500 }, { "epoch": 0.85, "learning_rate": 4.1465336134453784e-05, "loss": 0.6157, "step": 13000 }, { "epoch": 0.89, "learning_rate": 4.1137079831932774e-05, "loss": 0.618, "step": 13500 }, { "epoch": 0.92, "learning_rate": 4.0808823529411765e-05, "loss": 0.604, "step": 14000 }, { "epoch": 0.92, "eval_e": 0.7399319342030629, "eval_f1": 0.6704723459995169, "eval_loss": 0.9919618964195251, "eval_runtime": 36.2385, "eval_samples_per_second": 97.3, "eval_steps_per_second": 0.773, "step": 14000 }, { "epoch": 0.95, "learning_rate": 4.048056722689076e-05, "loss": 0.6043, "step": 14500 }, { "epoch": 0.98, "learning_rate": 4.015231092436975e-05, "loss": 0.579, "step": 15000 }, { "epoch": 1.02, "learning_rate": 3.982405462184874e-05, "loss": 0.5503, "step": 15500 }, { "epoch": 1.05, "learning_rate": 3.949579831932773e-05, "loss": 0.5093, "step": 16000 }, { "epoch": 1.05, "eval_e": 0.7359614293817357, "eval_f1": 0.6659783311586052, "eval_loss": 0.9624159336090088, "eval_runtime": 36.2264, "eval_samples_per_second": 97.332, "eval_steps_per_second": 0.773, "step": 16000 }, { "epoch": 1.08, "learning_rate": 3.916754201680672e-05, "loss": 0.5139, "step": 16500 }, { "epoch": 1.12, "learning_rate": 3.883928571428572e-05, "loss": 0.5018, "step": 17000 }, { "epoch": 1.15, "learning_rate": 3.851102941176471e-05, "loss": 0.5009, "step": 17500 }, { "epoch": 1.18, "learning_rate": 3.81827731092437e-05, "loss": 0.5068, "step": 18000 }, { "epoch": 1.18, "eval_e": 0.7464549064095292, "eval_f1": 0.6771850837420853, "eval_loss": 1.0379488468170166, "eval_runtime": 36.2397, "eval_samples_per_second": 97.297, "eval_steps_per_second": 0.773, "step": 18000 }, { "epoch": 1.21, "learning_rate": 3.785451680672269e-05, "loss": 0.4928, "step": 18500 }, { "epoch": 1.25, "learning_rate": 3.7526260504201685e-05, "loss": 0.5026, "step": 19000 }, { "epoch": 1.28, "learning_rate": 3.7198004201680675e-05, "loss": 0.4938, "step": 19500 }, { "epoch": 1.31, "learning_rate": 3.6869747899159665e-05, "loss": 0.4812, "step": 20000 }, { "epoch": 1.31, "eval_e": 0.742484401588202, "eval_f1": 0.6734074837554972, "eval_loss": 1.0004695653915405, "eval_runtime": 36.2259, "eval_samples_per_second": 97.334, "eval_steps_per_second": 0.773, "step": 20000 }, { "epoch": 1.35, "learning_rate": 3.6541491596638656e-05, "loss": 0.4877, "step": 20500 }, { "epoch": 1.38, "learning_rate": 3.6213235294117646e-05, "loss": 0.4823, "step": 21000 }, { "epoch": 1.41, "learning_rate": 3.588497899159664e-05, "loss": 0.4708, "step": 21500 }, { "epoch": 1.44, "learning_rate": 3.555672268907563e-05, "loss": 0.4831, "step": 22000 }, { "epoch": 1.44, "eval_e": 0.7433352240499149, "eval_f1": 0.6745692090035141, "eval_loss": 1.0472208261489868, "eval_runtime": 36.2204, "eval_samples_per_second": 97.348, "eval_steps_per_second": 0.773, "step": 22000 }, { "epoch": 1.48, "learning_rate": 3.522846638655463e-05, "loss": 0.4716, "step": 22500 }, { "epoch": 1.51, "learning_rate": 3.490021008403361e-05, "loss": 0.4716, "step": 23000 }, { "epoch": 1.54, "learning_rate": 3.45719537815126e-05, "loss": 0.474, "step": 23500 }, { "epoch": 1.58, "learning_rate": 3.42436974789916e-05, "loss": 0.4748, "step": 24000 }, { "epoch": 1.58, "eval_e": 0.7552467385138968, "eval_f1": 0.6830863599993048, "eval_loss": 0.9445247650146484, "eval_runtime": 41.2911, "eval_samples_per_second": 85.394, "eval_steps_per_second": 0.678, "step": 24000 }, { "epoch": 1.61, "learning_rate": 3.391544117647059e-05, "loss": 0.4668, "step": 24500 }, { "epoch": 1.64, "learning_rate": 3.358718487394958e-05, "loss": 0.467, "step": 25000 }, { "epoch": 1.67, "learning_rate": 3.325892857142857e-05, "loss": 0.4517, "step": 25500 }, { "epoch": 1.71, "learning_rate": 3.2930672268907566e-05, "loss": 0.4581, "step": 26000 }, { "epoch": 1.71, "eval_e": 0.7552467385138968, "eval_f1": 0.6822537839548614, "eval_loss": 0.9966788291931152, "eval_runtime": 36.282, "eval_samples_per_second": 97.183, "eval_steps_per_second": 0.772, "step": 26000 }, { "epoch": 1.74, "learning_rate": 3.2602415966386556e-05, "loss": 0.458, "step": 26500 }, { "epoch": 1.77, "learning_rate": 3.2274159663865547e-05, "loss": 0.441, "step": 27000 }, { "epoch": 1.81, "learning_rate": 3.1945903361344537e-05, "loss": 0.4703, "step": 27500 }, { "epoch": 1.84, "learning_rate": 3.161764705882353e-05, "loss": 0.4438, "step": 28000 }, { "epoch": 1.84, "eval_e": 0.7487237663074305, "eval_f1": 0.6789318734426844, "eval_loss": 0.9515223503112793, "eval_runtime": 39.9941, "eval_samples_per_second": 88.163, "eval_steps_per_second": 0.7, "step": 28000 }, { "epoch": 1.87, "learning_rate": 3.1289390756302523e-05, "loss": 0.4556, "step": 28500 }, { "epoch": 1.9, "learning_rate": 3.0961134453781514e-05, "loss": 0.4453, "step": 29000 }, { "epoch": 1.94, "learning_rate": 3.063287815126051e-05, "loss": 0.4415, "step": 29500 }, { "epoch": 1.97, "learning_rate": 3.0304621848739494e-05, "loss": 0.4417, "step": 30000 }, { "epoch": 1.97, "eval_e": 0.7577992058990357, "eval_f1": 0.686996111301651, "eval_loss": 0.9782966375350952, "eval_runtime": 36.255, "eval_samples_per_second": 97.256, "eval_steps_per_second": 0.772, "step": 30000 }, { "epoch": 2.0, "learning_rate": 2.9976365546218487e-05, "loss": 0.4254, "step": 30500 }, { "epoch": 2.04, "learning_rate": 2.964810924369748e-05, "loss": 0.3574, "step": 31000 }, { "epoch": 2.07, "learning_rate": 2.9319852941176474e-05, "loss": 0.3596, "step": 31500 }, { "epoch": 2.1, "learning_rate": 2.8991596638655467e-05, "loss": 0.3692, "step": 32000 }, { "epoch": 2.1, "eval_e": 0.7524106636415201, "eval_f1": 0.6861686413926235, "eval_loss": 1.0621048212051392, "eval_runtime": 36.2047, "eval_samples_per_second": 97.391, "eval_steps_per_second": 0.773, "step": 32000 }, { "epoch": 2.13, "learning_rate": 2.8663340336134454e-05, "loss": 0.3673, "step": 32500 }, { "epoch": 2.17, "learning_rate": 2.8335084033613447e-05, "loss": 0.3683, "step": 33000 }, { "epoch": 2.2, "learning_rate": 2.8006827731092438e-05, "loss": 0.3538, "step": 33500 }, { "epoch": 2.23, "learning_rate": 2.767857142857143e-05, "loss": 0.3555, "step": 34000 }, { "epoch": 2.23, "eval_e": 0.7620533182076007, "eval_f1": 0.6877485506353153, "eval_loss": 1.040844202041626, "eval_runtime": 36.2597, "eval_samples_per_second": 97.243, "eval_steps_per_second": 0.772, "step": 34000 }, { "epoch": 2.26, "learning_rate": 2.7350315126050424e-05, "loss": 0.3718, "step": 34500 }, { "epoch": 2.3, "learning_rate": 2.702205882352941e-05, "loss": 0.3557, "step": 35000 }, { "epoch": 2.33, "learning_rate": 2.6693802521008405e-05, "loss": 0.3459, "step": 35500 }, { "epoch": 2.36, "learning_rate": 2.6365546218487398e-05, "loss": 0.3657, "step": 36000 }, { "epoch": 2.36, "eval_e": 0.7597844583096994, "eval_f1": 0.6853469432427786, "eval_loss": 1.1275439262390137, "eval_runtime": 40.6163, "eval_samples_per_second": 86.813, "eval_steps_per_second": 0.689, "step": 36000 }, { "epoch": 2.4, "learning_rate": 2.6037289915966388e-05, "loss": 0.3593, "step": 36500 }, { "epoch": 2.43, "learning_rate": 2.5709033613445378e-05, "loss": 0.3468, "step": 37000 }, { "epoch": 2.46, "learning_rate": 2.5380777310924368e-05, "loss": 0.3475, "step": 37500 }, { "epoch": 2.49, "learning_rate": 2.505252100840336e-05, "loss": 0.3504, "step": 38000 }, { "epoch": 2.49, "eval_e": 0.7603516732841747, "eval_f1": 0.6846985297135735, "eval_loss": 1.0078964233398438, "eval_runtime": 36.2227, "eval_samples_per_second": 97.342, "eval_steps_per_second": 0.773, "step": 38000 }, { "epoch": 2.53, "learning_rate": 2.4724264705882355e-05, "loss": 0.3574, "step": 38500 }, { "epoch": 2.56, "learning_rate": 2.4396008403361345e-05, "loss": 0.3543, "step": 39000 }, { "epoch": 2.59, "learning_rate": 2.406775210084034e-05, "loss": 0.3339, "step": 39500 }, { "epoch": 2.63, "learning_rate": 2.373949579831933e-05, "loss": 0.3573, "step": 40000 }, { "epoch": 2.63, "eval_e": 0.7589336358479863, "eval_f1": 0.683545228612745, "eval_loss": 1.0078063011169434, "eval_runtime": 36.2573, "eval_samples_per_second": 97.25, "eval_steps_per_second": 0.772, "step": 40000 }, { "epoch": 2.66, "learning_rate": 2.3411239495798322e-05, "loss": 0.3583, "step": 40500 }, { "epoch": 2.69, "learning_rate": 2.3082983193277312e-05, "loss": 0.346, "step": 41000 }, { "epoch": 2.72, "learning_rate": 2.2754726890756302e-05, "loss": 0.3468, "step": 41500 }, { "epoch": 2.76, "learning_rate": 2.2426470588235296e-05, "loss": 0.3409, "step": 42000 }, { "epoch": 2.76, "eval_e": 0.7552467385138968, "eval_f1": 0.6810380542275204, "eval_loss": 1.08004629611969, "eval_runtime": 36.2112, "eval_samples_per_second": 97.373, "eval_steps_per_second": 0.773, "step": 42000 }, { "epoch": 2.79, "learning_rate": 2.2098214285714286e-05, "loss": 0.3501, "step": 42500 }, { "epoch": 2.82, "learning_rate": 2.176995798319328e-05, "loss": 0.3361, "step": 43000 }, { "epoch": 2.86, "learning_rate": 2.144170168067227e-05, "loss": 0.3452, "step": 43500 }, { "epoch": 2.89, "learning_rate": 2.1113445378151263e-05, "loss": 0.3602, "step": 44000 }, { "epoch": 2.89, "eval_e": 0.7609188882586501, "eval_f1": 0.6832601552139036, "eval_loss": 1.031318187713623, "eval_runtime": 36.2238, "eval_samples_per_second": 97.339, "eval_steps_per_second": 0.773, "step": 44000 }, { "epoch": 2.92, "learning_rate": 2.0785189075630253e-05, "loss": 0.3473, "step": 44500 }, { "epoch": 2.95, "learning_rate": 2.0456932773109243e-05, "loss": 0.3309, "step": 45000 }, { "epoch": 2.99, "learning_rate": 2.0128676470588236e-05, "loss": 0.3342, "step": 45500 }, { "epoch": 3.02, "learning_rate": 1.9800420168067226e-05, "loss": 0.3, "step": 46000 }, { "epoch": 3.02, "eval_e": 0.7577992058990357, "eval_f1": 0.6814344985190464, "eval_loss": 1.0859274864196777, "eval_runtime": 36.2358, "eval_samples_per_second": 97.307, "eval_steps_per_second": 0.773, "step": 46000 }, { "epoch": 3.05, "learning_rate": 1.947216386554622e-05, "loss": 0.2727, "step": 46500 }, { "epoch": 3.09, "learning_rate": 1.9143907563025213e-05, "loss": 0.271, "step": 47000 }, { "epoch": 3.12, "learning_rate": 1.8815651260504203e-05, "loss": 0.2777, "step": 47500 }, { "epoch": 3.15, "learning_rate": 1.8487394957983196e-05, "loss": 0.2616, "step": 48000 }, { "epoch": 3.15, "eval_e": 0.7623369256948384, "eval_f1": 0.6829232660093741, "eval_loss": 1.1106504201889038, "eval_runtime": 36.1973, "eval_samples_per_second": 97.41, "eval_steps_per_second": 0.774, "step": 48000 }, { "epoch": 3.18, "learning_rate": 1.8159138655462187e-05, "loss": 0.2702, "step": 48500 }, { "epoch": 3.22, "learning_rate": 1.7830882352941177e-05, "loss": 0.2665, "step": 49000 }, { "epoch": 3.25, "learning_rate": 1.7502626050420167e-05, "loss": 0.2661, "step": 49500 }, { "epoch": 3.28, "learning_rate": 1.717436974789916e-05, "loss": 0.2728, "step": 50000 }, { "epoch": 3.28, "eval_e": 0.7555303460011344, "eval_f1": 0.680738586407374, "eval_loss": 1.158908724784851, "eval_runtime": 36.2449, "eval_samples_per_second": 97.283, "eval_steps_per_second": 0.773, "step": 50000 }, { "epoch": 3.32, "learning_rate": 1.6846113445378154e-05, "loss": 0.2753, "step": 50500 }, { "epoch": 3.35, "learning_rate": 1.6517857142857144e-05, "loss": 0.2717, "step": 51000 }, { "epoch": 3.38, "learning_rate": 1.6189600840336137e-05, "loss": 0.2723, "step": 51500 }, { "epoch": 3.41, "learning_rate": 1.5861344537815127e-05, "loss": 0.2738, "step": 52000 }, { "epoch": 3.41, "eval_e": 0.7648893930799773, "eval_f1": 0.6879751943104644, "eval_loss": 1.1301259994506836, "eval_runtime": 41.2984, "eval_samples_per_second": 85.379, "eval_steps_per_second": 0.678, "step": 52000 }, { "epoch": 3.45, "learning_rate": 1.5533088235294117e-05, "loss": 0.2692, "step": 52500 }, { "epoch": 3.48, "learning_rate": 1.5204831932773109e-05, "loss": 0.2714, "step": 53000 }, { "epoch": 3.51, "learning_rate": 1.48765756302521e-05, "loss": 0.2646, "step": 53500 }, { "epoch": 3.55, "learning_rate": 1.4548319327731094e-05, "loss": 0.2664, "step": 54000 }, { "epoch": 3.55, "eval_e": 0.7688598979013046, "eval_f1": 0.6891421496451101, "eval_loss": 1.0957111120224, "eval_runtime": 36.2366, "eval_samples_per_second": 97.305, "eval_steps_per_second": 0.773, "step": 54000 }, { "epoch": 3.58, "learning_rate": 1.4220063025210084e-05, "loss": 0.2683, "step": 54500 }, { "epoch": 3.61, "learning_rate": 1.3891806722689078e-05, "loss": 0.2698, "step": 55000 }, { "epoch": 3.64, "learning_rate": 1.3563550420168068e-05, "loss": 0.2709, "step": 55500 }, { "epoch": 3.68, "learning_rate": 1.323529411764706e-05, "loss": 0.2737, "step": 56000 }, { "epoch": 3.68, "eval_e": 0.7631877481565513, "eval_f1": 0.6905602249087808, "eval_loss": 1.0759004354476929, "eval_runtime": 36.2232, "eval_samples_per_second": 97.341, "eval_steps_per_second": 0.773, "step": 56000 }, { "epoch": 3.71, "learning_rate": 1.2907037815126053e-05, "loss": 0.2703, "step": 56500 }, { "epoch": 3.74, "learning_rate": 1.2578781512605043e-05, "loss": 0.2609, "step": 57000 }, { "epoch": 3.77, "learning_rate": 1.2250525210084033e-05, "loss": 0.2616, "step": 57500 }, { "epoch": 3.81, "learning_rate": 1.1922268907563026e-05, "loss": 0.2784, "step": 58000 }, { "epoch": 3.81, "eval_e": 0.7614861032331254, "eval_f1": 0.6854242844445689, "eval_loss": 1.0705878734588623, "eval_runtime": 36.2079, "eval_samples_per_second": 97.382, "eval_steps_per_second": 0.773, "step": 58000 }, { "epoch": 3.84, "learning_rate": 1.1594012605042018e-05, "loss": 0.2701, "step": 58500 }, { "epoch": 3.87, "learning_rate": 1.126575630252101e-05, "loss": 0.2647, "step": 59000 }, { "epoch": 3.91, "learning_rate": 1.09375e-05, "loss": 0.2608, "step": 59500 }, { "epoch": 3.94, "learning_rate": 1.0609243697478992e-05, "loss": 0.2622, "step": 60000 }, { "epoch": 3.94, "eval_e": 0.7612024957458877, "eval_f1": 0.6860082711191074, "eval_loss": 1.1619102954864502, "eval_runtime": 36.2375, "eval_samples_per_second": 97.303, "eval_steps_per_second": 0.773, "step": 60000 }, { "epoch": 3.97, "learning_rate": 1.0280987394957983e-05, "loss": 0.2547, "step": 60500 } ], "logging_steps": 500, "max_steps": 76160, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.9495781319278917e+18, "trial_name": null, "trial_params": null }