{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 700, "global_step": 22388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025013400035733428, "grad_norm": 235.5828399658203, "learning_rate": 4.824012864034305e-07, "loss": 17.3783, "step": 140 }, { "epoch": 0.050026800071466856, "grad_norm": 26.746055603027344, "learning_rate": 9.82669287118099e-07, "loss": 10.3883, "step": 280 }, { "epoch": 0.07504020010720029, "grad_norm": 16.398082733154297, "learning_rate": 1.4829372878327677e-06, "loss": 6.239, "step": 420 }, { "epoch": 0.10005360014293371, "grad_norm": 14.383550643920898, "learning_rate": 1.983205288547436e-06, "loss": 5.6986, "step": 560 }, { "epoch": 0.12506700017866715, "grad_norm": 20.341243743896484, "learning_rate": 2.483473289262105e-06, "loss": 5.4609, "step": 700 }, { "epoch": 0.12506700017866715, "eval_nli-pairs_loss": 5.221348285675049, "eval_nli-pairs_runtime": 11.8701, "eval_nli-pairs_samples_per_second": 126.368, "eval_nli-pairs_steps_per_second": 5.307, "step": 700 }, { "epoch": 0.12506700017866715, "eval_scitail-pairs-pos_loss": 3.934469699859619, "eval_scitail-pairs-pos_runtime": 14.8075, "eval_scitail-pairs-pos_samples_per_second": 88.063, "eval_scitail-pairs-pos_steps_per_second": 3.714, "step": 700 }, { "epoch": 0.12506700017866715, "eval_qnli-contrastive_loss": 5.33072566986084, "eval_qnli-contrastive_runtime": 4.6845, "eval_qnli-contrastive_samples_per_second": 320.208, "eval_qnli-contrastive_steps_per_second": 13.449, "step": 700 }, { "epoch": 0.15008040021440058, "grad_norm": 16.652421951293945, "learning_rate": 2.983741289976774e-06, "loss": 5.2036, "step": 840 }, { "epoch": 0.175093800250134, "grad_norm": 19.31585121154785, "learning_rate": 3.484009290691442e-06, "loss": 4.9986, "step": 980 }, { "epoch": 0.20010720028586743, "grad_norm": 32.62712860107422, "learning_rate": 3.984277291406111e-06, "loss": 4.4918, "step": 1120 }, { "epoch": 0.22512060032160086, "grad_norm": 19.606719970703125, "learning_rate": 4.48454529212078e-06, "loss": 4.2202, "step": 1260 }, { "epoch": 0.2501340003573343, "grad_norm": 27.97759246826172, "learning_rate": 4.984813292835448e-06, "loss": 4.0922, "step": 1400 }, { "epoch": 0.2501340003573343, "eval_nli-pairs_loss": 4.116352081298828, "eval_nli-pairs_runtime": 11.8302, "eval_nli-pairs_samples_per_second": 126.795, "eval_nli-pairs_steps_per_second": 5.325, "step": 1400 }, { "epoch": 0.2501340003573343, "eval_scitail-pairs-pos_loss": 2.4735982418060303, "eval_scitail-pairs-pos_runtime": 14.767, "eval_scitail-pairs-pos_samples_per_second": 88.305, "eval_scitail-pairs-pos_steps_per_second": 3.725, "step": 1400 }, { "epoch": 0.2501340003573343, "eval_qnli-contrastive_loss": 4.378289222717285, "eval_qnli-contrastive_runtime": 4.6744, "eval_qnli-contrastive_samples_per_second": 320.895, "eval_qnli-contrastive_steps_per_second": 13.478, "step": 1400 }, { "epoch": 0.27514740039306773, "grad_norm": 21.981849670410156, "learning_rate": 5.4850812935501166e-06, "loss": 3.3935, "step": 1540 }, { "epoch": 0.30016080042880117, "grad_norm": 26.165401458740234, "learning_rate": 5.9853492942647854e-06, "loss": 3.5528, "step": 1680 }, { "epoch": 0.32517420046453455, "grad_norm": 18.916423797607422, "learning_rate": 6.4856172949794535e-06, "loss": 3.6013, "step": 1820 }, { "epoch": 0.350187600500268, "grad_norm": 14.634994506835938, "learning_rate": 6.985885295694122e-06, "loss": 3.275, "step": 1960 }, { "epoch": 0.3752010005360014, "grad_norm": 23.64468002319336, "learning_rate": 7.4825799535465435e-06, "loss": 3.2832, "step": 2100 }, { "epoch": 0.3752010005360014, "eval_nli-pairs_loss": 3.1333088874816895, "eval_nli-pairs_runtime": 11.8718, "eval_nli-pairs_samples_per_second": 126.35, "eval_nli-pairs_steps_per_second": 5.307, "step": 2100 }, { "epoch": 0.3752010005360014, "eval_scitail-pairs-pos_loss": 1.8785525560379028, "eval_scitail-pairs-pos_runtime": 14.7587, "eval_scitail-pairs-pos_samples_per_second": 88.355, "eval_scitail-pairs-pos_steps_per_second": 3.727, "step": 2100 }, { "epoch": 0.3752010005360014, "eval_qnli-contrastive_loss": 3.401968479156494, "eval_qnli-contrastive_runtime": 4.7313, "eval_qnli-contrastive_samples_per_second": 317.034, "eval_qnli-contrastive_steps_per_second": 13.315, "step": 2100 }, { "epoch": 0.40021440057173485, "grad_norm": 19.62537384033203, "learning_rate": 7.982847954261212e-06, "loss": 3.0276, "step": 2240 }, { "epoch": 0.4252278006074683, "grad_norm": 13.65882682800293, "learning_rate": 8.48311595497588e-06, "loss": 2.8936, "step": 2380 }, { "epoch": 0.4502412006432017, "grad_norm": 18.643104553222656, "learning_rate": 8.983383955690548e-06, "loss": 2.7427, "step": 2520 }, { "epoch": 0.47525460067893516, "grad_norm": 22.165441513061523, "learning_rate": 9.483651956405217e-06, "loss": 3.0206, "step": 2660 }, { "epoch": 0.5002680007146686, "grad_norm": 7.004354953765869, "learning_rate": 9.983919957119886e-06, "loss": 2.575, "step": 2800 }, { "epoch": 0.5002680007146686, "eval_nli-pairs_loss": 2.6767749786376953, "eval_nli-pairs_runtime": 11.9879, "eval_nli-pairs_samples_per_second": 125.127, "eval_nli-pairs_steps_per_second": 5.255, "step": 2800 }, { "epoch": 0.5002680007146686, "eval_scitail-pairs-pos_loss": 1.5111600160598755, "eval_scitail-pairs-pos_runtime": 14.7412, "eval_scitail-pairs-pos_samples_per_second": 88.46, "eval_scitail-pairs-pos_steps_per_second": 3.731, "step": 2800 }, { "epoch": 0.5002680007146686, "eval_qnli-contrastive_loss": 2.8863816261291504, "eval_qnli-contrastive_runtime": 4.6516, "eval_qnli-contrastive_samples_per_second": 322.468, "eval_qnli-contrastive_steps_per_second": 13.544, "step": 2800 }, { "epoch": 0.525281400750402, "grad_norm": 35.325260162353516, "learning_rate": 1.0484187957834555e-05, "loss": 2.5411, "step": 2940 }, { "epoch": 0.5502948007861355, "grad_norm": 18.314611434936523, "learning_rate": 1.0984455958549222e-05, "loss": 2.5173, "step": 3080 }, { "epoch": 0.5753082008218688, "grad_norm": 19.870922088623047, "learning_rate": 1.1484723959263893e-05, "loss": 2.5333, "step": 3220 }, { "epoch": 0.6003216008576023, "grad_norm": 15.03135871887207, "learning_rate": 1.198499195997856e-05, "loss": 2.6302, "step": 3360 }, { "epoch": 0.6253350008933357, "grad_norm": 22.288837432861328, "learning_rate": 1.248525996069323e-05, "loss": 2.5134, "step": 3500 }, { "epoch": 0.6253350008933357, "eval_nli-pairs_loss": 2.3164055347442627, "eval_nli-pairs_runtime": 11.9409, "eval_nli-pairs_samples_per_second": 125.619, "eval_nli-pairs_steps_per_second": 5.276, "step": 3500 }, { "epoch": 0.6253350008933357, "eval_scitail-pairs-pos_loss": 1.329464316368103, "eval_scitail-pairs-pos_runtime": 14.9802, "eval_scitail-pairs-pos_samples_per_second": 87.048, "eval_scitail-pairs-pos_steps_per_second": 3.672, "step": 3500 }, { "epoch": 0.6253350008933357, "eval_qnli-contrastive_loss": 2.432037830352783, "eval_qnli-contrastive_runtime": 4.7247, "eval_qnli-contrastive_samples_per_second": 317.478, "eval_qnli-contrastive_steps_per_second": 13.334, "step": 3500 }, { "epoch": 0.6503484009290691, "grad_norm": 22.80077362060547, "learning_rate": 1.2985527961407898e-05, "loss": 2.4442, "step": 3640 }, { "epoch": 0.6753618009648026, "grad_norm": 8.082416534423828, "learning_rate": 1.3485795962122568e-05, "loss": 2.2516, "step": 3780 }, { "epoch": 0.700375201000536, "grad_norm": 19.319093704223633, "learning_rate": 1.3986063962837235e-05, "loss": 2.3152, "step": 3920 }, { "epoch": 0.7253886010362695, "grad_norm": 22.325191497802734, "learning_rate": 1.4486331963551904e-05, "loss": 2.0466, "step": 4060 }, { "epoch": 0.7504020010720028, "grad_norm": 23.437545776367188, "learning_rate": 1.4983026621404324e-05, "loss": 2.2551, "step": 4200 }, { "epoch": 0.7504020010720028, "eval_nli-pairs_loss": 2.0880069732666016, "eval_nli-pairs_runtime": 12.1083, "eval_nli-pairs_samples_per_second": 123.882, "eval_nli-pairs_steps_per_second": 5.203, "step": 4200 }, { "epoch": 0.7504020010720028, "eval_scitail-pairs-pos_loss": 1.1470309495925903, "eval_scitail-pairs-pos_runtime": 14.8408, "eval_scitail-pairs-pos_samples_per_second": 87.866, "eval_scitail-pairs-pos_steps_per_second": 3.706, "step": 4200 }, { "epoch": 0.7504020010720028, "eval_qnli-contrastive_loss": 2.411231279373169, "eval_qnli-contrastive_runtime": 4.7069, "eval_qnli-contrastive_samples_per_second": 318.678, "eval_qnli-contrastive_steps_per_second": 13.384, "step": 4200 }, { "epoch": 0.7754154011077363, "grad_norm": 15.4782133102417, "learning_rate": 1.5483294622118993e-05, "loss": 2.1891, "step": 4340 }, { "epoch": 0.8004288011434697, "grad_norm": 2.4403023719787598, "learning_rate": 1.598356262283366e-05, "loss": 2.0993, "step": 4480 }, { "epoch": 0.8254422011792032, "grad_norm": 16.028566360473633, "learning_rate": 1.648383062354833e-05, "loss": 1.7904, "step": 4620 }, { "epoch": 0.8504556012149366, "grad_norm": 11.724300384521484, "learning_rate": 1.6984098624262998e-05, "loss": 2.0401, "step": 4760 }, { "epoch": 0.87546900125067, "grad_norm": 5.384641647338867, "learning_rate": 1.7484366624977668e-05, "loss": 1.8895, "step": 4900 }, { "epoch": 0.87546900125067, "eval_nli-pairs_loss": 1.8439208269119263, "eval_nli-pairs_runtime": 11.972, "eval_nli-pairs_samples_per_second": 125.292, "eval_nli-pairs_steps_per_second": 5.262, "step": 4900 }, { "epoch": 0.87546900125067, "eval_scitail-pairs-pos_loss": 0.9787265658378601, "eval_scitail-pairs-pos_runtime": 14.8626, "eval_scitail-pairs-pos_samples_per_second": 87.737, "eval_scitail-pairs-pos_steps_per_second": 3.701, "step": 4900 }, { "epoch": 0.87546900125067, "eval_qnli-contrastive_loss": 1.91374933719635, "eval_qnli-contrastive_runtime": 4.7148, "eval_qnli-contrastive_samples_per_second": 318.144, "eval_qnli-contrastive_steps_per_second": 13.362, "step": 4900 }, { "epoch": 0.9004824012864034, "grad_norm": 14.152888298034668, "learning_rate": 1.7984634625692335e-05, "loss": 2.0058, "step": 5040 }, { "epoch": 0.9254958013221368, "grad_norm": 10.892020225524902, "learning_rate": 1.8484902626407003e-05, "loss": 2.0126, "step": 5180 }, { "epoch": 0.9505092013578703, "grad_norm": 1.89139986038208, "learning_rate": 1.8985170627121673e-05, "loss": 2.067, "step": 5320 }, { "epoch": 0.9755226013936037, "grad_norm": 13.1935396194458, "learning_rate": 1.948543862783634e-05, "loss": 2.1337, "step": 5460 }, { "epoch": 1.0005360014293372, "grad_norm": 14.684636116027832, "learning_rate": 1.998570662855101e-05, "loss": 1.8408, "step": 5600 }, { "epoch": 1.0005360014293372, "eval_nli-pairs_loss": 1.7102497816085815, "eval_nli-pairs_runtime": 13.5641, "eval_nli-pairs_samples_per_second": 110.586, "eval_nli-pairs_steps_per_second": 4.645, "step": 5600 }, { "epoch": 1.0005360014293372, "eval_scitail-pairs-pos_loss": 0.920977771282196, "eval_scitail-pairs-pos_runtime": 15.9776, "eval_scitail-pairs-pos_samples_per_second": 81.614, "eval_scitail-pairs-pos_steps_per_second": 3.442, "step": 5600 }, { "epoch": 1.0005360014293372, "eval_qnli-contrastive_loss": 1.8655247688293457, "eval_qnli-contrastive_runtime": 4.9388, "eval_qnli-contrastive_samples_per_second": 303.717, "eval_qnli-contrastive_steps_per_second": 12.756, "step": 5600 }, { "epoch": 1.0255494014650706, "grad_norm": 28.780393600463867, "learning_rate": 1.995450879640972e-05, "loss": 1.7539, "step": 5740 }, { "epoch": 1.050562801500804, "grad_norm": 1.414655089378357, "learning_rate": 1.9813086898560906e-05, "loss": 1.9194, "step": 5880 }, { "epoch": 1.0755762015365375, "grad_norm": 1.5645997524261475, "learning_rate": 1.957705835379269e-05, "loss": 1.7071, "step": 6020 }, { "epoch": 1.100589601572271, "grad_norm": 4.314450263977051, "learning_rate": 1.9248698681462904e-05, "loss": 1.7053, "step": 6160 }, { "epoch": 1.1256030016080043, "grad_norm": 19.960533142089844, "learning_rate": 1.8831173552873946e-05, "loss": 1.8023, "step": 6300 }, { "epoch": 1.1256030016080043, "eval_nli-pairs_loss": 1.5202544927597046, "eval_nli-pairs_runtime": 12.0076, "eval_nli-pairs_samples_per_second": 124.92, "eval_nli-pairs_steps_per_second": 5.247, "step": 6300 }, { "epoch": 1.1256030016080043, "eval_scitail-pairs-pos_loss": 0.7953532934188843, "eval_scitail-pairs-pos_runtime": 14.8864, "eval_scitail-pairs-pos_samples_per_second": 87.597, "eval_scitail-pairs-pos_steps_per_second": 3.695, "step": 6300 }, { "epoch": 1.1256030016080043, "eval_qnli-contrastive_loss": 1.5952976942062378, "eval_qnli-contrastive_runtime": 4.7602, "eval_qnli-contrastive_samples_per_second": 315.111, "eval_qnli-contrastive_steps_per_second": 13.235, "step": 6300 }, { "epoch": 1.1506164016437377, "grad_norm": 29.618913650512695, "learning_rate": 1.8328508271462822e-05, "loss": 1.7448, "step": 6440 }, { "epoch": 1.175629801679471, "grad_norm": 23.058677673339844, "learning_rate": 1.7745548965393756e-05, "loss": 1.7874, "step": 6580 }, { "epoch": 1.2006432017152047, "grad_norm": 13.168740272521973, "learning_rate": 1.7087915866690346e-05, "loss": 1.7441, "step": 6720 }, { "epoch": 1.225656601750938, "grad_norm": 27.45108985900879, "learning_rate": 1.6361949127336846e-05, "loss": 1.4535, "step": 6860 }, { "epoch": 1.2506700017866714, "grad_norm": 17.073558807373047, "learning_rate": 1.557464769472821e-05, "loss": 1.7103, "step": 7000 }, { "epoch": 1.2506700017866714, "eval_nli-pairs_loss": 1.4381694793701172, "eval_nli-pairs_runtime": 11.9075, "eval_nli-pairs_samples_per_second": 125.971, "eval_nli-pairs_steps_per_second": 5.291, "step": 7000 }, { "epoch": 1.2506700017866714, "eval_scitail-pairs-pos_loss": 0.7307367920875549, "eval_scitail-pairs-pos_runtime": 14.9432, "eval_scitail-pairs-pos_samples_per_second": 87.264, "eval_scitail-pairs-pos_steps_per_second": 3.681, "step": 7000 }, { "epoch": 1.2506700017866714, "eval_qnli-contrastive_loss": 1.756566047668457, "eval_qnli-contrastive_runtime": 4.6972, "eval_qnli-contrastive_samples_per_second": 319.341, "eval_qnli-contrastive_steps_per_second": 13.412, "step": 7000 }, { "epoch": 1.2756834018224048, "grad_norm": 27.894006729125977, "learning_rate": 1.4733601835762515e-05, "loss": 1.4028, "step": 7140 }, { "epoch": 1.3006968018581384, "grad_norm": 12.896081924438477, "learning_rate": 1.3846919960101735e-05, "loss": 1.5999, "step": 7280 }, { "epoch": 1.3257102018938718, "grad_norm": 13.122183799743652, "learning_rate": 1.2923150448088129e-05, "loss": 1.7167, "step": 7420 }, { "epoch": 1.3507236019296052, "grad_norm": 19.48947525024414, "learning_rate": 1.1971199236962578e-05, "loss": 1.4903, "step": 7560 }, { "epoch": 1.3757370019653385, "grad_norm": 4.030813694000244, "learning_rate": 1.1000243959925168e-05, "loss": 1.5666, "step": 7700 }, { "epoch": 1.3757370019653385, "eval_nli-pairs_loss": 1.353498935699463, "eval_nli-pairs_runtime": 12.144, "eval_nli-pairs_samples_per_second": 123.518, "eval_nli-pairs_steps_per_second": 5.188, "step": 7700 }, { "epoch": 1.3757370019653385, "eval_scitail-pairs-pos_loss": 0.6812191605567932, "eval_scitail-pairs-pos_runtime": 15.227, "eval_scitail-pairs-pos_samples_per_second": 85.637, "eval_scitail-pairs-pos_steps_per_second": 3.612, "step": 7700 }, { "epoch": 1.3757370019653385, "eval_qnli-contrastive_loss": 1.5941846370697021, "eval_qnli-contrastive_runtime": 4.7731, "eval_qnli-contrastive_samples_per_second": 314.263, "eval_qnli-contrastive_steps_per_second": 13.199, "step": 7700 }, { "epoch": 1.400750402001072, "grad_norm": 1.3353348970413208, "learning_rate": 1.0019645465811612e-05, "loss": 1.4289, "step": 7840 }, { "epoch": 1.4257638020368053, "grad_norm": 59.30118942260742, "learning_rate": 9.038857572412504e-06, "loss": 1.4531, "step": 7980 }, { "epoch": 1.450777202072539, "grad_norm": 14.574576377868652, "learning_rate": 8.067335923491407e-06, "loss": 1.2831, "step": 8120 }, { "epoch": 1.4757906021082723, "grad_norm": 14.356673240661621, "learning_rate": 7.114446828198969e-06, "loss": 1.5913, "step": 8260 }, { "epoch": 1.5008040021440057, "grad_norm": 11.328471183776855, "learning_rate": 6.189376961749974e-06, "loss": 1.2771, "step": 8400 }, { "epoch": 1.5008040021440057, "eval_nli-pairs_loss": 1.2932122945785522, "eval_nli-pairs_runtime": 11.9045, "eval_nli-pairs_samples_per_second": 126.003, "eval_nli-pairs_steps_per_second": 5.292, "step": 8400 }, { "epoch": 1.5008040021440057, "eval_scitail-pairs-pos_loss": 0.6547886729240417, "eval_scitail-pairs-pos_runtime": 14.9249, "eval_scitail-pairs-pos_samples_per_second": 87.371, "eval_scitail-pairs-pos_steps_per_second": 3.685, "step": 8400 }, { "epoch": 1.5008040021440057, "eval_qnli-contrastive_loss": 1.6052364110946655, "eval_qnli-contrastive_runtime": 4.7149, "eval_qnli-contrastive_samples_per_second": 318.137, "eval_qnli-contrastive_steps_per_second": 13.362, "step": 8400 }, { "epoch": 1.5258174021797393, "grad_norm": 12.197046279907227, "learning_rate": 5.301044797927004e-06, "loss": 1.2718, "step": 8540 }, { "epoch": 1.5508308022154726, "grad_norm": 16.694225311279297, "learning_rate": 4.4638562023418675e-06, "loss": 1.374, "step": 8680 }, { "epoch": 1.575844202251206, "grad_norm": 9.635122299194336, "learning_rate": 3.673846274432512e-06, "loss": 1.4011, "step": 8820 }, { "epoch": 1.6008576022869394, "grad_norm": 13.676748275756836, "learning_rate": 2.9448259411389292e-06, "loss": 1.571, "step": 8960 }, { "epoch": 1.6258710023226728, "grad_norm": 3.530120849609375, "learning_rate": 2.283823589143308e-06, "loss": 1.54, "step": 9100 }, { "epoch": 1.6258710023226728, "eval_nli-pairs_loss": 1.2734112739562988, "eval_nli-pairs_runtime": 11.8925, "eval_nli-pairs_samples_per_second": 126.13, "eval_nli-pairs_steps_per_second": 5.297, "step": 9100 }, { "epoch": 1.6258710023226728, "eval_scitail-pairs-pos_loss": 0.638645350933075, "eval_scitail-pairs-pos_runtime": 14.9358, "eval_scitail-pairs-pos_samples_per_second": 87.307, "eval_scitail-pairs-pos_steps_per_second": 3.682, "step": 9100 }, { "epoch": 1.6258710023226728, "eval_qnli-contrastive_loss": 1.5299873352050781, "eval_qnli-contrastive_runtime": 4.6925, "eval_qnli-contrastive_samples_per_second": 319.657, "eval_qnli-contrastive_steps_per_second": 13.426, "step": 9100 }, { "epoch": 1.6508844023584062, "grad_norm": 0.7482818961143494, "learning_rate": 1.6972118529463478e-06, "loss": 1.3765, "step": 9240 }, { "epoch": 1.6758978023941398, "grad_norm": 10.352273941040039, "learning_rate": 1.1906461771577705e-06, "loss": 1.3131, "step": 9380 }, { "epoch": 1.7009112024298731, "grad_norm": 8.817509651184082, "learning_rate": 7.690102931177922e-07, "loss": 1.3459, "step": 9520 }, { "epoch": 1.7259246024656065, "grad_norm": 11.525378227233887, "learning_rate": 4.36369135502146e-07, "loss": 1.1947, "step": 9660 }, { "epoch": 1.7509380025013401, "grad_norm": 53.216773986816406, "learning_rate": 1.9592965283525944e-07, "loss": 1.4422, "step": 9800 }, { "epoch": 1.7509380025013401, "eval_nli-pairs_loss": 1.2626315355300903, "eval_nli-pairs_runtime": 12.0334, "eval_nli-pairs_samples_per_second": 124.653, "eval_nli-pairs_steps_per_second": 5.235, "step": 9800 }, { "epoch": 1.7509380025013401, "eval_scitail-pairs-pos_loss": 0.6336750388145447, "eval_scitail-pairs-pos_runtime": 14.9549, "eval_scitail-pairs-pos_samples_per_second": 87.195, "eval_scitail-pairs-pos_steps_per_second": 3.678, "step": 9800 }, { "epoch": 1.7509380025013401, "eval_qnli-contrastive_loss": 1.5141370296478271, "eval_qnli-contrastive_runtime": 4.7047, "eval_qnli-contrastive_samples_per_second": 318.832, "eval_qnli-contrastive_steps_per_second": 13.391, "step": 9800 }, { "epoch": 1.7759514025370735, "grad_norm": 10.688042640686035, "learning_rate": 5.000988973217102e-08, "loss": 1.3539, "step": 9940 }, { "epoch": 1.8009648025728069, "grad_norm": 5.067196846008301, "learning_rate": 1.6638943132196717e-11, "loss": 1.3033, "step": 10080 }, { "epoch": 1.8259782026085403, "grad_norm": 14.930255889892578, "learning_rate": 1.995356812134552e-05, "loss": 1.1106, "step": 10220 }, { "epoch": 1.8509916026442736, "grad_norm": 9.03366470336914, "learning_rate": 1.981119187419803e-05, "loss": 1.3733, "step": 10360 }, { "epoch": 1.876005002680007, "grad_norm": 11.640127182006836, "learning_rate": 1.957422724980536e-05, "loss": 1.2665, "step": 10500 }, { "epoch": 1.876005002680007, "eval_nli-pairs_loss": 1.2357267141342163, "eval_nli-pairs_runtime": 11.9459, "eval_nli-pairs_samples_per_second": 125.566, "eval_nli-pairs_steps_per_second": 5.274, "step": 10500 }, { "epoch": 1.876005002680007, "eval_scitail-pairs-pos_loss": 0.6095670461654663, "eval_scitail-pairs-pos_runtime": 14.9199, "eval_scitail-pairs-pos_samples_per_second": 87.4, "eval_scitail-pairs-pos_steps_per_second": 3.686, "step": 10500 }, { "epoch": 1.876005002680007, "eval_qnli-contrastive_loss": 1.4496928453445435, "eval_qnli-contrastive_runtime": 4.7387, "eval_qnli-contrastive_samples_per_second": 316.542, "eval_qnli-contrastive_steps_per_second": 13.295, "step": 10500 }, { "epoch": 1.9010184027157406, "grad_norm": 9.566073417663574, "learning_rate": 1.9247631052082938e-05, "loss": 1.351, "step": 10640 }, { "epoch": 1.926031802751474, "grad_norm": 4.9808878898620605, "learning_rate": 1.8829856531045453e-05, "loss": 1.3464, "step": 10780 }, { "epoch": 1.9510452027872074, "grad_norm": 12.794534683227539, "learning_rate": 1.833083761701665e-05, "loss": 1.1087, "step": 10920 }, { "epoch": 1.976058602822941, "grad_norm": 0.7860898375511169, "learning_rate": 1.774821097109875e-05, "loss": 1.1635, "step": 11060 }, { "epoch": 2.0010720028586744, "grad_norm": 40.53791427612305, "learning_rate": 1.7090884868508633e-05, "loss": 1.1354, "step": 11200 }, { "epoch": 2.0010720028586744, "eval_nli-pairs_loss": 1.1946154832839966, "eval_nli-pairs_runtime": 13.3184, "eval_nli-pairs_samples_per_second": 112.626, "eval_nli-pairs_steps_per_second": 4.73, "step": 11200 }, { "epoch": 2.0010720028586744, "eval_scitail-pairs-pos_loss": 0.6180398464202881, "eval_scitail-pairs-pos_runtime": 15.804, "eval_scitail-pairs-pos_samples_per_second": 82.511, "eval_scitail-pairs-pos_steps_per_second": 3.48, "step": 11200 }, { "epoch": 2.0010720028586744, "eval_qnli-contrastive_loss": 1.4097257852554321, "eval_qnli-contrastive_runtime": 4.9229, "eval_qnli-contrastive_samples_per_second": 304.698, "eval_qnli-contrastive_steps_per_second": 12.797, "step": 11200 }, { "epoch": 2.0260854028944077, "grad_norm": 6.641210556030273, "learning_rate": 1.636519650152244e-05, "loss": 1.1174, "step": 11340 }, { "epoch": 2.051098802930141, "grad_norm": 14.658684730529785, "learning_rate": 1.5578142133784694e-05, "loss": 1.3446, "step": 11480 }, { "epoch": 2.0761122029658745, "grad_norm": 8.882003784179688, "learning_rate": 1.4737309650274368e-05, "loss": 1.1409, "step": 11620 }, { "epoch": 2.101125603001608, "grad_norm": 8.679718971252441, "learning_rate": 1.3850805403529464e-05, "loss": 1.1902, "step": 11760 }, { "epoch": 2.1261390030373413, "grad_norm": 7.319563865661621, "learning_rate": 1.2927176061395823e-05, "loss": 1.2951, "step": 11900 }, { "epoch": 2.1261390030373413, "eval_nli-pairs_loss": 1.140305519104004, "eval_nli-pairs_runtime": 11.9166, "eval_nli-pairs_samples_per_second": 125.875, "eval_nli-pairs_steps_per_second": 5.287, "step": 11900 }, { "epoch": 2.1261390030373413, "eval_scitail-pairs-pos_loss": 0.5850992798805237, "eval_scitail-pairs-pos_runtime": 15.0095, "eval_scitail-pairs-pos_samples_per_second": 86.878, "eval_scitail-pairs-pos_steps_per_second": 3.664, "step": 11900 }, { "epoch": 2.1261390030373413, "eval_qnli-contrastive_loss": 1.2628742456436157, "eval_qnli-contrastive_runtime": 4.7346, "eval_qnli-contrastive_samples_per_second": 316.816, "eval_qnli-contrastive_steps_per_second": 13.306, "step": 11900 }, { "epoch": 2.151152403073075, "grad_norm": 0.9307207465171814, "learning_rate": 1.1975326209755442e-05, "loss": 1.2077, "step": 12040 }, { "epoch": 2.1761658031088085, "grad_norm": 6.99313497543335, "learning_rate": 1.1004432504615264e-05, "loss": 1.2858, "step": 12180 }, { "epoch": 2.201179203144542, "grad_norm": 0.7553257942199707, "learning_rate": 1.002385520120464e-05, "loss": 1.267, "step": 12320 }, { "epoch": 2.2261926031802752, "grad_norm": 11.953374862670898, "learning_rate": 9.04304791301748e-06, "loss": 1.0026, "step": 12460 }, { "epoch": 2.2512060032160086, "grad_norm": 0.7482788562774658, "learning_rate": 8.071466470799878e-06, "loss": 1.2523, "step": 12600 }, { "epoch": 2.2512060032160086, "eval_nli-pairs_loss": 1.1073120832443237, "eval_nli-pairs_runtime": 11.9071, "eval_nli-pairs_samples_per_second": 125.975, "eval_nli-pairs_steps_per_second": 5.291, "step": 12600 }, { "epoch": 2.2512060032160086, "eval_scitail-pairs-pos_loss": 0.553435206413269, "eval_scitail-pairs-pos_runtime": 14.8588, "eval_scitail-pairs-pos_samples_per_second": 87.759, "eval_scitail-pairs-pos_steps_per_second": 3.702, "step": 12600 }, { "epoch": 2.2512060032160086, "eval_qnli-contrastive_loss": 1.3819462060928345, "eval_qnli-contrastive_runtime": 4.7186, "eval_qnli-contrastive_samples_per_second": 317.888, "eval_qnli-contrastive_steps_per_second": 13.351, "step": 12600 }, { "epoch": 2.276219403251742, "grad_norm": 9.73590087890625, "learning_rate": 7.118477760161566e-06, "loss": 1.0009, "step": 12740 }, { "epoch": 2.3012328032874754, "grad_norm": 8.564470291137695, "learning_rate": 6.193269416695461e-06, "loss": 1.1822, "step": 12880 }, { "epoch": 2.3262462033232087, "grad_norm": 17.126731872558594, "learning_rate": 5.304761249222376e-06, "loss": 1.3998, "step": 13020 }, { "epoch": 2.351259603358942, "grad_norm": 0.7993360757827759, "learning_rate": 4.461519245117542e-06, "loss": 1.0833, "step": 13160 }, { "epoch": 2.376273003394676, "grad_norm": 14.322105407714844, "learning_rate": 3.6716729867819545e-06, "loss": 1.1998, "step": 13300 }, { "epoch": 2.376273003394676, "eval_nli-pairs_loss": 1.0725221633911133, "eval_nli-pairs_runtime": 12.0178, "eval_nli-pairs_samples_per_second": 124.815, "eval_nli-pairs_steps_per_second": 5.242, "step": 13300 }, { "epoch": 2.376273003394676, "eval_scitail-pairs-pos_loss": 0.5349867343902588, "eval_scitail-pairs-pos_runtime": 14.9892, "eval_scitail-pairs-pos_samples_per_second": 86.996, "eval_scitail-pairs-pos_steps_per_second": 3.669, "step": 13300 }, { "epoch": 2.376273003394676, "eval_qnli-contrastive_loss": 1.2879091501235962, "eval_qnli-contrastive_runtime": 4.7466, "eval_qnli-contrastive_samples_per_second": 316.015, "eval_qnli-contrastive_steps_per_second": 13.273, "step": 13300 }, { "epoch": 2.4012864034304093, "grad_norm": 9.142762184143066, "learning_rate": 2.942837275435191e-06, "loss": 1.0898, "step": 13440 }, { "epoch": 2.4262998034661427, "grad_norm": 10.551756858825684, "learning_rate": 2.282038717844137e-06, "loss": 1.1658, "step": 13580 }, { "epoch": 2.451313203501876, "grad_norm": 5.1452765464782715, "learning_rate": 1.6956479837551532e-06, "loss": 1.0127, "step": 13720 }, { "epoch": 2.4763266035376095, "grad_norm": 10.926447868347168, "learning_rate": 1.1893183871264458e-06, "loss": 1.2826, "step": 13860 }, { "epoch": 2.501340003573343, "grad_norm": 8.374099731445312, "learning_rate": 7.679313832908975e-07, "loss": 1.0243, "step": 14000 }, { "epoch": 2.501340003573343, "eval_nli-pairs_loss": 1.0618858337402344, "eval_nli-pairs_runtime": 11.9457, "eval_nli-pairs_samples_per_second": 125.568, "eval_nli-pairs_steps_per_second": 5.274, "step": 14000 }, { "epoch": 2.501340003573343, "eval_scitail-pairs-pos_loss": 0.5268865823745728, "eval_scitail-pairs-pos_runtime": 14.9206, "eval_scitail-pairs-pos_samples_per_second": 87.396, "eval_scitail-pairs-pos_steps_per_second": 3.686, "step": 14000 }, { "epoch": 2.501340003573343, "eval_qnli-contrastive_loss": 1.311318278312683, "eval_qnli-contrastive_runtime": 4.7255, "eval_qnli-contrastive_samples_per_second": 317.427, "eval_qnli-contrastive_steps_per_second": 13.332, "step": 14000 }, { "epoch": 2.5263534036090762, "grad_norm": 14.999399185180664, "learning_rate": 4.3554950750376746e-07, "loss": 1.0344, "step": 14140 }, { "epoch": 2.5513668036448096, "grad_norm": 7.31856107711792, "learning_rate": 1.953772085883532e-07, "loss": 1.1904, "step": 14280 }, { "epoch": 2.576380203680543, "grad_norm": 13.470648765563965, "learning_rate": 4.9729955277194594e-08, "loss": 1.2133, "step": 14420 }, { "epoch": 2.601393603716277, "grad_norm": 11.626204490661621, "learning_rate": 1.1913090399717774e-11, "loss": 1.3826, "step": 14560 }, { "epoch": 2.6264070037520098, "grad_norm": 3.0835983753204346, "learning_rate": 1.99532975930347e-05, "loss": 1.3857, "step": 14700 }, { "epoch": 2.6264070037520098, "eval_nli-pairs_loss": 1.0945470333099365, "eval_nli-pairs_runtime": 11.9625, "eval_nli-pairs_samples_per_second": 125.392, "eval_nli-pairs_steps_per_second": 5.266, "step": 14700 }, { "epoch": 2.6264070037520098, "eval_scitail-pairs-pos_loss": 0.543086051940918, "eval_scitail-pairs-pos_runtime": 14.9212, "eval_scitail-pairs-pos_samples_per_second": 87.392, "eval_scitail-pairs-pos_steps_per_second": 3.686, "step": 14700 }, { "epoch": 2.6264070037520098, "eval_qnli-contrastive_loss": 1.3532981872558594, "eval_qnli-contrastive_runtime": 4.7202, "eval_qnli-contrastive_samples_per_second": 317.78, "eval_qnli-contrastive_steps_per_second": 13.347, "step": 14700 }, { "epoch": 2.6514204037877436, "grad_norm": 10.123917579650879, "learning_rate": 1.98106486998506e-05, "loss": 1.2257, "step": 14840 }, { "epoch": 2.676433803823477, "grad_norm": 0.7845281362533569, "learning_rate": 1.957341666609194e-05, "loss": 1.2372, "step": 14980 }, { "epoch": 2.7014472038592103, "grad_norm": 8.572671890258789, "learning_rate": 1.9243888613791533e-05, "loss": 1.1853, "step": 15120 }, { "epoch": 2.7264606038949437, "grad_norm": 1.009204626083374, "learning_rate": 1.8828538813739946e-05, "loss": 1.0465, "step": 15260 }, { "epoch": 2.751474003930677, "grad_norm": 7.785921573638916, "learning_rate": 1.832540018150201e-05, "loss": 1.3469, "step": 15400 }, { "epoch": 2.751474003930677, "eval_nli-pairs_loss": 1.0669987201690674, "eval_nli-pairs_runtime": 11.8504, "eval_nli-pairs_samples_per_second": 126.578, "eval_nli-pairs_steps_per_second": 5.316, "step": 15400 }, { "epoch": 2.751474003930677, "eval_scitail-pairs-pos_loss": 0.5464032888412476, "eval_scitail-pairs-pos_runtime": 14.8493, "eval_scitail-pairs-pos_samples_per_second": 87.815, "eval_scitail-pairs-pos_steps_per_second": 3.704, "step": 15400 }, { "epoch": 2.751474003930677, "eval_qnli-contrastive_loss": 1.5209625959396362, "eval_qnli-contrastive_runtime": 4.7597, "eval_qnli-contrastive_samples_per_second": 315.149, "eval_qnli-contrastive_steps_per_second": 13.236, "step": 15400 }, { "epoch": 2.7764874039664105, "grad_norm": 0.9812193512916565, "learning_rate": 1.774199748928214e-05, "loss": 1.1843, "step": 15540 }, { "epoch": 2.801500804002144, "grad_norm": 5.74447774887085, "learning_rate": 1.7083955243729978e-05, "loss": 1.113, "step": 15680 }, { "epoch": 2.8265142040378777, "grad_norm": 12.007781982421875, "learning_rate": 1.6357617541359384e-05, "loss": 0.8078, "step": 15820 }, { "epoch": 2.8515276040736106, "grad_norm": 3.299994468688965, "learning_rate": 1.556998690596969e-05, "loss": 1.0437, "step": 15960 }, { "epoch": 2.8765410041093444, "grad_norm": 0.8269862532615662, "learning_rate": 1.4728656778258573e-05, "loss": 0.9093, "step": 16100 }, { "epoch": 2.8765410041093444, "eval_nli-pairs_loss": 0.9908406138420105, "eval_nli-pairs_runtime": 12.0455, "eval_nli-pairs_samples_per_second": 124.527, "eval_nli-pairs_steps_per_second": 5.23, "step": 16100 }, { "epoch": 2.8765410041093444, "eval_scitail-pairs-pos_loss": 0.49377763271331787, "eval_scitail-pairs-pos_runtime": 15.2422, "eval_scitail-pairs-pos_samples_per_second": 85.552, "eval_scitail-pairs-pos_steps_per_second": 3.608, "step": 16100 }, { "epoch": 2.8765410041093444, "eval_qnli-contrastive_loss": 1.1596204042434692, "eval_qnli-contrastive_runtime": 4.7624, "eval_qnli-contrastive_samples_per_second": 314.967, "eval_qnli-contrastive_steps_per_second": 13.229, "step": 16100 }, { "epoch": 2.901554404145078, "grad_norm": 4.0750861167907715, "learning_rate": 1.3841738308484636e-05, "loss": 1.044, "step": 16240 }, { "epoch": 2.926567804180811, "grad_norm": 7.754133224487305, "learning_rate": 1.291778215796206e-05, "loss": 1.0227, "step": 16380 }, { "epoch": 2.9515812042165446, "grad_norm": 37.815181732177734, "learning_rate": 1.1965696063288423e-05, "loss": 0.8159, "step": 16520 }, { "epoch": 2.976594604252278, "grad_norm": 16.427824020385742, "learning_rate": 1.0994658958057889e-05, "loss": 0.8426, "step": 16660 }, { "epoch": 3.0016080042880113, "grad_norm": 0.6402806043624878, "learning_rate": 1.0014032480000764e-05, "loss": 0.7955, "step": 16800 }, { "epoch": 3.0016080042880113, "eval_nli-pairs_loss": 0.9680945873260498, "eval_nli-pairs_runtime": 13.331, "eval_nli-pairs_samples_per_second": 112.519, "eval_nli-pairs_steps_per_second": 4.726, "step": 16800 }, { "epoch": 3.0016080042880113, "eval_scitail-pairs-pos_loss": 0.49118393659591675, "eval_scitail-pairs-pos_runtime": 15.2998, "eval_scitail-pairs-pos_samples_per_second": 85.23, "eval_scitail-pairs-pos_steps_per_second": 3.595, "step": 16800 }, { "epoch": 3.0016080042880113, "eval_qnli-contrastive_loss": 1.1894794702529907, "eval_qnli-contrastive_runtime": 4.8432, "eval_qnli-contrastive_samples_per_second": 309.715, "eval_qnli-contrastive_steps_per_second": 13.008, "step": 16800 }, { "epoch": 3.0266214043237447, "grad_norm": 3.155766010284424, "learning_rate": 9.03327071669702e-06, "loss": 0.856, "step": 16940 }, { "epoch": 3.051634804359478, "grad_norm": 11.008296966552734, "learning_rate": 8.061829059993542e-06, "loss": 1.0754, "step": 17080 }, { "epoch": 3.076648204395212, "grad_norm": 4.382720947265625, "learning_rate": 7.109073047846788e-06, "loss": 0.9151, "step": 17220 }, { "epoch": 3.1016616044309453, "grad_norm": 2.755722761154175, "learning_rate": 6.184188072434878e-06, "loss": 1.0051, "step": 17360 }, { "epoch": 3.1266750044666787, "grad_norm": 2.4547111988067627, "learning_rate": 5.296090825030854e-06, "loss": 1.0075, "step": 17500 }, { "epoch": 3.1266750044666787, "eval_nli-pairs_loss": 0.9583492875099182, "eval_nli-pairs_runtime": 12.1773, "eval_nli-pairs_samples_per_second": 123.18, "eval_nli-pairs_steps_per_second": 5.174, "step": 17500 }, { "epoch": 3.1266750044666787, "eval_scitail-pairs-pos_loss": 0.485266774892807, "eval_scitail-pairs-pos_runtime": 14.9222, "eval_scitail-pairs-pos_samples_per_second": 87.387, "eval_scitail-pairs-pos_steps_per_second": 3.686, "step": 17500 }, { "epoch": 3.1266750044666787, "eval_qnli-contrastive_loss": 1.0658234357833862, "eval_qnli-contrastive_runtime": 4.7681, "eval_qnli-contrastive_samples_per_second": 314.592, "eval_qnli-contrastive_steps_per_second": 13.213, "step": 17500 }, { "epoch": 3.151688404502412, "grad_norm": 19.061325073242188, "learning_rate": 4.453343331385006e-06, "loss": 0.9909, "step": 17640 }, { "epoch": 3.1767018045381454, "grad_norm": 17.016021728515625, "learning_rate": 3.6640704063896858e-06, "loss": 1.029, "step": 17780 }, { "epoch": 3.201715204573879, "grad_norm": 4.147863864898682, "learning_rate": 2.9358813238350816e-06, "loss": 1.0292, "step": 17920 }, { "epoch": 3.226728604609612, "grad_norm": 27.60422706604004, "learning_rate": 2.275796456427173e-06, "loss": 0.8334, "step": 18060 }, { "epoch": 3.2517420046453456, "grad_norm": 0.7800289392471313, "learning_rate": 1.6901795933215137e-06, "loss": 1.0119, "step": 18200 }, { "epoch": 3.2517420046453456, "eval_nli-pairs_loss": 0.9484548568725586, "eval_nli-pairs_runtime": 12.0697, "eval_nli-pairs_samples_per_second": 124.279, "eval_nli-pairs_steps_per_second": 5.22, "step": 18200 }, { "epoch": 3.2517420046453456, "eval_scitail-pairs-pos_loss": 0.4673975706100464, "eval_scitail-pairs-pos_runtime": 15.0509, "eval_scitail-pairs-pos_samples_per_second": 86.639, "eval_scitail-pairs-pos_steps_per_second": 3.654, "step": 18200 }, { "epoch": 3.2517420046453456, "eval_qnli-contrastive_loss": 1.1171668767929077, "eval_qnli-contrastive_runtime": 4.7871, "eval_qnli-contrastive_samples_per_second": 313.345, "eval_qnli-contrastive_steps_per_second": 13.16, "step": 18200 }, { "epoch": 3.2767554046810794, "grad_norm": 16.64696502685547, "learning_rate": 1.1846765876905709e-06, "loss": 0.8582, "step": 18340 }, { "epoch": 3.3017688047168123, "grad_norm": 16.13783073425293, "learning_rate": 7.668532006209551e-07, "loss": 1.0397, "step": 18480 }, { "epoch": 3.326782204752546, "grad_norm": 3.76619553565979, "learning_rate": 4.347306328421508e-07, "loss": 1.1988, "step": 18620 }, { "epoch": 3.3517956047882795, "grad_norm": 10.401665687561035, "learning_rate": 1.948255365952012e-07, "loss": 0.9432, "step": 18760 }, { "epoch": 3.376809004824013, "grad_norm": 2.400106191635132, "learning_rate": 4.945080454776929e-08, "loss": 1.0573, "step": 18900 }, { "epoch": 3.376809004824013, "eval_nli-pairs_loss": 0.9437180757522583, "eval_nli-pairs_runtime": 12.0974, "eval_nli-pairs_samples_per_second": 123.993, "eval_nli-pairs_steps_per_second": 5.208, "step": 18900 }, { "epoch": 3.376809004824013, "eval_scitail-pairs-pos_loss": 0.46788787841796875, "eval_scitail-pairs-pos_runtime": 15.1516, "eval_scitail-pairs-pos_samples_per_second": 86.063, "eval_scitail-pairs-pos_steps_per_second": 3.63, "step": 18900 }, { "epoch": 3.376809004824013, "eval_qnli-contrastive_loss": 1.081482172012329, "eval_qnli-contrastive_runtime": 4.8096, "eval_qnli-contrastive_samples_per_second": 311.875, "eval_qnli-contrastive_steps_per_second": 13.099, "step": 18900 }, { "epoch": 3.4018224048597463, "grad_norm": 25.33026695251465, "learning_rate": 7.974879220329356e-12, "loss": 0.9829, "step": 19040 }, { "epoch": 3.4268358048954797, "grad_norm": 4.218173027038574, "learning_rate": 1.995302628075987e-05, "loss": 1.0573, "step": 19180 }, { "epoch": 3.451849204931213, "grad_norm": 13.573431015014648, "learning_rate": 1.98101047527748e-05, "loss": 0.9449, "step": 19320 }, { "epoch": 3.4768626049669464, "grad_norm": 6.658699989318848, "learning_rate": 1.9572605328335534e-05, "loss": 1.2005, "step": 19460 }, { "epoch": 3.5018760050026803, "grad_norm": 6.075576305389404, "learning_rate": 1.924281770735239e-05, "loss": 0.9171, "step": 19600 }, { "epoch": 3.5018760050026803, "eval_nli-pairs_loss": 0.9502684473991394, "eval_nli-pairs_runtime": 12.0413, "eval_nli-pairs_samples_per_second": 124.572, "eval_nli-pairs_steps_per_second": 5.232, "step": 19600 }, { "epoch": 3.5018760050026803, "eval_scitail-pairs-pos_loss": 0.4798508584499359, "eval_scitail-pairs-pos_runtime": 14.9533, "eval_scitail-pairs-pos_samples_per_second": 87.205, "eval_scitail-pairs-pos_steps_per_second": 3.678, "step": 19600 }, { "epoch": 3.5018760050026803, "eval_qnli-contrastive_loss": 1.2315282821655273, "eval_qnli-contrastive_runtime": 4.7188, "eval_qnli-contrastive_samples_per_second": 317.874, "eval_qnli-contrastive_steps_per_second": 13.351, "step": 19600 }, { "epoch": 3.526889405038413, "grad_norm": 8.40775203704834, "learning_rate": 1.8823921327788075e-05, "loss": 0.9425, "step": 19740 }, { "epoch": 3.551902805074147, "grad_norm": 11.214140892028809, "learning_rate": 1.831995471312526e-05, "loss": 1.1213, "step": 19880 }, { "epoch": 3.5769162051098804, "grad_norm": 10.211651802062988, "learning_rate": 1.7735776537506483e-05, "loss": 1.1128, "step": 20020 }, { "epoch": 3.6019296051456138, "grad_norm": 44.01512908935547, "learning_rate": 1.707701878391224e-05, "loss": 1.331, "step": 20160 }, { "epoch": 3.626943005181347, "grad_norm": 13.295893669128418, "learning_rate": 1.6350032446972868e-05, "loss": 1.0495, "step": 20300 }, { "epoch": 3.626943005181347, "eval_nli-pairs_loss": 0.9468088150024414, "eval_nli-pairs_runtime": 11.9325, "eval_nli-pairs_samples_per_second": 125.707, "eval_nli-pairs_steps_per_second": 5.28, "step": 20300 }, { "epoch": 3.626943005181347, "eval_scitail-pairs-pos_loss": 0.4434490203857422, "eval_scitail-pairs-pos_runtime": 15.5134, "eval_scitail-pairs-pos_samples_per_second": 84.056, "eval_scitail-pairs-pos_steps_per_second": 3.545, "step": 20300 }, { "epoch": 3.626943005181347, "eval_qnli-contrastive_loss": 1.141271710395813, "eval_qnli-contrastive_runtime": 4.7207, "eval_qnli-contrastive_samples_per_second": 317.752, "eval_qnli-contrastive_steps_per_second": 13.346, "step": 20300 }, { "epoch": 3.6519564052170805, "grad_norm": 71.68439483642578, "learning_rate": 1.5561826303886085e-05, "loss": 0.9698, "step": 20440 }, { "epoch": 3.676969805252814, "grad_norm": 5.957241058349609, "learning_rate": 1.4719999343741618e-05, "loss": 0.9148, "step": 20580 }, { "epoch": 3.7019832052885473, "grad_norm": 1.4626597166061401, "learning_rate": 1.3839147028686583e-05, "loss": 0.9042, "step": 20720 }, { "epoch": 3.726996605324281, "grad_norm": 2.4634809494018555, "learning_rate": 1.2915097668067934e-05, "loss": 0.8232, "step": 20860 }, { "epoch": 3.752010005360014, "grad_norm": 1.5838899612426758, "learning_rate": 1.196294424410312e-05, "loss": 1.0163, "step": 21000 }, { "epoch": 3.752010005360014, "eval_nli-pairs_loss": 0.9020450115203857, "eval_nli-pairs_runtime": 12.2572, "eval_nli-pairs_samples_per_second": 122.377, "eval_nli-pairs_steps_per_second": 5.14, "step": 21000 }, { "epoch": 3.752010005360014, "eval_scitail-pairs-pos_loss": 0.4573577046394348, "eval_scitail-pairs-pos_runtime": 15.1478, "eval_scitail-pairs-pos_samples_per_second": 86.085, "eval_scitail-pairs-pos_steps_per_second": 3.631, "step": 21000 }, { "epoch": 3.752010005360014, "eval_qnli-contrastive_loss": 1.2882591485977173, "eval_qnli-contrastive_runtime": 4.762, "eval_qnli-contrastive_samples_per_second": 314.992, "eval_qnli-contrastive_steps_per_second": 13.23, "step": 21000 }, { "epoch": 3.777023405395748, "grad_norm": 5.878975868225098, "learning_rate": 1.099186633949893e-05, "loss": 0.9735, "step": 21140 }, { "epoch": 3.8020368054314813, "grad_norm": 10.22749137878418, "learning_rate": 1.0011225985326909e-05, "loss": 0.8371, "step": 21280 }, { "epoch": 3.8270502054672146, "grad_norm": 8.895988464355469, "learning_rate": 9.030477402944833e-06, "loss": 0.6344, "step": 21420 }, { "epoch": 3.852063605502948, "grad_norm": 1.564530372619629, "learning_rate": 8.059075857124063e-06, "loss": 0.87, "step": 21560 }, { "epoch": 3.8770770055386814, "grad_norm": 3.3526771068573, "learning_rate": 7.106386499117424e-06, "loss": 0.7404, "step": 21700 }, { "epoch": 3.8770770055386814, "eval_nli-pairs_loss": 0.8661152720451355, "eval_nli-pairs_runtime": 11.9159, "eval_nli-pairs_samples_per_second": 125.883, "eval_nli-pairs_steps_per_second": 5.287, "step": 21700 }, { "epoch": 3.8770770055386814, "eval_scitail-pairs-pos_loss": 0.4352877140045166, "eval_scitail-pairs-pos_runtime": 14.9412, "eval_scitail-pairs-pos_samples_per_second": 87.275, "eval_scitail-pairs-pos_steps_per_second": 3.681, "step": 21700 }, { "epoch": 3.8770770055386814, "eval_qnli-contrastive_loss": 1.0643585920333862, "eval_qnli-contrastive_runtime": 4.7458, "eval_qnli-contrastive_samples_per_second": 316.066, "eval_qnli-contrastive_steps_per_second": 13.275, "step": 21700 }, { "epoch": 3.9020904055744148, "grad_norm": 6.517562389373779, "learning_rate": 6.181594078499504e-06, "loss": 0.8486, "step": 21840 }, { "epoch": 3.927103805610148, "grad_norm": 4.482045650482178, "learning_rate": 5.293614394235034e-06, "loss": 0.8895, "step": 21980 }, { "epoch": 3.952117205645882, "grad_norm": 5.165999889373779, "learning_rate": 4.451008338663955e-06, "loss": 0.7476, "step": 22120 }, { "epoch": 3.977130605681615, "grad_norm": 7.821371078491211, "learning_rate": 3.6618993630932396e-06, "loss": 0.6761, "step": 22260 } ], "logging_steps": 140, "max_steps": 27985, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 6997, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 28, "trial_name": null, "trial_params": null }