{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 392, "global_step": 15669, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015125406854298296, "grad_norm": 199.77919006347656, "learning_rate": 5.589586523736601e-07, "loss": 12.973, "step": 79 }, { "epoch": 0.030250813708596593, "grad_norm": 70.86483764648438, "learning_rate": 1.1638591117917304e-06, "loss": 8.4661, "step": 158 }, { "epoch": 0.045376220562894885, "grad_norm": 17.361167907714844, "learning_rate": 1.768759571209801e-06, "loss": 6.136, "step": 237 }, { "epoch": 0.060501627417193185, "grad_norm": 13.593123435974121, "learning_rate": 2.3736600306278715e-06, "loss": 6.2319, "step": 316 }, { "epoch": 0.07505265173272066, "eval_nli-pairs_loss": 5.313699245452881, "eval_nli-pairs_runtime": 12.1282, "eval_nli-pairs_samples_per_second": 123.679, "eval_nli-pairs_steps_per_second": 5.195, "step": 392 }, { "epoch": 0.07505265173272066, "eval_scitail-pairs-pos_loss": 3.977630615234375, "eval_scitail-pairs-pos_runtime": 15.2195, "eval_scitail-pairs-pos_samples_per_second": 85.68, "eval_scitail-pairs-pos_steps_per_second": 3.614, "step": 392 }, { "epoch": 0.07505265173272066, "eval_qnli-contrastive_loss": 5.520341396331787, "eval_qnli-contrastive_runtime": 4.7394, "eval_qnli-contrastive_samples_per_second": 316.494, "eval_qnli-contrastive_steps_per_second": 13.293, "step": 392 }, { "epoch": 0.07562703427149148, "grad_norm": 15.363186836242676, "learning_rate": 2.978560490045942e-06, "loss": 5.6068, "step": 395 }, { "epoch": 0.09075244112578977, "grad_norm": 18.922758102416992, "learning_rate": 3.5834609494640125e-06, "loss": 5.2502, "step": 474 }, { "epoch": 0.10587784798008808, "grad_norm": 20.363380432128906, "learning_rate": 4.188361408882083e-06, "loss": 4.8699, "step": 553 }, { "epoch": 0.12100325483438637, "grad_norm": 14.830269813537598, "learning_rate": 4.793261868300153e-06, "loss": 4.9584, "step": 632 }, { "epoch": 0.13612866168868468, "grad_norm": 26.075838088989258, "learning_rate": 5.398162327718224e-06, "loss": 4.5632, "step": 711 }, { "epoch": 0.15010530346544132, "eval_nli-pairs_loss": 4.265738487243652, "eval_nli-pairs_runtime": 12.0989, "eval_nli-pairs_samples_per_second": 123.978, "eval_nli-pairs_steps_per_second": 5.207, "step": 784 }, { "epoch": 0.15010530346544132, "eval_scitail-pairs-pos_loss": 2.458251476287842, "eval_scitail-pairs-pos_runtime": 15.2215, "eval_scitail-pairs-pos_samples_per_second": 85.668, "eval_scitail-pairs-pos_steps_per_second": 3.613, "step": 784 }, { "epoch": 0.15010530346544132, "eval_qnli-contrastive_loss": 4.81198263168335, "eval_qnli-contrastive_runtime": 4.724, "eval_qnli-contrastive_samples_per_second": 317.525, "eval_qnli-contrastive_steps_per_second": 13.336, "step": 784 }, { "epoch": 0.15125406854298296, "grad_norm": 17.387819290161133, "learning_rate": 6.003062787136294e-06, "loss": 4.223, "step": 790 }, { "epoch": 0.16637947539728126, "grad_norm": 24.702957153320312, "learning_rate": 6.607963246554365e-06, "loss": 3.8496, "step": 869 }, { "epoch": 0.18150488225157954, "grad_norm": 20.878055572509766, "learning_rate": 7.212863705972435e-06, "loss": 3.4414, "step": 948 }, { "epoch": 0.19663028910587785, "grad_norm": 28.57908821105957, "learning_rate": 7.817764165390506e-06, "loss": 3.3513, "step": 1027 }, { "epoch": 0.21175569596017615, "grad_norm": 37.09183120727539, "learning_rate": 8.422664624808575e-06, "loss": 3.5611, "step": 1106 }, { "epoch": 0.22515795519816198, "eval_nli-pairs_loss": 3.178299903869629, "eval_nli-pairs_runtime": 12.0715, "eval_nli-pairs_samples_per_second": 124.26, "eval_nli-pairs_steps_per_second": 5.219, "step": 1176 }, { "epoch": 0.22515795519816198, "eval_scitail-pairs-pos_loss": 1.983331561088562, "eval_scitail-pairs-pos_runtime": 15.1626, "eval_scitail-pairs-pos_samples_per_second": 86.001, "eval_scitail-pairs-pos_steps_per_second": 3.627, "step": 1176 }, { "epoch": 0.22515795519816198, "eval_qnli-contrastive_loss": 3.4507648944854736, "eval_qnli-contrastive_runtime": 4.7752, "eval_qnli-contrastive_samples_per_second": 314.125, "eval_qnli-contrastive_steps_per_second": 13.193, "step": 1176 }, { "epoch": 0.22688110281447443, "grad_norm": 22.88146209716797, "learning_rate": 9.027565084226646e-06, "loss": 3.4039, "step": 1185 }, { "epoch": 0.24200650966877274, "grad_norm": 20.4180908203125, "learning_rate": 9.632465543644716e-06, "loss": 3.4269, "step": 1264 }, { "epoch": 0.25713191652307105, "grad_norm": 23.59966278076172, "learning_rate": 1.0237366003062788e-05, "loss": 3.1573, "step": 1343 }, { "epoch": 0.27225732337736935, "grad_norm": 10.84000301361084, "learning_rate": 1.0842266462480856e-05, "loss": 3.253, "step": 1422 }, { "epoch": 0.2873827302316676, "grad_norm": 16.418413162231445, "learning_rate": 1.1447166921898928e-05, "loss": 2.7614, "step": 1501 }, { "epoch": 0.30021060693088264, "eval_nli-pairs_loss": 2.722890615463257, "eval_nli-pairs_runtime": 12.0687, "eval_nli-pairs_samples_per_second": 124.288, "eval_nli-pairs_steps_per_second": 5.22, "step": 1568 }, { "epoch": 0.30021060693088264, "eval_scitail-pairs-pos_loss": 1.6435188055038452, "eval_scitail-pairs-pos_runtime": 15.2101, "eval_scitail-pairs-pos_samples_per_second": 85.732, "eval_scitail-pairs-pos_steps_per_second": 3.616, "step": 1568 }, { "epoch": 0.30021060693088264, "eval_qnli-contrastive_loss": 2.944777011871338, "eval_qnli-contrastive_runtime": 4.7212, "eval_qnli-contrastive_samples_per_second": 317.713, "eval_qnli-contrastive_steps_per_second": 13.344, "step": 1568 }, { "epoch": 0.3025081370859659, "grad_norm": 20.777223587036133, "learning_rate": 1.2052067381317e-05, "loss": 2.9549, "step": 1580 }, { "epoch": 0.3176335439402642, "grad_norm": 15.09938907623291, "learning_rate": 1.265696784073507e-05, "loss": 2.8357, "step": 1659 }, { "epoch": 0.3327589507945625, "grad_norm": 5.233273983001709, "learning_rate": 1.326186830015314e-05, "loss": 2.8964, "step": 1738 }, { "epoch": 0.34788435764886083, "grad_norm": 16.8189640045166, "learning_rate": 1.386676875957121e-05, "loss": 2.8274, "step": 1817 }, { "epoch": 0.3630097645031591, "grad_norm": 8.114161491394043, "learning_rate": 1.4471669218989282e-05, "loss": 2.6809, "step": 1896 }, { "epoch": 0.37526325866360327, "eval_nli-pairs_loss": 2.428619384765625, "eval_nli-pairs_runtime": 12.0706, "eval_nli-pairs_samples_per_second": 124.269, "eval_nli-pairs_steps_per_second": 5.219, "step": 1960 }, { "epoch": 0.37526325866360327, "eval_scitail-pairs-pos_loss": 1.3531062602996826, "eval_scitail-pairs-pos_runtime": 15.2633, "eval_scitail-pairs-pos_samples_per_second": 85.434, "eval_scitail-pairs-pos_steps_per_second": 3.603, "step": 1960 }, { "epoch": 0.37526325866360327, "eval_qnli-contrastive_loss": 2.404916286468506, "eval_qnli-contrastive_runtime": 4.7194, "eval_qnli-contrastive_samples_per_second": 317.838, "eval_qnli-contrastive_steps_per_second": 13.349, "step": 1960 }, { "epoch": 0.3781351713574574, "grad_norm": 22.405332565307617, "learning_rate": 1.5076569678407352e-05, "loss": 2.3456, "step": 1975 }, { "epoch": 0.3932605782117557, "grad_norm": 33.843994140625, "learning_rate": 1.5681470137825424e-05, "loss": 2.5316, "step": 2054 }, { "epoch": 0.408385985066054, "grad_norm": 3.7852566242218018, "learning_rate": 1.6286370597243492e-05, "loss": 2.653, "step": 2133 }, { "epoch": 0.4235113919203523, "grad_norm": 28.830053329467773, "learning_rate": 1.689127105666156e-05, "loss": 2.699, "step": 2212 }, { "epoch": 0.43863679877465056, "grad_norm": 26.699514389038086, "learning_rate": 1.7496171516079635e-05, "loss": 2.424, "step": 2291 }, { "epoch": 0.45031591039632396, "eval_nli-pairs_loss": 2.207122564315796, "eval_nli-pairs_runtime": 12.0919, "eval_nli-pairs_samples_per_second": 124.05, "eval_nli-pairs_steps_per_second": 5.21, "step": 2352 }, { "epoch": 0.45031591039632396, "eval_scitail-pairs-pos_loss": 1.2252534627914429, "eval_scitail-pairs-pos_runtime": 15.1733, "eval_scitail-pairs-pos_samples_per_second": 85.941, "eval_scitail-pairs-pos_steps_per_second": 3.625, "step": 2352 }, { "epoch": 0.45031591039632396, "eval_qnli-contrastive_loss": 2.292630672454834, "eval_qnli-contrastive_runtime": 4.7338, "eval_qnli-contrastive_samples_per_second": 316.868, "eval_qnli-contrastive_steps_per_second": 13.308, "step": 2352 }, { "epoch": 0.45376220562894887, "grad_norm": 3.1586949825286865, "learning_rate": 1.8101071975497704e-05, "loss": 2.4716, "step": 2370 }, { "epoch": 0.4688876124832472, "grad_norm": 15.398905754089355, "learning_rate": 1.8705972434915772e-05, "loss": 2.0097, "step": 2449 }, { "epoch": 0.4840130193375455, "grad_norm": 2.9506657123565674, "learning_rate": 1.9310872894333844e-05, "loss": 2.3993, "step": 2528 }, { "epoch": 0.4991384261918438, "grad_norm": 18.736677169799805, "learning_rate": 1.9915773353751916e-05, "loss": 2.3295, "step": 2607 }, { "epoch": 0.5142638330461421, "grad_norm": 16.75814437866211, "learning_rate": 2.0520673813169984e-05, "loss": 2.348, "step": 2686 }, { "epoch": 0.5253685621290446, "eval_nli-pairs_loss": 2.0092170238494873, "eval_nli-pairs_runtime": 12.0787, "eval_nli-pairs_samples_per_second": 124.185, "eval_nli-pairs_steps_per_second": 5.216, "step": 2744 }, { "epoch": 0.5253685621290446, "eval_scitail-pairs-pos_loss": 1.0735079050064087, "eval_scitail-pairs-pos_runtime": 14.9317, "eval_scitail-pairs-pos_samples_per_second": 87.331, "eval_scitail-pairs-pos_steps_per_second": 3.683, "step": 2744 }, { "epoch": 0.5253685621290446, "eval_qnli-contrastive_loss": 1.9999727010726929, "eval_qnli-contrastive_runtime": 4.659, "eval_qnli-contrastive_samples_per_second": 321.961, "eval_qnli-contrastive_steps_per_second": 13.522, "step": 2744 }, { "epoch": 0.5293892399004404, "grad_norm": 3.6279871463775635, "learning_rate": 2.1125574272588056e-05, "loss": 2.0747, "step": 2765 }, { "epoch": 0.5445146467547387, "grad_norm": 102.07367706298828, "learning_rate": 2.1730474732006124e-05, "loss": 2.3592, "step": 2844 }, { "epoch": 0.5596400536090369, "grad_norm": 12.037158966064453, "learning_rate": 2.23353751914242e-05, "loss": 2.2563, "step": 2923 }, { "epoch": 0.5747654604633352, "grad_norm": 11.711392402648926, "learning_rate": 2.2940275650842267e-05, "loss": 2.3484, "step": 3002 }, { "epoch": 0.5898908673176335, "grad_norm": 20.607454299926758, "learning_rate": 2.3545176110260336e-05, "loss": 1.868, "step": 3081 }, { "epoch": 0.6004212138617653, "eval_nli-pairs_loss": 1.846701979637146, "eval_nli-pairs_runtime": 11.9121, "eval_nli-pairs_samples_per_second": 125.922, "eval_nli-pairs_steps_per_second": 5.289, "step": 3136 }, { "epoch": 0.6004212138617653, "eval_scitail-pairs-pos_loss": 0.9629871249198914, "eval_scitail-pairs-pos_runtime": 15.006, "eval_scitail-pairs-pos_samples_per_second": 86.899, "eval_scitail-pairs-pos_steps_per_second": 3.665, "step": 3136 }, { "epoch": 0.6004212138617653, "eval_qnli-contrastive_loss": 1.9593416452407837, "eval_qnli-contrastive_runtime": 4.653, "eval_qnli-contrastive_samples_per_second": 322.374, "eval_qnli-contrastive_steps_per_second": 13.54, "step": 3136 }, { "epoch": 0.6050162741719318, "grad_norm": 15.901214599609375, "learning_rate": 2.4150076569678408e-05, "loss": 1.9958, "step": 3160 }, { "epoch": 0.6201416810262301, "grad_norm": 13.168147087097168, "learning_rate": 2.475497702909648e-05, "loss": 2.0089, "step": 3239 }, { "epoch": 0.6352670878805284, "grad_norm": 21.926223754882812, "learning_rate": 2.5359877488514548e-05, "loss": 1.8303, "step": 3318 }, { "epoch": 0.6503924947348267, "grad_norm": 21.501989364624023, "learning_rate": 2.596477794793262e-05, "loss": 1.6892, "step": 3397 }, { "epoch": 0.665517901589125, "grad_norm": 3.5192618370056152, "learning_rate": 2.6569678407350688e-05, "loss": 1.8379, "step": 3476 }, { "epoch": 0.675473865594486, "eval_nli-pairs_loss": 1.7486572265625, "eval_nli-pairs_runtime": 12.0369, "eval_nli-pairs_samples_per_second": 124.617, "eval_nli-pairs_steps_per_second": 5.234, "step": 3528 }, { "epoch": 0.675473865594486, "eval_scitail-pairs-pos_loss": 0.9056742191314697, "eval_scitail-pairs-pos_runtime": 14.8901, "eval_scitail-pairs-pos_samples_per_second": 87.575, "eval_scitail-pairs-pos_steps_per_second": 3.694, "step": 3528 }, { "epoch": 0.675473865594486, "eval_qnli-contrastive_loss": 1.7076925039291382, "eval_qnli-contrastive_runtime": 4.6837, "eval_qnli-contrastive_samples_per_second": 320.259, "eval_qnli-contrastive_steps_per_second": 13.451, "step": 3528 }, { "epoch": 0.6806433084434234, "grad_norm": 13.107728004455566, "learning_rate": 2.717457886676876e-05, "loss": 1.4958, "step": 3555 }, { "epoch": 0.6957687152977217, "grad_norm": 10.731244087219238, "learning_rate": 2.777947932618683e-05, "loss": 1.9504, "step": 3634 }, { "epoch": 0.7108941221520199, "grad_norm": 1.3723793029785156, "learning_rate": 2.83843797856049e-05, "loss": 1.6017, "step": 3713 }, { "epoch": 0.7260195290063182, "grad_norm": 16.096094131469727, "learning_rate": 2.8989280245022975e-05, "loss": 1.7229, "step": 3792 }, { "epoch": 0.7411449358606165, "grad_norm": 14.629384994506836, "learning_rate": 2.9594180704441043e-05, "loss": 1.5996, "step": 3871 }, { "epoch": 0.7505265173272065, "eval_nli-pairs_loss": 1.6035664081573486, "eval_nli-pairs_runtime": 12.0239, "eval_nli-pairs_samples_per_second": 124.752, "eval_nli-pairs_steps_per_second": 5.24, "step": 3920 }, { "epoch": 0.7505265173272065, "eval_scitail-pairs-pos_loss": 0.7905139923095703, "eval_scitail-pairs-pos_runtime": 15.2398, "eval_scitail-pairs-pos_samples_per_second": 85.566, "eval_scitail-pairs-pos_steps_per_second": 3.609, "step": 3920 }, { "epoch": 0.7505265173272065, "eval_qnli-contrastive_loss": 1.7369401454925537, "eval_qnli-contrastive_runtime": 4.726, "eval_qnli-contrastive_samples_per_second": 317.396, "eval_qnli-contrastive_steps_per_second": 13.331, "step": 3920 }, { "epoch": 0.7562703427149148, "grad_norm": 12.058998107910156, "learning_rate": 2.999673874450528e-05, "loss": 1.6257, "step": 3950 }, { "epoch": 0.7713957495692131, "grad_norm": 4.181306838989258, "learning_rate": 2.9946841125275615e-05, "loss": 1.6094, "step": 4029 }, { "epoch": 0.7865211564235114, "grad_norm": 14.733617782592773, "learning_rate": 2.983695736786804e-05, "loss": 1.6061, "step": 4108 }, { "epoch": 0.8016465632778097, "grad_norm": 75.19181823730469, "learning_rate": 2.96675284686242e-05, "loss": 1.8917, "step": 4187 }, { "epoch": 0.816771970132108, "grad_norm": 17.123188018798828, "learning_rate": 2.943923439632653e-05, "loss": 1.766, "step": 4266 }, { "epoch": 0.8255791690599272, "eval_nli-pairs_loss": 1.5217715501785278, "eval_nli-pairs_runtime": 12.1712, "eval_nli-pairs_samples_per_second": 123.241, "eval_nli-pairs_steps_per_second": 5.176, "step": 4312 }, { "epoch": 0.8255791690599272, "eval_scitail-pairs-pos_loss": 0.7310367226600647, "eval_scitail-pairs-pos_runtime": 15.0699, "eval_scitail-pairs-pos_samples_per_second": 86.53, "eval_scitail-pairs-pos_steps_per_second": 3.65, "step": 4312 }, { "epoch": 0.8255791690599272, "eval_qnli-contrastive_loss": 1.8110274076461792, "eval_qnli-contrastive_runtime": 4.7354, "eval_qnli-contrastive_samples_per_second": 316.764, "eval_qnli-contrastive_steps_per_second": 13.304, "step": 4312 }, { "epoch": 0.8318973769864063, "grad_norm": 26.6308536529541, "learning_rate": 2.9152991363280456e-05, "loss": 1.6544, "step": 4345 }, { "epoch": 0.8470227838407046, "grad_norm": 11.87916088104248, "learning_rate": 2.8809948148280698e-05, "loss": 1.5872, "step": 4424 }, { "epoch": 0.8621481906950029, "grad_norm": 5.825096607208252, "learning_rate": 2.841148148621882e-05, "loss": 1.6237, "step": 4503 }, { "epoch": 0.8772735975493011, "grad_norm": 7.624891757965088, "learning_rate": 2.7959190542834895e-05, "loss": 1.5713, "step": 4582 }, { "epoch": 0.8923990044035994, "grad_norm": 11.067708969116211, "learning_rate": 2.7454890496787676e-05, "loss": 1.5109, "step": 4661 }, { "epoch": 0.9006318207926479, "eval_nli-pairs_loss": 1.4145296812057495, "eval_nli-pairs_runtime": 12.1688, "eval_nli-pairs_samples_per_second": 123.266, "eval_nli-pairs_steps_per_second": 5.177, "step": 4704 }, { "epoch": 0.9006318207926479, "eval_scitail-pairs-pos_loss": 0.7044198513031006, "eval_scitail-pairs-pos_runtime": 15.0745, "eval_scitail-pairs-pos_samples_per_second": 86.504, "eval_scitail-pairs-pos_steps_per_second": 3.649, "step": 4704 }, { "epoch": 0.9006318207926479, "eval_qnli-contrastive_loss": 1.5929718017578125, "eval_qnli-contrastive_runtime": 4.7378, "eval_qnli-contrastive_samples_per_second": 316.603, "eval_qnli-contrastive_steps_per_second": 13.297, "step": 4704 }, { "epoch": 0.9075244112578977, "grad_norm": 18.31964874267578, "learning_rate": 2.6900605254800455e-05, "loss": 1.8614, "step": 4740 }, { "epoch": 0.922649818112196, "grad_norm": 11.028084754943848, "learning_rate": 2.6298559329118796e-05, "loss": 1.2809, "step": 4819 }, { "epoch": 0.9377752249664943, "grad_norm": 11.14758586883545, "learning_rate": 2.565116890987845e-05, "loss": 1.4557, "step": 4898 }, { "epoch": 0.9529006318207927, "grad_norm": 12.307340621948242, "learning_rate": 2.4970023905369427e-05, "loss": 2.285, "step": 4977 }, { "epoch": 0.968026038675091, "grad_norm": 19.368682861328125, "learning_rate": 2.4249872456580537e-05, "loss": 1.5918, "step": 5056 }, { "epoch": 0.9756844725253686, "eval_nli-pairs_loss": 1.3622660636901855, "eval_nli-pairs_runtime": 12.1119, "eval_nli-pairs_samples_per_second": 123.845, "eval_nli-pairs_steps_per_second": 5.201, "step": 5096 }, { "epoch": 0.9756844725253686, "eval_scitail-pairs-pos_loss": 0.6618204116821289, "eval_scitail-pairs-pos_runtime": 15.1844, "eval_scitail-pairs-pos_samples_per_second": 85.877, "eval_scitail-pairs-pos_steps_per_second": 3.622, "step": 5096 }, { "epoch": 0.9756844725253686, "eval_qnli-contrastive_loss": 1.5225657224655151, "eval_qnli-contrastive_runtime": 4.73, "eval_qnli-contrastive_samples_per_second": 317.125, "eval_qnli-contrastive_steps_per_second": 13.319, "step": 5096 }, { "epoch": 0.9831514455293893, "grad_norm": 23.91764259338379, "learning_rate": 2.349353206401398e-05, "loss": 1.5956, "step": 5135 }, { "epoch": 0.9982768523836876, "grad_norm": 28.184560775756836, "learning_rate": 2.269363669859137e-05, "loss": 1.309, "step": 5214 }, { "epoch": 1.0134022592379859, "grad_norm": 1.2889472246170044, "learning_rate": 2.186286447094588e-05, "loss": 1.6033, "step": 5293 }, { "epoch": 1.0285276660922842, "grad_norm": 9.043930053710938, "learning_rate": 2.1004549518185432e-05, "loss": 1.2943, "step": 5372 }, { "epoch": 1.0436530729465825, "grad_norm": 15.558199882507324, "learning_rate": 2.012213651460107e-05, "loss": 1.4881, "step": 5451 }, { "epoch": 1.0507371242580892, "eval_nli-pairs_loss": 1.3221956491470337, "eval_nli-pairs_runtime": 12.1205, "eval_nli-pairs_samples_per_second": 123.757, "eval_nli-pairs_steps_per_second": 5.198, "step": 5488 }, { "epoch": 1.0507371242580892, "eval_scitail-pairs-pos_loss": 0.6279736161231995, "eval_scitail-pairs-pos_runtime": 15.0898, "eval_scitail-pairs-pos_samples_per_second": 86.416, "eval_scitail-pairs-pos_steps_per_second": 3.645, "step": 5488 }, { "epoch": 1.0507371242580892, "eval_qnli-contrastive_loss": 1.5666921138763428, "eval_qnli-contrastive_runtime": 4.7489, "eval_qnli-contrastive_samples_per_second": 315.863, "eval_qnli-contrastive_steps_per_second": 13.266, "step": 5488 }, { "epoch": 1.0587784798008808, "grad_norm": 22.18709373474121, "learning_rate": 1.921916684716005e-05, "loss": 1.6734, "step": 5530 }, { "epoch": 1.073903886655179, "grad_norm": 2.1289186477661133, "learning_rate": 1.8299264402862166e-05, "loss": 1.6602, "step": 5609 }, { "epoch": 1.0890292935094774, "grad_norm": 8.099466323852539, "learning_rate": 1.7366121024998667e-05, "loss": 1.4626, "step": 5688 }, { "epoch": 1.1041547003637757, "grad_norm": 11.092597007751465, "learning_rate": 1.642348169668238e-05, "loss": 1.4048, "step": 5767 }, { "epoch": 1.1192801072180738, "grad_norm": 1.632265329360962, "learning_rate": 1.5475129511111833e-05, "loss": 1.5961, "step": 5846 }, { "epoch": 1.12578977599081, "eval_nli-pairs_loss": 1.257077932357788, "eval_nli-pairs_runtime": 12.0966, "eval_nli-pairs_samples_per_second": 124.002, "eval_nli-pairs_steps_per_second": 5.208, "step": 5880 }, { "epoch": 1.12578977599081, "eval_scitail-pairs-pos_loss": 0.6171609163284302, "eval_scitail-pairs-pos_runtime": 15.2057, "eval_scitail-pairs-pos_samples_per_second": 85.757, "eval_scitail-pairs-pos_steps_per_second": 3.617, "step": 5880 }, { "epoch": 1.12578977599081, "eval_qnli-contrastive_loss": 1.4182076454162598, "eval_qnli-contrastive_runtime": 4.7646, "eval_qnli-contrastive_samples_per_second": 314.825, "eval_qnli-contrastive_steps_per_second": 13.223, "step": 5880 }, { "epoch": 1.1344055140723721, "grad_norm": 17.874731063842773, "learning_rate": 1.452487048888817e-05, "loss": 1.4949, "step": 5925 }, { "epoch": 1.1495309209266704, "grad_norm": 5.625218391418457, "learning_rate": 1.357651830331762e-05, "loss": 1.7542, "step": 6004 }, { "epoch": 1.1646563277809687, "grad_norm": 12.764110565185547, "learning_rate": 1.2633878975001336e-05, "loss": 1.3177, "step": 6083 }, { "epoch": 1.179781734635267, "grad_norm": 14.75761890411377, "learning_rate": 1.1700735597137837e-05, "loss": 1.1522, "step": 6162 }, { "epoch": 1.1949071414895653, "grad_norm": 7.778223037719727, "learning_rate": 1.078083315283995e-05, "loss": 1.0727, "step": 6241 }, { "epoch": 1.2008424277235306, "eval_nli-pairs_loss": 1.2002286911010742, "eval_nli-pairs_runtime": 12.1083, "eval_nli-pairs_samples_per_second": 123.882, "eval_nli-pairs_steps_per_second": 5.203, "step": 6272 }, { "epoch": 1.2008424277235306, "eval_scitail-pairs-pos_loss": 0.587746798992157, "eval_scitail-pairs-pos_runtime": 15.2398, "eval_scitail-pairs-pos_samples_per_second": 85.565, "eval_scitail-pairs-pos_steps_per_second": 3.609, "step": 6272 }, { "epoch": 1.2008424277235306, "eval_qnli-contrastive_loss": 1.5079773664474487, "eval_qnli-contrastive_runtime": 4.7468, "eval_qnli-contrastive_samples_per_second": 316.005, "eval_qnli-contrastive_steps_per_second": 13.272, "step": 6272 }, { "epoch": 1.2100325483438636, "grad_norm": 5.742403507232666, "learning_rate": 9.877863485398942e-06, "loss": 1.598, "step": 6320 }, { "epoch": 1.225157955198162, "grad_norm": 13.002484321594238, "learning_rate": 8.995450481814567e-06, "loss": 1.3773, "step": 6399 }, { "epoch": 1.2402833620524603, "grad_norm": 12.662968635559082, "learning_rate": 8.137135529054122e-06, "loss": 1.6495, "step": 6478 }, { "epoch": 1.2554087689067586, "grad_norm": 7.513673305511475, "learning_rate": 7.306363301408635e-06, "loss": 1.3042, "step": 6557 }, { "epoch": 1.2705341757610569, "grad_norm": 92.78031158447266, "learning_rate": 6.506467935986024e-06, "loss": 1.5158, "step": 6636 }, { "epoch": 1.2758950794562511, "eval_nli-pairs_loss": 1.1646167039871216, "eval_nli-pairs_runtime": 12.3376, "eval_nli-pairs_samples_per_second": 121.579, "eval_nli-pairs_steps_per_second": 5.106, "step": 6664 }, { "epoch": 1.2758950794562511, "eval_scitail-pairs-pos_loss": 0.5752041339874268, "eval_scitail-pairs-pos_runtime": 15.5528, "eval_scitail-pairs-pos_samples_per_second": 83.843, "eval_scitail-pairs-pos_steps_per_second": 3.536, "step": 6664 }, { "epoch": 1.2758950794562511, "eval_qnli-contrastive_loss": 1.331896424293518, "eval_qnli-contrastive_runtime": 4.7695, "eval_qnli-contrastive_samples_per_second": 314.501, "eval_qnli-contrastive_steps_per_second": 13.209, "step": 6664 }, { "epoch": 1.2856595826153552, "grad_norm": 11.36242961883545, "learning_rate": 5.740659651822936e-06, "loss": 1.2205, "step": 6715 }, { "epoch": 1.3007849894696535, "grad_norm": 10.5322904586792, "learning_rate": 5.012011866316839e-06, "loss": 1.3909, "step": 6794 }, { "epoch": 1.3159103963239518, "grad_norm": 2.6958863735198975, "learning_rate": 4.323448860683947e-06, "loss": 1.4255, "step": 6873 }, { "epoch": 1.33103580317825, "grad_norm": 19.98720359802246, "learning_rate": 3.677734043945192e-06, "loss": 1.5415, "step": 6952 }, { "epoch": 1.3461612100325484, "grad_norm": 3.684659719467163, "learning_rate": 3.077458862540392e-06, "loss": 1.3355, "step": 7031 }, { "epoch": 1.350947731188972, "eval_nli-pairs_loss": 1.1400986909866333, "eval_nli-pairs_runtime": 12.0157, "eval_nli-pairs_samples_per_second": 124.836, "eval_nli-pairs_steps_per_second": 5.243, "step": 7056 }, { "epoch": 1.350947731188972, "eval_scitail-pairs-pos_loss": 0.5660089254379272, "eval_scitail-pairs-pos_runtime": 15.1309, "eval_scitail-pairs-pos_samples_per_second": 86.181, "eval_scitail-pairs-pos_steps_per_second": 3.635, "step": 7056 }, { "epoch": 1.350947731188972, "eval_qnli-contrastive_loss": 1.2624869346618652, "eval_qnli-contrastive_runtime": 4.6898, "eval_qnli-contrastive_samples_per_second": 319.843, "eval_qnli-contrastive_steps_per_second": 13.433, "step": 7056 }, { "epoch": 1.3612866168868467, "grad_norm": 11.162321090698242, "learning_rate": 2.5250324000795594e-06, "loss": 1.5326, "step": 7110 }, { "epoch": 1.376412023741145, "grad_norm": 9.399407386779785, "learning_rate": 2.0226717089707925e-06, "loss": 1.0109, "step": 7189 }, { "epoch": 1.3915374305954433, "grad_norm": 0.5825966596603394, "learning_rate": 1.5723929127267211e-06, "loss": 1.2729, "step": 7268 }, { "epoch": 1.4066628374497414, "grad_norm": 7.376439094543457, "learning_rate": 1.1760031146585697e-06, "loss": 1.605, "step": 7347 }, { "epoch": 1.42178824430404, "grad_norm": 0.5974981188774109, "learning_rate": 8.350931454308347e-07, "loss": 1.4983, "step": 7426 }, { "epoch": 1.4260003829216925, "eval_nli-pairs_loss": 1.1365835666656494, "eval_nli-pairs_runtime": 11.9569, "eval_nli-pairs_samples_per_second": 125.451, "eval_nli-pairs_steps_per_second": 5.269, "step": 7448 }, { "epoch": 1.4260003829216925, "eval_scitail-pairs-pos_loss": 0.5671288371086121, "eval_scitail-pairs-pos_runtime": 14.9551, "eval_scitail-pairs-pos_samples_per_second": 87.194, "eval_scitail-pairs-pos_steps_per_second": 3.678, "step": 7448 }, { "epoch": 1.4260003829216925, "eval_qnli-contrastive_loss": 1.2691177129745483, "eval_qnli-contrastive_runtime": 4.6835, "eval_qnli-contrastive_samples_per_second": 320.27, "eval_qnli-contrastive_steps_per_second": 13.451, "step": 7448 }, { "epoch": 1.436913651158338, "grad_norm": 8.548786163330078, "learning_rate": 5.5103117858258e-07, "loss": 1.2901, "step": 7505 }, { "epoch": 1.4520390580126366, "grad_norm": 9.624091148376465, "learning_rate": 3.2495723963837597e-07, "loss": 1.4993, "step": 7584 }, { "epoch": 1.4671644648669346, "grad_norm": 18.643239974975586, "learning_rate": 1.5777863084531385e-07, "loss": 1.0473, "step": 7663 }, { "epoch": 1.482289871721233, "grad_norm": 10.979313850402832, "learning_rate": 5.0166289898085916e-08, "loss": 1.2113, "step": 7742 }, { "epoch": 1.4974152785755313, "grad_norm": 10.067323684692383, "learning_rate": 2.55209726558292e-09, "loss": 1.3604, "step": 7821 }, { "epoch": 1.5010530346544133, "eval_nli-pairs_loss": 1.1346535682678223, "eval_nli-pairs_runtime": 12.2237, "eval_nli-pairs_samples_per_second": 122.712, "eval_nli-pairs_steps_per_second": 5.154, "step": 7840 }, { "epoch": 1.5010530346544133, "eval_scitail-pairs-pos_loss": 0.5651898980140686, "eval_scitail-pairs-pos_runtime": 15.2453, "eval_scitail-pairs-pos_samples_per_second": 85.535, "eval_scitail-pairs-pos_steps_per_second": 3.608, "step": 7840 }, { "epoch": 1.5010530346544133, "eval_qnli-contrastive_loss": 1.2610852718353271, "eval_qnli-contrastive_runtime": 4.7666, "eval_qnli-contrastive_samples_per_second": 314.687, "eval_qnli-contrastive_steps_per_second": 13.217, "step": 7840 }, { "epoch": 1.5125406854298296, "grad_norm": 12.913325309753418, "learning_rate": 2.9984872857074416e-05, "loss": 1.4627, "step": 7900 }, { "epoch": 1.5276660922841279, "grad_norm": 13.103713035583496, "learning_rate": 2.9912159040536404e-05, "loss": 1.1015, "step": 7979 }, { "epoch": 1.5427914991384262, "grad_norm": 10.095404624938965, "learning_rate": 2.9779598275386362e-05, "loss": 1.4538, "step": 8058 }, { "epoch": 1.5579169059927245, "grad_norm": 0.5388267040252686, "learning_rate": 2.9587722567571802e-05, "loss": 1.4412, "step": 8137 }, { "epoch": 1.5730423128470228, "grad_norm": 20.366121292114258, "learning_rate": 2.933730197162302e-05, "loss": 1.4793, "step": 8216 }, { "epoch": 1.5761056863871339, "eval_nli-pairs_loss": 1.1918026208877563, "eval_nli-pairs_runtime": 12.158, "eval_nli-pairs_samples_per_second": 123.375, "eval_nli-pairs_steps_per_second": 5.182, "step": 8232 }, { "epoch": 1.5761056863871339, "eval_scitail-pairs-pos_loss": 0.5848828554153442, "eval_scitail-pairs-pos_runtime": 15.3425, "eval_scitail-pairs-pos_samples_per_second": 84.993, "eval_scitail-pairs-pos_steps_per_second": 3.585, "step": 8232 }, { "epoch": 1.5761056863871339, "eval_qnli-contrastive_loss": 1.4694615602493286, "eval_qnli-contrastive_runtime": 4.7988, "eval_qnli-contrastive_samples_per_second": 312.58, "eval_qnli-contrastive_steps_per_second": 13.128, "step": 8232 }, { "epoch": 1.588167719701321, "grad_norm": 19.943920135498047, "learning_rate": 2.9029341500194198e-05, "loss": 1.1267, "step": 8295 }, { "epoch": 1.6032931265556194, "grad_norm": 14.96302318572998, "learning_rate": 2.8665077090647462e-05, "loss": 1.1734, "step": 8374 }, { "epoch": 1.6184185334099177, "grad_norm": 6.065411567687988, "learning_rate": 2.8245970644867055e-05, "loss": 1.2193, "step": 8453 }, { "epoch": 1.633543940264216, "grad_norm": 15.93069076538086, "learning_rate": 2.7773704162210366e-05, "loss": 1.1381, "step": 8532 }, { "epoch": 1.6486693471185143, "grad_norm": 5.890163898468018, "learning_rate": 2.725017298914211e-05, "loss": 0.9632, "step": 8611 }, { "epoch": 1.6511583381198545, "eval_nli-pairs_loss": 1.1099625825881958, "eval_nli-pairs_runtime": 12.0731, "eval_nli-pairs_samples_per_second": 124.243, "eval_nli-pairs_steps_per_second": 5.218, "step": 8624 }, { "epoch": 1.6511583381198545, "eval_scitail-pairs-pos_loss": 0.5500022172927856, "eval_scitail-pairs-pos_runtime": 15.0341, "eval_scitail-pairs-pos_samples_per_second": 86.736, "eval_scitail-pairs-pos_steps_per_second": 3.658, "step": 8624 }, { "epoch": 1.6511583381198545, "eval_qnli-contrastive_loss": 1.208964467048645, "eval_qnli-contrastive_runtime": 4.6959, "eval_qnli-contrastive_samples_per_second": 319.43, "eval_qnli-contrastive_steps_per_second": 13.416, "step": 8624 }, { "epoch": 1.6637947539728124, "grad_norm": 11.52648639678955, "learning_rate": 2.6677478212642807e-05, "loss": 1.0842, "step": 8690 }, { "epoch": 1.678920160827111, "grad_norm": 8.958113670349121, "learning_rate": 2.6057918227919096e-05, "loss": 0.7981, "step": 8769 }, { "epoch": 1.694045567681409, "grad_norm": 12.083248138427734, "learning_rate": 2.5393979514257247e-05, "loss": 1.2196, "step": 8848 }, { "epoch": 1.7091709745357075, "grad_norm": 2.1500277519226074, "learning_rate": 2.4688326656039045e-05, "loss": 0.8321, "step": 8927 }, { "epoch": 1.7242963813900056, "grad_norm": 3.8833096027374268, "learning_rate": 2.3943791648968727e-05, "loss": 0.938, "step": 9006 }, { "epoch": 1.726210989852575, "eval_nli-pairs_loss": 1.1021158695220947, "eval_nli-pairs_runtime": 12.05, "eval_nli-pairs_samples_per_second": 124.482, "eval_nli-pairs_steps_per_second": 5.228, "step": 9016 }, { "epoch": 1.726210989852575, "eval_scitail-pairs-pos_loss": 0.519660472869873, "eval_scitail-pairs-pos_runtime": 15.121, "eval_scitail-pairs-pos_samples_per_second": 86.238, "eval_scitail-pairs-pos_steps_per_second": 3.637, "step": 9016 }, { "epoch": 1.726210989852575, "eval_qnli-contrastive_loss": 1.3204244375228882, "eval_qnli-contrastive_runtime": 4.6913, "eval_qnli-contrastive_samples_per_second": 319.739, "eval_qnli-contrastive_steps_per_second": 13.429, "step": 9016 }, { "epoch": 1.7394217882443042, "grad_norm": 9.389202117919922, "learning_rate": 2.316336253442829e-05, "loss": 1.0008, "step": 9085 }, { "epoch": 1.7545471950986022, "grad_norm": 0.5910531282424927, "learning_rate": 2.235017140757486e-05, "loss": 0.8644, "step": 9164 }, { "epoch": 1.7696726019529008, "grad_norm": 5.645143032073975, "learning_rate": 2.1507481847307262e-05, "loss": 1.0459, "step": 9243 }, { "epoch": 1.7847980088071989, "grad_norm": 2.0821499824523926, "learning_rate": 2.0638675818549023e-05, "loss": 0.9344, "step": 9322 }, { "epoch": 1.7999234156614974, "grad_norm": 10.352788925170898, "learning_rate": 1.9747240099412936e-05, "loss": 1.0636, "step": 9401 }, { "epoch": 1.8012636415852958, "eval_nli-pairs_loss": 1.0661962032318115, "eval_nli-pairs_runtime": 11.9602, "eval_nli-pairs_samples_per_second": 125.416, "eval_nli-pairs_steps_per_second": 5.267, "step": 9408 }, { "epoch": 1.8012636415852958, "eval_scitail-pairs-pos_loss": 0.5188334584236145, "eval_scitail-pairs-pos_runtime": 15.0572, "eval_scitail-pairs-pos_samples_per_second": 86.603, "eval_scitail-pairs-pos_steps_per_second": 3.653, "step": 9408 }, { "epoch": 1.8012636415852958, "eval_qnli-contrastive_loss": 0.9691615700721741, "eval_qnli-contrastive_runtime": 4.7039, "eval_qnli-contrastive_samples_per_second": 318.886, "eval_qnli-contrastive_steps_per_second": 13.393, "step": 9408 }, { "epoch": 1.8150488225157955, "grad_norm": 7.344937801361084, "learning_rate": 1.8836752287718936e-05, "loss": 1.2482, "step": 9480 }, { "epoch": 1.8301742293700938, "grad_norm": 1.0527677536010742, "learning_rate": 1.7910866443025426e-05, "loss": 1.0134, "step": 9559 }, { "epoch": 1.845299636224392, "grad_norm": 13.278373718261719, "learning_rate": 1.6973298421796733e-05, "loss": 0.981, "step": 9638 }, { "epoch": 1.8604250430786904, "grad_norm": 2.146714448928833, "learning_rate": 1.6027810964561188e-05, "loss": 1.0289, "step": 9717 }, { "epoch": 1.8755504499329887, "grad_norm": 13.393159866333008, "learning_rate": 1.5078198594909435e-05, "loss": 0.9656, "step": 9796 }, { "epoch": 1.8763162933180164, "eval_nli-pairs_loss": 1.0254323482513428, "eval_nli-pairs_runtime": 12.1952, "eval_nli-pairs_samples_per_second": 122.999, "eval_nli-pairs_steps_per_second": 5.166, "step": 9800 }, { "epoch": 1.8763162933180164, "eval_scitail-pairs-pos_loss": 0.496192991733551, "eval_scitail-pairs-pos_runtime": 15.1968, "eval_scitail-pairs-pos_samples_per_second": 85.808, "eval_scitail-pairs-pos_steps_per_second": 3.619, "step": 9800 }, { "epoch": 1.8763162933180164, "eval_qnli-contrastive_loss": 1.0920603275299072, "eval_qnli-contrastive_runtime": 4.731, "eval_qnli-contrastive_samples_per_second": 317.056, "eval_qnli-contrastive_steps_per_second": 13.316, "step": 9800 }, { "epoch": 1.890675856787287, "grad_norm": 15.524497032165527, "learning_rate": 1.412827239093775e-05, "loss": 0.9088, "step": 9875 }, { "epoch": 1.9058012636415853, "grad_norm": 67.18510437011719, "learning_rate": 1.3181844690253298e-05, "loss": 1.2097, "step": 9954 }, { "epoch": 1.9209266704958836, "grad_norm": 5.732685565948486, "learning_rate": 1.2242713789924544e-05, "loss": 0.7741, "step": 10033 }, { "epoch": 1.936052077350182, "grad_norm": 6.51609992980957, "learning_rate": 1.13146487027805e-05, "loss": 0.8206, "step": 10112 }, { "epoch": 1.9511774842044802, "grad_norm": 6.481364727020264, "learning_rate": 1.040137403123638e-05, "loss": 0.8686, "step": 10191 }, { "epoch": 1.9513689450507372, "eval_nli-pairs_loss": 1.0038272142410278, "eval_nli-pairs_runtime": 12.0711, "eval_nli-pairs_samples_per_second": 124.264, "eval_nli-pairs_steps_per_second": 5.219, "step": 10192 }, { "epoch": 1.9513689450507372, "eval_scitail-pairs-pos_loss": 0.4778198003768921, "eval_scitail-pairs-pos_runtime": 15.3152, "eval_scitail-pairs-pos_samples_per_second": 85.144, "eval_scitail-pairs-pos_steps_per_second": 3.591, "step": 10192 }, { "epoch": 1.9513689450507372, "eval_qnli-contrastive_loss": 0.9486138820648193, "eval_qnli-contrastive_runtime": 4.7421, "eval_qnli-contrastive_samples_per_second": 316.315, "eval_qnli-contrastive_steps_per_second": 13.285, "step": 10192 }, { "epoch": 1.9663028910587785, "grad_norm": 9.677536964416504, "learning_rate": 9.50655501935166e-06, "loss": 0.7649, "step": 10270 }, { "epoch": 1.9814282979130766, "grad_norm": 8.125744819641113, "learning_rate": 8.633782843110642e-06, "loss": 0.9249, "step": 10349 }, { "epoch": 1.9965537047673751, "grad_norm": 3.1055586338043213, "learning_rate": 7.797110684759332e-06, "loss": 0.6997, "step": 10428 }, { "epoch": 2.0116791116216732, "grad_norm": 7.393470764160156, "learning_rate": 6.978450495850865e-06, "loss": 1.06, "step": 10507 }, { "epoch": 2.026421596783458, "eval_nli-pairs_loss": 1.000571608543396, "eval_nli-pairs_runtime": 12.5554, "eval_nli-pairs_samples_per_second": 119.47, "eval_nli-pairs_steps_per_second": 5.018, "step": 10584 }, { "epoch": 2.026421596783458, "eval_scitail-pairs-pos_loss": 0.48184335231781006, "eval_scitail-pairs-pos_runtime": 15.4715, "eval_scitail-pairs-pos_samples_per_second": 84.284, "eval_scitail-pairs-pos_steps_per_second": 3.555, "step": 10584 }, { "epoch": 2.026421596783458, "eval_qnli-contrastive_loss": 0.9664335250854492, "eval_qnli-contrastive_runtime": 4.7851, "eval_qnli-contrastive_samples_per_second": 313.474, "eval_qnli-contrastive_steps_per_second": 13.166, "step": 10584 }, { "epoch": 2.0268045184759718, "grad_norm": 12.336913108825684, "learning_rate": 6.191983181204208e-06, "loss": 0.9447, "step": 10586 }, { "epoch": 2.04192992533027, "grad_norm": 4.7379984855651855, "learning_rate": 5.440865069077124e-06, "loss": 1.0151, "step": 10665 }, { "epoch": 2.0570553321845684, "grad_norm": 27.00238800048828, "learning_rate": 4.728110620818674e-06, "loss": 1.113, "step": 10744 }, { "epoch": 2.0721807390388665, "grad_norm": 17.84748649597168, "learning_rate": 4.0565803329351935e-06, "loss": 1.1183, "step": 10823 }, { "epoch": 2.087306145893165, "grad_norm": 15.165081977844238, "learning_rate": 3.4289692570634956e-06, "loss": 1.1639, "step": 10902 }, { "epoch": 2.1014742485161784, "eval_nli-pairs_loss": 0.9944142699241638, "eval_nli-pairs_runtime": 12.004, "eval_nli-pairs_samples_per_second": 124.958, "eval_nli-pairs_steps_per_second": 5.248, "step": 10976 }, { "epoch": 2.1014742485161784, "eval_scitail-pairs-pos_loss": 0.47857147455215454, "eval_scitail-pairs-pos_runtime": 15.1823, "eval_scitail-pairs-pos_samples_per_second": 85.89, "eval_scitail-pairs-pos_steps_per_second": 3.623, "step": 10976 }, { "epoch": 2.1014742485161784, "eval_qnli-contrastive_loss": 0.9332481026649475, "eval_qnli-contrastive_runtime": 4.7377, "eval_qnli-contrastive_samples_per_second": 316.609, "eval_qnli-contrastive_steps_per_second": 13.298, "step": 10976 }, { "epoch": 2.102431552747463, "grad_norm": 14.085611343383789, "learning_rate": 2.847796183923562e-06, "loss": 1.0222, "step": 10981 }, { "epoch": 2.1175569596017616, "grad_norm": 9.214906692504883, "learning_rate": 2.3153935346589784e-06, "loss": 1.244, "step": 11060 }, { "epoch": 2.1326823664560597, "grad_norm": 28.293725967407227, "learning_rate": 1.8338980001342158e-06, "loss": 1.0128, "step": 11139 }, { "epoch": 2.147807773310358, "grad_norm": 7.782803058624268, "learning_rate": 1.4052419657559468e-06, "loss": 1.3783, "step": 11218 }, { "epoch": 2.1629331801646563, "grad_norm": 8.853714942932129, "learning_rate": 1.0311457562331311e-06, "loss": 1.0301, "step": 11297 }, { "epoch": 2.176526900248899, "eval_nli-pairs_loss": 0.9802760481834412, "eval_nli-pairs_runtime": 11.9822, "eval_nli-pairs_samples_per_second": 125.185, "eval_nli-pairs_steps_per_second": 5.258, "step": 11368 }, { "epoch": 2.176526900248899, "eval_scitail-pairs-pos_loss": 0.47513890266418457, "eval_scitail-pairs-pos_runtime": 15.0277, "eval_scitail-pairs-pos_samples_per_second": 86.773, "eval_scitail-pairs-pos_steps_per_second": 3.66, "step": 11368 }, { "epoch": 2.176526900248899, "eval_qnli-contrastive_loss": 0.9649375677108765, "eval_qnli-contrastive_runtime": 4.7258, "eval_qnli-contrastive_samples_per_second": 317.404, "eval_qnli-contrastive_steps_per_second": 13.331, "step": 11368 }, { "epoch": 2.178058587018955, "grad_norm": 9.836175918579102, "learning_rate": 7.131107314001456e-07, "loss": 0.8699, "step": 11376 }, { "epoch": 2.193183993873253, "grad_norm": 6.889993190765381, "learning_rate": 4.5241326081128687e-07, "loss": 0.7565, "step": 11455 }, { "epoch": 2.2083094007275514, "grad_norm": 7.592372894287109, "learning_rate": 2.500996012884593e-07, "loss": 1.3038, "step": 11534 }, { "epoch": 2.2234348075818495, "grad_norm": 2.2131893634796143, "learning_rate": 1.069816979800553e-07, "loss": 0.9584, "step": 11613 }, { "epoch": 2.2385602144361476, "grad_norm": 129.076904296875, "learning_rate": 2.3633925782526324e-08, "loss": 1.4689, "step": 11692 }, { "epoch": 2.25157955198162, "eval_nli-pairs_loss": 0.9801518321037292, "eval_nli-pairs_runtime": 12.0172, "eval_nli-pairs_samples_per_second": 124.821, "eval_nli-pairs_steps_per_second": 5.242, "step": 11760 }, { "epoch": 2.25157955198162, "eval_scitail-pairs-pos_loss": 0.4722036123275757, "eval_scitail-pairs-pos_runtime": 15.1727, "eval_scitail-pairs-pos_samples_per_second": 85.944, "eval_scitail-pairs-pos_steps_per_second": 3.625, "step": 11760 }, { "epoch": 2.25157955198162, "eval_qnli-contrastive_loss": 0.9584055542945862, "eval_qnli-contrastive_runtime": 4.7605, "eval_qnli-contrastive_samples_per_second": 315.092, "eval_qnli-contrastive_steps_per_second": 13.234, "step": 11760 }, { "epoch": 2.253685621290446, "grad_norm": 9.094249725341797, "learning_rate": 2.999960921579765e-05, "loss": 0.9979, "step": 11771 }, { "epoch": 2.2688110281447442, "grad_norm": 14.057835578918457, "learning_rate": 2.9962654445090394e-05, "loss": 1.3444, "step": 11850 }, { "epoch": 2.2839364349990428, "grad_norm": 8.33903694152832, "learning_rate": 2.9865650072629244e-05, "loss": 1.2052, "step": 11929 }, { "epoch": 2.299061841853341, "grad_norm": 5.676733493804932, "learning_rate": 2.970898540593688e-05, "loss": 1.2007, "step": 12008 }, { "epoch": 2.3141872487076394, "grad_norm": 3.648158550262451, "learning_rate": 2.9493289187117727e-05, "loss": 1.1402, "step": 12087 }, { "epoch": 2.3266322037143405, "eval_nli-pairs_loss": 1.0052505731582642, "eval_nli-pairs_runtime": 12.1373, "eval_nli-pairs_samples_per_second": 123.586, "eval_nli-pairs_steps_per_second": 5.191, "step": 12152 }, { "epoch": 2.3266322037143405, "eval_scitail-pairs-pos_loss": 0.47668519616127014, "eval_scitail-pairs-pos_runtime": 15.0626, "eval_scitail-pairs-pos_samples_per_second": 86.572, "eval_scitail-pairs-pos_steps_per_second": 3.651, "step": 12152 }, { "epoch": 2.3266322037143405, "eval_qnli-contrastive_loss": 1.2372807264328003, "eval_qnli-contrastive_runtime": 4.7164, "eval_qnli-contrastive_samples_per_second": 318.038, "eval_qnli-contrastive_steps_per_second": 13.358, "step": 12152 }, { "epoch": 2.3293126555619375, "grad_norm": 4.789942741394043, "learning_rate": 2.9219427069528128e-05, "loss": 1.5263, "step": 12166 }, { "epoch": 2.344438062416236, "grad_norm": 14.52586555480957, "learning_rate": 2.8888498143650785e-05, "loss": 1.263, "step": 12245 }, { "epoch": 2.359563469270534, "grad_norm": 2.835966110229492, "learning_rate": 2.8501830526116386e-05, "loss": 1.1912, "step": 12324 }, { "epoch": 2.3746888761248326, "grad_norm": 14.9393949508667, "learning_rate": 2.8060976029574842e-05, "loss": 1.0982, "step": 12403 }, { "epoch": 2.3898142829791307, "grad_norm": 8.84047794342041, "learning_rate": 2.7567703934807572e-05, "loss": 1.1574, "step": 12482 }, { "epoch": 2.401684855447061, "eval_nli-pairs_loss": 0.9759184122085571, "eval_nli-pairs_runtime": 12.2553, "eval_nli-pairs_samples_per_second": 122.396, "eval_nli-pairs_steps_per_second": 5.141, "step": 12544 }, { "epoch": 2.401684855447061, "eval_scitail-pairs-pos_loss": 0.4914855659008026, "eval_scitail-pairs-pos_runtime": 15.0918, "eval_scitail-pairs-pos_samples_per_second": 86.404, "eval_scitail-pairs-pos_steps_per_second": 3.644, "step": 12544 }, { "epoch": 2.401684855447061, "eval_qnli-contrastive_loss": 1.1089410781860352, "eval_qnli-contrastive_runtime": 4.7223, "eval_qnli-contrastive_samples_per_second": 317.644, "eval_qnli-contrastive_steps_per_second": 13.341, "step": 12544 }, { "epoch": 2.404939689833429, "grad_norm": 11.71249008178711, "learning_rate": 2.7023993890075236e-05, "loss": 1.4077, "step": 12561 }, { "epoch": 2.4200650966877273, "grad_norm": 2.904869794845581, "learning_rate": 2.6432027966197927e-05, "loss": 1.3183, "step": 12640 }, { "epoch": 2.435190503542026, "grad_norm": 9.094073295593262, "learning_rate": 2.579418189925317e-05, "loss": 1.0883, "step": 12719 }, { "epoch": 2.450315910396324, "grad_norm": 9.701898574829102, "learning_rate": 2.5113015556037383e-05, "loss": 1.3182, "step": 12798 }, { "epoch": 2.4654413172506224, "grad_norm": 6.8915581703186035, "learning_rate": 2.4391262660555785e-05, "loss": 1.0089, "step": 12877 }, { "epoch": 2.4767375071797817, "eval_nli-pairs_loss": 0.9481552243232727, "eval_nli-pairs_runtime": 12.17, "eval_nli-pairs_samples_per_second": 123.254, "eval_nli-pairs_steps_per_second": 5.177, "step": 12936 }, { "epoch": 2.4767375071797817, "eval_scitail-pairs-pos_loss": 0.4552152752876282, "eval_scitail-pairs-pos_runtime": 15.2525, "eval_scitail-pairs-pos_samples_per_second": 85.494, "eval_scitail-pairs-pos_steps_per_second": 3.606, "step": 12936 }, { "epoch": 2.4767375071797817, "eval_qnli-contrastive_loss": 1.1650612354278564, "eval_qnli-contrastive_runtime": 4.7586, "eval_qnli-contrastive_samples_per_second": 315.216, "eval_qnli-contrastive_steps_per_second": 13.239, "step": 12936 }, { "epoch": 2.4805667241049205, "grad_norm": 9.97049617767334, "learning_rate": 2.3631819822771357e-05, "loss": 1.0616, "step": 12956 }, { "epoch": 2.4956921309592186, "grad_norm": 10.72946548461914, "learning_rate": 2.2837734913643845e-05, "loss": 1.1083, "step": 13035 }, { "epoch": 2.510817537813517, "grad_norm": 6.889919281005859, "learning_rate": 2.2012194833113163e-05, "loss": 1.2687, "step": 13114 }, { "epoch": 2.5259429446678157, "grad_norm": 2.167541742324829, "learning_rate": 2.1158512720117925e-05, "loss": 0.698, "step": 13193 }, { "epoch": 2.5410683515221137, "grad_norm": 6.788521766662598, "learning_rate": 2.0280114655979378e-05, "loss": 1.0596, "step": 13272 }, { "epoch": 2.5517901589125023, "eval_nli-pairs_loss": 0.9386218786239624, "eval_nli-pairs_runtime": 12.1882, "eval_nli-pairs_samples_per_second": 123.07, "eval_nli-pairs_steps_per_second": 5.169, "step": 13328 }, { "epoch": 2.5517901589125023, "eval_scitail-pairs-pos_loss": 0.45524224638938904, "eval_scitail-pairs-pos_runtime": 15.3268, "eval_scitail-pairs-pos_samples_per_second": 85.08, "eval_scitail-pairs-pos_steps_per_second": 3.588, "step": 13328 }, { "epoch": 2.5517901589125023, "eval_qnli-contrastive_loss": 1.053303837776184, "eval_qnli-contrastive_runtime": 4.7606, "eval_qnli-contrastive_samples_per_second": 315.086, "eval_qnli-contrastive_steps_per_second": 13.234, "step": 13328 }, { "epoch": 2.556193758376412, "grad_norm": 5.612150192260742, "learning_rate": 1.9380525914513508e-05, "loss": 1.1182, "step": 13351 }, { "epoch": 2.5713191652307104, "grad_norm": 5.856744289398193, "learning_rate": 1.8463356814054177e-05, "loss": 0.9092, "step": 13430 }, { "epoch": 2.586444572085009, "grad_norm": 3.5007331371307373, "learning_rate": 1.7532288228167412e-05, "loss": 0.8628, "step": 13509 }, { "epoch": 2.601569978939307, "grad_norm": 3.8348581790924072, "learning_rate": 1.6591056813206084e-05, "loss": 0.762, "step": 13588 }, { "epoch": 2.616695385793605, "grad_norm": 3.7152531147003174, "learning_rate": 1.564344001199179e-05, "loss": 0.9521, "step": 13667 }, { "epoch": 2.626842810645223, "eval_nli-pairs_loss": 0.8889521956443787, "eval_nli-pairs_runtime": 12.1548, "eval_nli-pairs_samples_per_second": 123.408, "eval_nli-pairs_steps_per_second": 5.183, "step": 13720 }, { "epoch": 2.626842810645223, "eval_scitail-pairs-pos_loss": 0.45236507058143616, "eval_scitail-pairs-pos_runtime": 15.2247, "eval_scitail-pairs-pos_samples_per_second": 85.65, "eval_scitail-pairs-pos_steps_per_second": 3.613, "step": 13720 }, { "epoch": 2.626842810645223, "eval_qnli-contrastive_loss": 0.794640302658081, "eval_qnli-contrastive_runtime": 4.8223, "eval_qnli-contrastive_samples_per_second": 311.053, "eval_qnli-contrastive_steps_per_second": 13.064, "step": 13720 }, { "epoch": 2.6318207926479036, "grad_norm": 8.717215538024902, "learning_rate": 1.4693240893808674e-05, "loss": 0.8631, "step": 13746 }, { "epoch": 2.6469461995022017, "grad_norm": 0.3876877725124359, "learning_rate": 1.3744272891550144e-05, "loss": 0.6899, "step": 13825 }, { "epoch": 2.6620716063565, "grad_norm": 0.41043633222579956, "learning_rate": 1.2800344497273615e-05, "loss": 0.6552, "step": 13904 }, { "epoch": 2.6771970132107983, "grad_norm": 0.8379763960838318, "learning_rate": 1.1865243977584432e-05, "loss": 0.572, "step": 13983 }, { "epoch": 2.692322420065097, "grad_norm": 4.94291877746582, "learning_rate": 1.0942724170190126e-05, "loss": 0.9809, "step": 14062 }, { "epoch": 2.701895462377944, "eval_nli-pairs_loss": 0.8912826180458069, "eval_nli-pairs_runtime": 12.096, "eval_nli-pairs_samples_per_second": 124.008, "eval_nli-pairs_steps_per_second": 5.208, "step": 14112 }, { "epoch": 2.701895462377944, "eval_scitail-pairs-pos_loss": 0.4352218210697174, "eval_scitail-pairs-pos_runtime": 15.0606, "eval_scitail-pairs-pos_samples_per_second": 86.584, "eval_scitail-pairs-pos_steps_per_second": 3.652, "step": 14112 }, { "epoch": 2.701895462377944, "eval_qnli-contrastive_loss": 0.727630078792572, "eval_qnli-contrastive_runtime": 4.7927, "eval_qnli-contrastive_samples_per_second": 312.979, "eval_qnli-contrastive_steps_per_second": 13.145, "step": 14112 }, { "epoch": 2.707447826919395, "grad_norm": 2.8381199836730957, "learning_rate": 1.0036487422641892e-05, "loss": 0.5392, "step": 14141 }, { "epoch": 2.7225732337736934, "grad_norm": 9.423616409301758, "learning_rate": 9.150170733707937e-06, "loss": 0.6777, "step": 14220 }, { "epoch": 2.7376986406279915, "grad_norm": 0.6272808909416199, "learning_rate": 8.287331157010844e-06, "loss": 0.6523, "step": 14299 }, { "epoch": 2.75282404748229, "grad_norm": 0.7308062314987183, "learning_rate": 7.4514315255090594e-06, "loss": 0.6416, "step": 14378 }, { "epoch": 2.767949454336588, "grad_norm": 4.945492267608643, "learning_rate": 6.645826554113819e-06, "loss": 0.7713, "step": 14457 }, { "epoch": 2.7769481141106644, "eval_nli-pairs_loss": 0.872556209564209, "eval_nli-pairs_runtime": 12.1015, "eval_nli-pairs_samples_per_second": 123.952, "eval_nli-pairs_steps_per_second": 5.206, "step": 14504 }, { "epoch": 2.7769481141106644, "eval_scitail-pairs-pos_loss": 0.42709970474243164, "eval_scitail-pairs-pos_runtime": 15.0845, "eval_scitail-pairs-pos_samples_per_second": 86.446, "eval_scitail-pairs-pos_steps_per_second": 3.646, "step": 14504 }, { "epoch": 2.7769481141106644, "eval_qnli-contrastive_loss": 0.7923160791397095, "eval_qnli-contrastive_runtime": 4.7233, "eval_qnli-contrastive_samples_per_second": 317.576, "eval_qnli-contrastive_steps_per_second": 13.338, "step": 14504 }, { "epoch": 2.7830748611908867, "grad_norm": 9.502604484558105, "learning_rate": 5.873749376215993e-06, "loss": 0.6531, "step": 14536 }, { "epoch": 2.7982002680451847, "grad_norm": 6.348124980926514, "learning_rate": 5.138298568156192e-06, "loss": 0.7056, "step": 14615 }, { "epoch": 2.813325674899483, "grad_norm": 4.395310401916504, "learning_rate": 4.442425713712258e-06, "loss": 1.054, "step": 14694 }, { "epoch": 2.8284510817537813, "grad_norm": 5.8618011474609375, "learning_rate": 3.7889235585119115e-06, "loss": 0.8535, "step": 14773 }, { "epoch": 2.84357648860808, "grad_norm": 7.8259406089782715, "learning_rate": 3.1804148019103528e-06, "loss": 0.7321, "step": 14852 }, { "epoch": 2.852000765843385, "eval_nli-pairs_loss": 0.8661790490150452, "eval_nli-pairs_runtime": 12.1048, "eval_nli-pairs_samples_per_second": 123.917, "eval_nli-pairs_steps_per_second": 5.205, "step": 14896 }, { "epoch": 2.852000765843385, "eval_scitail-pairs-pos_loss": 0.4211391508579254, "eval_scitail-pairs-pos_runtime": 15.1135, "eval_scitail-pairs-pos_samples_per_second": 86.28, "eval_scitail-pairs-pos_steps_per_second": 3.639, "step": 14896 }, { "epoch": 2.852000765843385, "eval_qnli-contrastive_loss": 0.7693744897842407, "eval_qnli-contrastive_runtime": 4.7208, "eval_qnli-contrastive_samples_per_second": 317.743, "eval_qnli-contrastive_steps_per_second": 13.345, "step": 14896 }, { "epoch": 2.858701895462378, "grad_norm": 8.156476974487305, "learning_rate": 2.6193415713143028e-06, "loss": 0.8236, "step": 14931 }, { "epoch": 2.873827302316676, "grad_norm": 0.3863189220428467, "learning_rate": 2.107955621195247e-06, "loss": 0.776, "step": 15010 }, { "epoch": 2.8889527091709746, "grad_norm": 0.4337412118911743, "learning_rate": 1.6483092961261291e-06, "loss": 0.7049, "step": 15089 }, { "epoch": 2.904078116025273, "grad_norm": 0.5512604117393494, "learning_rate": 1.2422472941095199e-06, "loss": 0.9409, "step": 15168 }, { "epoch": 2.919203522879571, "grad_norm": 4.254249572753906, "learning_rate": 8.913992632535123e-07, "loss": 0.7416, "step": 15247 }, { "epoch": 2.9270534175761056, "eval_nli-pairs_loss": 0.8609779477119446, "eval_nli-pairs_runtime": 12.2133, "eval_nli-pairs_samples_per_second": 122.817, "eval_nli-pairs_steps_per_second": 5.158, "step": 15288 }, { "epoch": 2.9270534175761056, "eval_scitail-pairs-pos_loss": 0.42045190930366516, "eval_scitail-pairs-pos_runtime": 15.4078, "eval_scitail-pairs-pos_samples_per_second": 84.632, "eval_scitail-pairs-pos_steps_per_second": 3.57, "step": 15288 }, { "epoch": 2.9270534175761056, "eval_qnli-contrastive_loss": 0.7351691722869873, "eval_qnli-contrastive_runtime": 4.7717, "eval_qnli-contrastive_samples_per_second": 314.351, "eval_qnli-contrastive_steps_per_second": 13.203, "step": 15288 }, { "epoch": 2.9343289297338693, "grad_norm": 6.785557270050049, "learning_rate": 5.971732615070724e-07, "loss": 0.6059, "step": 15326 }, { "epoch": 2.949454336588168, "grad_norm": 14.958471298217773, "learning_rate": 3.6075010570289336e-07, "loss": 0.6598, "step": 15405 }, { "epoch": 2.964579743442466, "grad_norm": 0.34104809165000916, "learning_rate": 1.8307863258672674e-07, "loss": 0.5777, "step": 15484 }, { "epoch": 2.9797051502967644, "grad_norm": 0.6522515416145325, "learning_rate": 6.487189085208289e-08, "loss": 0.8212, "step": 15563 }, { "epoch": 2.9948305571510625, "grad_norm": 0.3607589304447174, "learning_rate": 6.6042794628590194e-09, "loss": 0.5638, "step": 15642 } ], "logging_steps": 79, "max_steps": 15669, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3918, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 30, "trial_name": null, "trial_params": null }