bobox's picture
Training in progress, epoch 3, checkpoint
731c01e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 392,
"global_step": 15669,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015125406854298296,
"grad_norm": 199.77919006347656,
"learning_rate": 5.589586523736601e-07,
"loss": 12.973,
"step": 79
},
{
"epoch": 0.030250813708596593,
"grad_norm": 70.86483764648438,
"learning_rate": 1.1638591117917304e-06,
"loss": 8.4661,
"step": 158
},
{
"epoch": 0.045376220562894885,
"grad_norm": 17.361167907714844,
"learning_rate": 1.768759571209801e-06,
"loss": 6.136,
"step": 237
},
{
"epoch": 0.060501627417193185,
"grad_norm": 13.593123435974121,
"learning_rate": 2.3736600306278715e-06,
"loss": 6.2319,
"step": 316
},
{
"epoch": 0.07505265173272066,
"eval_nli-pairs_loss": 5.313699245452881,
"eval_nli-pairs_runtime": 12.1282,
"eval_nli-pairs_samples_per_second": 123.679,
"eval_nli-pairs_steps_per_second": 5.195,
"step": 392
},
{
"epoch": 0.07505265173272066,
"eval_scitail-pairs-pos_loss": 3.977630615234375,
"eval_scitail-pairs-pos_runtime": 15.2195,
"eval_scitail-pairs-pos_samples_per_second": 85.68,
"eval_scitail-pairs-pos_steps_per_second": 3.614,
"step": 392
},
{
"epoch": 0.07505265173272066,
"eval_qnli-contrastive_loss": 5.520341396331787,
"eval_qnli-contrastive_runtime": 4.7394,
"eval_qnli-contrastive_samples_per_second": 316.494,
"eval_qnli-contrastive_steps_per_second": 13.293,
"step": 392
},
{
"epoch": 0.07562703427149148,
"grad_norm": 15.363186836242676,
"learning_rate": 2.978560490045942e-06,
"loss": 5.6068,
"step": 395
},
{
"epoch": 0.09075244112578977,
"grad_norm": 18.922758102416992,
"learning_rate": 3.5834609494640125e-06,
"loss": 5.2502,
"step": 474
},
{
"epoch": 0.10587784798008808,
"grad_norm": 20.363380432128906,
"learning_rate": 4.188361408882083e-06,
"loss": 4.8699,
"step": 553
},
{
"epoch": 0.12100325483438637,
"grad_norm": 14.830269813537598,
"learning_rate": 4.793261868300153e-06,
"loss": 4.9584,
"step": 632
},
{
"epoch": 0.13612866168868468,
"grad_norm": 26.075838088989258,
"learning_rate": 5.398162327718224e-06,
"loss": 4.5632,
"step": 711
},
{
"epoch": 0.15010530346544132,
"eval_nli-pairs_loss": 4.265738487243652,
"eval_nli-pairs_runtime": 12.0989,
"eval_nli-pairs_samples_per_second": 123.978,
"eval_nli-pairs_steps_per_second": 5.207,
"step": 784
},
{
"epoch": 0.15010530346544132,
"eval_scitail-pairs-pos_loss": 2.458251476287842,
"eval_scitail-pairs-pos_runtime": 15.2215,
"eval_scitail-pairs-pos_samples_per_second": 85.668,
"eval_scitail-pairs-pos_steps_per_second": 3.613,
"step": 784
},
{
"epoch": 0.15010530346544132,
"eval_qnli-contrastive_loss": 4.81198263168335,
"eval_qnli-contrastive_runtime": 4.724,
"eval_qnli-contrastive_samples_per_second": 317.525,
"eval_qnli-contrastive_steps_per_second": 13.336,
"step": 784
},
{
"epoch": 0.15125406854298296,
"grad_norm": 17.387819290161133,
"learning_rate": 6.003062787136294e-06,
"loss": 4.223,
"step": 790
},
{
"epoch": 0.16637947539728126,
"grad_norm": 24.702957153320312,
"learning_rate": 6.607963246554365e-06,
"loss": 3.8496,
"step": 869
},
{
"epoch": 0.18150488225157954,
"grad_norm": 20.878055572509766,
"learning_rate": 7.212863705972435e-06,
"loss": 3.4414,
"step": 948
},
{
"epoch": 0.19663028910587785,
"grad_norm": 28.57908821105957,
"learning_rate": 7.817764165390506e-06,
"loss": 3.3513,
"step": 1027
},
{
"epoch": 0.21175569596017615,
"grad_norm": 37.09183120727539,
"learning_rate": 8.422664624808575e-06,
"loss": 3.5611,
"step": 1106
},
{
"epoch": 0.22515795519816198,
"eval_nli-pairs_loss": 3.178299903869629,
"eval_nli-pairs_runtime": 12.0715,
"eval_nli-pairs_samples_per_second": 124.26,
"eval_nli-pairs_steps_per_second": 5.219,
"step": 1176
},
{
"epoch": 0.22515795519816198,
"eval_scitail-pairs-pos_loss": 1.983331561088562,
"eval_scitail-pairs-pos_runtime": 15.1626,
"eval_scitail-pairs-pos_samples_per_second": 86.001,
"eval_scitail-pairs-pos_steps_per_second": 3.627,
"step": 1176
},
{
"epoch": 0.22515795519816198,
"eval_qnli-contrastive_loss": 3.4507648944854736,
"eval_qnli-contrastive_runtime": 4.7752,
"eval_qnli-contrastive_samples_per_second": 314.125,
"eval_qnli-contrastive_steps_per_second": 13.193,
"step": 1176
},
{
"epoch": 0.22688110281447443,
"grad_norm": 22.88146209716797,
"learning_rate": 9.027565084226646e-06,
"loss": 3.4039,
"step": 1185
},
{
"epoch": 0.24200650966877274,
"grad_norm": 20.4180908203125,
"learning_rate": 9.632465543644716e-06,
"loss": 3.4269,
"step": 1264
},
{
"epoch": 0.25713191652307105,
"grad_norm": 23.59966278076172,
"learning_rate": 1.0237366003062788e-05,
"loss": 3.1573,
"step": 1343
},
{
"epoch": 0.27225732337736935,
"grad_norm": 10.84000301361084,
"learning_rate": 1.0842266462480856e-05,
"loss": 3.253,
"step": 1422
},
{
"epoch": 0.2873827302316676,
"grad_norm": 16.418413162231445,
"learning_rate": 1.1447166921898928e-05,
"loss": 2.7614,
"step": 1501
},
{
"epoch": 0.30021060693088264,
"eval_nli-pairs_loss": 2.722890615463257,
"eval_nli-pairs_runtime": 12.0687,
"eval_nli-pairs_samples_per_second": 124.288,
"eval_nli-pairs_steps_per_second": 5.22,
"step": 1568
},
{
"epoch": 0.30021060693088264,
"eval_scitail-pairs-pos_loss": 1.6435188055038452,
"eval_scitail-pairs-pos_runtime": 15.2101,
"eval_scitail-pairs-pos_samples_per_second": 85.732,
"eval_scitail-pairs-pos_steps_per_second": 3.616,
"step": 1568
},
{
"epoch": 0.30021060693088264,
"eval_qnli-contrastive_loss": 2.944777011871338,
"eval_qnli-contrastive_runtime": 4.7212,
"eval_qnli-contrastive_samples_per_second": 317.713,
"eval_qnli-contrastive_steps_per_second": 13.344,
"step": 1568
},
{
"epoch": 0.3025081370859659,
"grad_norm": 20.777223587036133,
"learning_rate": 1.2052067381317e-05,
"loss": 2.9549,
"step": 1580
},
{
"epoch": 0.3176335439402642,
"grad_norm": 15.09938907623291,
"learning_rate": 1.265696784073507e-05,
"loss": 2.8357,
"step": 1659
},
{
"epoch": 0.3327589507945625,
"grad_norm": 5.233273983001709,
"learning_rate": 1.326186830015314e-05,
"loss": 2.8964,
"step": 1738
},
{
"epoch": 0.34788435764886083,
"grad_norm": 16.8189640045166,
"learning_rate": 1.386676875957121e-05,
"loss": 2.8274,
"step": 1817
},
{
"epoch": 0.3630097645031591,
"grad_norm": 8.114161491394043,
"learning_rate": 1.4471669218989282e-05,
"loss": 2.6809,
"step": 1896
},
{
"epoch": 0.37526325866360327,
"eval_nli-pairs_loss": 2.428619384765625,
"eval_nli-pairs_runtime": 12.0706,
"eval_nli-pairs_samples_per_second": 124.269,
"eval_nli-pairs_steps_per_second": 5.219,
"step": 1960
},
{
"epoch": 0.37526325866360327,
"eval_scitail-pairs-pos_loss": 1.3531062602996826,
"eval_scitail-pairs-pos_runtime": 15.2633,
"eval_scitail-pairs-pos_samples_per_second": 85.434,
"eval_scitail-pairs-pos_steps_per_second": 3.603,
"step": 1960
},
{
"epoch": 0.37526325866360327,
"eval_qnli-contrastive_loss": 2.404916286468506,
"eval_qnli-contrastive_runtime": 4.7194,
"eval_qnli-contrastive_samples_per_second": 317.838,
"eval_qnli-contrastive_steps_per_second": 13.349,
"step": 1960
},
{
"epoch": 0.3781351713574574,
"grad_norm": 22.405332565307617,
"learning_rate": 1.5076569678407352e-05,
"loss": 2.3456,
"step": 1975
},
{
"epoch": 0.3932605782117557,
"grad_norm": 33.843994140625,
"learning_rate": 1.5681470137825424e-05,
"loss": 2.5316,
"step": 2054
},
{
"epoch": 0.408385985066054,
"grad_norm": 3.7852566242218018,
"learning_rate": 1.6286370597243492e-05,
"loss": 2.653,
"step": 2133
},
{
"epoch": 0.4235113919203523,
"grad_norm": 28.830053329467773,
"learning_rate": 1.689127105666156e-05,
"loss": 2.699,
"step": 2212
},
{
"epoch": 0.43863679877465056,
"grad_norm": 26.699514389038086,
"learning_rate": 1.7496171516079635e-05,
"loss": 2.424,
"step": 2291
},
{
"epoch": 0.45031591039632396,
"eval_nli-pairs_loss": 2.207122564315796,
"eval_nli-pairs_runtime": 12.0919,
"eval_nli-pairs_samples_per_second": 124.05,
"eval_nli-pairs_steps_per_second": 5.21,
"step": 2352
},
{
"epoch": 0.45031591039632396,
"eval_scitail-pairs-pos_loss": 1.2252534627914429,
"eval_scitail-pairs-pos_runtime": 15.1733,
"eval_scitail-pairs-pos_samples_per_second": 85.941,
"eval_scitail-pairs-pos_steps_per_second": 3.625,
"step": 2352
},
{
"epoch": 0.45031591039632396,
"eval_qnli-contrastive_loss": 2.292630672454834,
"eval_qnli-contrastive_runtime": 4.7338,
"eval_qnli-contrastive_samples_per_second": 316.868,
"eval_qnli-contrastive_steps_per_second": 13.308,
"step": 2352
},
{
"epoch": 0.45376220562894887,
"grad_norm": 3.1586949825286865,
"learning_rate": 1.8101071975497704e-05,
"loss": 2.4716,
"step": 2370
},
{
"epoch": 0.4688876124832472,
"grad_norm": 15.398905754089355,
"learning_rate": 1.8705972434915772e-05,
"loss": 2.0097,
"step": 2449
},
{
"epoch": 0.4840130193375455,
"grad_norm": 2.9506657123565674,
"learning_rate": 1.9310872894333844e-05,
"loss": 2.3993,
"step": 2528
},
{
"epoch": 0.4991384261918438,
"grad_norm": 18.736677169799805,
"learning_rate": 1.9915773353751916e-05,
"loss": 2.3295,
"step": 2607
},
{
"epoch": 0.5142638330461421,
"grad_norm": 16.75814437866211,
"learning_rate": 2.0520673813169984e-05,
"loss": 2.348,
"step": 2686
},
{
"epoch": 0.5253685621290446,
"eval_nli-pairs_loss": 2.0092170238494873,
"eval_nli-pairs_runtime": 12.0787,
"eval_nli-pairs_samples_per_second": 124.185,
"eval_nli-pairs_steps_per_second": 5.216,
"step": 2744
},
{
"epoch": 0.5253685621290446,
"eval_scitail-pairs-pos_loss": 1.0735079050064087,
"eval_scitail-pairs-pos_runtime": 14.9317,
"eval_scitail-pairs-pos_samples_per_second": 87.331,
"eval_scitail-pairs-pos_steps_per_second": 3.683,
"step": 2744
},
{
"epoch": 0.5253685621290446,
"eval_qnli-contrastive_loss": 1.9999727010726929,
"eval_qnli-contrastive_runtime": 4.659,
"eval_qnli-contrastive_samples_per_second": 321.961,
"eval_qnli-contrastive_steps_per_second": 13.522,
"step": 2744
},
{
"epoch": 0.5293892399004404,
"grad_norm": 3.6279871463775635,
"learning_rate": 2.1125574272588056e-05,
"loss": 2.0747,
"step": 2765
},
{
"epoch": 0.5445146467547387,
"grad_norm": 102.07367706298828,
"learning_rate": 2.1730474732006124e-05,
"loss": 2.3592,
"step": 2844
},
{
"epoch": 0.5596400536090369,
"grad_norm": 12.037158966064453,
"learning_rate": 2.23353751914242e-05,
"loss": 2.2563,
"step": 2923
},
{
"epoch": 0.5747654604633352,
"grad_norm": 11.711392402648926,
"learning_rate": 2.2940275650842267e-05,
"loss": 2.3484,
"step": 3002
},
{
"epoch": 0.5898908673176335,
"grad_norm": 20.607454299926758,
"learning_rate": 2.3545176110260336e-05,
"loss": 1.868,
"step": 3081
},
{
"epoch": 0.6004212138617653,
"eval_nli-pairs_loss": 1.846701979637146,
"eval_nli-pairs_runtime": 11.9121,
"eval_nli-pairs_samples_per_second": 125.922,
"eval_nli-pairs_steps_per_second": 5.289,
"step": 3136
},
{
"epoch": 0.6004212138617653,
"eval_scitail-pairs-pos_loss": 0.9629871249198914,
"eval_scitail-pairs-pos_runtime": 15.006,
"eval_scitail-pairs-pos_samples_per_second": 86.899,
"eval_scitail-pairs-pos_steps_per_second": 3.665,
"step": 3136
},
{
"epoch": 0.6004212138617653,
"eval_qnli-contrastive_loss": 1.9593416452407837,
"eval_qnli-contrastive_runtime": 4.653,
"eval_qnli-contrastive_samples_per_second": 322.374,
"eval_qnli-contrastive_steps_per_second": 13.54,
"step": 3136
},
{
"epoch": 0.6050162741719318,
"grad_norm": 15.901214599609375,
"learning_rate": 2.4150076569678408e-05,
"loss": 1.9958,
"step": 3160
},
{
"epoch": 0.6201416810262301,
"grad_norm": 13.168147087097168,
"learning_rate": 2.475497702909648e-05,
"loss": 2.0089,
"step": 3239
},
{
"epoch": 0.6352670878805284,
"grad_norm": 21.926223754882812,
"learning_rate": 2.5359877488514548e-05,
"loss": 1.8303,
"step": 3318
},
{
"epoch": 0.6503924947348267,
"grad_norm": 21.501989364624023,
"learning_rate": 2.596477794793262e-05,
"loss": 1.6892,
"step": 3397
},
{
"epoch": 0.665517901589125,
"grad_norm": 3.5192618370056152,
"learning_rate": 2.6569678407350688e-05,
"loss": 1.8379,
"step": 3476
},
{
"epoch": 0.675473865594486,
"eval_nli-pairs_loss": 1.7486572265625,
"eval_nli-pairs_runtime": 12.0369,
"eval_nli-pairs_samples_per_second": 124.617,
"eval_nli-pairs_steps_per_second": 5.234,
"step": 3528
},
{
"epoch": 0.675473865594486,
"eval_scitail-pairs-pos_loss": 0.9056742191314697,
"eval_scitail-pairs-pos_runtime": 14.8901,
"eval_scitail-pairs-pos_samples_per_second": 87.575,
"eval_scitail-pairs-pos_steps_per_second": 3.694,
"step": 3528
},
{
"epoch": 0.675473865594486,
"eval_qnli-contrastive_loss": 1.7076925039291382,
"eval_qnli-contrastive_runtime": 4.6837,
"eval_qnli-contrastive_samples_per_second": 320.259,
"eval_qnli-contrastive_steps_per_second": 13.451,
"step": 3528
},
{
"epoch": 0.6806433084434234,
"grad_norm": 13.107728004455566,
"learning_rate": 2.717457886676876e-05,
"loss": 1.4958,
"step": 3555
},
{
"epoch": 0.6957687152977217,
"grad_norm": 10.731244087219238,
"learning_rate": 2.777947932618683e-05,
"loss": 1.9504,
"step": 3634
},
{
"epoch": 0.7108941221520199,
"grad_norm": 1.3723793029785156,
"learning_rate": 2.83843797856049e-05,
"loss": 1.6017,
"step": 3713
},
{
"epoch": 0.7260195290063182,
"grad_norm": 16.096094131469727,
"learning_rate": 2.8989280245022975e-05,
"loss": 1.7229,
"step": 3792
},
{
"epoch": 0.7411449358606165,
"grad_norm": 14.629384994506836,
"learning_rate": 2.9594180704441043e-05,
"loss": 1.5996,
"step": 3871
},
{
"epoch": 0.7505265173272065,
"eval_nli-pairs_loss": 1.6035664081573486,
"eval_nli-pairs_runtime": 12.0239,
"eval_nli-pairs_samples_per_second": 124.752,
"eval_nli-pairs_steps_per_second": 5.24,
"step": 3920
},
{
"epoch": 0.7505265173272065,
"eval_scitail-pairs-pos_loss": 0.7905139923095703,
"eval_scitail-pairs-pos_runtime": 15.2398,
"eval_scitail-pairs-pos_samples_per_second": 85.566,
"eval_scitail-pairs-pos_steps_per_second": 3.609,
"step": 3920
},
{
"epoch": 0.7505265173272065,
"eval_qnli-contrastive_loss": 1.7369401454925537,
"eval_qnli-contrastive_runtime": 4.726,
"eval_qnli-contrastive_samples_per_second": 317.396,
"eval_qnli-contrastive_steps_per_second": 13.331,
"step": 3920
},
{
"epoch": 0.7562703427149148,
"grad_norm": 12.058998107910156,
"learning_rate": 2.999673874450528e-05,
"loss": 1.6257,
"step": 3950
},
{
"epoch": 0.7713957495692131,
"grad_norm": 4.181306838989258,
"learning_rate": 2.9946841125275615e-05,
"loss": 1.6094,
"step": 4029
},
{
"epoch": 0.7865211564235114,
"grad_norm": 14.733617782592773,
"learning_rate": 2.983695736786804e-05,
"loss": 1.6061,
"step": 4108
},
{
"epoch": 0.8016465632778097,
"grad_norm": 75.19181823730469,
"learning_rate": 2.96675284686242e-05,
"loss": 1.8917,
"step": 4187
},
{
"epoch": 0.816771970132108,
"grad_norm": 17.123188018798828,
"learning_rate": 2.943923439632653e-05,
"loss": 1.766,
"step": 4266
},
{
"epoch": 0.8255791690599272,
"eval_nli-pairs_loss": 1.5217715501785278,
"eval_nli-pairs_runtime": 12.1712,
"eval_nli-pairs_samples_per_second": 123.241,
"eval_nli-pairs_steps_per_second": 5.176,
"step": 4312
},
{
"epoch": 0.8255791690599272,
"eval_scitail-pairs-pos_loss": 0.7310367226600647,
"eval_scitail-pairs-pos_runtime": 15.0699,
"eval_scitail-pairs-pos_samples_per_second": 86.53,
"eval_scitail-pairs-pos_steps_per_second": 3.65,
"step": 4312
},
{
"epoch": 0.8255791690599272,
"eval_qnli-contrastive_loss": 1.8110274076461792,
"eval_qnli-contrastive_runtime": 4.7354,
"eval_qnli-contrastive_samples_per_second": 316.764,
"eval_qnli-contrastive_steps_per_second": 13.304,
"step": 4312
},
{
"epoch": 0.8318973769864063,
"grad_norm": 26.6308536529541,
"learning_rate": 2.9152991363280456e-05,
"loss": 1.6544,
"step": 4345
},
{
"epoch": 0.8470227838407046,
"grad_norm": 11.87916088104248,
"learning_rate": 2.8809948148280698e-05,
"loss": 1.5872,
"step": 4424
},
{
"epoch": 0.8621481906950029,
"grad_norm": 5.825096607208252,
"learning_rate": 2.841148148621882e-05,
"loss": 1.6237,
"step": 4503
},
{
"epoch": 0.8772735975493011,
"grad_norm": 7.624891757965088,
"learning_rate": 2.7959190542834895e-05,
"loss": 1.5713,
"step": 4582
},
{
"epoch": 0.8923990044035994,
"grad_norm": 11.067708969116211,
"learning_rate": 2.7454890496787676e-05,
"loss": 1.5109,
"step": 4661
},
{
"epoch": 0.9006318207926479,
"eval_nli-pairs_loss": 1.4145296812057495,
"eval_nli-pairs_runtime": 12.1688,
"eval_nli-pairs_samples_per_second": 123.266,
"eval_nli-pairs_steps_per_second": 5.177,
"step": 4704
},
{
"epoch": 0.9006318207926479,
"eval_scitail-pairs-pos_loss": 0.7044198513031006,
"eval_scitail-pairs-pos_runtime": 15.0745,
"eval_scitail-pairs-pos_samples_per_second": 86.504,
"eval_scitail-pairs-pos_steps_per_second": 3.649,
"step": 4704
},
{
"epoch": 0.9006318207926479,
"eval_qnli-contrastive_loss": 1.5929718017578125,
"eval_qnli-contrastive_runtime": 4.7378,
"eval_qnli-contrastive_samples_per_second": 316.603,
"eval_qnli-contrastive_steps_per_second": 13.297,
"step": 4704
},
{
"epoch": 0.9075244112578977,
"grad_norm": 18.31964874267578,
"learning_rate": 2.6900605254800455e-05,
"loss": 1.8614,
"step": 4740
},
{
"epoch": 0.922649818112196,
"grad_norm": 11.028084754943848,
"learning_rate": 2.6298559329118796e-05,
"loss": 1.2809,
"step": 4819
},
{
"epoch": 0.9377752249664943,
"grad_norm": 11.14758586883545,
"learning_rate": 2.565116890987845e-05,
"loss": 1.4557,
"step": 4898
},
{
"epoch": 0.9529006318207927,
"grad_norm": 12.307340621948242,
"learning_rate": 2.4970023905369427e-05,
"loss": 2.285,
"step": 4977
},
{
"epoch": 0.968026038675091,
"grad_norm": 19.368682861328125,
"learning_rate": 2.4249872456580537e-05,
"loss": 1.5918,
"step": 5056
},
{
"epoch": 0.9756844725253686,
"eval_nli-pairs_loss": 1.3622660636901855,
"eval_nli-pairs_runtime": 12.1119,
"eval_nli-pairs_samples_per_second": 123.845,
"eval_nli-pairs_steps_per_second": 5.201,
"step": 5096
},
{
"epoch": 0.9756844725253686,
"eval_scitail-pairs-pos_loss": 0.6618204116821289,
"eval_scitail-pairs-pos_runtime": 15.1844,
"eval_scitail-pairs-pos_samples_per_second": 85.877,
"eval_scitail-pairs-pos_steps_per_second": 3.622,
"step": 5096
},
{
"epoch": 0.9756844725253686,
"eval_qnli-contrastive_loss": 1.5225657224655151,
"eval_qnli-contrastive_runtime": 4.73,
"eval_qnli-contrastive_samples_per_second": 317.125,
"eval_qnli-contrastive_steps_per_second": 13.319,
"step": 5096
},
{
"epoch": 0.9831514455293893,
"grad_norm": 23.91764259338379,
"learning_rate": 2.349353206401398e-05,
"loss": 1.5956,
"step": 5135
},
{
"epoch": 0.9982768523836876,
"grad_norm": 28.184560775756836,
"learning_rate": 2.269363669859137e-05,
"loss": 1.309,
"step": 5214
},
{
"epoch": 1.0134022592379859,
"grad_norm": 1.2889472246170044,
"learning_rate": 2.186286447094588e-05,
"loss": 1.6033,
"step": 5293
},
{
"epoch": 1.0285276660922842,
"grad_norm": 9.043930053710938,
"learning_rate": 2.1004549518185432e-05,
"loss": 1.2943,
"step": 5372
},
{
"epoch": 1.0436530729465825,
"grad_norm": 15.558199882507324,
"learning_rate": 2.012213651460107e-05,
"loss": 1.4881,
"step": 5451
},
{
"epoch": 1.0507371242580892,
"eval_nli-pairs_loss": 1.3221956491470337,
"eval_nli-pairs_runtime": 12.1205,
"eval_nli-pairs_samples_per_second": 123.757,
"eval_nli-pairs_steps_per_second": 5.198,
"step": 5488
},
{
"epoch": 1.0507371242580892,
"eval_scitail-pairs-pos_loss": 0.6279736161231995,
"eval_scitail-pairs-pos_runtime": 15.0898,
"eval_scitail-pairs-pos_samples_per_second": 86.416,
"eval_scitail-pairs-pos_steps_per_second": 3.645,
"step": 5488
},
{
"epoch": 1.0507371242580892,
"eval_qnli-contrastive_loss": 1.5666921138763428,
"eval_qnli-contrastive_runtime": 4.7489,
"eval_qnli-contrastive_samples_per_second": 315.863,
"eval_qnli-contrastive_steps_per_second": 13.266,
"step": 5488
},
{
"epoch": 1.0587784798008808,
"grad_norm": 22.18709373474121,
"learning_rate": 1.921916684716005e-05,
"loss": 1.6734,
"step": 5530
},
{
"epoch": 1.073903886655179,
"grad_norm": 2.1289186477661133,
"learning_rate": 1.8299264402862166e-05,
"loss": 1.6602,
"step": 5609
},
{
"epoch": 1.0890292935094774,
"grad_norm": 8.099466323852539,
"learning_rate": 1.7366121024998667e-05,
"loss": 1.4626,
"step": 5688
},
{
"epoch": 1.1041547003637757,
"grad_norm": 11.092597007751465,
"learning_rate": 1.642348169668238e-05,
"loss": 1.4048,
"step": 5767
},
{
"epoch": 1.1192801072180738,
"grad_norm": 1.632265329360962,
"learning_rate": 1.5475129511111833e-05,
"loss": 1.5961,
"step": 5846
},
{
"epoch": 1.12578977599081,
"eval_nli-pairs_loss": 1.257077932357788,
"eval_nli-pairs_runtime": 12.0966,
"eval_nli-pairs_samples_per_second": 124.002,
"eval_nli-pairs_steps_per_second": 5.208,
"step": 5880
},
{
"epoch": 1.12578977599081,
"eval_scitail-pairs-pos_loss": 0.6171609163284302,
"eval_scitail-pairs-pos_runtime": 15.2057,
"eval_scitail-pairs-pos_samples_per_second": 85.757,
"eval_scitail-pairs-pos_steps_per_second": 3.617,
"step": 5880
},
{
"epoch": 1.12578977599081,
"eval_qnli-contrastive_loss": 1.4182076454162598,
"eval_qnli-contrastive_runtime": 4.7646,
"eval_qnli-contrastive_samples_per_second": 314.825,
"eval_qnli-contrastive_steps_per_second": 13.223,
"step": 5880
},
{
"epoch": 1.1344055140723721,
"grad_norm": 17.874731063842773,
"learning_rate": 1.452487048888817e-05,
"loss": 1.4949,
"step": 5925
},
{
"epoch": 1.1495309209266704,
"grad_norm": 5.625218391418457,
"learning_rate": 1.357651830331762e-05,
"loss": 1.7542,
"step": 6004
},
{
"epoch": 1.1646563277809687,
"grad_norm": 12.764110565185547,
"learning_rate": 1.2633878975001336e-05,
"loss": 1.3177,
"step": 6083
},
{
"epoch": 1.179781734635267,
"grad_norm": 14.75761890411377,
"learning_rate": 1.1700735597137837e-05,
"loss": 1.1522,
"step": 6162
},
{
"epoch": 1.1949071414895653,
"grad_norm": 7.778223037719727,
"learning_rate": 1.078083315283995e-05,
"loss": 1.0727,
"step": 6241
},
{
"epoch": 1.2008424277235306,
"eval_nli-pairs_loss": 1.2002286911010742,
"eval_nli-pairs_runtime": 12.1083,
"eval_nli-pairs_samples_per_second": 123.882,
"eval_nli-pairs_steps_per_second": 5.203,
"step": 6272
},
{
"epoch": 1.2008424277235306,
"eval_scitail-pairs-pos_loss": 0.587746798992157,
"eval_scitail-pairs-pos_runtime": 15.2398,
"eval_scitail-pairs-pos_samples_per_second": 85.565,
"eval_scitail-pairs-pos_steps_per_second": 3.609,
"step": 6272
},
{
"epoch": 1.2008424277235306,
"eval_qnli-contrastive_loss": 1.5079773664474487,
"eval_qnli-contrastive_runtime": 4.7468,
"eval_qnli-contrastive_samples_per_second": 316.005,
"eval_qnli-contrastive_steps_per_second": 13.272,
"step": 6272
},
{
"epoch": 1.2100325483438636,
"grad_norm": 5.742403507232666,
"learning_rate": 9.877863485398942e-06,
"loss": 1.598,
"step": 6320
},
{
"epoch": 1.225157955198162,
"grad_norm": 13.002484321594238,
"learning_rate": 8.995450481814567e-06,
"loss": 1.3773,
"step": 6399
},
{
"epoch": 1.2402833620524603,
"grad_norm": 12.662968635559082,
"learning_rate": 8.137135529054122e-06,
"loss": 1.6495,
"step": 6478
},
{
"epoch": 1.2554087689067586,
"grad_norm": 7.513673305511475,
"learning_rate": 7.306363301408635e-06,
"loss": 1.3042,
"step": 6557
},
{
"epoch": 1.2705341757610569,
"grad_norm": 92.78031158447266,
"learning_rate": 6.506467935986024e-06,
"loss": 1.5158,
"step": 6636
},
{
"epoch": 1.2758950794562511,
"eval_nli-pairs_loss": 1.1646167039871216,
"eval_nli-pairs_runtime": 12.3376,
"eval_nli-pairs_samples_per_second": 121.579,
"eval_nli-pairs_steps_per_second": 5.106,
"step": 6664
},
{
"epoch": 1.2758950794562511,
"eval_scitail-pairs-pos_loss": 0.5752041339874268,
"eval_scitail-pairs-pos_runtime": 15.5528,
"eval_scitail-pairs-pos_samples_per_second": 83.843,
"eval_scitail-pairs-pos_steps_per_second": 3.536,
"step": 6664
},
{
"epoch": 1.2758950794562511,
"eval_qnli-contrastive_loss": 1.331896424293518,
"eval_qnli-contrastive_runtime": 4.7695,
"eval_qnli-contrastive_samples_per_second": 314.501,
"eval_qnli-contrastive_steps_per_second": 13.209,
"step": 6664
},
{
"epoch": 1.2856595826153552,
"grad_norm": 11.36242961883545,
"learning_rate": 5.740659651822936e-06,
"loss": 1.2205,
"step": 6715
},
{
"epoch": 1.3007849894696535,
"grad_norm": 10.5322904586792,
"learning_rate": 5.012011866316839e-06,
"loss": 1.3909,
"step": 6794
},
{
"epoch": 1.3159103963239518,
"grad_norm": 2.6958863735198975,
"learning_rate": 4.323448860683947e-06,
"loss": 1.4255,
"step": 6873
},
{
"epoch": 1.33103580317825,
"grad_norm": 19.98720359802246,
"learning_rate": 3.677734043945192e-06,
"loss": 1.5415,
"step": 6952
},
{
"epoch": 1.3461612100325484,
"grad_norm": 3.684659719467163,
"learning_rate": 3.077458862540392e-06,
"loss": 1.3355,
"step": 7031
},
{
"epoch": 1.350947731188972,
"eval_nli-pairs_loss": 1.1400986909866333,
"eval_nli-pairs_runtime": 12.0157,
"eval_nli-pairs_samples_per_second": 124.836,
"eval_nli-pairs_steps_per_second": 5.243,
"step": 7056
},
{
"epoch": 1.350947731188972,
"eval_scitail-pairs-pos_loss": 0.5660089254379272,
"eval_scitail-pairs-pos_runtime": 15.1309,
"eval_scitail-pairs-pos_samples_per_second": 86.181,
"eval_scitail-pairs-pos_steps_per_second": 3.635,
"step": 7056
},
{
"epoch": 1.350947731188972,
"eval_qnli-contrastive_loss": 1.2624869346618652,
"eval_qnli-contrastive_runtime": 4.6898,
"eval_qnli-contrastive_samples_per_second": 319.843,
"eval_qnli-contrastive_steps_per_second": 13.433,
"step": 7056
},
{
"epoch": 1.3612866168868467,
"grad_norm": 11.162321090698242,
"learning_rate": 2.5250324000795594e-06,
"loss": 1.5326,
"step": 7110
},
{
"epoch": 1.376412023741145,
"grad_norm": 9.399407386779785,
"learning_rate": 2.0226717089707925e-06,
"loss": 1.0109,
"step": 7189
},
{
"epoch": 1.3915374305954433,
"grad_norm": 0.5825966596603394,
"learning_rate": 1.5723929127267211e-06,
"loss": 1.2729,
"step": 7268
},
{
"epoch": 1.4066628374497414,
"grad_norm": 7.376439094543457,
"learning_rate": 1.1760031146585697e-06,
"loss": 1.605,
"step": 7347
},
{
"epoch": 1.42178824430404,
"grad_norm": 0.5974981188774109,
"learning_rate": 8.350931454308347e-07,
"loss": 1.4983,
"step": 7426
},
{
"epoch": 1.4260003829216925,
"eval_nli-pairs_loss": 1.1365835666656494,
"eval_nli-pairs_runtime": 11.9569,
"eval_nli-pairs_samples_per_second": 125.451,
"eval_nli-pairs_steps_per_second": 5.269,
"step": 7448
},
{
"epoch": 1.4260003829216925,
"eval_scitail-pairs-pos_loss": 0.5671288371086121,
"eval_scitail-pairs-pos_runtime": 14.9551,
"eval_scitail-pairs-pos_samples_per_second": 87.194,
"eval_scitail-pairs-pos_steps_per_second": 3.678,
"step": 7448
},
{
"epoch": 1.4260003829216925,
"eval_qnli-contrastive_loss": 1.2691177129745483,
"eval_qnli-contrastive_runtime": 4.6835,
"eval_qnli-contrastive_samples_per_second": 320.27,
"eval_qnli-contrastive_steps_per_second": 13.451,
"step": 7448
},
{
"epoch": 1.436913651158338,
"grad_norm": 8.548786163330078,
"learning_rate": 5.5103117858258e-07,
"loss": 1.2901,
"step": 7505
},
{
"epoch": 1.4520390580126366,
"grad_norm": 9.624091148376465,
"learning_rate": 3.2495723963837597e-07,
"loss": 1.4993,
"step": 7584
},
{
"epoch": 1.4671644648669346,
"grad_norm": 18.643239974975586,
"learning_rate": 1.5777863084531385e-07,
"loss": 1.0473,
"step": 7663
},
{
"epoch": 1.482289871721233,
"grad_norm": 10.979313850402832,
"learning_rate": 5.0166289898085916e-08,
"loss": 1.2113,
"step": 7742
},
{
"epoch": 1.4974152785755313,
"grad_norm": 10.067323684692383,
"learning_rate": 2.55209726558292e-09,
"loss": 1.3604,
"step": 7821
},
{
"epoch": 1.5010530346544133,
"eval_nli-pairs_loss": 1.1346535682678223,
"eval_nli-pairs_runtime": 12.2237,
"eval_nli-pairs_samples_per_second": 122.712,
"eval_nli-pairs_steps_per_second": 5.154,
"step": 7840
},
{
"epoch": 1.5010530346544133,
"eval_scitail-pairs-pos_loss": 0.5651898980140686,
"eval_scitail-pairs-pos_runtime": 15.2453,
"eval_scitail-pairs-pos_samples_per_second": 85.535,
"eval_scitail-pairs-pos_steps_per_second": 3.608,
"step": 7840
},
{
"epoch": 1.5010530346544133,
"eval_qnli-contrastive_loss": 1.2610852718353271,
"eval_qnli-contrastive_runtime": 4.7666,
"eval_qnli-contrastive_samples_per_second": 314.687,
"eval_qnli-contrastive_steps_per_second": 13.217,
"step": 7840
},
{
"epoch": 1.5125406854298296,
"grad_norm": 12.913325309753418,
"learning_rate": 2.9984872857074416e-05,
"loss": 1.4627,
"step": 7900
},
{
"epoch": 1.5276660922841279,
"grad_norm": 13.103713035583496,
"learning_rate": 2.9912159040536404e-05,
"loss": 1.1015,
"step": 7979
},
{
"epoch": 1.5427914991384262,
"grad_norm": 10.095404624938965,
"learning_rate": 2.9779598275386362e-05,
"loss": 1.4538,
"step": 8058
},
{
"epoch": 1.5579169059927245,
"grad_norm": 0.5388267040252686,
"learning_rate": 2.9587722567571802e-05,
"loss": 1.4412,
"step": 8137
},
{
"epoch": 1.5730423128470228,
"grad_norm": 20.366121292114258,
"learning_rate": 2.933730197162302e-05,
"loss": 1.4793,
"step": 8216
},
{
"epoch": 1.5761056863871339,
"eval_nli-pairs_loss": 1.1918026208877563,
"eval_nli-pairs_runtime": 12.158,
"eval_nli-pairs_samples_per_second": 123.375,
"eval_nli-pairs_steps_per_second": 5.182,
"step": 8232
},
{
"epoch": 1.5761056863871339,
"eval_scitail-pairs-pos_loss": 0.5848828554153442,
"eval_scitail-pairs-pos_runtime": 15.3425,
"eval_scitail-pairs-pos_samples_per_second": 84.993,
"eval_scitail-pairs-pos_steps_per_second": 3.585,
"step": 8232
},
{
"epoch": 1.5761056863871339,
"eval_qnli-contrastive_loss": 1.4694615602493286,
"eval_qnli-contrastive_runtime": 4.7988,
"eval_qnli-contrastive_samples_per_second": 312.58,
"eval_qnli-contrastive_steps_per_second": 13.128,
"step": 8232
},
{
"epoch": 1.588167719701321,
"grad_norm": 19.943920135498047,
"learning_rate": 2.9029341500194198e-05,
"loss": 1.1267,
"step": 8295
},
{
"epoch": 1.6032931265556194,
"grad_norm": 14.96302318572998,
"learning_rate": 2.8665077090647462e-05,
"loss": 1.1734,
"step": 8374
},
{
"epoch": 1.6184185334099177,
"grad_norm": 6.065411567687988,
"learning_rate": 2.8245970644867055e-05,
"loss": 1.2193,
"step": 8453
},
{
"epoch": 1.633543940264216,
"grad_norm": 15.93069076538086,
"learning_rate": 2.7773704162210366e-05,
"loss": 1.1381,
"step": 8532
},
{
"epoch": 1.6486693471185143,
"grad_norm": 5.890163898468018,
"learning_rate": 2.725017298914211e-05,
"loss": 0.9632,
"step": 8611
},
{
"epoch": 1.6511583381198545,
"eval_nli-pairs_loss": 1.1099625825881958,
"eval_nli-pairs_runtime": 12.0731,
"eval_nli-pairs_samples_per_second": 124.243,
"eval_nli-pairs_steps_per_second": 5.218,
"step": 8624
},
{
"epoch": 1.6511583381198545,
"eval_scitail-pairs-pos_loss": 0.5500022172927856,
"eval_scitail-pairs-pos_runtime": 15.0341,
"eval_scitail-pairs-pos_samples_per_second": 86.736,
"eval_scitail-pairs-pos_steps_per_second": 3.658,
"step": 8624
},
{
"epoch": 1.6511583381198545,
"eval_qnli-contrastive_loss": 1.208964467048645,
"eval_qnli-contrastive_runtime": 4.6959,
"eval_qnli-contrastive_samples_per_second": 319.43,
"eval_qnli-contrastive_steps_per_second": 13.416,
"step": 8624
},
{
"epoch": 1.6637947539728124,
"grad_norm": 11.52648639678955,
"learning_rate": 2.6677478212642807e-05,
"loss": 1.0842,
"step": 8690
},
{
"epoch": 1.678920160827111,
"grad_norm": 8.958113670349121,
"learning_rate": 2.6057918227919096e-05,
"loss": 0.7981,
"step": 8769
},
{
"epoch": 1.694045567681409,
"grad_norm": 12.083248138427734,
"learning_rate": 2.5393979514257247e-05,
"loss": 1.2196,
"step": 8848
},
{
"epoch": 1.7091709745357075,
"grad_norm": 2.1500277519226074,
"learning_rate": 2.4688326656039045e-05,
"loss": 0.8321,
"step": 8927
},
{
"epoch": 1.7242963813900056,
"grad_norm": 3.8833096027374268,
"learning_rate": 2.3943791648968727e-05,
"loss": 0.938,
"step": 9006
},
{
"epoch": 1.726210989852575,
"eval_nli-pairs_loss": 1.1021158695220947,
"eval_nli-pairs_runtime": 12.05,
"eval_nli-pairs_samples_per_second": 124.482,
"eval_nli-pairs_steps_per_second": 5.228,
"step": 9016
},
{
"epoch": 1.726210989852575,
"eval_scitail-pairs-pos_loss": 0.519660472869873,
"eval_scitail-pairs-pos_runtime": 15.121,
"eval_scitail-pairs-pos_samples_per_second": 86.238,
"eval_scitail-pairs-pos_steps_per_second": 3.637,
"step": 9016
},
{
"epoch": 1.726210989852575,
"eval_qnli-contrastive_loss": 1.3204244375228882,
"eval_qnli-contrastive_runtime": 4.6913,
"eval_qnli-contrastive_samples_per_second": 319.739,
"eval_qnli-contrastive_steps_per_second": 13.429,
"step": 9016
},
{
"epoch": 1.7394217882443042,
"grad_norm": 9.389202117919922,
"learning_rate": 2.316336253442829e-05,
"loss": 1.0008,
"step": 9085
},
{
"epoch": 1.7545471950986022,
"grad_norm": 0.5910531282424927,
"learning_rate": 2.235017140757486e-05,
"loss": 0.8644,
"step": 9164
},
{
"epoch": 1.7696726019529008,
"grad_norm": 5.645143032073975,
"learning_rate": 2.1507481847307262e-05,
"loss": 1.0459,
"step": 9243
},
{
"epoch": 1.7847980088071989,
"grad_norm": 2.0821499824523926,
"learning_rate": 2.0638675818549023e-05,
"loss": 0.9344,
"step": 9322
},
{
"epoch": 1.7999234156614974,
"grad_norm": 10.352788925170898,
"learning_rate": 1.9747240099412936e-05,
"loss": 1.0636,
"step": 9401
},
{
"epoch": 1.8012636415852958,
"eval_nli-pairs_loss": 1.0661962032318115,
"eval_nli-pairs_runtime": 11.9602,
"eval_nli-pairs_samples_per_second": 125.416,
"eval_nli-pairs_steps_per_second": 5.267,
"step": 9408
},
{
"epoch": 1.8012636415852958,
"eval_scitail-pairs-pos_loss": 0.5188334584236145,
"eval_scitail-pairs-pos_runtime": 15.0572,
"eval_scitail-pairs-pos_samples_per_second": 86.603,
"eval_scitail-pairs-pos_steps_per_second": 3.653,
"step": 9408
},
{
"epoch": 1.8012636415852958,
"eval_qnli-contrastive_loss": 0.9691615700721741,
"eval_qnli-contrastive_runtime": 4.7039,
"eval_qnli-contrastive_samples_per_second": 318.886,
"eval_qnli-contrastive_steps_per_second": 13.393,
"step": 9408
},
{
"epoch": 1.8150488225157955,
"grad_norm": 7.344937801361084,
"learning_rate": 1.8836752287718936e-05,
"loss": 1.2482,
"step": 9480
},
{
"epoch": 1.8301742293700938,
"grad_norm": 1.0527677536010742,
"learning_rate": 1.7910866443025426e-05,
"loss": 1.0134,
"step": 9559
},
{
"epoch": 1.845299636224392,
"grad_norm": 13.278373718261719,
"learning_rate": 1.6973298421796733e-05,
"loss": 0.981,
"step": 9638
},
{
"epoch": 1.8604250430786904,
"grad_norm": 2.146714448928833,
"learning_rate": 1.6027810964561188e-05,
"loss": 1.0289,
"step": 9717
},
{
"epoch": 1.8755504499329887,
"grad_norm": 13.393159866333008,
"learning_rate": 1.5078198594909435e-05,
"loss": 0.9656,
"step": 9796
},
{
"epoch": 1.8763162933180164,
"eval_nli-pairs_loss": 1.0254323482513428,
"eval_nli-pairs_runtime": 12.1952,
"eval_nli-pairs_samples_per_second": 122.999,
"eval_nli-pairs_steps_per_second": 5.166,
"step": 9800
},
{
"epoch": 1.8763162933180164,
"eval_scitail-pairs-pos_loss": 0.496192991733551,
"eval_scitail-pairs-pos_runtime": 15.1968,
"eval_scitail-pairs-pos_samples_per_second": 85.808,
"eval_scitail-pairs-pos_steps_per_second": 3.619,
"step": 9800
},
{
"epoch": 1.8763162933180164,
"eval_qnli-contrastive_loss": 1.0920603275299072,
"eval_qnli-contrastive_runtime": 4.731,
"eval_qnli-contrastive_samples_per_second": 317.056,
"eval_qnli-contrastive_steps_per_second": 13.316,
"step": 9800
},
{
"epoch": 1.890675856787287,
"grad_norm": 15.524497032165527,
"learning_rate": 1.412827239093775e-05,
"loss": 0.9088,
"step": 9875
},
{
"epoch": 1.9058012636415853,
"grad_norm": 67.18510437011719,
"learning_rate": 1.3181844690253298e-05,
"loss": 1.2097,
"step": 9954
},
{
"epoch": 1.9209266704958836,
"grad_norm": 5.732685565948486,
"learning_rate": 1.2242713789924544e-05,
"loss": 0.7741,
"step": 10033
},
{
"epoch": 1.936052077350182,
"grad_norm": 6.51609992980957,
"learning_rate": 1.13146487027805e-05,
"loss": 0.8206,
"step": 10112
},
{
"epoch": 1.9511774842044802,
"grad_norm": 6.481364727020264,
"learning_rate": 1.040137403123638e-05,
"loss": 0.8686,
"step": 10191
},
{
"epoch": 1.9513689450507372,
"eval_nli-pairs_loss": 1.0038272142410278,
"eval_nli-pairs_runtime": 12.0711,
"eval_nli-pairs_samples_per_second": 124.264,
"eval_nli-pairs_steps_per_second": 5.219,
"step": 10192
},
{
"epoch": 1.9513689450507372,
"eval_scitail-pairs-pos_loss": 0.4778198003768921,
"eval_scitail-pairs-pos_runtime": 15.3152,
"eval_scitail-pairs-pos_samples_per_second": 85.144,
"eval_scitail-pairs-pos_steps_per_second": 3.591,
"step": 10192
},
{
"epoch": 1.9513689450507372,
"eval_qnli-contrastive_loss": 0.9486138820648193,
"eval_qnli-contrastive_runtime": 4.7421,
"eval_qnli-contrastive_samples_per_second": 316.315,
"eval_qnli-contrastive_steps_per_second": 13.285,
"step": 10192
},
{
"epoch": 1.9663028910587785,
"grad_norm": 9.677536964416504,
"learning_rate": 9.50655501935166e-06,
"loss": 0.7649,
"step": 10270
},
{
"epoch": 1.9814282979130766,
"grad_norm": 8.125744819641113,
"learning_rate": 8.633782843110642e-06,
"loss": 0.9249,
"step": 10349
},
{
"epoch": 1.9965537047673751,
"grad_norm": 3.1055586338043213,
"learning_rate": 7.797110684759332e-06,
"loss": 0.6997,
"step": 10428
},
{
"epoch": 2.0116791116216732,
"grad_norm": 7.393470764160156,
"learning_rate": 6.978450495850865e-06,
"loss": 1.06,
"step": 10507
},
{
"epoch": 2.026421596783458,
"eval_nli-pairs_loss": 1.000571608543396,
"eval_nli-pairs_runtime": 12.5554,
"eval_nli-pairs_samples_per_second": 119.47,
"eval_nli-pairs_steps_per_second": 5.018,
"step": 10584
},
{
"epoch": 2.026421596783458,
"eval_scitail-pairs-pos_loss": 0.48184335231781006,
"eval_scitail-pairs-pos_runtime": 15.4715,
"eval_scitail-pairs-pos_samples_per_second": 84.284,
"eval_scitail-pairs-pos_steps_per_second": 3.555,
"step": 10584
},
{
"epoch": 2.026421596783458,
"eval_qnli-contrastive_loss": 0.9664335250854492,
"eval_qnli-contrastive_runtime": 4.7851,
"eval_qnli-contrastive_samples_per_second": 313.474,
"eval_qnli-contrastive_steps_per_second": 13.166,
"step": 10584
},
{
"epoch": 2.0268045184759718,
"grad_norm": 12.336913108825684,
"learning_rate": 6.191983181204208e-06,
"loss": 0.9447,
"step": 10586
},
{
"epoch": 2.04192992533027,
"grad_norm": 4.7379984855651855,
"learning_rate": 5.440865069077124e-06,
"loss": 1.0151,
"step": 10665
},
{
"epoch": 2.0570553321845684,
"grad_norm": 27.00238800048828,
"learning_rate": 4.728110620818674e-06,
"loss": 1.113,
"step": 10744
},
{
"epoch": 2.0721807390388665,
"grad_norm": 17.84748649597168,
"learning_rate": 4.0565803329351935e-06,
"loss": 1.1183,
"step": 10823
},
{
"epoch": 2.087306145893165,
"grad_norm": 15.165081977844238,
"learning_rate": 3.4289692570634956e-06,
"loss": 1.1639,
"step": 10902
},
{
"epoch": 2.1014742485161784,
"eval_nli-pairs_loss": 0.9944142699241638,
"eval_nli-pairs_runtime": 12.004,
"eval_nli-pairs_samples_per_second": 124.958,
"eval_nli-pairs_steps_per_second": 5.248,
"step": 10976
},
{
"epoch": 2.1014742485161784,
"eval_scitail-pairs-pos_loss": 0.47857147455215454,
"eval_scitail-pairs-pos_runtime": 15.1823,
"eval_scitail-pairs-pos_samples_per_second": 85.89,
"eval_scitail-pairs-pos_steps_per_second": 3.623,
"step": 10976
},
{
"epoch": 2.1014742485161784,
"eval_qnli-contrastive_loss": 0.9332481026649475,
"eval_qnli-contrastive_runtime": 4.7377,
"eval_qnli-contrastive_samples_per_second": 316.609,
"eval_qnli-contrastive_steps_per_second": 13.298,
"step": 10976
},
{
"epoch": 2.102431552747463,
"grad_norm": 14.085611343383789,
"learning_rate": 2.847796183923562e-06,
"loss": 1.0222,
"step": 10981
},
{
"epoch": 2.1175569596017616,
"grad_norm": 9.214906692504883,
"learning_rate": 2.3153935346589784e-06,
"loss": 1.244,
"step": 11060
},
{
"epoch": 2.1326823664560597,
"grad_norm": 28.293725967407227,
"learning_rate": 1.8338980001342158e-06,
"loss": 1.0128,
"step": 11139
},
{
"epoch": 2.147807773310358,
"grad_norm": 7.782803058624268,
"learning_rate": 1.4052419657559468e-06,
"loss": 1.3783,
"step": 11218
},
{
"epoch": 2.1629331801646563,
"grad_norm": 8.853714942932129,
"learning_rate": 1.0311457562331311e-06,
"loss": 1.0301,
"step": 11297
},
{
"epoch": 2.176526900248899,
"eval_nli-pairs_loss": 0.9802760481834412,
"eval_nli-pairs_runtime": 11.9822,
"eval_nli-pairs_samples_per_second": 125.185,
"eval_nli-pairs_steps_per_second": 5.258,
"step": 11368
},
{
"epoch": 2.176526900248899,
"eval_scitail-pairs-pos_loss": 0.47513890266418457,
"eval_scitail-pairs-pos_runtime": 15.0277,
"eval_scitail-pairs-pos_samples_per_second": 86.773,
"eval_scitail-pairs-pos_steps_per_second": 3.66,
"step": 11368
},
{
"epoch": 2.176526900248899,
"eval_qnli-contrastive_loss": 0.9649375677108765,
"eval_qnli-contrastive_runtime": 4.7258,
"eval_qnli-contrastive_samples_per_second": 317.404,
"eval_qnli-contrastive_steps_per_second": 13.331,
"step": 11368
},
{
"epoch": 2.178058587018955,
"grad_norm": 9.836175918579102,
"learning_rate": 7.131107314001456e-07,
"loss": 0.8699,
"step": 11376
},
{
"epoch": 2.193183993873253,
"grad_norm": 6.889993190765381,
"learning_rate": 4.5241326081128687e-07,
"loss": 0.7565,
"step": 11455
},
{
"epoch": 2.2083094007275514,
"grad_norm": 7.592372894287109,
"learning_rate": 2.500996012884593e-07,
"loss": 1.3038,
"step": 11534
},
{
"epoch": 2.2234348075818495,
"grad_norm": 2.2131893634796143,
"learning_rate": 1.069816979800553e-07,
"loss": 0.9584,
"step": 11613
},
{
"epoch": 2.2385602144361476,
"grad_norm": 129.076904296875,
"learning_rate": 2.3633925782526324e-08,
"loss": 1.4689,
"step": 11692
},
{
"epoch": 2.25157955198162,
"eval_nli-pairs_loss": 0.9801518321037292,
"eval_nli-pairs_runtime": 12.0172,
"eval_nli-pairs_samples_per_second": 124.821,
"eval_nli-pairs_steps_per_second": 5.242,
"step": 11760
},
{
"epoch": 2.25157955198162,
"eval_scitail-pairs-pos_loss": 0.4722036123275757,
"eval_scitail-pairs-pos_runtime": 15.1727,
"eval_scitail-pairs-pos_samples_per_second": 85.944,
"eval_scitail-pairs-pos_steps_per_second": 3.625,
"step": 11760
},
{
"epoch": 2.25157955198162,
"eval_qnli-contrastive_loss": 0.9584055542945862,
"eval_qnli-contrastive_runtime": 4.7605,
"eval_qnli-contrastive_samples_per_second": 315.092,
"eval_qnli-contrastive_steps_per_second": 13.234,
"step": 11760
},
{
"epoch": 2.253685621290446,
"grad_norm": 9.094249725341797,
"learning_rate": 2.999960921579765e-05,
"loss": 0.9979,
"step": 11771
},
{
"epoch": 2.2688110281447442,
"grad_norm": 14.057835578918457,
"learning_rate": 2.9962654445090394e-05,
"loss": 1.3444,
"step": 11850
},
{
"epoch": 2.2839364349990428,
"grad_norm": 8.33903694152832,
"learning_rate": 2.9865650072629244e-05,
"loss": 1.2052,
"step": 11929
},
{
"epoch": 2.299061841853341,
"grad_norm": 5.676733493804932,
"learning_rate": 2.970898540593688e-05,
"loss": 1.2007,
"step": 12008
},
{
"epoch": 2.3141872487076394,
"grad_norm": 3.648158550262451,
"learning_rate": 2.9493289187117727e-05,
"loss": 1.1402,
"step": 12087
},
{
"epoch": 2.3266322037143405,
"eval_nli-pairs_loss": 1.0052505731582642,
"eval_nli-pairs_runtime": 12.1373,
"eval_nli-pairs_samples_per_second": 123.586,
"eval_nli-pairs_steps_per_second": 5.191,
"step": 12152
},
{
"epoch": 2.3266322037143405,
"eval_scitail-pairs-pos_loss": 0.47668519616127014,
"eval_scitail-pairs-pos_runtime": 15.0626,
"eval_scitail-pairs-pos_samples_per_second": 86.572,
"eval_scitail-pairs-pos_steps_per_second": 3.651,
"step": 12152
},
{
"epoch": 2.3266322037143405,
"eval_qnli-contrastive_loss": 1.2372807264328003,
"eval_qnli-contrastive_runtime": 4.7164,
"eval_qnli-contrastive_samples_per_second": 318.038,
"eval_qnli-contrastive_steps_per_second": 13.358,
"step": 12152
},
{
"epoch": 2.3293126555619375,
"grad_norm": 4.789942741394043,
"learning_rate": 2.9219427069528128e-05,
"loss": 1.5263,
"step": 12166
},
{
"epoch": 2.344438062416236,
"grad_norm": 14.52586555480957,
"learning_rate": 2.8888498143650785e-05,
"loss": 1.263,
"step": 12245
},
{
"epoch": 2.359563469270534,
"grad_norm": 2.835966110229492,
"learning_rate": 2.8501830526116386e-05,
"loss": 1.1912,
"step": 12324
},
{
"epoch": 2.3746888761248326,
"grad_norm": 14.9393949508667,
"learning_rate": 2.8060976029574842e-05,
"loss": 1.0982,
"step": 12403
},
{
"epoch": 2.3898142829791307,
"grad_norm": 8.84047794342041,
"learning_rate": 2.7567703934807572e-05,
"loss": 1.1574,
"step": 12482
},
{
"epoch": 2.401684855447061,
"eval_nli-pairs_loss": 0.9759184122085571,
"eval_nli-pairs_runtime": 12.2553,
"eval_nli-pairs_samples_per_second": 122.396,
"eval_nli-pairs_steps_per_second": 5.141,
"step": 12544
},
{
"epoch": 2.401684855447061,
"eval_scitail-pairs-pos_loss": 0.4914855659008026,
"eval_scitail-pairs-pos_runtime": 15.0918,
"eval_scitail-pairs-pos_samples_per_second": 86.404,
"eval_scitail-pairs-pos_steps_per_second": 3.644,
"step": 12544
},
{
"epoch": 2.401684855447061,
"eval_qnli-contrastive_loss": 1.1089410781860352,
"eval_qnli-contrastive_runtime": 4.7223,
"eval_qnli-contrastive_samples_per_second": 317.644,
"eval_qnli-contrastive_steps_per_second": 13.341,
"step": 12544
},
{
"epoch": 2.404939689833429,
"grad_norm": 11.71249008178711,
"learning_rate": 2.7023993890075236e-05,
"loss": 1.4077,
"step": 12561
},
{
"epoch": 2.4200650966877273,
"grad_norm": 2.904869794845581,
"learning_rate": 2.6432027966197927e-05,
"loss": 1.3183,
"step": 12640
},
{
"epoch": 2.435190503542026,
"grad_norm": 9.094073295593262,
"learning_rate": 2.579418189925317e-05,
"loss": 1.0883,
"step": 12719
},
{
"epoch": 2.450315910396324,
"grad_norm": 9.701898574829102,
"learning_rate": 2.5113015556037383e-05,
"loss": 1.3182,
"step": 12798
},
{
"epoch": 2.4654413172506224,
"grad_norm": 6.8915581703186035,
"learning_rate": 2.4391262660555785e-05,
"loss": 1.0089,
"step": 12877
},
{
"epoch": 2.4767375071797817,
"eval_nli-pairs_loss": 0.9481552243232727,
"eval_nli-pairs_runtime": 12.17,
"eval_nli-pairs_samples_per_second": 123.254,
"eval_nli-pairs_steps_per_second": 5.177,
"step": 12936
},
{
"epoch": 2.4767375071797817,
"eval_scitail-pairs-pos_loss": 0.4552152752876282,
"eval_scitail-pairs-pos_runtime": 15.2525,
"eval_scitail-pairs-pos_samples_per_second": 85.494,
"eval_scitail-pairs-pos_steps_per_second": 3.606,
"step": 12936
},
{
"epoch": 2.4767375071797817,
"eval_qnli-contrastive_loss": 1.1650612354278564,
"eval_qnli-contrastive_runtime": 4.7586,
"eval_qnli-contrastive_samples_per_second": 315.216,
"eval_qnli-contrastive_steps_per_second": 13.239,
"step": 12936
},
{
"epoch": 2.4805667241049205,
"grad_norm": 9.97049617767334,
"learning_rate": 2.3631819822771357e-05,
"loss": 1.0616,
"step": 12956
},
{
"epoch": 2.4956921309592186,
"grad_norm": 10.72946548461914,
"learning_rate": 2.2837734913643845e-05,
"loss": 1.1083,
"step": 13035
},
{
"epoch": 2.510817537813517,
"grad_norm": 6.889919281005859,
"learning_rate": 2.2012194833113163e-05,
"loss": 1.2687,
"step": 13114
},
{
"epoch": 2.5259429446678157,
"grad_norm": 2.167541742324829,
"learning_rate": 2.1158512720117925e-05,
"loss": 0.698,
"step": 13193
},
{
"epoch": 2.5410683515221137,
"grad_norm": 6.788521766662598,
"learning_rate": 2.0280114655979378e-05,
"loss": 1.0596,
"step": 13272
},
{
"epoch": 2.5517901589125023,
"eval_nli-pairs_loss": 0.9386218786239624,
"eval_nli-pairs_runtime": 12.1882,
"eval_nli-pairs_samples_per_second": 123.07,
"eval_nli-pairs_steps_per_second": 5.169,
"step": 13328
},
{
"epoch": 2.5517901589125023,
"eval_scitail-pairs-pos_loss": 0.45524224638938904,
"eval_scitail-pairs-pos_runtime": 15.3268,
"eval_scitail-pairs-pos_samples_per_second": 85.08,
"eval_scitail-pairs-pos_steps_per_second": 3.588,
"step": 13328
},
{
"epoch": 2.5517901589125023,
"eval_qnli-contrastive_loss": 1.053303837776184,
"eval_qnli-contrastive_runtime": 4.7606,
"eval_qnli-contrastive_samples_per_second": 315.086,
"eval_qnli-contrastive_steps_per_second": 13.234,
"step": 13328
},
{
"epoch": 2.556193758376412,
"grad_norm": 5.612150192260742,
"learning_rate": 1.9380525914513508e-05,
"loss": 1.1182,
"step": 13351
},
{
"epoch": 2.5713191652307104,
"grad_norm": 5.856744289398193,
"learning_rate": 1.8463356814054177e-05,
"loss": 0.9092,
"step": 13430
},
{
"epoch": 2.586444572085009,
"grad_norm": 3.5007331371307373,
"learning_rate": 1.7532288228167412e-05,
"loss": 0.8628,
"step": 13509
},
{
"epoch": 2.601569978939307,
"grad_norm": 3.8348581790924072,
"learning_rate": 1.6591056813206084e-05,
"loss": 0.762,
"step": 13588
},
{
"epoch": 2.616695385793605,
"grad_norm": 3.7152531147003174,
"learning_rate": 1.564344001199179e-05,
"loss": 0.9521,
"step": 13667
},
{
"epoch": 2.626842810645223,
"eval_nli-pairs_loss": 0.8889521956443787,
"eval_nli-pairs_runtime": 12.1548,
"eval_nli-pairs_samples_per_second": 123.408,
"eval_nli-pairs_steps_per_second": 5.183,
"step": 13720
},
{
"epoch": 2.626842810645223,
"eval_scitail-pairs-pos_loss": 0.45236507058143616,
"eval_scitail-pairs-pos_runtime": 15.2247,
"eval_scitail-pairs-pos_samples_per_second": 85.65,
"eval_scitail-pairs-pos_steps_per_second": 3.613,
"step": 13720
},
{
"epoch": 2.626842810645223,
"eval_qnli-contrastive_loss": 0.794640302658081,
"eval_qnli-contrastive_runtime": 4.8223,
"eval_qnli-contrastive_samples_per_second": 311.053,
"eval_qnli-contrastive_steps_per_second": 13.064,
"step": 13720
},
{
"epoch": 2.6318207926479036,
"grad_norm": 8.717215538024902,
"learning_rate": 1.4693240893808674e-05,
"loss": 0.8631,
"step": 13746
},
{
"epoch": 2.6469461995022017,
"grad_norm": 0.3876877725124359,
"learning_rate": 1.3744272891550144e-05,
"loss": 0.6899,
"step": 13825
},
{
"epoch": 2.6620716063565,
"grad_norm": 0.41043633222579956,
"learning_rate": 1.2800344497273615e-05,
"loss": 0.6552,
"step": 13904
},
{
"epoch": 2.6771970132107983,
"grad_norm": 0.8379763960838318,
"learning_rate": 1.1865243977584432e-05,
"loss": 0.572,
"step": 13983
},
{
"epoch": 2.692322420065097,
"grad_norm": 4.94291877746582,
"learning_rate": 1.0942724170190126e-05,
"loss": 0.9809,
"step": 14062
},
{
"epoch": 2.701895462377944,
"eval_nli-pairs_loss": 0.8912826180458069,
"eval_nli-pairs_runtime": 12.096,
"eval_nli-pairs_samples_per_second": 124.008,
"eval_nli-pairs_steps_per_second": 5.208,
"step": 14112
},
{
"epoch": 2.701895462377944,
"eval_scitail-pairs-pos_loss": 0.4352218210697174,
"eval_scitail-pairs-pos_runtime": 15.0606,
"eval_scitail-pairs-pos_samples_per_second": 86.584,
"eval_scitail-pairs-pos_steps_per_second": 3.652,
"step": 14112
},
{
"epoch": 2.701895462377944,
"eval_qnli-contrastive_loss": 0.727630078792572,
"eval_qnli-contrastive_runtime": 4.7927,
"eval_qnli-contrastive_samples_per_second": 312.979,
"eval_qnli-contrastive_steps_per_second": 13.145,
"step": 14112
},
{
"epoch": 2.707447826919395,
"grad_norm": 2.8381199836730957,
"learning_rate": 1.0036487422641892e-05,
"loss": 0.5392,
"step": 14141
},
{
"epoch": 2.7225732337736934,
"grad_norm": 9.423616409301758,
"learning_rate": 9.150170733707937e-06,
"loss": 0.6777,
"step": 14220
},
{
"epoch": 2.7376986406279915,
"grad_norm": 0.6272808909416199,
"learning_rate": 8.287331157010844e-06,
"loss": 0.6523,
"step": 14299
},
{
"epoch": 2.75282404748229,
"grad_norm": 0.7308062314987183,
"learning_rate": 7.4514315255090594e-06,
"loss": 0.6416,
"step": 14378
},
{
"epoch": 2.767949454336588,
"grad_norm": 4.945492267608643,
"learning_rate": 6.645826554113819e-06,
"loss": 0.7713,
"step": 14457
},
{
"epoch": 2.7769481141106644,
"eval_nli-pairs_loss": 0.872556209564209,
"eval_nli-pairs_runtime": 12.1015,
"eval_nli-pairs_samples_per_second": 123.952,
"eval_nli-pairs_steps_per_second": 5.206,
"step": 14504
},
{
"epoch": 2.7769481141106644,
"eval_scitail-pairs-pos_loss": 0.42709970474243164,
"eval_scitail-pairs-pos_runtime": 15.0845,
"eval_scitail-pairs-pos_samples_per_second": 86.446,
"eval_scitail-pairs-pos_steps_per_second": 3.646,
"step": 14504
},
{
"epoch": 2.7769481141106644,
"eval_qnli-contrastive_loss": 0.7923160791397095,
"eval_qnli-contrastive_runtime": 4.7233,
"eval_qnli-contrastive_samples_per_second": 317.576,
"eval_qnli-contrastive_steps_per_second": 13.338,
"step": 14504
},
{
"epoch": 2.7830748611908867,
"grad_norm": 9.502604484558105,
"learning_rate": 5.873749376215993e-06,
"loss": 0.6531,
"step": 14536
},
{
"epoch": 2.7982002680451847,
"grad_norm": 6.348124980926514,
"learning_rate": 5.138298568156192e-06,
"loss": 0.7056,
"step": 14615
},
{
"epoch": 2.813325674899483,
"grad_norm": 4.395310401916504,
"learning_rate": 4.442425713712258e-06,
"loss": 1.054,
"step": 14694
},
{
"epoch": 2.8284510817537813,
"grad_norm": 5.8618011474609375,
"learning_rate": 3.7889235585119115e-06,
"loss": 0.8535,
"step": 14773
},
{
"epoch": 2.84357648860808,
"grad_norm": 7.8259406089782715,
"learning_rate": 3.1804148019103528e-06,
"loss": 0.7321,
"step": 14852
},
{
"epoch": 2.852000765843385,
"eval_nli-pairs_loss": 0.8661790490150452,
"eval_nli-pairs_runtime": 12.1048,
"eval_nli-pairs_samples_per_second": 123.917,
"eval_nli-pairs_steps_per_second": 5.205,
"step": 14896
},
{
"epoch": 2.852000765843385,
"eval_scitail-pairs-pos_loss": 0.4211391508579254,
"eval_scitail-pairs-pos_runtime": 15.1135,
"eval_scitail-pairs-pos_samples_per_second": 86.28,
"eval_scitail-pairs-pos_steps_per_second": 3.639,
"step": 14896
},
{
"epoch": 2.852000765843385,
"eval_qnli-contrastive_loss": 0.7693744897842407,
"eval_qnli-contrastive_runtime": 4.7208,
"eval_qnli-contrastive_samples_per_second": 317.743,
"eval_qnli-contrastive_steps_per_second": 13.345,
"step": 14896
},
{
"epoch": 2.858701895462378,
"grad_norm": 8.156476974487305,
"learning_rate": 2.6193415713143028e-06,
"loss": 0.8236,
"step": 14931
},
{
"epoch": 2.873827302316676,
"grad_norm": 0.3863189220428467,
"learning_rate": 2.107955621195247e-06,
"loss": 0.776,
"step": 15010
},
{
"epoch": 2.8889527091709746,
"grad_norm": 0.4337412118911743,
"learning_rate": 1.6483092961261291e-06,
"loss": 0.7049,
"step": 15089
},
{
"epoch": 2.904078116025273,
"grad_norm": 0.5512604117393494,
"learning_rate": 1.2422472941095199e-06,
"loss": 0.9409,
"step": 15168
},
{
"epoch": 2.919203522879571,
"grad_norm": 4.254249572753906,
"learning_rate": 8.913992632535123e-07,
"loss": 0.7416,
"step": 15247
},
{
"epoch": 2.9270534175761056,
"eval_nli-pairs_loss": 0.8609779477119446,
"eval_nli-pairs_runtime": 12.2133,
"eval_nli-pairs_samples_per_second": 122.817,
"eval_nli-pairs_steps_per_second": 5.158,
"step": 15288
},
{
"epoch": 2.9270534175761056,
"eval_scitail-pairs-pos_loss": 0.42045190930366516,
"eval_scitail-pairs-pos_runtime": 15.4078,
"eval_scitail-pairs-pos_samples_per_second": 84.632,
"eval_scitail-pairs-pos_steps_per_second": 3.57,
"step": 15288
},
{
"epoch": 2.9270534175761056,
"eval_qnli-contrastive_loss": 0.7351691722869873,
"eval_qnli-contrastive_runtime": 4.7717,
"eval_qnli-contrastive_samples_per_second": 314.351,
"eval_qnli-contrastive_steps_per_second": 13.203,
"step": 15288
},
{
"epoch": 2.9343289297338693,
"grad_norm": 6.785557270050049,
"learning_rate": 5.971732615070724e-07,
"loss": 0.6059,
"step": 15326
},
{
"epoch": 2.949454336588168,
"grad_norm": 14.958471298217773,
"learning_rate": 3.6075010570289336e-07,
"loss": 0.6598,
"step": 15405
},
{
"epoch": 2.964579743442466,
"grad_norm": 0.34104809165000916,
"learning_rate": 1.8307863258672674e-07,
"loss": 0.5777,
"step": 15484
},
{
"epoch": 2.9797051502967644,
"grad_norm": 0.6522515416145325,
"learning_rate": 6.487189085208289e-08,
"loss": 0.8212,
"step": 15563
},
{
"epoch": 2.9948305571510625,
"grad_norm": 0.3607589304447174,
"learning_rate": 6.6042794628590194e-09,
"loss": 0.5638,
"step": 15642
}
],
"logging_steps": 79,
"max_steps": 15669,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 3918,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 30,
"trial_name": null,
"trial_params": null
}