bobox's picture
Training in progress, epoch 4, checkpoint
2a8c918 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 700,
"global_step": 22388,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025013400035733428,
"grad_norm": 235.5828399658203,
"learning_rate": 4.824012864034305e-07,
"loss": 17.3783,
"step": 140
},
{
"epoch": 0.050026800071466856,
"grad_norm": 26.746055603027344,
"learning_rate": 9.82669287118099e-07,
"loss": 10.3883,
"step": 280
},
{
"epoch": 0.07504020010720029,
"grad_norm": 16.398082733154297,
"learning_rate": 1.4829372878327677e-06,
"loss": 6.239,
"step": 420
},
{
"epoch": 0.10005360014293371,
"grad_norm": 14.383550643920898,
"learning_rate": 1.983205288547436e-06,
"loss": 5.6986,
"step": 560
},
{
"epoch": 0.12506700017866715,
"grad_norm": 20.341243743896484,
"learning_rate": 2.483473289262105e-06,
"loss": 5.4609,
"step": 700
},
{
"epoch": 0.12506700017866715,
"eval_nli-pairs_loss": 5.221348285675049,
"eval_nli-pairs_runtime": 11.8701,
"eval_nli-pairs_samples_per_second": 126.368,
"eval_nli-pairs_steps_per_second": 5.307,
"step": 700
},
{
"epoch": 0.12506700017866715,
"eval_scitail-pairs-pos_loss": 3.934469699859619,
"eval_scitail-pairs-pos_runtime": 14.8075,
"eval_scitail-pairs-pos_samples_per_second": 88.063,
"eval_scitail-pairs-pos_steps_per_second": 3.714,
"step": 700
},
{
"epoch": 0.12506700017866715,
"eval_qnli-contrastive_loss": 5.33072566986084,
"eval_qnli-contrastive_runtime": 4.6845,
"eval_qnli-contrastive_samples_per_second": 320.208,
"eval_qnli-contrastive_steps_per_second": 13.449,
"step": 700
},
{
"epoch": 0.15008040021440058,
"grad_norm": 16.652421951293945,
"learning_rate": 2.983741289976774e-06,
"loss": 5.2036,
"step": 840
},
{
"epoch": 0.175093800250134,
"grad_norm": 19.31585121154785,
"learning_rate": 3.484009290691442e-06,
"loss": 4.9986,
"step": 980
},
{
"epoch": 0.20010720028586743,
"grad_norm": 32.62712860107422,
"learning_rate": 3.984277291406111e-06,
"loss": 4.4918,
"step": 1120
},
{
"epoch": 0.22512060032160086,
"grad_norm": 19.606719970703125,
"learning_rate": 4.48454529212078e-06,
"loss": 4.2202,
"step": 1260
},
{
"epoch": 0.2501340003573343,
"grad_norm": 27.97759246826172,
"learning_rate": 4.984813292835448e-06,
"loss": 4.0922,
"step": 1400
},
{
"epoch": 0.2501340003573343,
"eval_nli-pairs_loss": 4.116352081298828,
"eval_nli-pairs_runtime": 11.8302,
"eval_nli-pairs_samples_per_second": 126.795,
"eval_nli-pairs_steps_per_second": 5.325,
"step": 1400
},
{
"epoch": 0.2501340003573343,
"eval_scitail-pairs-pos_loss": 2.4735982418060303,
"eval_scitail-pairs-pos_runtime": 14.767,
"eval_scitail-pairs-pos_samples_per_second": 88.305,
"eval_scitail-pairs-pos_steps_per_second": 3.725,
"step": 1400
},
{
"epoch": 0.2501340003573343,
"eval_qnli-contrastive_loss": 4.378289222717285,
"eval_qnli-contrastive_runtime": 4.6744,
"eval_qnli-contrastive_samples_per_second": 320.895,
"eval_qnli-contrastive_steps_per_second": 13.478,
"step": 1400
},
{
"epoch": 0.27514740039306773,
"grad_norm": 21.981849670410156,
"learning_rate": 5.4850812935501166e-06,
"loss": 3.3935,
"step": 1540
},
{
"epoch": 0.30016080042880117,
"grad_norm": 26.165401458740234,
"learning_rate": 5.9853492942647854e-06,
"loss": 3.5528,
"step": 1680
},
{
"epoch": 0.32517420046453455,
"grad_norm": 18.916423797607422,
"learning_rate": 6.4856172949794535e-06,
"loss": 3.6013,
"step": 1820
},
{
"epoch": 0.350187600500268,
"grad_norm": 14.634994506835938,
"learning_rate": 6.985885295694122e-06,
"loss": 3.275,
"step": 1960
},
{
"epoch": 0.3752010005360014,
"grad_norm": 23.64468002319336,
"learning_rate": 7.4825799535465435e-06,
"loss": 3.2832,
"step": 2100
},
{
"epoch": 0.3752010005360014,
"eval_nli-pairs_loss": 3.1333088874816895,
"eval_nli-pairs_runtime": 11.8718,
"eval_nli-pairs_samples_per_second": 126.35,
"eval_nli-pairs_steps_per_second": 5.307,
"step": 2100
},
{
"epoch": 0.3752010005360014,
"eval_scitail-pairs-pos_loss": 1.8785525560379028,
"eval_scitail-pairs-pos_runtime": 14.7587,
"eval_scitail-pairs-pos_samples_per_second": 88.355,
"eval_scitail-pairs-pos_steps_per_second": 3.727,
"step": 2100
},
{
"epoch": 0.3752010005360014,
"eval_qnli-contrastive_loss": 3.401968479156494,
"eval_qnli-contrastive_runtime": 4.7313,
"eval_qnli-contrastive_samples_per_second": 317.034,
"eval_qnli-contrastive_steps_per_second": 13.315,
"step": 2100
},
{
"epoch": 0.40021440057173485,
"grad_norm": 19.62537384033203,
"learning_rate": 7.982847954261212e-06,
"loss": 3.0276,
"step": 2240
},
{
"epoch": 0.4252278006074683,
"grad_norm": 13.65882682800293,
"learning_rate": 8.48311595497588e-06,
"loss": 2.8936,
"step": 2380
},
{
"epoch": 0.4502412006432017,
"grad_norm": 18.643104553222656,
"learning_rate": 8.983383955690548e-06,
"loss": 2.7427,
"step": 2520
},
{
"epoch": 0.47525460067893516,
"grad_norm": 22.165441513061523,
"learning_rate": 9.483651956405217e-06,
"loss": 3.0206,
"step": 2660
},
{
"epoch": 0.5002680007146686,
"grad_norm": 7.004354953765869,
"learning_rate": 9.983919957119886e-06,
"loss": 2.575,
"step": 2800
},
{
"epoch": 0.5002680007146686,
"eval_nli-pairs_loss": 2.6767749786376953,
"eval_nli-pairs_runtime": 11.9879,
"eval_nli-pairs_samples_per_second": 125.127,
"eval_nli-pairs_steps_per_second": 5.255,
"step": 2800
},
{
"epoch": 0.5002680007146686,
"eval_scitail-pairs-pos_loss": 1.5111600160598755,
"eval_scitail-pairs-pos_runtime": 14.7412,
"eval_scitail-pairs-pos_samples_per_second": 88.46,
"eval_scitail-pairs-pos_steps_per_second": 3.731,
"step": 2800
},
{
"epoch": 0.5002680007146686,
"eval_qnli-contrastive_loss": 2.8863816261291504,
"eval_qnli-contrastive_runtime": 4.6516,
"eval_qnli-contrastive_samples_per_second": 322.468,
"eval_qnli-contrastive_steps_per_second": 13.544,
"step": 2800
},
{
"epoch": 0.525281400750402,
"grad_norm": 35.325260162353516,
"learning_rate": 1.0484187957834555e-05,
"loss": 2.5411,
"step": 2940
},
{
"epoch": 0.5502948007861355,
"grad_norm": 18.314611434936523,
"learning_rate": 1.0984455958549222e-05,
"loss": 2.5173,
"step": 3080
},
{
"epoch": 0.5753082008218688,
"grad_norm": 19.870922088623047,
"learning_rate": 1.1484723959263893e-05,
"loss": 2.5333,
"step": 3220
},
{
"epoch": 0.6003216008576023,
"grad_norm": 15.03135871887207,
"learning_rate": 1.198499195997856e-05,
"loss": 2.6302,
"step": 3360
},
{
"epoch": 0.6253350008933357,
"grad_norm": 22.288837432861328,
"learning_rate": 1.248525996069323e-05,
"loss": 2.5134,
"step": 3500
},
{
"epoch": 0.6253350008933357,
"eval_nli-pairs_loss": 2.3164055347442627,
"eval_nli-pairs_runtime": 11.9409,
"eval_nli-pairs_samples_per_second": 125.619,
"eval_nli-pairs_steps_per_second": 5.276,
"step": 3500
},
{
"epoch": 0.6253350008933357,
"eval_scitail-pairs-pos_loss": 1.329464316368103,
"eval_scitail-pairs-pos_runtime": 14.9802,
"eval_scitail-pairs-pos_samples_per_second": 87.048,
"eval_scitail-pairs-pos_steps_per_second": 3.672,
"step": 3500
},
{
"epoch": 0.6253350008933357,
"eval_qnli-contrastive_loss": 2.432037830352783,
"eval_qnli-contrastive_runtime": 4.7247,
"eval_qnli-contrastive_samples_per_second": 317.478,
"eval_qnli-contrastive_steps_per_second": 13.334,
"step": 3500
},
{
"epoch": 0.6503484009290691,
"grad_norm": 22.80077362060547,
"learning_rate": 1.2985527961407898e-05,
"loss": 2.4442,
"step": 3640
},
{
"epoch": 0.6753618009648026,
"grad_norm": 8.082416534423828,
"learning_rate": 1.3485795962122568e-05,
"loss": 2.2516,
"step": 3780
},
{
"epoch": 0.700375201000536,
"grad_norm": 19.319093704223633,
"learning_rate": 1.3986063962837235e-05,
"loss": 2.3152,
"step": 3920
},
{
"epoch": 0.7253886010362695,
"grad_norm": 22.325191497802734,
"learning_rate": 1.4486331963551904e-05,
"loss": 2.0466,
"step": 4060
},
{
"epoch": 0.7504020010720028,
"grad_norm": 23.437545776367188,
"learning_rate": 1.4983026621404324e-05,
"loss": 2.2551,
"step": 4200
},
{
"epoch": 0.7504020010720028,
"eval_nli-pairs_loss": 2.0880069732666016,
"eval_nli-pairs_runtime": 12.1083,
"eval_nli-pairs_samples_per_second": 123.882,
"eval_nli-pairs_steps_per_second": 5.203,
"step": 4200
},
{
"epoch": 0.7504020010720028,
"eval_scitail-pairs-pos_loss": 1.1470309495925903,
"eval_scitail-pairs-pos_runtime": 14.8408,
"eval_scitail-pairs-pos_samples_per_second": 87.866,
"eval_scitail-pairs-pos_steps_per_second": 3.706,
"step": 4200
},
{
"epoch": 0.7504020010720028,
"eval_qnli-contrastive_loss": 2.411231279373169,
"eval_qnli-contrastive_runtime": 4.7069,
"eval_qnli-contrastive_samples_per_second": 318.678,
"eval_qnli-contrastive_steps_per_second": 13.384,
"step": 4200
},
{
"epoch": 0.7754154011077363,
"grad_norm": 15.4782133102417,
"learning_rate": 1.5483294622118993e-05,
"loss": 2.1891,
"step": 4340
},
{
"epoch": 0.8004288011434697,
"grad_norm": 2.4403023719787598,
"learning_rate": 1.598356262283366e-05,
"loss": 2.0993,
"step": 4480
},
{
"epoch": 0.8254422011792032,
"grad_norm": 16.028566360473633,
"learning_rate": 1.648383062354833e-05,
"loss": 1.7904,
"step": 4620
},
{
"epoch": 0.8504556012149366,
"grad_norm": 11.724300384521484,
"learning_rate": 1.6984098624262998e-05,
"loss": 2.0401,
"step": 4760
},
{
"epoch": 0.87546900125067,
"grad_norm": 5.384641647338867,
"learning_rate": 1.7484366624977668e-05,
"loss": 1.8895,
"step": 4900
},
{
"epoch": 0.87546900125067,
"eval_nli-pairs_loss": 1.8439208269119263,
"eval_nli-pairs_runtime": 11.972,
"eval_nli-pairs_samples_per_second": 125.292,
"eval_nli-pairs_steps_per_second": 5.262,
"step": 4900
},
{
"epoch": 0.87546900125067,
"eval_scitail-pairs-pos_loss": 0.9787265658378601,
"eval_scitail-pairs-pos_runtime": 14.8626,
"eval_scitail-pairs-pos_samples_per_second": 87.737,
"eval_scitail-pairs-pos_steps_per_second": 3.701,
"step": 4900
},
{
"epoch": 0.87546900125067,
"eval_qnli-contrastive_loss": 1.91374933719635,
"eval_qnli-contrastive_runtime": 4.7148,
"eval_qnli-contrastive_samples_per_second": 318.144,
"eval_qnli-contrastive_steps_per_second": 13.362,
"step": 4900
},
{
"epoch": 0.9004824012864034,
"grad_norm": 14.152888298034668,
"learning_rate": 1.7984634625692335e-05,
"loss": 2.0058,
"step": 5040
},
{
"epoch": 0.9254958013221368,
"grad_norm": 10.892020225524902,
"learning_rate": 1.8484902626407003e-05,
"loss": 2.0126,
"step": 5180
},
{
"epoch": 0.9505092013578703,
"grad_norm": 1.89139986038208,
"learning_rate": 1.8985170627121673e-05,
"loss": 2.067,
"step": 5320
},
{
"epoch": 0.9755226013936037,
"grad_norm": 13.1935396194458,
"learning_rate": 1.948543862783634e-05,
"loss": 2.1337,
"step": 5460
},
{
"epoch": 1.0005360014293372,
"grad_norm": 14.684636116027832,
"learning_rate": 1.998570662855101e-05,
"loss": 1.8408,
"step": 5600
},
{
"epoch": 1.0005360014293372,
"eval_nli-pairs_loss": 1.7102497816085815,
"eval_nli-pairs_runtime": 13.5641,
"eval_nli-pairs_samples_per_second": 110.586,
"eval_nli-pairs_steps_per_second": 4.645,
"step": 5600
},
{
"epoch": 1.0005360014293372,
"eval_scitail-pairs-pos_loss": 0.920977771282196,
"eval_scitail-pairs-pos_runtime": 15.9776,
"eval_scitail-pairs-pos_samples_per_second": 81.614,
"eval_scitail-pairs-pos_steps_per_second": 3.442,
"step": 5600
},
{
"epoch": 1.0005360014293372,
"eval_qnli-contrastive_loss": 1.8655247688293457,
"eval_qnli-contrastive_runtime": 4.9388,
"eval_qnli-contrastive_samples_per_second": 303.717,
"eval_qnli-contrastive_steps_per_second": 12.756,
"step": 5600
},
{
"epoch": 1.0255494014650706,
"grad_norm": 28.780393600463867,
"learning_rate": 1.995450879640972e-05,
"loss": 1.7539,
"step": 5740
},
{
"epoch": 1.050562801500804,
"grad_norm": 1.414655089378357,
"learning_rate": 1.9813086898560906e-05,
"loss": 1.9194,
"step": 5880
},
{
"epoch": 1.0755762015365375,
"grad_norm": 1.5645997524261475,
"learning_rate": 1.957705835379269e-05,
"loss": 1.7071,
"step": 6020
},
{
"epoch": 1.100589601572271,
"grad_norm": 4.314450263977051,
"learning_rate": 1.9248698681462904e-05,
"loss": 1.7053,
"step": 6160
},
{
"epoch": 1.1256030016080043,
"grad_norm": 19.960533142089844,
"learning_rate": 1.8831173552873946e-05,
"loss": 1.8023,
"step": 6300
},
{
"epoch": 1.1256030016080043,
"eval_nli-pairs_loss": 1.5202544927597046,
"eval_nli-pairs_runtime": 12.0076,
"eval_nli-pairs_samples_per_second": 124.92,
"eval_nli-pairs_steps_per_second": 5.247,
"step": 6300
},
{
"epoch": 1.1256030016080043,
"eval_scitail-pairs-pos_loss": 0.7953532934188843,
"eval_scitail-pairs-pos_runtime": 14.8864,
"eval_scitail-pairs-pos_samples_per_second": 87.597,
"eval_scitail-pairs-pos_steps_per_second": 3.695,
"step": 6300
},
{
"epoch": 1.1256030016080043,
"eval_qnli-contrastive_loss": 1.5952976942062378,
"eval_qnli-contrastive_runtime": 4.7602,
"eval_qnli-contrastive_samples_per_second": 315.111,
"eval_qnli-contrastive_steps_per_second": 13.235,
"step": 6300
},
{
"epoch": 1.1506164016437377,
"grad_norm": 29.618913650512695,
"learning_rate": 1.8328508271462822e-05,
"loss": 1.7448,
"step": 6440
},
{
"epoch": 1.175629801679471,
"grad_norm": 23.058677673339844,
"learning_rate": 1.7745548965393756e-05,
"loss": 1.7874,
"step": 6580
},
{
"epoch": 1.2006432017152047,
"grad_norm": 13.168740272521973,
"learning_rate": 1.7087915866690346e-05,
"loss": 1.7441,
"step": 6720
},
{
"epoch": 1.225656601750938,
"grad_norm": 27.45108985900879,
"learning_rate": 1.6361949127336846e-05,
"loss": 1.4535,
"step": 6860
},
{
"epoch": 1.2506700017866714,
"grad_norm": 17.073558807373047,
"learning_rate": 1.557464769472821e-05,
"loss": 1.7103,
"step": 7000
},
{
"epoch": 1.2506700017866714,
"eval_nli-pairs_loss": 1.4381694793701172,
"eval_nli-pairs_runtime": 11.9075,
"eval_nli-pairs_samples_per_second": 125.971,
"eval_nli-pairs_steps_per_second": 5.291,
"step": 7000
},
{
"epoch": 1.2506700017866714,
"eval_scitail-pairs-pos_loss": 0.7307367920875549,
"eval_scitail-pairs-pos_runtime": 14.9432,
"eval_scitail-pairs-pos_samples_per_second": 87.264,
"eval_scitail-pairs-pos_steps_per_second": 3.681,
"step": 7000
},
{
"epoch": 1.2506700017866714,
"eval_qnli-contrastive_loss": 1.756566047668457,
"eval_qnli-contrastive_runtime": 4.6972,
"eval_qnli-contrastive_samples_per_second": 319.341,
"eval_qnli-contrastive_steps_per_second": 13.412,
"step": 7000
},
{
"epoch": 1.2756834018224048,
"grad_norm": 27.894006729125977,
"learning_rate": 1.4733601835762515e-05,
"loss": 1.4028,
"step": 7140
},
{
"epoch": 1.3006968018581384,
"grad_norm": 12.896081924438477,
"learning_rate": 1.3846919960101735e-05,
"loss": 1.5999,
"step": 7280
},
{
"epoch": 1.3257102018938718,
"grad_norm": 13.122183799743652,
"learning_rate": 1.2923150448088129e-05,
"loss": 1.7167,
"step": 7420
},
{
"epoch": 1.3507236019296052,
"grad_norm": 19.48947525024414,
"learning_rate": 1.1971199236962578e-05,
"loss": 1.4903,
"step": 7560
},
{
"epoch": 1.3757370019653385,
"grad_norm": 4.030813694000244,
"learning_rate": 1.1000243959925168e-05,
"loss": 1.5666,
"step": 7700
},
{
"epoch": 1.3757370019653385,
"eval_nli-pairs_loss": 1.353498935699463,
"eval_nli-pairs_runtime": 12.144,
"eval_nli-pairs_samples_per_second": 123.518,
"eval_nli-pairs_steps_per_second": 5.188,
"step": 7700
},
{
"epoch": 1.3757370019653385,
"eval_scitail-pairs-pos_loss": 0.6812191605567932,
"eval_scitail-pairs-pos_runtime": 15.227,
"eval_scitail-pairs-pos_samples_per_second": 85.637,
"eval_scitail-pairs-pos_steps_per_second": 3.612,
"step": 7700
},
{
"epoch": 1.3757370019653385,
"eval_qnli-contrastive_loss": 1.5941846370697021,
"eval_qnli-contrastive_runtime": 4.7731,
"eval_qnli-contrastive_samples_per_second": 314.263,
"eval_qnli-contrastive_steps_per_second": 13.199,
"step": 7700
},
{
"epoch": 1.400750402001072,
"grad_norm": 1.3353348970413208,
"learning_rate": 1.0019645465811612e-05,
"loss": 1.4289,
"step": 7840
},
{
"epoch": 1.4257638020368053,
"grad_norm": 59.30118942260742,
"learning_rate": 9.038857572412504e-06,
"loss": 1.4531,
"step": 7980
},
{
"epoch": 1.450777202072539,
"grad_norm": 14.574576377868652,
"learning_rate": 8.067335923491407e-06,
"loss": 1.2831,
"step": 8120
},
{
"epoch": 1.4757906021082723,
"grad_norm": 14.356673240661621,
"learning_rate": 7.114446828198969e-06,
"loss": 1.5913,
"step": 8260
},
{
"epoch": 1.5008040021440057,
"grad_norm": 11.328471183776855,
"learning_rate": 6.189376961749974e-06,
"loss": 1.2771,
"step": 8400
},
{
"epoch": 1.5008040021440057,
"eval_nli-pairs_loss": 1.2932122945785522,
"eval_nli-pairs_runtime": 11.9045,
"eval_nli-pairs_samples_per_second": 126.003,
"eval_nli-pairs_steps_per_second": 5.292,
"step": 8400
},
{
"epoch": 1.5008040021440057,
"eval_scitail-pairs-pos_loss": 0.6547886729240417,
"eval_scitail-pairs-pos_runtime": 14.9249,
"eval_scitail-pairs-pos_samples_per_second": 87.371,
"eval_scitail-pairs-pos_steps_per_second": 3.685,
"step": 8400
},
{
"epoch": 1.5008040021440057,
"eval_qnli-contrastive_loss": 1.6052364110946655,
"eval_qnli-contrastive_runtime": 4.7149,
"eval_qnli-contrastive_samples_per_second": 318.137,
"eval_qnli-contrastive_steps_per_second": 13.362,
"step": 8400
},
{
"epoch": 1.5258174021797393,
"grad_norm": 12.197046279907227,
"learning_rate": 5.301044797927004e-06,
"loss": 1.2718,
"step": 8540
},
{
"epoch": 1.5508308022154726,
"grad_norm": 16.694225311279297,
"learning_rate": 4.4638562023418675e-06,
"loss": 1.374,
"step": 8680
},
{
"epoch": 1.575844202251206,
"grad_norm": 9.635122299194336,
"learning_rate": 3.673846274432512e-06,
"loss": 1.4011,
"step": 8820
},
{
"epoch": 1.6008576022869394,
"grad_norm": 13.676748275756836,
"learning_rate": 2.9448259411389292e-06,
"loss": 1.571,
"step": 8960
},
{
"epoch": 1.6258710023226728,
"grad_norm": 3.530120849609375,
"learning_rate": 2.283823589143308e-06,
"loss": 1.54,
"step": 9100
},
{
"epoch": 1.6258710023226728,
"eval_nli-pairs_loss": 1.2734112739562988,
"eval_nli-pairs_runtime": 11.8925,
"eval_nli-pairs_samples_per_second": 126.13,
"eval_nli-pairs_steps_per_second": 5.297,
"step": 9100
},
{
"epoch": 1.6258710023226728,
"eval_scitail-pairs-pos_loss": 0.638645350933075,
"eval_scitail-pairs-pos_runtime": 14.9358,
"eval_scitail-pairs-pos_samples_per_second": 87.307,
"eval_scitail-pairs-pos_steps_per_second": 3.682,
"step": 9100
},
{
"epoch": 1.6258710023226728,
"eval_qnli-contrastive_loss": 1.5299873352050781,
"eval_qnli-contrastive_runtime": 4.6925,
"eval_qnli-contrastive_samples_per_second": 319.657,
"eval_qnli-contrastive_steps_per_second": 13.426,
"step": 9100
},
{
"epoch": 1.6508844023584062,
"grad_norm": 0.7482818961143494,
"learning_rate": 1.6972118529463478e-06,
"loss": 1.3765,
"step": 9240
},
{
"epoch": 1.6758978023941398,
"grad_norm": 10.352273941040039,
"learning_rate": 1.1906461771577705e-06,
"loss": 1.3131,
"step": 9380
},
{
"epoch": 1.7009112024298731,
"grad_norm": 8.817509651184082,
"learning_rate": 7.690102931177922e-07,
"loss": 1.3459,
"step": 9520
},
{
"epoch": 1.7259246024656065,
"grad_norm": 11.525378227233887,
"learning_rate": 4.36369135502146e-07,
"loss": 1.1947,
"step": 9660
},
{
"epoch": 1.7509380025013401,
"grad_norm": 53.216773986816406,
"learning_rate": 1.9592965283525944e-07,
"loss": 1.4422,
"step": 9800
},
{
"epoch": 1.7509380025013401,
"eval_nli-pairs_loss": 1.2626315355300903,
"eval_nli-pairs_runtime": 12.0334,
"eval_nli-pairs_samples_per_second": 124.653,
"eval_nli-pairs_steps_per_second": 5.235,
"step": 9800
},
{
"epoch": 1.7509380025013401,
"eval_scitail-pairs-pos_loss": 0.6336750388145447,
"eval_scitail-pairs-pos_runtime": 14.9549,
"eval_scitail-pairs-pos_samples_per_second": 87.195,
"eval_scitail-pairs-pos_steps_per_second": 3.678,
"step": 9800
},
{
"epoch": 1.7509380025013401,
"eval_qnli-contrastive_loss": 1.5141370296478271,
"eval_qnli-contrastive_runtime": 4.7047,
"eval_qnli-contrastive_samples_per_second": 318.832,
"eval_qnli-contrastive_steps_per_second": 13.391,
"step": 9800
},
{
"epoch": 1.7759514025370735,
"grad_norm": 10.688042640686035,
"learning_rate": 5.000988973217102e-08,
"loss": 1.3539,
"step": 9940
},
{
"epoch": 1.8009648025728069,
"grad_norm": 5.067196846008301,
"learning_rate": 1.6638943132196717e-11,
"loss": 1.3033,
"step": 10080
},
{
"epoch": 1.8259782026085403,
"grad_norm": 14.930255889892578,
"learning_rate": 1.995356812134552e-05,
"loss": 1.1106,
"step": 10220
},
{
"epoch": 1.8509916026442736,
"grad_norm": 9.03366470336914,
"learning_rate": 1.981119187419803e-05,
"loss": 1.3733,
"step": 10360
},
{
"epoch": 1.876005002680007,
"grad_norm": 11.640127182006836,
"learning_rate": 1.957422724980536e-05,
"loss": 1.2665,
"step": 10500
},
{
"epoch": 1.876005002680007,
"eval_nli-pairs_loss": 1.2357267141342163,
"eval_nli-pairs_runtime": 11.9459,
"eval_nli-pairs_samples_per_second": 125.566,
"eval_nli-pairs_steps_per_second": 5.274,
"step": 10500
},
{
"epoch": 1.876005002680007,
"eval_scitail-pairs-pos_loss": 0.6095670461654663,
"eval_scitail-pairs-pos_runtime": 14.9199,
"eval_scitail-pairs-pos_samples_per_second": 87.4,
"eval_scitail-pairs-pos_steps_per_second": 3.686,
"step": 10500
},
{
"epoch": 1.876005002680007,
"eval_qnli-contrastive_loss": 1.4496928453445435,
"eval_qnli-contrastive_runtime": 4.7387,
"eval_qnli-contrastive_samples_per_second": 316.542,
"eval_qnli-contrastive_steps_per_second": 13.295,
"step": 10500
},
{
"epoch": 1.9010184027157406,
"grad_norm": 9.566073417663574,
"learning_rate": 1.9247631052082938e-05,
"loss": 1.351,
"step": 10640
},
{
"epoch": 1.926031802751474,
"grad_norm": 4.9808878898620605,
"learning_rate": 1.8829856531045453e-05,
"loss": 1.3464,
"step": 10780
},
{
"epoch": 1.9510452027872074,
"grad_norm": 12.794534683227539,
"learning_rate": 1.833083761701665e-05,
"loss": 1.1087,
"step": 10920
},
{
"epoch": 1.976058602822941,
"grad_norm": 0.7860898375511169,
"learning_rate": 1.774821097109875e-05,
"loss": 1.1635,
"step": 11060
},
{
"epoch": 2.0010720028586744,
"grad_norm": 40.53791427612305,
"learning_rate": 1.7090884868508633e-05,
"loss": 1.1354,
"step": 11200
},
{
"epoch": 2.0010720028586744,
"eval_nli-pairs_loss": 1.1946154832839966,
"eval_nli-pairs_runtime": 13.3184,
"eval_nli-pairs_samples_per_second": 112.626,
"eval_nli-pairs_steps_per_second": 4.73,
"step": 11200
},
{
"epoch": 2.0010720028586744,
"eval_scitail-pairs-pos_loss": 0.6180398464202881,
"eval_scitail-pairs-pos_runtime": 15.804,
"eval_scitail-pairs-pos_samples_per_second": 82.511,
"eval_scitail-pairs-pos_steps_per_second": 3.48,
"step": 11200
},
{
"epoch": 2.0010720028586744,
"eval_qnli-contrastive_loss": 1.4097257852554321,
"eval_qnli-contrastive_runtime": 4.9229,
"eval_qnli-contrastive_samples_per_second": 304.698,
"eval_qnli-contrastive_steps_per_second": 12.797,
"step": 11200
},
{
"epoch": 2.0260854028944077,
"grad_norm": 6.641210556030273,
"learning_rate": 1.636519650152244e-05,
"loss": 1.1174,
"step": 11340
},
{
"epoch": 2.051098802930141,
"grad_norm": 14.658684730529785,
"learning_rate": 1.5578142133784694e-05,
"loss": 1.3446,
"step": 11480
},
{
"epoch": 2.0761122029658745,
"grad_norm": 8.882003784179688,
"learning_rate": 1.4737309650274368e-05,
"loss": 1.1409,
"step": 11620
},
{
"epoch": 2.101125603001608,
"grad_norm": 8.679718971252441,
"learning_rate": 1.3850805403529464e-05,
"loss": 1.1902,
"step": 11760
},
{
"epoch": 2.1261390030373413,
"grad_norm": 7.319563865661621,
"learning_rate": 1.2927176061395823e-05,
"loss": 1.2951,
"step": 11900
},
{
"epoch": 2.1261390030373413,
"eval_nli-pairs_loss": 1.140305519104004,
"eval_nli-pairs_runtime": 11.9166,
"eval_nli-pairs_samples_per_second": 125.875,
"eval_nli-pairs_steps_per_second": 5.287,
"step": 11900
},
{
"epoch": 2.1261390030373413,
"eval_scitail-pairs-pos_loss": 0.5850992798805237,
"eval_scitail-pairs-pos_runtime": 15.0095,
"eval_scitail-pairs-pos_samples_per_second": 86.878,
"eval_scitail-pairs-pos_steps_per_second": 3.664,
"step": 11900
},
{
"epoch": 2.1261390030373413,
"eval_qnli-contrastive_loss": 1.2628742456436157,
"eval_qnli-contrastive_runtime": 4.7346,
"eval_qnli-contrastive_samples_per_second": 316.816,
"eval_qnli-contrastive_steps_per_second": 13.306,
"step": 11900
},
{
"epoch": 2.151152403073075,
"grad_norm": 0.9307207465171814,
"learning_rate": 1.1975326209755442e-05,
"loss": 1.2077,
"step": 12040
},
{
"epoch": 2.1761658031088085,
"grad_norm": 6.99313497543335,
"learning_rate": 1.1004432504615264e-05,
"loss": 1.2858,
"step": 12180
},
{
"epoch": 2.201179203144542,
"grad_norm": 0.7553257942199707,
"learning_rate": 1.002385520120464e-05,
"loss": 1.267,
"step": 12320
},
{
"epoch": 2.2261926031802752,
"grad_norm": 11.953374862670898,
"learning_rate": 9.04304791301748e-06,
"loss": 1.0026,
"step": 12460
},
{
"epoch": 2.2512060032160086,
"grad_norm": 0.7482788562774658,
"learning_rate": 8.071466470799878e-06,
"loss": 1.2523,
"step": 12600
},
{
"epoch": 2.2512060032160086,
"eval_nli-pairs_loss": 1.1073120832443237,
"eval_nli-pairs_runtime": 11.9071,
"eval_nli-pairs_samples_per_second": 125.975,
"eval_nli-pairs_steps_per_second": 5.291,
"step": 12600
},
{
"epoch": 2.2512060032160086,
"eval_scitail-pairs-pos_loss": 0.553435206413269,
"eval_scitail-pairs-pos_runtime": 14.8588,
"eval_scitail-pairs-pos_samples_per_second": 87.759,
"eval_scitail-pairs-pos_steps_per_second": 3.702,
"step": 12600
},
{
"epoch": 2.2512060032160086,
"eval_qnli-contrastive_loss": 1.3819462060928345,
"eval_qnli-contrastive_runtime": 4.7186,
"eval_qnli-contrastive_samples_per_second": 317.888,
"eval_qnli-contrastive_steps_per_second": 13.351,
"step": 12600
},
{
"epoch": 2.276219403251742,
"grad_norm": 9.73590087890625,
"learning_rate": 7.118477760161566e-06,
"loss": 1.0009,
"step": 12740
},
{
"epoch": 2.3012328032874754,
"grad_norm": 8.564470291137695,
"learning_rate": 6.193269416695461e-06,
"loss": 1.1822,
"step": 12880
},
{
"epoch": 2.3262462033232087,
"grad_norm": 17.126731872558594,
"learning_rate": 5.304761249222376e-06,
"loss": 1.3998,
"step": 13020
},
{
"epoch": 2.351259603358942,
"grad_norm": 0.7993360757827759,
"learning_rate": 4.461519245117542e-06,
"loss": 1.0833,
"step": 13160
},
{
"epoch": 2.376273003394676,
"grad_norm": 14.322105407714844,
"learning_rate": 3.6716729867819545e-06,
"loss": 1.1998,
"step": 13300
},
{
"epoch": 2.376273003394676,
"eval_nli-pairs_loss": 1.0725221633911133,
"eval_nli-pairs_runtime": 12.0178,
"eval_nli-pairs_samples_per_second": 124.815,
"eval_nli-pairs_steps_per_second": 5.242,
"step": 13300
},
{
"epoch": 2.376273003394676,
"eval_scitail-pairs-pos_loss": 0.5349867343902588,
"eval_scitail-pairs-pos_runtime": 14.9892,
"eval_scitail-pairs-pos_samples_per_second": 86.996,
"eval_scitail-pairs-pos_steps_per_second": 3.669,
"step": 13300
},
{
"epoch": 2.376273003394676,
"eval_qnli-contrastive_loss": 1.2879091501235962,
"eval_qnli-contrastive_runtime": 4.7466,
"eval_qnli-contrastive_samples_per_second": 316.015,
"eval_qnli-contrastive_steps_per_second": 13.273,
"step": 13300
},
{
"epoch": 2.4012864034304093,
"grad_norm": 9.142762184143066,
"learning_rate": 2.942837275435191e-06,
"loss": 1.0898,
"step": 13440
},
{
"epoch": 2.4262998034661427,
"grad_norm": 10.551756858825684,
"learning_rate": 2.282038717844137e-06,
"loss": 1.1658,
"step": 13580
},
{
"epoch": 2.451313203501876,
"grad_norm": 5.1452765464782715,
"learning_rate": 1.6956479837551532e-06,
"loss": 1.0127,
"step": 13720
},
{
"epoch": 2.4763266035376095,
"grad_norm": 10.926447868347168,
"learning_rate": 1.1893183871264458e-06,
"loss": 1.2826,
"step": 13860
},
{
"epoch": 2.501340003573343,
"grad_norm": 8.374099731445312,
"learning_rate": 7.679313832908975e-07,
"loss": 1.0243,
"step": 14000
},
{
"epoch": 2.501340003573343,
"eval_nli-pairs_loss": 1.0618858337402344,
"eval_nli-pairs_runtime": 11.9457,
"eval_nli-pairs_samples_per_second": 125.568,
"eval_nli-pairs_steps_per_second": 5.274,
"step": 14000
},
{
"epoch": 2.501340003573343,
"eval_scitail-pairs-pos_loss": 0.5268865823745728,
"eval_scitail-pairs-pos_runtime": 14.9206,
"eval_scitail-pairs-pos_samples_per_second": 87.396,
"eval_scitail-pairs-pos_steps_per_second": 3.686,
"step": 14000
},
{
"epoch": 2.501340003573343,
"eval_qnli-contrastive_loss": 1.311318278312683,
"eval_qnli-contrastive_runtime": 4.7255,
"eval_qnli-contrastive_samples_per_second": 317.427,
"eval_qnli-contrastive_steps_per_second": 13.332,
"step": 14000
},
{
"epoch": 2.5263534036090762,
"grad_norm": 14.999399185180664,
"learning_rate": 4.3554950750376746e-07,
"loss": 1.0344,
"step": 14140
},
{
"epoch": 2.5513668036448096,
"grad_norm": 7.31856107711792,
"learning_rate": 1.953772085883532e-07,
"loss": 1.1904,
"step": 14280
},
{
"epoch": 2.576380203680543,
"grad_norm": 13.470648765563965,
"learning_rate": 4.9729955277194594e-08,
"loss": 1.2133,
"step": 14420
},
{
"epoch": 2.601393603716277,
"grad_norm": 11.626204490661621,
"learning_rate": 1.1913090399717774e-11,
"loss": 1.3826,
"step": 14560
},
{
"epoch": 2.6264070037520098,
"grad_norm": 3.0835983753204346,
"learning_rate": 1.99532975930347e-05,
"loss": 1.3857,
"step": 14700
},
{
"epoch": 2.6264070037520098,
"eval_nli-pairs_loss": 1.0945470333099365,
"eval_nli-pairs_runtime": 11.9625,
"eval_nli-pairs_samples_per_second": 125.392,
"eval_nli-pairs_steps_per_second": 5.266,
"step": 14700
},
{
"epoch": 2.6264070037520098,
"eval_scitail-pairs-pos_loss": 0.543086051940918,
"eval_scitail-pairs-pos_runtime": 14.9212,
"eval_scitail-pairs-pos_samples_per_second": 87.392,
"eval_scitail-pairs-pos_steps_per_second": 3.686,
"step": 14700
},
{
"epoch": 2.6264070037520098,
"eval_qnli-contrastive_loss": 1.3532981872558594,
"eval_qnli-contrastive_runtime": 4.7202,
"eval_qnli-contrastive_samples_per_second": 317.78,
"eval_qnli-contrastive_steps_per_second": 13.347,
"step": 14700
},
{
"epoch": 2.6514204037877436,
"grad_norm": 10.123917579650879,
"learning_rate": 1.98106486998506e-05,
"loss": 1.2257,
"step": 14840
},
{
"epoch": 2.676433803823477,
"grad_norm": 0.7845281362533569,
"learning_rate": 1.957341666609194e-05,
"loss": 1.2372,
"step": 14980
},
{
"epoch": 2.7014472038592103,
"grad_norm": 8.572671890258789,
"learning_rate": 1.9243888613791533e-05,
"loss": 1.1853,
"step": 15120
},
{
"epoch": 2.7264606038949437,
"grad_norm": 1.009204626083374,
"learning_rate": 1.8828538813739946e-05,
"loss": 1.0465,
"step": 15260
},
{
"epoch": 2.751474003930677,
"grad_norm": 7.785921573638916,
"learning_rate": 1.832540018150201e-05,
"loss": 1.3469,
"step": 15400
},
{
"epoch": 2.751474003930677,
"eval_nli-pairs_loss": 1.0669987201690674,
"eval_nli-pairs_runtime": 11.8504,
"eval_nli-pairs_samples_per_second": 126.578,
"eval_nli-pairs_steps_per_second": 5.316,
"step": 15400
},
{
"epoch": 2.751474003930677,
"eval_scitail-pairs-pos_loss": 0.5464032888412476,
"eval_scitail-pairs-pos_runtime": 14.8493,
"eval_scitail-pairs-pos_samples_per_second": 87.815,
"eval_scitail-pairs-pos_steps_per_second": 3.704,
"step": 15400
},
{
"epoch": 2.751474003930677,
"eval_qnli-contrastive_loss": 1.5209625959396362,
"eval_qnli-contrastive_runtime": 4.7597,
"eval_qnli-contrastive_samples_per_second": 315.149,
"eval_qnli-contrastive_steps_per_second": 13.236,
"step": 15400
},
{
"epoch": 2.7764874039664105,
"grad_norm": 0.9812193512916565,
"learning_rate": 1.774199748928214e-05,
"loss": 1.1843,
"step": 15540
},
{
"epoch": 2.801500804002144,
"grad_norm": 5.74447774887085,
"learning_rate": 1.7083955243729978e-05,
"loss": 1.113,
"step": 15680
},
{
"epoch": 2.8265142040378777,
"grad_norm": 12.007781982421875,
"learning_rate": 1.6357617541359384e-05,
"loss": 0.8078,
"step": 15820
},
{
"epoch": 2.8515276040736106,
"grad_norm": 3.299994468688965,
"learning_rate": 1.556998690596969e-05,
"loss": 1.0437,
"step": 15960
},
{
"epoch": 2.8765410041093444,
"grad_norm": 0.8269862532615662,
"learning_rate": 1.4728656778258573e-05,
"loss": 0.9093,
"step": 16100
},
{
"epoch": 2.8765410041093444,
"eval_nli-pairs_loss": 0.9908406138420105,
"eval_nli-pairs_runtime": 12.0455,
"eval_nli-pairs_samples_per_second": 124.527,
"eval_nli-pairs_steps_per_second": 5.23,
"step": 16100
},
{
"epoch": 2.8765410041093444,
"eval_scitail-pairs-pos_loss": 0.49377763271331787,
"eval_scitail-pairs-pos_runtime": 15.2422,
"eval_scitail-pairs-pos_samples_per_second": 85.552,
"eval_scitail-pairs-pos_steps_per_second": 3.608,
"step": 16100
},
{
"epoch": 2.8765410041093444,
"eval_qnli-contrastive_loss": 1.1596204042434692,
"eval_qnli-contrastive_runtime": 4.7624,
"eval_qnli-contrastive_samples_per_second": 314.967,
"eval_qnli-contrastive_steps_per_second": 13.229,
"step": 16100
},
{
"epoch": 2.901554404145078,
"grad_norm": 4.0750861167907715,
"learning_rate": 1.3841738308484636e-05,
"loss": 1.044,
"step": 16240
},
{
"epoch": 2.926567804180811,
"grad_norm": 7.754133224487305,
"learning_rate": 1.291778215796206e-05,
"loss": 1.0227,
"step": 16380
},
{
"epoch": 2.9515812042165446,
"grad_norm": 37.815181732177734,
"learning_rate": 1.1965696063288423e-05,
"loss": 0.8159,
"step": 16520
},
{
"epoch": 2.976594604252278,
"grad_norm": 16.427824020385742,
"learning_rate": 1.0994658958057889e-05,
"loss": 0.8426,
"step": 16660
},
{
"epoch": 3.0016080042880113,
"grad_norm": 0.6402806043624878,
"learning_rate": 1.0014032480000764e-05,
"loss": 0.7955,
"step": 16800
},
{
"epoch": 3.0016080042880113,
"eval_nli-pairs_loss": 0.9680945873260498,
"eval_nli-pairs_runtime": 13.331,
"eval_nli-pairs_samples_per_second": 112.519,
"eval_nli-pairs_steps_per_second": 4.726,
"step": 16800
},
{
"epoch": 3.0016080042880113,
"eval_scitail-pairs-pos_loss": 0.49118393659591675,
"eval_scitail-pairs-pos_runtime": 15.2998,
"eval_scitail-pairs-pos_samples_per_second": 85.23,
"eval_scitail-pairs-pos_steps_per_second": 3.595,
"step": 16800
},
{
"epoch": 3.0016080042880113,
"eval_qnli-contrastive_loss": 1.1894794702529907,
"eval_qnli-contrastive_runtime": 4.8432,
"eval_qnli-contrastive_samples_per_second": 309.715,
"eval_qnli-contrastive_steps_per_second": 13.008,
"step": 16800
},
{
"epoch": 3.0266214043237447,
"grad_norm": 3.155766010284424,
"learning_rate": 9.03327071669702e-06,
"loss": 0.856,
"step": 16940
},
{
"epoch": 3.051634804359478,
"grad_norm": 11.008296966552734,
"learning_rate": 8.061829059993542e-06,
"loss": 1.0754,
"step": 17080
},
{
"epoch": 3.076648204395212,
"grad_norm": 4.382720947265625,
"learning_rate": 7.109073047846788e-06,
"loss": 0.9151,
"step": 17220
},
{
"epoch": 3.1016616044309453,
"grad_norm": 2.755722761154175,
"learning_rate": 6.184188072434878e-06,
"loss": 1.0051,
"step": 17360
},
{
"epoch": 3.1266750044666787,
"grad_norm": 2.4547111988067627,
"learning_rate": 5.296090825030854e-06,
"loss": 1.0075,
"step": 17500
},
{
"epoch": 3.1266750044666787,
"eval_nli-pairs_loss": 0.9583492875099182,
"eval_nli-pairs_runtime": 12.1773,
"eval_nli-pairs_samples_per_second": 123.18,
"eval_nli-pairs_steps_per_second": 5.174,
"step": 17500
},
{
"epoch": 3.1266750044666787,
"eval_scitail-pairs-pos_loss": 0.485266774892807,
"eval_scitail-pairs-pos_runtime": 14.9222,
"eval_scitail-pairs-pos_samples_per_second": 87.387,
"eval_scitail-pairs-pos_steps_per_second": 3.686,
"step": 17500
},
{
"epoch": 3.1266750044666787,
"eval_qnli-contrastive_loss": 1.0658234357833862,
"eval_qnli-contrastive_runtime": 4.7681,
"eval_qnli-contrastive_samples_per_second": 314.592,
"eval_qnli-contrastive_steps_per_second": 13.213,
"step": 17500
},
{
"epoch": 3.151688404502412,
"grad_norm": 19.061325073242188,
"learning_rate": 4.453343331385006e-06,
"loss": 0.9909,
"step": 17640
},
{
"epoch": 3.1767018045381454,
"grad_norm": 17.016021728515625,
"learning_rate": 3.6640704063896858e-06,
"loss": 1.029,
"step": 17780
},
{
"epoch": 3.201715204573879,
"grad_norm": 4.147863864898682,
"learning_rate": 2.9358813238350816e-06,
"loss": 1.0292,
"step": 17920
},
{
"epoch": 3.226728604609612,
"grad_norm": 27.60422706604004,
"learning_rate": 2.275796456427173e-06,
"loss": 0.8334,
"step": 18060
},
{
"epoch": 3.2517420046453456,
"grad_norm": 0.7800289392471313,
"learning_rate": 1.6901795933215137e-06,
"loss": 1.0119,
"step": 18200
},
{
"epoch": 3.2517420046453456,
"eval_nli-pairs_loss": 0.9484548568725586,
"eval_nli-pairs_runtime": 12.0697,
"eval_nli-pairs_samples_per_second": 124.279,
"eval_nli-pairs_steps_per_second": 5.22,
"step": 18200
},
{
"epoch": 3.2517420046453456,
"eval_scitail-pairs-pos_loss": 0.4673975706100464,
"eval_scitail-pairs-pos_runtime": 15.0509,
"eval_scitail-pairs-pos_samples_per_second": 86.639,
"eval_scitail-pairs-pos_steps_per_second": 3.654,
"step": 18200
},
{
"epoch": 3.2517420046453456,
"eval_qnli-contrastive_loss": 1.1171668767929077,
"eval_qnli-contrastive_runtime": 4.7871,
"eval_qnli-contrastive_samples_per_second": 313.345,
"eval_qnli-contrastive_steps_per_second": 13.16,
"step": 18200
},
{
"epoch": 3.2767554046810794,
"grad_norm": 16.64696502685547,
"learning_rate": 1.1846765876905709e-06,
"loss": 0.8582,
"step": 18340
},
{
"epoch": 3.3017688047168123,
"grad_norm": 16.13783073425293,
"learning_rate": 7.668532006209551e-07,
"loss": 1.0397,
"step": 18480
},
{
"epoch": 3.326782204752546,
"grad_norm": 3.76619553565979,
"learning_rate": 4.347306328421508e-07,
"loss": 1.1988,
"step": 18620
},
{
"epoch": 3.3517956047882795,
"grad_norm": 10.401665687561035,
"learning_rate": 1.948255365952012e-07,
"loss": 0.9432,
"step": 18760
},
{
"epoch": 3.376809004824013,
"grad_norm": 2.400106191635132,
"learning_rate": 4.945080454776929e-08,
"loss": 1.0573,
"step": 18900
},
{
"epoch": 3.376809004824013,
"eval_nli-pairs_loss": 0.9437180757522583,
"eval_nli-pairs_runtime": 12.0974,
"eval_nli-pairs_samples_per_second": 123.993,
"eval_nli-pairs_steps_per_second": 5.208,
"step": 18900
},
{
"epoch": 3.376809004824013,
"eval_scitail-pairs-pos_loss": 0.46788787841796875,
"eval_scitail-pairs-pos_runtime": 15.1516,
"eval_scitail-pairs-pos_samples_per_second": 86.063,
"eval_scitail-pairs-pos_steps_per_second": 3.63,
"step": 18900
},
{
"epoch": 3.376809004824013,
"eval_qnli-contrastive_loss": 1.081482172012329,
"eval_qnli-contrastive_runtime": 4.8096,
"eval_qnli-contrastive_samples_per_second": 311.875,
"eval_qnli-contrastive_steps_per_second": 13.099,
"step": 18900
},
{
"epoch": 3.4018224048597463,
"grad_norm": 25.33026695251465,
"learning_rate": 7.974879220329356e-12,
"loss": 0.9829,
"step": 19040
},
{
"epoch": 3.4268358048954797,
"grad_norm": 4.218173027038574,
"learning_rate": 1.995302628075987e-05,
"loss": 1.0573,
"step": 19180
},
{
"epoch": 3.451849204931213,
"grad_norm": 13.573431015014648,
"learning_rate": 1.98101047527748e-05,
"loss": 0.9449,
"step": 19320
},
{
"epoch": 3.4768626049669464,
"grad_norm": 6.658699989318848,
"learning_rate": 1.9572605328335534e-05,
"loss": 1.2005,
"step": 19460
},
{
"epoch": 3.5018760050026803,
"grad_norm": 6.075576305389404,
"learning_rate": 1.924281770735239e-05,
"loss": 0.9171,
"step": 19600
},
{
"epoch": 3.5018760050026803,
"eval_nli-pairs_loss": 0.9502684473991394,
"eval_nli-pairs_runtime": 12.0413,
"eval_nli-pairs_samples_per_second": 124.572,
"eval_nli-pairs_steps_per_second": 5.232,
"step": 19600
},
{
"epoch": 3.5018760050026803,
"eval_scitail-pairs-pos_loss": 0.4798508584499359,
"eval_scitail-pairs-pos_runtime": 14.9533,
"eval_scitail-pairs-pos_samples_per_second": 87.205,
"eval_scitail-pairs-pos_steps_per_second": 3.678,
"step": 19600
},
{
"epoch": 3.5018760050026803,
"eval_qnli-contrastive_loss": 1.2315282821655273,
"eval_qnli-contrastive_runtime": 4.7188,
"eval_qnli-contrastive_samples_per_second": 317.874,
"eval_qnli-contrastive_steps_per_second": 13.351,
"step": 19600
},
{
"epoch": 3.526889405038413,
"grad_norm": 8.40775203704834,
"learning_rate": 1.8823921327788075e-05,
"loss": 0.9425,
"step": 19740
},
{
"epoch": 3.551902805074147,
"grad_norm": 11.214140892028809,
"learning_rate": 1.831995471312526e-05,
"loss": 1.1213,
"step": 19880
},
{
"epoch": 3.5769162051098804,
"grad_norm": 10.211651802062988,
"learning_rate": 1.7735776537506483e-05,
"loss": 1.1128,
"step": 20020
},
{
"epoch": 3.6019296051456138,
"grad_norm": 44.01512908935547,
"learning_rate": 1.707701878391224e-05,
"loss": 1.331,
"step": 20160
},
{
"epoch": 3.626943005181347,
"grad_norm": 13.295893669128418,
"learning_rate": 1.6350032446972868e-05,
"loss": 1.0495,
"step": 20300
},
{
"epoch": 3.626943005181347,
"eval_nli-pairs_loss": 0.9468088150024414,
"eval_nli-pairs_runtime": 11.9325,
"eval_nli-pairs_samples_per_second": 125.707,
"eval_nli-pairs_steps_per_second": 5.28,
"step": 20300
},
{
"epoch": 3.626943005181347,
"eval_scitail-pairs-pos_loss": 0.4434490203857422,
"eval_scitail-pairs-pos_runtime": 15.5134,
"eval_scitail-pairs-pos_samples_per_second": 84.056,
"eval_scitail-pairs-pos_steps_per_second": 3.545,
"step": 20300
},
{
"epoch": 3.626943005181347,
"eval_qnli-contrastive_loss": 1.141271710395813,
"eval_qnli-contrastive_runtime": 4.7207,
"eval_qnli-contrastive_samples_per_second": 317.752,
"eval_qnli-contrastive_steps_per_second": 13.346,
"step": 20300
},
{
"epoch": 3.6519564052170805,
"grad_norm": 71.68439483642578,
"learning_rate": 1.5561826303886085e-05,
"loss": 0.9698,
"step": 20440
},
{
"epoch": 3.676969805252814,
"grad_norm": 5.957241058349609,
"learning_rate": 1.4719999343741618e-05,
"loss": 0.9148,
"step": 20580
},
{
"epoch": 3.7019832052885473,
"grad_norm": 1.4626597166061401,
"learning_rate": 1.3839147028686583e-05,
"loss": 0.9042,
"step": 20720
},
{
"epoch": 3.726996605324281,
"grad_norm": 2.4634809494018555,
"learning_rate": 1.2915097668067934e-05,
"loss": 0.8232,
"step": 20860
},
{
"epoch": 3.752010005360014,
"grad_norm": 1.5838899612426758,
"learning_rate": 1.196294424410312e-05,
"loss": 1.0163,
"step": 21000
},
{
"epoch": 3.752010005360014,
"eval_nli-pairs_loss": 0.9020450115203857,
"eval_nli-pairs_runtime": 12.2572,
"eval_nli-pairs_samples_per_second": 122.377,
"eval_nli-pairs_steps_per_second": 5.14,
"step": 21000
},
{
"epoch": 3.752010005360014,
"eval_scitail-pairs-pos_loss": 0.4573577046394348,
"eval_scitail-pairs-pos_runtime": 15.1478,
"eval_scitail-pairs-pos_samples_per_second": 86.085,
"eval_scitail-pairs-pos_steps_per_second": 3.631,
"step": 21000
},
{
"epoch": 3.752010005360014,
"eval_qnli-contrastive_loss": 1.2882591485977173,
"eval_qnli-contrastive_runtime": 4.762,
"eval_qnli-contrastive_samples_per_second": 314.992,
"eval_qnli-contrastive_steps_per_second": 13.23,
"step": 21000
},
{
"epoch": 3.777023405395748,
"grad_norm": 5.878975868225098,
"learning_rate": 1.099186633949893e-05,
"loss": 0.9735,
"step": 21140
},
{
"epoch": 3.8020368054314813,
"grad_norm": 10.22749137878418,
"learning_rate": 1.0011225985326909e-05,
"loss": 0.8371,
"step": 21280
},
{
"epoch": 3.8270502054672146,
"grad_norm": 8.895988464355469,
"learning_rate": 9.030477402944833e-06,
"loss": 0.6344,
"step": 21420
},
{
"epoch": 3.852063605502948,
"grad_norm": 1.564530372619629,
"learning_rate": 8.059075857124063e-06,
"loss": 0.87,
"step": 21560
},
{
"epoch": 3.8770770055386814,
"grad_norm": 3.3526771068573,
"learning_rate": 7.106386499117424e-06,
"loss": 0.7404,
"step": 21700
},
{
"epoch": 3.8770770055386814,
"eval_nli-pairs_loss": 0.8661152720451355,
"eval_nli-pairs_runtime": 11.9159,
"eval_nli-pairs_samples_per_second": 125.883,
"eval_nli-pairs_steps_per_second": 5.287,
"step": 21700
},
{
"epoch": 3.8770770055386814,
"eval_scitail-pairs-pos_loss": 0.4352877140045166,
"eval_scitail-pairs-pos_runtime": 14.9412,
"eval_scitail-pairs-pos_samples_per_second": 87.275,
"eval_scitail-pairs-pos_steps_per_second": 3.681,
"step": 21700
},
{
"epoch": 3.8770770055386814,
"eval_qnli-contrastive_loss": 1.0643585920333862,
"eval_qnli-contrastive_runtime": 4.7458,
"eval_qnli-contrastive_samples_per_second": 316.066,
"eval_qnli-contrastive_steps_per_second": 13.275,
"step": 21700
},
{
"epoch": 3.9020904055744148,
"grad_norm": 6.517562389373779,
"learning_rate": 6.181594078499504e-06,
"loss": 0.8486,
"step": 21840
},
{
"epoch": 3.927103805610148,
"grad_norm": 4.482045650482178,
"learning_rate": 5.293614394235034e-06,
"loss": 0.8895,
"step": 21980
},
{
"epoch": 3.952117205645882,
"grad_norm": 5.165999889373779,
"learning_rate": 4.451008338663955e-06,
"loss": 0.7476,
"step": 22120
},
{
"epoch": 3.977130605681615,
"grad_norm": 7.821371078491211,
"learning_rate": 3.6618993630932396e-06,
"loss": 0.6761,
"step": 22260
}
],
"logging_steps": 140,
"max_steps": 27985,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 6997,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 28,
"trial_name": null,
"trial_params": null
}