manu's picture
Upload folder using huggingface_hub
20f1115 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.992,
"eval_steps": 100,
"global_step": 6200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 203.1111602783203,
"learning_rate": 7.520000000000001e-06,
"loss": 3.4547,
"step": 100
},
{
"epoch": 0.016,
"eval_all-nli-dev_cosine_accuracy": 0.737,
"eval_all-nli-dev_dot_accuracy": 0.319,
"eval_all-nli-dev_euclidean_accuracy": 0.737,
"eval_all-nli-dev_manhattan_accuracy": 0.801,
"eval_all-nli-dev_max_accuracy": 0.801,
"eval_loss": 2.2853124141693115,
"eval_runtime": 4.7392,
"eval_samples_per_second": 211.008,
"eval_steps_per_second": 13.294,
"step": 100
},
{
"epoch": 0.032,
"grad_norm": 38.57483673095703,
"learning_rate": 1.552e-05,
"loss": 1.6761,
"step": 200
},
{
"epoch": 0.032,
"eval_all-nli-dev_cosine_accuracy": 0.826,
"eval_all-nli-dev_dot_accuracy": 0.267,
"eval_all-nli-dev_euclidean_accuracy": 0.83,
"eval_all-nli-dev_manhattan_accuracy": 0.856,
"eval_all-nli-dev_max_accuracy": 0.856,
"eval_loss": 1.3493391275405884,
"eval_runtime": 4.734,
"eval_samples_per_second": 211.236,
"eval_steps_per_second": 13.308,
"step": 200
},
{
"epoch": 0.048,
"grad_norm": 36.92991638183594,
"learning_rate": 2.3520000000000002e-05,
"loss": 1.5528,
"step": 300
},
{
"epoch": 0.048,
"eval_all-nli-dev_cosine_accuracy": 0.805,
"eval_all-nli-dev_dot_accuracy": 0.29,
"eval_all-nli-dev_euclidean_accuracy": 0.818,
"eval_all-nli-dev_manhattan_accuracy": 0.83,
"eval_all-nli-dev_max_accuracy": 0.83,
"eval_loss": 1.4180811643600464,
"eval_runtime": 4.629,
"eval_samples_per_second": 216.028,
"eval_steps_per_second": 13.61,
"step": 300
},
{
"epoch": 0.064,
"grad_norm": 35.267967224121094,
"learning_rate": 3.1519999999999996e-05,
"loss": 1.0069,
"step": 400
},
{
"epoch": 0.064,
"eval_all-nli-dev_cosine_accuracy": 0.819,
"eval_all-nli-dev_dot_accuracy": 0.244,
"eval_all-nli-dev_euclidean_accuracy": 0.824,
"eval_all-nli-dev_manhattan_accuracy": 0.835,
"eval_all-nli-dev_max_accuracy": 0.835,
"eval_loss": 1.3276922702789307,
"eval_runtime": 4.5852,
"eval_samples_per_second": 218.095,
"eval_steps_per_second": 13.74,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 25.50597381591797,
"learning_rate": 3.944e-05,
"loss": 1.0611,
"step": 500
},
{
"epoch": 0.08,
"eval_all-nli-dev_cosine_accuracy": 0.814,
"eval_all-nli-dev_dot_accuracy": 0.216,
"eval_all-nli-dev_euclidean_accuracy": 0.804,
"eval_all-nli-dev_manhattan_accuracy": 0.847,
"eval_all-nli-dev_max_accuracy": 0.847,
"eval_loss": 1.4609754085540771,
"eval_runtime": 4.6833,
"eval_samples_per_second": 213.524,
"eval_steps_per_second": 13.452,
"step": 500
},
{
"epoch": 0.096,
"grad_norm": 55.69437026977539,
"learning_rate": 4.744e-05,
"loss": 1.1424,
"step": 600
},
{
"epoch": 0.096,
"eval_all-nli-dev_cosine_accuracy": 0.776,
"eval_all-nli-dev_dot_accuracy": 0.319,
"eval_all-nli-dev_euclidean_accuracy": 0.786,
"eval_all-nli-dev_manhattan_accuracy": 0.805,
"eval_all-nli-dev_max_accuracy": 0.805,
"eval_loss": 1.7394046783447266,
"eval_runtime": 4.6079,
"eval_samples_per_second": 217.017,
"eval_steps_per_second": 13.672,
"step": 600
},
{
"epoch": 0.112,
"grad_norm": 24.330310821533203,
"learning_rate": 4.9404444444444447e-05,
"loss": 1.3545,
"step": 700
},
{
"epoch": 0.112,
"eval_all-nli-dev_cosine_accuracy": 0.825,
"eval_all-nli-dev_dot_accuracy": 0.211,
"eval_all-nli-dev_euclidean_accuracy": 0.825,
"eval_all-nli-dev_manhattan_accuracy": 0.83,
"eval_all-nli-dev_max_accuracy": 0.83,
"eval_loss": 1.417900562286377,
"eval_runtime": 4.6918,
"eval_samples_per_second": 213.136,
"eval_steps_per_second": 13.428,
"step": 700
},
{
"epoch": 0.128,
"grad_norm": 18.1214599609375,
"learning_rate": 4.852444444444444e-05,
"loss": 1.3587,
"step": 800
},
{
"epoch": 0.128,
"eval_all-nli-dev_cosine_accuracy": 0.834,
"eval_all-nli-dev_dot_accuracy": 0.175,
"eval_all-nli-dev_euclidean_accuracy": 0.832,
"eval_all-nli-dev_manhattan_accuracy": 0.84,
"eval_all-nli-dev_max_accuracy": 0.84,
"eval_loss": 1.6350008249282837,
"eval_runtime": 4.57,
"eval_samples_per_second": 218.818,
"eval_steps_per_second": 13.786,
"step": 800
},
{
"epoch": 0.144,
"grad_norm": 28.469261169433594,
"learning_rate": 4.763555555555555e-05,
"loss": 1.237,
"step": 900
},
{
"epoch": 0.144,
"eval_all-nli-dev_cosine_accuracy": 0.778,
"eval_all-nli-dev_dot_accuracy": 0.235,
"eval_all-nli-dev_euclidean_accuracy": 0.778,
"eval_all-nli-dev_manhattan_accuracy": 0.801,
"eval_all-nli-dev_max_accuracy": 0.801,
"eval_loss": 1.6793839931488037,
"eval_runtime": 4.5941,
"eval_samples_per_second": 217.673,
"eval_steps_per_second": 13.713,
"step": 900
},
{
"epoch": 0.16,
"grad_norm": 33.614253997802734,
"learning_rate": 4.6746666666666664e-05,
"loss": 1.2029,
"step": 1000
},
{
"epoch": 0.16,
"eval_all-nli-dev_cosine_accuracy": 0.799,
"eval_all-nli-dev_dot_accuracy": 0.24,
"eval_all-nli-dev_euclidean_accuracy": 0.808,
"eval_all-nli-dev_manhattan_accuracy": 0.811,
"eval_all-nli-dev_max_accuracy": 0.811,
"eval_loss": 1.673274040222168,
"eval_runtime": 4.6683,
"eval_samples_per_second": 214.212,
"eval_steps_per_second": 13.495,
"step": 1000
},
{
"epoch": 0.176,
"grad_norm": 26.691509246826172,
"learning_rate": 4.5857777777777775e-05,
"loss": 1.2748,
"step": 1100
},
{
"epoch": 0.176,
"eval_all-nli-dev_cosine_accuracy": 0.8,
"eval_all-nli-dev_dot_accuracy": 0.213,
"eval_all-nli-dev_euclidean_accuracy": 0.802,
"eval_all-nli-dev_manhattan_accuracy": 0.818,
"eval_all-nli-dev_max_accuracy": 0.818,
"eval_loss": 1.6359915733337402,
"eval_runtime": 4.6977,
"eval_samples_per_second": 212.869,
"eval_steps_per_second": 13.411,
"step": 1100
},
{
"epoch": 0.192,
"grad_norm": 23.69953727722168,
"learning_rate": 4.4968888888888894e-05,
"loss": 1.1433,
"step": 1200
},
{
"epoch": 0.192,
"eval_all-nli-dev_cosine_accuracy": 0.786,
"eval_all-nli-dev_dot_accuracy": 0.215,
"eval_all-nli-dev_euclidean_accuracy": 0.785,
"eval_all-nli-dev_manhattan_accuracy": 0.806,
"eval_all-nli-dev_max_accuracy": 0.806,
"eval_loss": 1.7951678037643433,
"eval_runtime": 4.6338,
"eval_samples_per_second": 215.807,
"eval_steps_per_second": 13.596,
"step": 1200
},
{
"epoch": 0.208,
"grad_norm": 17.910472869873047,
"learning_rate": 4.4080000000000005e-05,
"loss": 1.0113,
"step": 1300
},
{
"epoch": 0.208,
"eval_all-nli-dev_cosine_accuracy": 0.817,
"eval_all-nli-dev_dot_accuracy": 0.178,
"eval_all-nli-dev_euclidean_accuracy": 0.815,
"eval_all-nli-dev_manhattan_accuracy": 0.817,
"eval_all-nli-dev_max_accuracy": 0.817,
"eval_loss": 1.4315475225448608,
"eval_runtime": 4.6753,
"eval_samples_per_second": 213.892,
"eval_steps_per_second": 13.475,
"step": 1300
},
{
"epoch": 0.224,
"grad_norm": 43.645694732666016,
"learning_rate": 4.3191111111111116e-05,
"loss": 0.8216,
"step": 1400
},
{
"epoch": 0.224,
"eval_all-nli-dev_cosine_accuracy": 0.771,
"eval_all-nli-dev_dot_accuracy": 0.243,
"eval_all-nli-dev_euclidean_accuracy": 0.776,
"eval_all-nli-dev_manhattan_accuracy": 0.774,
"eval_all-nli-dev_max_accuracy": 0.776,
"eval_loss": 1.6300010681152344,
"eval_runtime": 4.6418,
"eval_samples_per_second": 215.435,
"eval_steps_per_second": 13.572,
"step": 1400
},
{
"epoch": 0.24,
"grad_norm": 31.21487045288086,
"learning_rate": 4.231111111111111e-05,
"loss": 1.3451,
"step": 1500
},
{
"epoch": 0.24,
"eval_all-nli-dev_cosine_accuracy": 0.845,
"eval_all-nli-dev_dot_accuracy": 0.186,
"eval_all-nli-dev_euclidean_accuracy": 0.85,
"eval_all-nli-dev_manhattan_accuracy": 0.856,
"eval_all-nli-dev_max_accuracy": 0.856,
"eval_loss": 1.1566354036331177,
"eval_runtime": 4.6716,
"eval_samples_per_second": 214.061,
"eval_steps_per_second": 13.486,
"step": 1500
},
{
"epoch": 0.256,
"grad_norm": 25.364757537841797,
"learning_rate": 4.142222222222222e-05,
"loss": 0.8745,
"step": 1600
},
{
"epoch": 0.256,
"eval_all-nli-dev_cosine_accuracy": 0.825,
"eval_all-nli-dev_dot_accuracy": 0.175,
"eval_all-nli-dev_euclidean_accuracy": 0.833,
"eval_all-nli-dev_manhattan_accuracy": 0.838,
"eval_all-nli-dev_max_accuracy": 0.838,
"eval_loss": 1.2074507474899292,
"eval_runtime": 4.7951,
"eval_samples_per_second": 208.546,
"eval_steps_per_second": 13.138,
"step": 1600
},
{
"epoch": 0.272,
"grad_norm": 16.138595581054688,
"learning_rate": 4.0533333333333334e-05,
"loss": 0.9945,
"step": 1700
},
{
"epoch": 0.272,
"eval_all-nli-dev_cosine_accuracy": 0.822,
"eval_all-nli-dev_dot_accuracy": 0.191,
"eval_all-nli-dev_euclidean_accuracy": 0.824,
"eval_all-nli-dev_manhattan_accuracy": 0.831,
"eval_all-nli-dev_max_accuracy": 0.831,
"eval_loss": 1.3295574188232422,
"eval_runtime": 4.7422,
"eval_samples_per_second": 210.873,
"eval_steps_per_second": 13.285,
"step": 1700
},
{
"epoch": 0.288,
"grad_norm": 21.44277572631836,
"learning_rate": 3.9644444444444445e-05,
"loss": 0.9827,
"step": 1800
},
{
"epoch": 0.288,
"eval_all-nli-dev_cosine_accuracy": 0.844,
"eval_all-nli-dev_dot_accuracy": 0.163,
"eval_all-nli-dev_euclidean_accuracy": 0.839,
"eval_all-nli-dev_manhattan_accuracy": 0.844,
"eval_all-nli-dev_max_accuracy": 0.844,
"eval_loss": 1.3051831722259521,
"eval_runtime": 4.6529,
"eval_samples_per_second": 214.919,
"eval_steps_per_second": 13.54,
"step": 1800
},
{
"epoch": 0.304,
"grad_norm": 25.058795928955078,
"learning_rate": 3.8755555555555556e-05,
"loss": 0.974,
"step": 1900
},
{
"epoch": 0.304,
"eval_all-nli-dev_cosine_accuracy": 0.837,
"eval_all-nli-dev_dot_accuracy": 0.161,
"eval_all-nli-dev_euclidean_accuracy": 0.838,
"eval_all-nli-dev_manhattan_accuracy": 0.85,
"eval_all-nli-dev_max_accuracy": 0.85,
"eval_loss": 1.164267897605896,
"eval_runtime": 4.6376,
"eval_samples_per_second": 215.626,
"eval_steps_per_second": 13.584,
"step": 1900
},
{
"epoch": 0.32,
"grad_norm": 17.524301528930664,
"learning_rate": 3.786666666666667e-05,
"loss": 0.7555,
"step": 2000
},
{
"epoch": 0.32,
"eval_all-nli-dev_cosine_accuracy": 0.855,
"eval_all-nli-dev_dot_accuracy": 0.147,
"eval_all-nli-dev_euclidean_accuracy": 0.856,
"eval_all-nli-dev_manhattan_accuracy": 0.869,
"eval_all-nli-dev_max_accuracy": 0.869,
"eval_loss": 1.2737869024276733,
"eval_runtime": 4.6503,
"eval_samples_per_second": 215.038,
"eval_steps_per_second": 13.547,
"step": 2000
},
{
"epoch": 0.336,
"grad_norm": 10.828136444091797,
"learning_rate": 3.697777777777778e-05,
"loss": 0.7176,
"step": 2100
},
{
"epoch": 0.336,
"eval_all-nli-dev_cosine_accuracy": 0.832,
"eval_all-nli-dev_dot_accuracy": 0.183,
"eval_all-nli-dev_euclidean_accuracy": 0.832,
"eval_all-nli-dev_manhattan_accuracy": 0.829,
"eval_all-nli-dev_max_accuracy": 0.832,
"eval_loss": 1.374898910522461,
"eval_runtime": 4.6209,
"eval_samples_per_second": 216.408,
"eval_steps_per_second": 13.634,
"step": 2100
},
{
"epoch": 0.352,
"grad_norm": 11.03420639038086,
"learning_rate": 3.608888888888889e-05,
"loss": 0.834,
"step": 2200
},
{
"epoch": 0.352,
"eval_all-nli-dev_cosine_accuracy": 0.875,
"eval_all-nli-dev_dot_accuracy": 0.147,
"eval_all-nli-dev_euclidean_accuracy": 0.879,
"eval_all-nli-dev_manhattan_accuracy": 0.874,
"eval_all-nli-dev_max_accuracy": 0.879,
"eval_loss": 1.071208119392395,
"eval_runtime": 4.6829,
"eval_samples_per_second": 213.542,
"eval_steps_per_second": 13.453,
"step": 2200
},
{
"epoch": 0.368,
"grad_norm": 15.635822296142578,
"learning_rate": 3.52e-05,
"loss": 1.0819,
"step": 2300
},
{
"epoch": 0.368,
"eval_all-nli-dev_cosine_accuracy": 0.849,
"eval_all-nli-dev_dot_accuracy": 0.162,
"eval_all-nli-dev_euclidean_accuracy": 0.849,
"eval_all-nli-dev_manhattan_accuracy": 0.848,
"eval_all-nli-dev_max_accuracy": 0.849,
"eval_loss": 1.27626633644104,
"eval_runtime": 4.5515,
"eval_samples_per_second": 219.707,
"eval_steps_per_second": 13.842,
"step": 2300
},
{
"epoch": 0.384,
"grad_norm": 15.611441612243652,
"learning_rate": 3.431111111111111e-05,
"loss": 0.9515,
"step": 2400
},
{
"epoch": 0.384,
"eval_all-nli-dev_cosine_accuracy": 0.845,
"eval_all-nli-dev_dot_accuracy": 0.153,
"eval_all-nli-dev_euclidean_accuracy": 0.847,
"eval_all-nli-dev_manhattan_accuracy": 0.848,
"eval_all-nli-dev_max_accuracy": 0.848,
"eval_loss": 1.1383966207504272,
"eval_runtime": 4.5335,
"eval_samples_per_second": 220.582,
"eval_steps_per_second": 13.897,
"step": 2400
},
{
"epoch": 0.4,
"grad_norm": 23.901636123657227,
"learning_rate": 3.3422222222222224e-05,
"loss": 0.7828,
"step": 2500
},
{
"epoch": 0.4,
"eval_all-nli-dev_cosine_accuracy": 0.859,
"eval_all-nli-dev_dot_accuracy": 0.142,
"eval_all-nli-dev_euclidean_accuracy": 0.861,
"eval_all-nli-dev_manhattan_accuracy": 0.861,
"eval_all-nli-dev_max_accuracy": 0.861,
"eval_loss": 1.0878574848175049,
"eval_runtime": 4.6563,
"eval_samples_per_second": 214.765,
"eval_steps_per_second": 13.53,
"step": 2500
},
{
"epoch": 0.416,
"grad_norm": 20.29927635192871,
"learning_rate": 3.253333333333333e-05,
"loss": 0.7268,
"step": 2600
},
{
"epoch": 0.416,
"eval_all-nli-dev_cosine_accuracy": 0.868,
"eval_all-nli-dev_dot_accuracy": 0.128,
"eval_all-nli-dev_euclidean_accuracy": 0.864,
"eval_all-nli-dev_manhattan_accuracy": 0.867,
"eval_all-nli-dev_max_accuracy": 0.868,
"eval_loss": 0.9835022687911987,
"eval_runtime": 4.6005,
"eval_samples_per_second": 217.367,
"eval_steps_per_second": 13.694,
"step": 2600
},
{
"epoch": 0.432,
"grad_norm": 39.60092544555664,
"learning_rate": 3.164444444444444e-05,
"loss": 0.9228,
"step": 2700
},
{
"epoch": 0.432,
"eval_all-nli-dev_cosine_accuracy": 0.851,
"eval_all-nli-dev_dot_accuracy": 0.15,
"eval_all-nli-dev_euclidean_accuracy": 0.848,
"eval_all-nli-dev_manhattan_accuracy": 0.848,
"eval_all-nli-dev_max_accuracy": 0.851,
"eval_loss": 1.1840057373046875,
"eval_runtime": 4.6302,
"eval_samples_per_second": 215.972,
"eval_steps_per_second": 13.606,
"step": 2700
},
{
"epoch": 0.448,
"grad_norm": 26.71760368347168,
"learning_rate": 3.075555555555556e-05,
"loss": 1.0017,
"step": 2800
},
{
"epoch": 0.448,
"eval_all-nli-dev_cosine_accuracy": 0.85,
"eval_all-nli-dev_dot_accuracy": 0.138,
"eval_all-nli-dev_euclidean_accuracy": 0.846,
"eval_all-nli-dev_manhattan_accuracy": 0.853,
"eval_all-nli-dev_max_accuracy": 0.853,
"eval_loss": 1.1967850923538208,
"eval_runtime": 4.6908,
"eval_samples_per_second": 213.184,
"eval_steps_per_second": 13.431,
"step": 2800
},
{
"epoch": 0.464,
"grad_norm": 25.688671112060547,
"learning_rate": 2.986666666666667e-05,
"loss": 0.9138,
"step": 2900
},
{
"epoch": 0.464,
"eval_all-nli-dev_cosine_accuracy": 0.861,
"eval_all-nli-dev_dot_accuracy": 0.14,
"eval_all-nli-dev_euclidean_accuracy": 0.86,
"eval_all-nli-dev_manhattan_accuracy": 0.869,
"eval_all-nli-dev_max_accuracy": 0.869,
"eval_loss": 0.9930791854858398,
"eval_runtime": 4.7105,
"eval_samples_per_second": 212.29,
"eval_steps_per_second": 13.374,
"step": 2900
},
{
"epoch": 0.48,
"grad_norm": 13.824788093566895,
"learning_rate": 2.897777777777778e-05,
"loss": 0.8498,
"step": 3000
},
{
"epoch": 0.48,
"eval_all-nli-dev_cosine_accuracy": 0.872,
"eval_all-nli-dev_dot_accuracy": 0.129,
"eval_all-nli-dev_euclidean_accuracy": 0.871,
"eval_all-nli-dev_manhattan_accuracy": 0.876,
"eval_all-nli-dev_max_accuracy": 0.876,
"eval_loss": 0.9925669431686401,
"eval_runtime": 4.67,
"eval_samples_per_second": 214.134,
"eval_steps_per_second": 13.49,
"step": 3000
},
{
"epoch": 0.496,
"grad_norm": 24.699886322021484,
"learning_rate": 2.8088888888888893e-05,
"loss": 0.9682,
"step": 3100
},
{
"epoch": 0.496,
"eval_all-nli-dev_cosine_accuracy": 0.863,
"eval_all-nli-dev_dot_accuracy": 0.132,
"eval_all-nli-dev_euclidean_accuracy": 0.86,
"eval_all-nli-dev_manhattan_accuracy": 0.866,
"eval_all-nli-dev_max_accuracy": 0.866,
"eval_loss": 1.0003857612609863,
"eval_runtime": 4.6701,
"eval_samples_per_second": 214.128,
"eval_steps_per_second": 13.49,
"step": 3100
},
{
"epoch": 0.512,
"grad_norm": 33.25304412841797,
"learning_rate": 2.7200000000000004e-05,
"loss": 0.7227,
"step": 3200
},
{
"epoch": 0.512,
"eval_all-nli-dev_cosine_accuracy": 0.882,
"eval_all-nli-dev_dot_accuracy": 0.118,
"eval_all-nli-dev_euclidean_accuracy": 0.883,
"eval_all-nli-dev_manhattan_accuracy": 0.88,
"eval_all-nli-dev_max_accuracy": 0.883,
"eval_loss": 0.8489543199539185,
"eval_runtime": 4.5888,
"eval_samples_per_second": 217.92,
"eval_steps_per_second": 13.729,
"step": 3200
},
{
"epoch": 0.528,
"grad_norm": 12.450774192810059,
"learning_rate": 2.6311111111111115e-05,
"loss": 0.7134,
"step": 3300
},
{
"epoch": 0.528,
"eval_all-nli-dev_cosine_accuracy": 0.882,
"eval_all-nli-dev_dot_accuracy": 0.121,
"eval_all-nli-dev_euclidean_accuracy": 0.877,
"eval_all-nli-dev_manhattan_accuracy": 0.884,
"eval_all-nli-dev_max_accuracy": 0.884,
"eval_loss": 0.8214895725250244,
"eval_runtime": 4.6881,
"eval_samples_per_second": 213.307,
"eval_steps_per_second": 13.438,
"step": 3300
},
{
"epoch": 0.544,
"grad_norm": 13.840750694274902,
"learning_rate": 2.5422222222222227e-05,
"loss": 0.6645,
"step": 3400
},
{
"epoch": 0.544,
"eval_all-nli-dev_cosine_accuracy": 0.873,
"eval_all-nli-dev_dot_accuracy": 0.136,
"eval_all-nli-dev_euclidean_accuracy": 0.874,
"eval_all-nli-dev_manhattan_accuracy": 0.877,
"eval_all-nli-dev_max_accuracy": 0.877,
"eval_loss": 0.8888874053955078,
"eval_runtime": 4.6792,
"eval_samples_per_second": 213.711,
"eval_steps_per_second": 13.464,
"step": 3400
},
{
"epoch": 0.56,
"grad_norm": 9.944676399230957,
"learning_rate": 2.4533333333333334e-05,
"loss": 0.7073,
"step": 3500
},
{
"epoch": 0.56,
"eval_all-nli-dev_cosine_accuracy": 0.884,
"eval_all-nli-dev_dot_accuracy": 0.108,
"eval_all-nli-dev_euclidean_accuracy": 0.884,
"eval_all-nli-dev_manhattan_accuracy": 0.888,
"eval_all-nli-dev_max_accuracy": 0.888,
"eval_loss": 0.8373873829841614,
"eval_runtime": 4.6888,
"eval_samples_per_second": 213.273,
"eval_steps_per_second": 13.436,
"step": 3500
},
{
"epoch": 0.576,
"grad_norm": 23.035810470581055,
"learning_rate": 2.3644444444444446e-05,
"loss": 0.6679,
"step": 3600
},
{
"epoch": 0.576,
"eval_all-nli-dev_cosine_accuracy": 0.905,
"eval_all-nli-dev_dot_accuracy": 0.094,
"eval_all-nli-dev_euclidean_accuracy": 0.903,
"eval_all-nli-dev_manhattan_accuracy": 0.911,
"eval_all-nli-dev_max_accuracy": 0.911,
"eval_loss": 0.7780482172966003,
"eval_runtime": 4.6847,
"eval_samples_per_second": 213.461,
"eval_steps_per_second": 13.448,
"step": 3600
},
{
"epoch": 0.592,
"grad_norm": 12.241792678833008,
"learning_rate": 2.2755555555555557e-05,
"loss": 0.6609,
"step": 3700
},
{
"epoch": 0.592,
"eval_all-nli-dev_cosine_accuracy": 0.893,
"eval_all-nli-dev_dot_accuracy": 0.102,
"eval_all-nli-dev_euclidean_accuracy": 0.893,
"eval_all-nli-dev_manhattan_accuracy": 0.896,
"eval_all-nli-dev_max_accuracy": 0.896,
"eval_loss": 0.812877357006073,
"eval_runtime": 4.685,
"eval_samples_per_second": 213.449,
"eval_steps_per_second": 13.447,
"step": 3700
},
{
"epoch": 0.608,
"grad_norm": 23.00703239440918,
"learning_rate": 2.186666666666667e-05,
"loss": 0.687,
"step": 3800
},
{
"epoch": 0.608,
"eval_all-nli-dev_cosine_accuracy": 0.913,
"eval_all-nli-dev_dot_accuracy": 0.085,
"eval_all-nli-dev_euclidean_accuracy": 0.905,
"eval_all-nli-dev_manhattan_accuracy": 0.905,
"eval_all-nli-dev_max_accuracy": 0.913,
"eval_loss": 0.7215772271156311,
"eval_runtime": 4.6566,
"eval_samples_per_second": 214.751,
"eval_steps_per_second": 13.529,
"step": 3800
},
{
"epoch": 0.624,
"grad_norm": 13.749407768249512,
"learning_rate": 2.097777777777778e-05,
"loss": 0.5725,
"step": 3900
},
{
"epoch": 0.624,
"eval_all-nli-dev_cosine_accuracy": 0.912,
"eval_all-nli-dev_dot_accuracy": 0.09,
"eval_all-nli-dev_euclidean_accuracy": 0.908,
"eval_all-nli-dev_manhattan_accuracy": 0.92,
"eval_all-nli-dev_max_accuracy": 0.92,
"eval_loss": 0.7618492841720581,
"eval_runtime": 4.6929,
"eval_samples_per_second": 213.087,
"eval_steps_per_second": 13.424,
"step": 3900
},
{
"epoch": 0.64,
"grad_norm": 14.7637300491333,
"learning_rate": 2.008888888888889e-05,
"loss": 0.87,
"step": 4000
},
{
"epoch": 0.64,
"eval_all-nli-dev_cosine_accuracy": 0.909,
"eval_all-nli-dev_dot_accuracy": 0.086,
"eval_all-nli-dev_euclidean_accuracy": 0.909,
"eval_all-nli-dev_manhattan_accuracy": 0.909,
"eval_all-nli-dev_max_accuracy": 0.909,
"eval_loss": 0.706980288028717,
"eval_runtime": 4.6894,
"eval_samples_per_second": 213.247,
"eval_steps_per_second": 13.435,
"step": 4000
},
{
"epoch": 0.656,
"grad_norm": 16.75938606262207,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.0892,
"step": 4100
},
{
"epoch": 0.656,
"eval_all-nli-dev_cosine_accuracy": 0.901,
"eval_all-nli-dev_dot_accuracy": 0.094,
"eval_all-nli-dev_euclidean_accuracy": 0.897,
"eval_all-nli-dev_manhattan_accuracy": 0.899,
"eval_all-nli-dev_max_accuracy": 0.901,
"eval_loss": 0.7424288392066956,
"eval_runtime": 4.6286,
"eval_samples_per_second": 216.046,
"eval_steps_per_second": 13.611,
"step": 4100
},
{
"epoch": 0.672,
"grad_norm": 17.46446418762207,
"learning_rate": 1.8311111111111114e-05,
"loss": 1.048,
"step": 4200
},
{
"epoch": 0.672,
"eval_all-nli-dev_cosine_accuracy": 0.908,
"eval_all-nli-dev_dot_accuracy": 0.093,
"eval_all-nli-dev_euclidean_accuracy": 0.908,
"eval_all-nli-dev_manhattan_accuracy": 0.909,
"eval_all-nli-dev_max_accuracy": 0.909,
"eval_loss": 0.6750496029853821,
"eval_runtime": 4.641,
"eval_samples_per_second": 215.47,
"eval_steps_per_second": 13.575,
"step": 4200
},
{
"epoch": 0.688,
"grad_norm": 18.15054702758789,
"learning_rate": 1.7422222222222222e-05,
"loss": 0.8571,
"step": 4300
},
{
"epoch": 0.688,
"eval_all-nli-dev_cosine_accuracy": 0.903,
"eval_all-nli-dev_dot_accuracy": 0.095,
"eval_all-nli-dev_euclidean_accuracy": 0.901,
"eval_all-nli-dev_manhattan_accuracy": 0.902,
"eval_all-nli-dev_max_accuracy": 0.903,
"eval_loss": 0.6474354863166809,
"eval_runtime": 4.7822,
"eval_samples_per_second": 209.111,
"eval_steps_per_second": 13.174,
"step": 4300
},
{
"epoch": 0.704,
"grad_norm": 15.00839614868164,
"learning_rate": 1.6533333333333333e-05,
"loss": 0.7945,
"step": 4400
},
{
"epoch": 0.704,
"eval_all-nli-dev_cosine_accuracy": 0.908,
"eval_all-nli-dev_dot_accuracy": 0.089,
"eval_all-nli-dev_euclidean_accuracy": 0.91,
"eval_all-nli-dev_manhattan_accuracy": 0.911,
"eval_all-nli-dev_max_accuracy": 0.911,
"eval_loss": 0.6094924211502075,
"eval_runtime": 4.6578,
"eval_samples_per_second": 214.695,
"eval_steps_per_second": 13.526,
"step": 4400
},
{
"epoch": 0.72,
"grad_norm": 24.833444595336914,
"learning_rate": 1.5644444444444444e-05,
"loss": 0.6717,
"step": 4500
},
{
"epoch": 0.72,
"eval_all-nli-dev_cosine_accuracy": 0.93,
"eval_all-nli-dev_dot_accuracy": 0.066,
"eval_all-nli-dev_euclidean_accuracy": 0.923,
"eval_all-nli-dev_manhattan_accuracy": 0.921,
"eval_all-nli-dev_max_accuracy": 0.93,
"eval_loss": 0.5663518309593201,
"eval_runtime": 4.7483,
"eval_samples_per_second": 210.6,
"eval_steps_per_second": 13.268,
"step": 4500
},
{
"epoch": 0.736,
"grad_norm": 1504.730224609375,
"learning_rate": 1.4755555555555556e-05,
"loss": 0.8161,
"step": 4600
},
{
"epoch": 0.736,
"eval_all-nli-dev_cosine_accuracy": 0.919,
"eval_all-nli-dev_dot_accuracy": 0.07,
"eval_all-nli-dev_euclidean_accuracy": 0.916,
"eval_all-nli-dev_manhattan_accuracy": 0.918,
"eval_all-nli-dev_max_accuracy": 0.919,
"eval_loss": 0.5479408502578735,
"eval_runtime": 4.6248,
"eval_samples_per_second": 216.224,
"eval_steps_per_second": 13.622,
"step": 4600
},
{
"epoch": 0.752,
"grad_norm": 30.87934684753418,
"learning_rate": 1.3866666666666667e-05,
"loss": 0.7917,
"step": 4700
},
{
"epoch": 0.752,
"eval_all-nli-dev_cosine_accuracy": 0.911,
"eval_all-nli-dev_dot_accuracy": 0.083,
"eval_all-nli-dev_euclidean_accuracy": 0.909,
"eval_all-nli-dev_manhattan_accuracy": 0.907,
"eval_all-nli-dev_max_accuracy": 0.911,
"eval_loss": 0.6419683694839478,
"eval_runtime": 4.5916,
"eval_samples_per_second": 217.788,
"eval_steps_per_second": 13.721,
"step": 4700
},
{
"epoch": 0.768,
"grad_norm": 23.224735260009766,
"learning_rate": 1.2977777777777777e-05,
"loss": 0.7711,
"step": 4800
},
{
"epoch": 0.768,
"eval_all-nli-dev_cosine_accuracy": 0.916,
"eval_all-nli-dev_dot_accuracy": 0.078,
"eval_all-nli-dev_euclidean_accuracy": 0.914,
"eval_all-nli-dev_manhattan_accuracy": 0.913,
"eval_all-nli-dev_max_accuracy": 0.916,
"eval_loss": 0.5856308341026306,
"eval_runtime": 4.5755,
"eval_samples_per_second": 218.553,
"eval_steps_per_second": 13.769,
"step": 4800
},
{
"epoch": 0.784,
"grad_norm": 13.215950012207031,
"learning_rate": 1.208888888888889e-05,
"loss": 0.6441,
"step": 4900
},
{
"epoch": 0.784,
"eval_all-nli-dev_cosine_accuracy": 0.916,
"eval_all-nli-dev_dot_accuracy": 0.079,
"eval_all-nli-dev_euclidean_accuracy": 0.913,
"eval_all-nli-dev_manhattan_accuracy": 0.913,
"eval_all-nli-dev_max_accuracy": 0.916,
"eval_loss": 0.5775041580200195,
"eval_runtime": 4.5681,
"eval_samples_per_second": 218.91,
"eval_steps_per_second": 13.791,
"step": 4900
},
{
"epoch": 0.8,
"grad_norm": 18.222217559814453,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.7766,
"step": 5000
},
{
"epoch": 0.8,
"eval_all-nli-dev_cosine_accuracy": 0.922,
"eval_all-nli-dev_dot_accuracy": 0.077,
"eval_all-nli-dev_euclidean_accuracy": 0.92,
"eval_all-nli-dev_manhattan_accuracy": 0.917,
"eval_all-nli-dev_max_accuracy": 0.922,
"eval_loss": 0.5785014629364014,
"eval_runtime": 4.6158,
"eval_samples_per_second": 216.645,
"eval_steps_per_second": 13.649,
"step": 5000
},
{
"epoch": 0.816,
"grad_norm": 10.330174446105957,
"learning_rate": 1.031111111111111e-05,
"loss": 0.6009,
"step": 5100
},
{
"epoch": 0.816,
"eval_all-nli-dev_cosine_accuracy": 0.921,
"eval_all-nli-dev_dot_accuracy": 0.081,
"eval_all-nli-dev_euclidean_accuracy": 0.917,
"eval_all-nli-dev_manhattan_accuracy": 0.919,
"eval_all-nli-dev_max_accuracy": 0.921,
"eval_loss": 0.5679826140403748,
"eval_runtime": 4.5803,
"eval_samples_per_second": 218.325,
"eval_steps_per_second": 13.754,
"step": 5100
},
{
"epoch": 0.832,
"grad_norm": 14.917418479919434,
"learning_rate": 9.422222222222222e-06,
"loss": 0.6711,
"step": 5200
},
{
"epoch": 0.832,
"eval_all-nli-dev_cosine_accuracy": 0.921,
"eval_all-nli-dev_dot_accuracy": 0.074,
"eval_all-nli-dev_euclidean_accuracy": 0.917,
"eval_all-nli-dev_manhattan_accuracy": 0.92,
"eval_all-nli-dev_max_accuracy": 0.921,
"eval_loss": 0.5487431883811951,
"eval_runtime": 4.6796,
"eval_samples_per_second": 213.694,
"eval_steps_per_second": 13.463,
"step": 5200
},
{
"epoch": 0.848,
"grad_norm": 32.70161056518555,
"learning_rate": 8.533333333333334e-06,
"loss": 0.618,
"step": 5300
},
{
"epoch": 0.848,
"eval_all-nli-dev_cosine_accuracy": 0.926,
"eval_all-nli-dev_dot_accuracy": 0.074,
"eval_all-nli-dev_euclidean_accuracy": 0.921,
"eval_all-nli-dev_manhattan_accuracy": 0.922,
"eval_all-nli-dev_max_accuracy": 0.926,
"eval_loss": 0.5450394749641418,
"eval_runtime": 4.662,
"eval_samples_per_second": 214.5,
"eval_steps_per_second": 13.513,
"step": 5300
},
{
"epoch": 0.864,
"grad_norm": 13.764747619628906,
"learning_rate": 7.644444444444445e-06,
"loss": 0.6702,
"step": 5400
},
{
"epoch": 0.864,
"eval_all-nli-dev_cosine_accuracy": 0.926,
"eval_all-nli-dev_dot_accuracy": 0.073,
"eval_all-nli-dev_euclidean_accuracy": 0.921,
"eval_all-nli-dev_manhattan_accuracy": 0.919,
"eval_all-nli-dev_max_accuracy": 0.926,
"eval_loss": 0.5497583150863647,
"eval_runtime": 4.5535,
"eval_samples_per_second": 219.613,
"eval_steps_per_second": 13.836,
"step": 5400
},
{
"epoch": 0.88,
"grad_norm": 29.501554489135742,
"learning_rate": 6.755555555555555e-06,
"loss": 0.7039,
"step": 5500
},
{
"epoch": 0.88,
"eval_all-nli-dev_cosine_accuracy": 0.927,
"eval_all-nli-dev_dot_accuracy": 0.07,
"eval_all-nli-dev_euclidean_accuracy": 0.926,
"eval_all-nli-dev_manhattan_accuracy": 0.922,
"eval_all-nli-dev_max_accuracy": 0.927,
"eval_loss": 0.5191856622695923,
"eval_runtime": 4.5392,
"eval_samples_per_second": 220.305,
"eval_steps_per_second": 13.879,
"step": 5500
},
{
"epoch": 0.896,
"grad_norm": 19.63087272644043,
"learning_rate": 5.866666666666667e-06,
"loss": 0.6114,
"step": 5600
},
{
"epoch": 0.896,
"eval_all-nli-dev_cosine_accuracy": 0.932,
"eval_all-nli-dev_dot_accuracy": 0.067,
"eval_all-nli-dev_euclidean_accuracy": 0.931,
"eval_all-nli-dev_manhattan_accuracy": 0.93,
"eval_all-nli-dev_max_accuracy": 0.932,
"eval_loss": 0.5045494437217712,
"eval_runtime": 4.6367,
"eval_samples_per_second": 215.672,
"eval_steps_per_second": 13.587,
"step": 5600
},
{
"epoch": 0.912,
"grad_norm": 57.868019104003906,
"learning_rate": 4.977777777777778e-06,
"loss": 0.7761,
"step": 5700
},
{
"epoch": 0.912,
"eval_all-nli-dev_cosine_accuracy": 0.934,
"eval_all-nli-dev_dot_accuracy": 0.061,
"eval_all-nli-dev_euclidean_accuracy": 0.931,
"eval_all-nli-dev_manhattan_accuracy": 0.928,
"eval_all-nli-dev_max_accuracy": 0.934,
"eval_loss": 0.5033252835273743,
"eval_runtime": 4.6312,
"eval_samples_per_second": 215.928,
"eval_steps_per_second": 13.603,
"step": 5700
},
{
"epoch": 0.928,
"grad_norm": 11.63007640838623,
"learning_rate": 4.088888888888889e-06,
"loss": 0.6248,
"step": 5800
},
{
"epoch": 0.928,
"eval_all-nli-dev_cosine_accuracy": 0.932,
"eval_all-nli-dev_dot_accuracy": 0.068,
"eval_all-nli-dev_euclidean_accuracy": 0.926,
"eval_all-nli-dev_manhattan_accuracy": 0.926,
"eval_all-nli-dev_max_accuracy": 0.932,
"eval_loss": 0.5013440251350403,
"eval_runtime": 4.6162,
"eval_samples_per_second": 216.629,
"eval_steps_per_second": 13.648,
"step": 5800
},
{
"epoch": 0.944,
"grad_norm": 16.12616539001465,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8359,
"step": 5900
},
{
"epoch": 0.944,
"eval_all-nli-dev_cosine_accuracy": 0.93,
"eval_all-nli-dev_dot_accuracy": 0.07,
"eval_all-nli-dev_euclidean_accuracy": 0.926,
"eval_all-nli-dev_manhattan_accuracy": 0.923,
"eval_all-nli-dev_max_accuracy": 0.93,
"eval_loss": 0.49764034152030945,
"eval_runtime": 4.6817,
"eval_samples_per_second": 213.6,
"eval_steps_per_second": 13.457,
"step": 5900
},
{
"epoch": 0.96,
"grad_norm": 16.06106185913086,
"learning_rate": 2.311111111111111e-06,
"loss": 0.8764,
"step": 6000
},
{
"epoch": 0.96,
"eval_all-nli-dev_cosine_accuracy": 0.936,
"eval_all-nli-dev_dot_accuracy": 0.062,
"eval_all-nli-dev_euclidean_accuracy": 0.928,
"eval_all-nli-dev_manhattan_accuracy": 0.928,
"eval_all-nli-dev_max_accuracy": 0.936,
"eval_loss": 0.49757900834083557,
"eval_runtime": 4.6133,
"eval_samples_per_second": 216.765,
"eval_steps_per_second": 13.656,
"step": 6000
},
{
"epoch": 0.976,
"grad_norm": 0.0003809410845860839,
"learning_rate": 1.4222222222222223e-06,
"loss": 0.763,
"step": 6100
},
{
"epoch": 0.976,
"eval_all-nli-dev_cosine_accuracy": 0.935,
"eval_all-nli-dev_dot_accuracy": 0.061,
"eval_all-nli-dev_euclidean_accuracy": 0.93,
"eval_all-nli-dev_manhattan_accuracy": 0.929,
"eval_all-nli-dev_max_accuracy": 0.935,
"eval_loss": 0.48454272747039795,
"eval_runtime": 4.737,
"eval_samples_per_second": 211.106,
"eval_steps_per_second": 13.3,
"step": 6100
},
{
"epoch": 0.992,
"grad_norm": 8.930841431720182e-06,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0001,
"step": 6200
},
{
"epoch": 0.992,
"eval_all-nli-dev_cosine_accuracy": 0.935,
"eval_all-nli-dev_dot_accuracy": 0.062,
"eval_all-nli-dev_euclidean_accuracy": 0.928,
"eval_all-nli-dev_manhattan_accuracy": 0.929,
"eval_all-nli-dev_max_accuracy": 0.935,
"eval_loss": 0.48440054059028625,
"eval_runtime": 4.5868,
"eval_samples_per_second": 218.019,
"eval_steps_per_second": 13.735,
"step": 6200
}
],
"logging_steps": 100,
"max_steps": 6250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}