diff --git "a/checkpoint-657/trainer_state.json" "b/checkpoint-657/trainer_state.json" --- "a/checkpoint-657/trainer_state.json" +++ "b/checkpoint-657/trainer_state.json" @@ -9,1025 +9,5211 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.015089163237311385, - "grad_norm": 3.8793580532073975, - "learning_rate": 1.8281535648994517e-05, - "loss": 0.7558, - "step": 11 + "epoch": 0.0013717421124828531, + "grad_norm": 13.75906753540039, + "learning_rate": 1.8281535648994516e-06, + "loss": 1.2468, + "step": 1 }, { - "epoch": 0.03017832647462277, - "grad_norm": 3.666482448577881, - "learning_rate": 3.839122486288849e-05, - "loss": 0.7776, - "step": 22 + "epoch": 0.0027434842249657062, + "grad_norm": 14.05058765411377, + "learning_rate": 3.6563071297989032e-06, + "loss": 1.4692, + "step": 2 }, { - "epoch": 0.04526748971193416, - "grad_norm": 3.493319511413574, - "learning_rate": 5.850091407678245e-05, - "loss": 0.379, - "step": 33 + "epoch": 0.00411522633744856, + "grad_norm": 13.991771697998047, + "learning_rate": 5.484460694698355e-06, + "loss": 1.2457, + "step": 3 }, { - "epoch": 0.06035665294924554, - "grad_norm": 2.435302972793579, - "learning_rate": 7.861060329067642e-05, - "loss": 0.4947, - "step": 44 + "epoch": 0.0054869684499314125, + "grad_norm": 13.429465293884277, + "learning_rate": 7.3126142595978065e-06, + "loss": 1.1859, + "step": 4 }, { - "epoch": 0.07544581618655692, - "grad_norm": 2.2073612213134766, - "learning_rate": 9.872029250457039e-05, - "loss": 0.3406, - "step": 55 + "epoch": 0.006858710562414266, + "grad_norm": 12.403002738952637, + "learning_rate": 9.140767824497258e-06, + "loss": 1.2404, + "step": 5 }, { - "epoch": 0.09053497942386832, - "grad_norm": 0.5107505917549133, - "learning_rate": 0.00011882998171846434, - "loss": 0.2391, - "step": 66 + "epoch": 0.00823045267489712, + "grad_norm": 1.5524662733078003, + "learning_rate": 1.096892138939671e-05, + "loss": 0.042, + "step": 6 }, { - "epoch": 0.1056241426611797, - "grad_norm": 2.4110350608825684, - "learning_rate": 0.00013893967093235832, - "loss": 0.2154, - "step": 77 + "epoch": 0.009602194787379973, + "grad_norm": 10.1494140625, + "learning_rate": 1.2797074954296162e-05, + "loss": 0.8856, + "step": 7 }, { - "epoch": 0.12071330589849108, - "grad_norm": 3.4400408267974854, - "learning_rate": 0.00015904936014625229, - "loss": 0.2662, - "step": 88 + "epoch": 0.010973936899862825, + "grad_norm": Infinity, + "learning_rate": 1.2797074954296162e-05, + "loss": 1.5417, + "step": 8 }, { - "epoch": 0.13580246913580246, - "grad_norm": 3.31318998336792, - "learning_rate": 0.00017915904936014626, - "loss": 0.283, - "step": 99 + "epoch": 0.012345679012345678, + "grad_norm": 0.4553964138031006, + "learning_rate": 1.4625228519195613e-05, + "loss": 0.023, + "step": 9 }, { - "epoch": 0.15089163237311384, - "grad_norm": 2.624321699142456, - "learning_rate": 0.00019926873857404023, - "loss": 0.2391, - "step": 110 + "epoch": 0.013717421124828532, + "grad_norm": 10.332369804382324, + "learning_rate": 1.6453382084095062e-05, + "loss": 0.8655, + "step": 10 }, { - "epoch": 0.15089163237311384, - "eval_Qnli-dev_cosine_accuracy": 0.7353515625, - "eval_Qnli-dev_cosine_accuracy_threshold": 0.641769528388977, - "eval_Qnli-dev_cosine_ap": 0.7934694922676566, - "eval_Qnli-dev_cosine_f1": 0.7255734919286321, - "eval_Qnli-dev_cosine_f1_threshold": 0.533623456954956, - "eval_Qnli-dev_cosine_precision": 0.6170520231213873, - "eval_Qnli-dev_cosine_recall": 0.8804123711340206, - "eval_allNLI-dev_cosine_accuracy": 0.7451171875, - "eval_allNLI-dev_cosine_accuracy_threshold": 0.7058684229850769, - "eval_allNLI-dev_cosine_ap": 0.6358738534384165, - "eval_allNLI-dev_cosine_f1": 0.6633039092055485, - "eval_allNLI-dev_cosine_f1_threshold": 0.6644865274429321, - "eval_allNLI-dev_cosine_precision": 0.579295154185022, - "eval_allNLI-dev_cosine_recall": 0.775811209439528, - "eval_sequential_score": 0.7934694922676566, - "eval_sts-test_pearson_cosine": 0.8508165029528609, - "eval_sts-test_spearman_cosine": 0.8665992028008191, - "eval_vitaminc-pairs_loss": 1.550615668296814, - "eval_vitaminc-pairs_runtime": 24.2459, - "eval_vitaminc-pairs_samples_per_second": 10.558, - "eval_vitaminc-pairs_steps_per_second": 0.041, - "step": 110 + "epoch": 0.015089163237311385, + "grad_norm": 7.544310092926025, + "learning_rate": 1.8281535648994517e-05, + "loss": 0.5894, + "step": 11 }, { - "epoch": 0.15089163237311384, - "eval_negation-triplets_loss": 0.8121126294136047, - "eval_negation-triplets_runtime": 4.2821, - "eval_negation-triplets_samples_per_second": 59.784, - "eval_negation-triplets_steps_per_second": 0.234, - "step": 110 + "epoch": 0.01646090534979424, + "grad_norm": 8.16427230834961, + "learning_rate": 2.0109689213893968e-05, + "loss": 0.7053, + "step": 12 }, { - "epoch": 0.15089163237311384, - "eval_scitail-pairs-pos_loss": 0.05080736428499222, - "eval_scitail-pairs-pos_runtime": 3.0909, - "eval_scitail-pairs-pos_samples_per_second": 82.824, - "eval_scitail-pairs-pos_steps_per_second": 0.324, - "step": 110 + "epoch": 0.01783264746227709, + "grad_norm": 7.403252601623535, + "learning_rate": 2.193784277879342e-05, + "loss": 0.5857, + "step": 13 }, { - "epoch": 0.15089163237311384, - "eval_scitail-pairs-qa_loss": 0.011191274970769882, - "eval_scitail-pairs-qa_runtime": 2.3574, - "eval_scitail-pairs-qa_samples_per_second": 108.596, - "eval_scitail-pairs-qa_steps_per_second": 0.424, - "step": 110 + "epoch": 0.019204389574759947, + "grad_norm": 8.974674224853516, + "learning_rate": 2.376599634369287e-05, + "loss": 0.8375, + "step": 14 }, { - "epoch": 0.15089163237311384, - "eval_xsum-pairs_loss": 0.2039160132408142, - "eval_xsum-pairs_runtime": 12.8416, - "eval_xsum-pairs_samples_per_second": 19.935, - "eval_xsum-pairs_steps_per_second": 0.078, - "step": 110 + "epoch": 0.0205761316872428, + "grad_norm": 13.417745590209961, + "learning_rate": 2.5594149908592324e-05, + "loss": 0.9043, + "step": 15 }, { - "epoch": 0.15089163237311384, - "eval_sciq_pairs_loss": 0.023365184664726257, - "eval_sciq_pairs_runtime": 20.6659, - "eval_sciq_pairs_samples_per_second": 12.388, - "eval_sciq_pairs_steps_per_second": 0.048, - "step": 110 + "epoch": 0.02194787379972565, + "grad_norm": 12.881294250488281, + "learning_rate": 2.742230347349177e-05, + "loss": 0.8756, + "step": 16 }, { - "epoch": 0.15089163237311384, - "eval_qasc_pairs_loss": 0.558290421962738, - "eval_qasc_pairs_runtime": 3.009, - "eval_qasc_pairs_samples_per_second": 85.077, - "eval_qasc_pairs_steps_per_second": 0.332, - "step": 110 + "epoch": 0.023319615912208505, + "grad_norm": 7.439205169677734, + "learning_rate": 2.9250457038391226e-05, + "loss": 0.5076, + "step": 17 }, { - "epoch": 0.15089163237311384, - "eval_openbookqa_pairs_loss": 1.253723382949829, - "eval_openbookqa_pairs_runtime": 2.2457, - "eval_openbookqa_pairs_samples_per_second": 113.993, - "eval_openbookqa_pairs_steps_per_second": 0.445, - "step": 110 + "epoch": 0.024691358024691357, + "grad_norm": 8.46964168548584, + "learning_rate": 3.107861060329068e-05, + "loss": 0.4757, + "step": 18 }, { - "epoch": 0.15089163237311384, - "eval_nq_pairs_loss": 0.10213108360767365, - "eval_nq_pairs_runtime": 18.094, - "eval_nq_pairs_samples_per_second": 14.148, - "eval_nq_pairs_steps_per_second": 0.055, - "step": 110 + "epoch": 0.02606310013717421, + "grad_norm": 17.02773666381836, + "learning_rate": 3.2906764168190124e-05, + "loss": 0.9993, + "step": 19 }, { - "epoch": 0.15089163237311384, - "eval_trivia_pairs_loss": 0.4372706711292267, - "eval_trivia_pairs_runtime": 16.9326, - "eval_trivia_pairs_samples_per_second": 15.119, - "eval_trivia_pairs_steps_per_second": 0.059, - "step": 110 + "epoch": 0.027434842249657063, + "grad_norm": 6.2668776512146, + "learning_rate": 3.473491773308958e-05, + "loss": 0.2622, + "step": 20 }, { - "epoch": 0.15089163237311384, - "eval_gooaq_pairs_loss": 0.2727060914039612, - "eval_gooaq_pairs_runtime": 3.6277, - "eval_gooaq_pairs_samples_per_second": 70.568, - "eval_gooaq_pairs_steps_per_second": 0.276, - "step": 110 + "epoch": 0.02880658436213992, + "grad_norm": 8.273824691772461, + "learning_rate": 3.656307129798903e-05, + "loss": 0.3497, + "step": 21 }, { - "epoch": 0.15089163237311384, - "eval_paws-pos_loss": 0.040396444499492645, - "eval_paws-pos_runtime": 2.9381, - "eval_paws-pos_samples_per_second": 87.132, - "eval_paws-pos_steps_per_second": 0.34, - "step": 110 + "epoch": 0.03017832647462277, + "grad_norm": 5.5460944175720215, + "learning_rate": 3.839122486288849e-05, + "loss": 0.2514, + "step": 22 }, { - "epoch": 0.15089163237311384, - "eval_global_dataset_loss": 0.1820984184741974, - "eval_global_dataset_runtime": 125.361, - "eval_global_dataset_samples_per_second": 9.764, - "eval_global_dataset_steps_per_second": 0.04, - "step": 110 + "epoch": 0.03155006858710562, + "grad_norm": 4.283128261566162, + "learning_rate": 4.0219378427787935e-05, + "loss": 0.1673, + "step": 23 }, { - "epoch": 0.16598079561042525, - "grad_norm": 3.345057725906372, - "learning_rate": 0.00021937842778793417, - "loss": 0.2456, - "step": 121 + "epoch": 0.03292181069958848, + "grad_norm": 4.708792209625244, + "learning_rate": 4.204753199268738e-05, + "loss": 0.203, + "step": 24 }, { - "epoch": 0.18106995884773663, - "grad_norm": 2.2555935382843018, - "learning_rate": 0.00023948811700182814, - "loss": 0.4199, - "step": 132 + "epoch": 0.03429355281207133, + "grad_norm": 14.491021156311035, + "learning_rate": 4.387568555758684e-05, + "loss": 0.698, + "step": 25 }, { - "epoch": 0.19615912208504802, - "grad_norm": 2.3139288425445557, - "learning_rate": 0.0002595978062157221, - "loss": 0.2809, - "step": 143 + "epoch": 0.03566529492455418, + "grad_norm": 7.903520584106445, + "learning_rate": 4.570383912248629e-05, + "loss": 0.3401, + "step": 26 }, { - "epoch": 0.2112482853223594, - "grad_norm": 3.585463047027588, - "learning_rate": 0.0002797074954296161, - "loss": 0.5773, - "step": 154 + "epoch": 0.037037037037037035, + "grad_norm": 7.333080291748047, + "learning_rate": 4.753199268738574e-05, + "loss": 0.2185, + "step": 27 }, { - "epoch": 0.22633744855967078, - "grad_norm": 2.4281251430511475, - "learning_rate": 0.00029981718464351003, - "loss": 0.3734, - "step": 165 + "epoch": 0.038408779149519894, + "grad_norm": 8.625358581542969, + "learning_rate": 4.936014625228519e-05, + "loss": 0.4424, + "step": 28 }, { - "epoch": 0.24142661179698216, - "grad_norm": 0.2383209466934204, - "learning_rate": 0.000319926873857404, - "loss": 0.2348, - "step": 176 + "epoch": 0.039780521262002745, + "grad_norm": 1.5588488578796387, + "learning_rate": 5.118829981718465e-05, + "loss": 0.0381, + "step": 29 }, { - "epoch": 0.25651577503429357, - "grad_norm": 2.4634456634521484, - "learning_rate": 0.00034003656307129797, - "loss": 0.4421, - "step": 187 + "epoch": 0.0411522633744856, + "grad_norm": 12.401138305664062, + "learning_rate": 5.3016453382084095e-05, + "loss": 0.8215, + "step": 30 }, { - "epoch": 0.2716049382716049, - "grad_norm": 3.1270384788513184, - "learning_rate": 0.00036014625228519197, - "loss": 0.5076, - "step": 198 + "epoch": 0.04252400548696845, + "grad_norm": 5.405845642089844, + "learning_rate": 5.484460694698354e-05, + "loss": 0.1542, + "step": 31 }, { - "epoch": 0.28669410150891633, - "grad_norm": 0.7871516346931458, - "learning_rate": 0.0003802559414990859, - "loss": 0.211, - "step": 209 + "epoch": 0.0438957475994513, + "grad_norm": 8.558808326721191, + "learning_rate": 5.6672760511883e-05, + "loss": 0.6893, + "step": 32 }, { - "epoch": 0.3017832647462277, - "grad_norm": 2.159247636795044, - "learning_rate": 0.00040036563071297986, - "loss": 0.3514, - "step": 220 + "epoch": 0.04526748971193416, + "grad_norm": 7.206741809844971, + "learning_rate": 5.850091407678245e-05, + "loss": 0.3773, + "step": 33 }, { - "epoch": 0.3017832647462277, - "eval_Qnli-dev_cosine_accuracy": 0.736328125, - "eval_Qnli-dev_cosine_accuracy_threshold": 0.6637322902679443, - "eval_Qnli-dev_cosine_ap": 0.7960534826633536, - "eval_Qnli-dev_cosine_f1": 0.7298050139275766, - "eval_Qnli-dev_cosine_f1_threshold": 0.5937396287918091, - "eval_Qnli-dev_cosine_precision": 0.6638513513513513, - "eval_Qnli-dev_cosine_recall": 0.8103092783505155, - "eval_allNLI-dev_cosine_accuracy": 0.755859375, - "eval_allNLI-dev_cosine_accuracy_threshold": 0.728554904460907, - "eval_allNLI-dev_cosine_ap": 0.6433680273177467, - "eval_allNLI-dev_cosine_f1": 0.661818181818182, - "eval_allNLI-dev_cosine_f1_threshold": 0.6872978210449219, - "eval_allNLI-dev_cosine_precision": 0.5617283950617284, - "eval_allNLI-dev_cosine_recall": 0.8053097345132744, - "eval_sequential_score": 0.7960534826633536, - "eval_sts-test_pearson_cosine": 0.8381104761555598, - "eval_sts-test_spearman_cosine": 0.8624525294470655, - "eval_vitaminc-pairs_loss": 1.5675894021987915, - "eval_vitaminc-pairs_runtime": 24.2649, - "eval_vitaminc-pairs_samples_per_second": 10.55, - "eval_vitaminc-pairs_steps_per_second": 0.041, - "step": 220 + "epoch": 0.04663923182441701, + "grad_norm": 8.300729751586914, + "learning_rate": 6.0329067641681906e-05, + "loss": 0.538, + "step": 34 }, { - "epoch": 0.3017832647462277, - "eval_negation-triplets_loss": 0.923573911190033, - "eval_negation-triplets_runtime": 4.311, - "eval_negation-triplets_samples_per_second": 59.384, - "eval_negation-triplets_steps_per_second": 0.232, - "step": 220 + "epoch": 0.04801097393689986, + "grad_norm": 0.2500181496143341, + "learning_rate": 6.215722120658135e-05, + "loss": 0.0073, + "step": 35 }, { - "epoch": 0.3017832647462277, - "eval_scitail-pairs-pos_loss": 0.0530293844640255, - "eval_scitail-pairs-pos_runtime": 3.1182, - "eval_scitail-pairs-pos_samples_per_second": 82.098, - "eval_scitail-pairs-pos_steps_per_second": 0.321, - "step": 220 + "epoch": 0.04938271604938271, + "grad_norm": 17.457223892211914, + "learning_rate": 6.398537477148081e-05, + "loss": 2.378, + "step": 36 }, { - "epoch": 0.3017832647462277, - "eval_scitail-pairs-qa_loss": 0.008582310751080513, - "eval_scitail-pairs-qa_runtime": 2.402, - "eval_scitail-pairs-qa_samples_per_second": 106.58, - "eval_scitail-pairs-qa_steps_per_second": 0.416, - "step": 220 + "epoch": 0.05075445816186557, + "grad_norm": 10.884990692138672, + "learning_rate": 6.581352833638025e-05, + "loss": 0.5949, + "step": 37 }, { - "epoch": 0.3017832647462277, - "eval_xsum-pairs_loss": 0.19049452245235443, - "eval_xsum-pairs_runtime": 12.8763, - "eval_xsum-pairs_samples_per_second": 19.882, - "eval_xsum-pairs_steps_per_second": 0.078, - "step": 220 + "epoch": 0.05212620027434842, + "grad_norm": 10.013723373413086, + "learning_rate": 6.764168190127972e-05, + "loss": 0.7071, + "step": 38 }, { - "epoch": 0.3017832647462277, - "eval_sciq_pairs_loss": 0.02437273971736431, - "eval_sciq_pairs_runtime": 20.6537, - "eval_sciq_pairs_samples_per_second": 12.395, - "eval_sciq_pairs_steps_per_second": 0.048, - "step": 220 + "epoch": 0.053497942386831275, + "grad_norm": 4.653324604034424, + "learning_rate": 6.946983546617916e-05, + "loss": 0.1607, + "step": 39 }, { - "epoch": 0.3017832647462277, - "eval_qasc_pairs_loss": 0.6957117915153503, - "eval_qasc_pairs_runtime": 3.0168, - "eval_qasc_pairs_samples_per_second": 84.858, - "eval_qasc_pairs_steps_per_second": 0.331, - "step": 220 + "epoch": 0.05486968449931413, + "grad_norm": 9.527400970458984, + "learning_rate": 7.129798903107861e-05, + "loss": 0.7735, + "step": 40 }, { - "epoch": 0.3017832647462277, - "eval_openbookqa_pairs_loss": 1.2585959434509277, - "eval_openbookqa_pairs_runtime": 2.2495, - "eval_openbookqa_pairs_samples_per_second": 113.801, - "eval_openbookqa_pairs_steps_per_second": 0.445, - "step": 220 + "epoch": 0.056241426611796985, + "grad_norm": 12.477531433105469, + "learning_rate": 7.312614259597807e-05, + "loss": 0.7594, + "step": 41 }, { - "epoch": 0.3017832647462277, - "eval_nq_pairs_loss": 0.12953564524650574, - "eval_nq_pairs_runtime": 18.127, - "eval_nq_pairs_samples_per_second": 14.123, - "eval_nq_pairs_steps_per_second": 0.055, - "step": 220 + "epoch": 0.05761316872427984, + "grad_norm": 5.369799613952637, + "learning_rate": 7.495429616087751e-05, + "loss": 0.3569, + "step": 42 }, { - "epoch": 0.3017832647462277, - "eval_trivia_pairs_loss": 0.46085307002067566, - "eval_trivia_pairs_runtime": 16.9635, - "eval_trivia_pairs_samples_per_second": 15.091, - "eval_trivia_pairs_steps_per_second": 0.059, - "step": 220 + "epoch": 0.05898491083676269, + "grad_norm": 5.1385908126831055, + "learning_rate": 7.678244972577697e-05, + "loss": 0.2454, + "step": 43 }, { - "epoch": 0.3017832647462277, - "eval_gooaq_pairs_loss": 0.2918424606323242, - "eval_gooaq_pairs_runtime": 3.6275, - "eval_gooaq_pairs_samples_per_second": 70.573, - "eval_gooaq_pairs_steps_per_second": 0.276, - "step": 220 + "epoch": 0.06035665294924554, + "grad_norm": 6.1807708740234375, + "learning_rate": 7.861060329067642e-05, + "loss": 0.2723, + "step": 44 }, { - "epoch": 0.3017832647462277, - "eval_paws-pos_loss": 0.041661862283945084, - "eval_paws-pos_runtime": 2.9518, - "eval_paws-pos_samples_per_second": 86.727, - "eval_paws-pos_steps_per_second": 0.339, - "step": 220 + "epoch": 0.06172839506172839, + "grad_norm": 7.941879749298096, + "learning_rate": 8.043875685557587e-05, + "loss": 0.5338, + "step": 45 }, { - "epoch": 0.3017832647462277, - "eval_global_dataset_loss": 0.3610426187515259, - "eval_global_dataset_runtime": 125.4174, - "eval_global_dataset_samples_per_second": 9.759, - "eval_global_dataset_steps_per_second": 0.04, - "step": 220 + "epoch": 0.06310013717421124, + "grad_norm": 5.015410423278809, + "learning_rate": 8.226691042047532e-05, + "loss": 0.1891, + "step": 46 }, { - "epoch": 0.3168724279835391, - "grad_norm": 2.7020387649536133, - "learning_rate": 0.00042047531992687385, - "loss": 0.2981, - "step": 231 + "epoch": 0.0644718792866941, + "grad_norm": 7.299699306488037, + "learning_rate": 8.409506398537477e-05, + "loss": 0.3647, + "step": 47 }, { - "epoch": 0.3319615912208505, - "grad_norm": 2.0000767707824707, - "learning_rate": 0.0004405850091407678, - "loss": 0.3045, - "step": 242 + "epoch": 0.06584362139917696, + "grad_norm": 8.421393394470215, + "learning_rate": 8.592321755027423e-05, + "loss": 0.383, + "step": 48 }, { - "epoch": 0.34705075445816186, - "grad_norm": 2.539660692214966, - "learning_rate": 0.00046069469835466185, - "loss": 0.3126, - "step": 253 + "epoch": 0.06721536351165981, + "grad_norm": 5.5915937423706055, + "learning_rate": 8.775137111517367e-05, + "loss": 0.2353, + "step": 49 }, { - "epoch": 0.36213991769547327, - "grad_norm": 2.418445348739624, - "learning_rate": 0.0004808043875685558, - "loss": 0.7813, - "step": 264 + "epoch": 0.06858710562414266, + "grad_norm": 8.187829971313477, + "learning_rate": 8.957952468007313e-05, + "loss": 0.5541, + "step": 50 }, { - "epoch": 0.3772290809327846, - "grad_norm": 0.13016735017299652, - "learning_rate": 0.0005009140767824497, - "loss": 0.547, - "step": 275 + "epoch": 0.06995884773662552, + "grad_norm": 6.386786460876465, + "learning_rate": 9.140767824497258e-05, + "loss": 0.4908, + "step": 51 }, { - "epoch": 0.39231824417009603, - "grad_norm": 2.2323102951049805, - "learning_rate": 0.0005210237659963437, - "loss": 0.4698, - "step": 286 + "epoch": 0.07133058984910837, + "grad_norm": 8.64050006866455, + "learning_rate": 9.323583180987204e-05, + "loss": 0.586, + "step": 52 }, { - "epoch": 0.4074074074074074, - "grad_norm": 3.097975492477417, - "learning_rate": 0.0005411334552102377, - "loss": 0.5427, - "step": 297 + "epoch": 0.07270233196159122, + "grad_norm": 5.879551410675049, + "learning_rate": 9.506398537477148e-05, + "loss": 0.2241, + "step": 53 }, { - "epoch": 0.4224965706447188, - "grad_norm": 0.7084994316101074, - "learning_rate": 0.0005612431444241316, - "loss": 0.3151, - "step": 308 + "epoch": 0.07407407407407407, + "grad_norm": 7.824138164520264, + "learning_rate": 9.689213893967093e-05, + "loss": 0.6046, + "step": 54 }, { - "epoch": 0.4375857338820302, - "grad_norm": 1.7643369436264038, - "learning_rate": 0.0005813528336380256, - "loss": 0.4687, - "step": 319 + "epoch": 0.07544581618655692, + "grad_norm": 6.351109504699707, + "learning_rate": 9.872029250457039e-05, + "loss": 0.231, + "step": 55 }, { - "epoch": 0.45267489711934156, - "grad_norm": 1.7608978748321533, - "learning_rate": 0.0006014625228519196, - "loss": 0.3769, - "step": 330 + "epoch": 0.07681755829903979, + "grad_norm": 9.437410354614258, + "learning_rate": 0.00010054844606946984, + "loss": 0.7105, + "step": 56 }, { - "epoch": 0.45267489711934156, - "eval_Qnli-dev_cosine_accuracy": 0.7255859375, - "eval_Qnli-dev_cosine_accuracy_threshold": 0.6892818212509155, - "eval_Qnli-dev_cosine_ap": 0.7884120709809157, - "eval_Qnli-dev_cosine_f1": 0.7228464419475655, - "eval_Qnli-dev_cosine_f1_threshold": 0.6395477056503296, - "eval_Qnli-dev_cosine_precision": 0.6620926243567753, - "eval_Qnli-dev_cosine_recall": 0.7958762886597938, - "eval_allNLI-dev_cosine_accuracy": 0.7421875, - "eval_allNLI-dev_cosine_accuracy_threshold": 0.7702663540840149, - "eval_allNLI-dev_cosine_ap": 0.6347046378974335, - "eval_allNLI-dev_cosine_f1": 0.6455542021924483, - "eval_allNLI-dev_cosine_f1_threshold": 0.7014378309249878, - "eval_allNLI-dev_cosine_precision": 0.549792531120332, - "eval_allNLI-dev_cosine_recall": 0.7817109144542773, - "eval_sequential_score": 0.7884120709809157, - "eval_sts-test_pearson_cosine": 0.8389808770066287, - "eval_sts-test_spearman_cosine": 0.861226668384837, - "eval_vitaminc-pairs_loss": 1.5675371885299683, - "eval_vitaminc-pairs_runtime": 24.2072, - "eval_vitaminc-pairs_samples_per_second": 10.575, - "eval_vitaminc-pairs_steps_per_second": 0.041, - "step": 330 + "epoch": 0.07818930041152264, + "grad_norm": 8.40911865234375, + "learning_rate": 0.0001023765996343693, + "loss": 0.5591, + "step": 57 }, { - "epoch": 0.45267489711934156, - "eval_negation-triplets_loss": 0.7654371857643127, - "eval_negation-triplets_runtime": 4.2588, - "eval_negation-triplets_samples_per_second": 60.111, - "eval_negation-triplets_steps_per_second": 0.235, - "step": 330 + "epoch": 0.07956104252400549, + "grad_norm": 7.631382942199707, + "learning_rate": 0.00010420475319926874, + "loss": 0.5194, + "step": 58 }, { - "epoch": 0.45267489711934156, - "eval_scitail-pairs-pos_loss": 0.04239173233509064, - "eval_scitail-pairs-pos_runtime": 3.0875, - "eval_scitail-pairs-pos_samples_per_second": 82.915, - "eval_scitail-pairs-pos_steps_per_second": 0.324, - "step": 330 + "epoch": 0.08093278463648834, + "grad_norm": 5.773220062255859, + "learning_rate": 0.00010603290676416819, + "loss": 0.3297, + "step": 59 }, { - "epoch": 0.45267489711934156, - "eval_scitail-pairs-qa_loss": 0.010665436275303364, - "eval_scitail-pairs-qa_runtime": 2.3433, - "eval_scitail-pairs-qa_samples_per_second": 109.246, - "eval_scitail-pairs-qa_steps_per_second": 0.427, - "step": 330 + "epoch": 0.0823045267489712, + "grad_norm": 1.3606321811676025, + "learning_rate": 0.00010786106032906765, + "loss": 0.0299, + "step": 60 }, { - "epoch": 0.45267489711934156, - "eval_xsum-pairs_loss": 0.20898626744747162, - "eval_xsum-pairs_runtime": 12.8471, - "eval_xsum-pairs_samples_per_second": 19.927, - "eval_xsum-pairs_steps_per_second": 0.078, - "step": 330 + "epoch": 0.08367626886145405, + "grad_norm": 7.216275215148926, + "learning_rate": 0.00010968921389396709, + "loss": 0.3514, + "step": 61 }, { - "epoch": 0.45267489711934156, - "eval_sciq_pairs_loss": 0.03412973880767822, - "eval_sciq_pairs_runtime": 20.7279, - "eval_sciq_pairs_samples_per_second": 12.351, - "eval_sciq_pairs_steps_per_second": 0.048, - "step": 330 + "epoch": 0.0850480109739369, + "grad_norm": 4.70477294921875, + "learning_rate": 0.00011151736745886655, + "loss": 0.1932, + "step": 62 }, { - "epoch": 0.45267489711934156, - "eval_qasc_pairs_loss": 0.7678776383399963, - "eval_qasc_pairs_runtime": 3.0154, - "eval_qasc_pairs_samples_per_second": 84.896, - "eval_qasc_pairs_steps_per_second": 0.332, - "step": 330 + "epoch": 0.08641975308641975, + "grad_norm": 6.754104137420654, + "learning_rate": 0.000113345521023766, + "loss": 0.4035, + "step": 63 }, { - "epoch": 0.45267489711934156, - "eval_openbookqa_pairs_loss": 1.3723315000534058, - "eval_openbookqa_pairs_runtime": 2.2474, - "eval_openbookqa_pairs_samples_per_second": 113.91, - "eval_openbookqa_pairs_steps_per_second": 0.445, - "step": 330 + "epoch": 0.0877914951989026, + "grad_norm": 0.19067375361919403, + "learning_rate": 0.00011517367458866546, + "loss": 0.0094, + "step": 64 }, { - "epoch": 0.45267489711934156, - "eval_nq_pairs_loss": 0.15752817690372467, - "eval_nq_pairs_runtime": 18.1484, - "eval_nq_pairs_samples_per_second": 14.106, - "eval_nq_pairs_steps_per_second": 0.055, - "step": 330 + "epoch": 0.08916323731138547, + "grad_norm": 1.1715893745422363, + "learning_rate": 0.0001170018281535649, + "loss": 0.0148, + "step": 65 }, { - "epoch": 0.45267489711934156, - "eval_trivia_pairs_loss": 0.6312745213508606, - "eval_trivia_pairs_runtime": 16.9738, - "eval_trivia_pairs_samples_per_second": 15.082, - "eval_trivia_pairs_steps_per_second": 0.059, - "step": 330 + "epoch": 0.09053497942386832, + "grad_norm": 1.6287739276885986, + "learning_rate": 0.00011882998171846434, + "loss": 0.0231, + "step": 66 }, { - "epoch": 0.45267489711934156, - "eval_gooaq_pairs_loss": 0.47655048966407776, - "eval_gooaq_pairs_runtime": 3.6272, - "eval_gooaq_pairs_samples_per_second": 70.579, - "eval_gooaq_pairs_steps_per_second": 0.276, - "step": 330 + "epoch": 0.09190672153635117, + "grad_norm": 7.027708053588867, + "learning_rate": 0.00012065813528336381, + "loss": 0.3204, + "step": 67 }, { - "epoch": 0.45267489711934156, - "eval_paws-pos_loss": 0.04226630926132202, - "eval_paws-pos_runtime": 2.9441, - "eval_paws-pos_samples_per_second": 86.954, - "eval_paws-pos_steps_per_second": 0.34, - "step": 330 + "epoch": 0.09327846364883402, + "grad_norm": 7.248253345489502, + "learning_rate": 0.00012248628884826325, + "loss": 0.3011, + "step": 68 }, { - "epoch": 0.45267489711934156, - "eval_global_dataset_loss": 0.23270446062088013, - "eval_global_dataset_runtime": 125.3499, - "eval_global_dataset_samples_per_second": 9.765, - "eval_global_dataset_steps_per_second": 0.04, - "step": 330 + "epoch": 0.09465020576131687, + "grad_norm": 9.592718124389648, + "learning_rate": 0.0001243144424131627, + "loss": 0.3871, + "step": 69 }, { - "epoch": 0.46776406035665297, - "grad_norm": 0.37502893805503845, - "learning_rate": 0.0006215722120658135, - "loss": 0.3832, - "step": 341 + "epoch": 0.09602194787379972, + "grad_norm": 5.128874778747559, + "learning_rate": 0.00012614259597806216, + "loss": 0.1823, + "step": 70 }, { - "epoch": 0.4828532235939643, - "grad_norm": 2.488353967666626, - "learning_rate": 0.0006416819012797075, - "loss": 0.4392, - "step": 352 + "epoch": 0.09739368998628258, + "grad_norm": 6.496853351593018, + "learning_rate": 0.00012797074954296162, + "loss": 0.3572, + "step": 71 }, { - "epoch": 0.49794238683127573, - "grad_norm": 0.0037633629981428385, - "learning_rate": 0.0006617915904936015, - "loss": 0.3929, - "step": 363 + "epoch": 0.09876543209876543, + "grad_norm": 6.564659118652344, + "learning_rate": 0.00012979890310786104, + "loss": 0.5289, + "step": 72 }, { - "epoch": 0.5130315500685871, - "grad_norm": 0.467970073223114, - "learning_rate": 0.0006819012797074955, - "loss": 0.2611, - "step": 374 + "epoch": 0.10013717421124829, + "grad_norm": 6.480371952056885, + "learning_rate": 0.0001316270566727605, + "loss": 0.3223, + "step": 73 }, { - "epoch": 0.5281207133058985, - "grad_norm": 2.020796537399292, - "learning_rate": 0.0007020109689213894, - "loss": 0.3528, - "step": 385 + "epoch": 0.10150891632373114, + "grad_norm": 7.222306728363037, + "learning_rate": 0.00013345521023765998, + "loss": 0.3247, + "step": 74 }, { - "epoch": 0.5432098765432098, - "grad_norm": 2.0242817401885986, - "learning_rate": 0.0007221206581352834, - "loss": 0.738, - "step": 396 + "epoch": 0.102880658436214, + "grad_norm": 5.406076431274414, + "learning_rate": 0.00013528336380255943, + "loss": 0.2133, + "step": 75 }, { - "epoch": 0.5582990397805213, - "grad_norm": 2.9996001720428467, - "learning_rate": 0.0007422303473491774, - "loss": 0.4016, - "step": 407 + "epoch": 0.10425240054869685, + "grad_norm": 11.029163360595703, + "learning_rate": 0.00013711151736745886, + "loss": 0.8249, + "step": 76 }, { - "epoch": 0.5733882030178327, - "grad_norm": 1.8861972093582153, - "learning_rate": 0.0007623400365630713, - "loss": 0.3589, - "step": 418 + "epoch": 0.1056241426611797, + "grad_norm": 7.284115314483643, + "learning_rate": 0.00013893967093235832, + "loss": 0.4341, + "step": 77 }, { - "epoch": 0.588477366255144, - "grad_norm": 0.24432632327079773, - "learning_rate": 0.0007824497257769653, - "loss": 0.3057, - "step": 429 + "epoch": 0.10699588477366255, + "grad_norm": 6.240738868713379, + "learning_rate": 0.00014076782449725777, + "loss": 0.2932, + "step": 78 }, { - "epoch": 0.6035665294924554, - "grad_norm": 1.1939449310302734, - "learning_rate": 0.0008025594149908593, - "loss": 0.6095, - "step": 440 + "epoch": 0.1083676268861454, + "grad_norm": 0.1745665967464447, + "learning_rate": 0.00014259597806215722, + "loss": 0.0099, + "step": 79 }, { - "epoch": 0.6035665294924554, - "eval_Qnli-dev_cosine_accuracy": 0.7294921875, - "eval_Qnli-dev_cosine_accuracy_threshold": 0.7398079633712769, - "eval_Qnli-dev_cosine_ap": 0.7745603475556573, - "eval_Qnli-dev_cosine_f1": 0.7162790697674419, - "eval_Qnli-dev_cosine_f1_threshold": 0.6965705752372742, - "eval_Qnli-dev_cosine_precision": 0.652542372881356, - "eval_Qnli-dev_cosine_recall": 0.7938144329896907, - "eval_allNLI-dev_cosine_accuracy": 0.7431640625, - "eval_allNLI-dev_cosine_accuracy_threshold": 0.7988871335983276, - "eval_allNLI-dev_cosine_ap": 0.6431097859832409, - "eval_allNLI-dev_cosine_f1": 0.6502793296089385, - "eval_allNLI-dev_cosine_f1_threshold": 0.7334020733833313, - "eval_allNLI-dev_cosine_precision": 0.5233812949640287, - "eval_allNLI-dev_cosine_recall": 0.8584070796460177, - "eval_sequential_score": 0.7745603475556573, - "eval_sts-test_pearson_cosine": 0.8337056827457451, - "eval_sts-test_spearman_cosine": 0.8627129106149749, - "eval_vitaminc-pairs_loss": 1.9911772012710571, - "eval_vitaminc-pairs_runtime": 24.6526, - "eval_vitaminc-pairs_samples_per_second": 10.384, - "eval_vitaminc-pairs_steps_per_second": 0.041, - "step": 440 + "epoch": 0.10973936899862825, + "grad_norm": 5.460353851318359, + "learning_rate": 0.00014442413162705668, + "loss": 0.3348, + "step": 80 }, { - "epoch": 0.6035665294924554, - "eval_negation-triplets_loss": 0.7211357951164246, - "eval_negation-triplets_runtime": 4.2956, - "eval_negation-triplets_samples_per_second": 59.595, - "eval_negation-triplets_steps_per_second": 0.233, - "step": 440 + "epoch": 0.1111111111111111, + "grad_norm": 8.869246482849121, + "learning_rate": 0.00014625228519195613, + "loss": 0.6405, + "step": 81 }, { - "epoch": 0.6035665294924554, - "eval_scitail-pairs-pos_loss": 0.08597714453935623, - "eval_scitail-pairs-pos_runtime": 3.1006, - "eval_scitail-pairs-pos_samples_per_second": 82.565, - "eval_scitail-pairs-pos_steps_per_second": 0.323, - "step": 440 + "epoch": 0.11248285322359397, + "grad_norm": 4.475996971130371, + "learning_rate": 0.0001480804387568556, + "loss": 0.1536, + "step": 82 }, { - "epoch": 0.6035665294924554, - "eval_scitail-pairs-qa_loss": 0.01645529642701149, - "eval_scitail-pairs-qa_runtime": 2.3457, - "eval_scitail-pairs-qa_samples_per_second": 109.136, - "eval_scitail-pairs-qa_steps_per_second": 0.426, - "step": 440 + "epoch": 0.11385459533607682, + "grad_norm": 2.700299024581909, + "learning_rate": 0.00014990859232175501, + "loss": 0.1299, + "step": 83 }, { - "epoch": 0.6035665294924554, - "eval_xsum-pairs_loss": 0.41817042231559753, - "eval_xsum-pairs_runtime": 12.8711, - "eval_xsum-pairs_samples_per_second": 19.89, - "eval_xsum-pairs_steps_per_second": 0.078, - "step": 440 + "epoch": 0.11522633744855967, + "grad_norm": 7.5515618324279785, + "learning_rate": 0.00015173674588665447, + "loss": 0.5863, + "step": 84 }, { - "epoch": 0.6035665294924554, - "eval_sciq_pairs_loss": 0.03533514216542244, - "eval_sciq_pairs_runtime": 20.7828, - "eval_sciq_pairs_samples_per_second": 12.318, - "eval_sciq_pairs_steps_per_second": 0.048, - "step": 440 + "epoch": 0.11659807956104253, + "grad_norm": 9.869407653808594, + "learning_rate": 0.00015356489945155395, + "loss": 0.7205, + "step": 85 }, { - "epoch": 0.6035665294924554, - "eval_qasc_pairs_loss": 0.6740420460700989, - "eval_qasc_pairs_runtime": 3.0138, - "eval_qasc_pairs_samples_per_second": 84.942, - "eval_qasc_pairs_steps_per_second": 0.332, - "step": 440 + "epoch": 0.11796982167352538, + "grad_norm": 8.208423614501953, + "learning_rate": 0.00015539305301645338, + "loss": 0.4052, + "step": 86 }, { - "epoch": 0.6035665294924554, - "eval_openbookqa_pairs_loss": 1.6906702518463135, - "eval_openbookqa_pairs_runtime": 2.257, - "eval_openbookqa_pairs_samples_per_second": 113.425, - "eval_openbookqa_pairs_steps_per_second": 0.443, - "step": 440 + "epoch": 0.11934156378600823, + "grad_norm": 6.408420562744141, + "learning_rate": 0.00015722120658135283, + "loss": 0.3953, + "step": 87 }, { - "epoch": 0.6035665294924554, - "eval_nq_pairs_loss": 0.1867213398218155, - "eval_nq_pairs_runtime": 18.1321, - "eval_nq_pairs_samples_per_second": 14.119, - "eval_nq_pairs_steps_per_second": 0.055, - "step": 440 + "epoch": 0.12071330589849108, + "grad_norm": 7.050099849700928, + "learning_rate": 0.00015904936014625229, + "loss": 0.5598, + "step": 88 }, { - "epoch": 0.6035665294924554, - "eval_trivia_pairs_loss": 0.6214608550071716, - "eval_trivia_pairs_runtime": 17.0099, - "eval_trivia_pairs_samples_per_second": 15.05, - "eval_trivia_pairs_steps_per_second": 0.059, - "step": 440 + "epoch": 0.12208504801097393, + "grad_norm": 5.326991558074951, + "learning_rate": 0.00016087751371115174, + "loss": 0.2856, + "step": 89 }, { - "epoch": 0.6035665294924554, - "eval_gooaq_pairs_loss": 0.4756861925125122, - "eval_gooaq_pairs_runtime": 3.6268, - "eval_gooaq_pairs_samples_per_second": 70.587, - "eval_gooaq_pairs_steps_per_second": 0.276, - "step": 440 + "epoch": 0.12345679012345678, + "grad_norm": 4.510193347930908, + "learning_rate": 0.0001627056672760512, + "loss": 0.2277, + "step": 90 }, { - "epoch": 0.6035665294924554, - "eval_paws-pos_loss": 0.04365835338830948, - "eval_paws-pos_runtime": 2.9421, - "eval_paws-pos_samples_per_second": 87.012, - "eval_paws-pos_steps_per_second": 0.34, - "step": 440 + "epoch": 0.12482853223593965, + "grad_norm": 5.769596576690674, + "learning_rate": 0.00016453382084095065, + "loss": 0.3296, + "step": 91 }, { - "epoch": 0.6035665294924554, - "eval_global_dataset_loss": 0.29543137550354004, - "eval_global_dataset_runtime": 125.3874, - "eval_global_dataset_samples_per_second": 9.762, - "eval_global_dataset_steps_per_second": 0.04, - "step": 440 + "epoch": 0.1262002743484225, + "grad_norm": 6.066390037536621, + "learning_rate": 0.0001663619744058501, + "loss": 0.3079, + "step": 92 }, { - "epoch": 0.6186556927297668, - "grad_norm": 1.2445456981658936, + "epoch": 0.12757201646090535, + "grad_norm": 6.80173921585083, + "learning_rate": 0.00016819012797074953, + "loss": 0.4867, + "step": 93 + }, + { + "epoch": 0.1289437585733882, + "grad_norm": 6.219693183898926, + "learning_rate": 0.00017001828153564899, + "loss": 0.4319, + "step": 94 + }, + { + "epoch": 0.13031550068587106, + "grad_norm": 5.316290855407715, + "learning_rate": 0.00017184643510054847, + "loss": 0.2952, + "step": 95 + }, + { + "epoch": 0.13168724279835392, + "grad_norm": 6.86447811126709, + "learning_rate": 0.00017367458866544792, + "loss": 0.5531, + "step": 96 + }, + { + "epoch": 0.13305898491083676, + "grad_norm": 1.2648167610168457, + "learning_rate": 0.00017550274223034735, + "loss": 0.0296, + "step": 97 + }, + { + "epoch": 0.13443072702331962, + "grad_norm": 8.14661979675293, + "learning_rate": 0.0001773308957952468, + "loss": 0.8536, + "step": 98 + }, + { + "epoch": 0.13580246913580246, + "grad_norm": 8.927884101867676, + "learning_rate": 0.00017915904936014626, + "loss": 0.4879, + "step": 99 + }, + { + "epoch": 0.13717421124828533, + "grad_norm": 9.555243492126465, + "learning_rate": 0.00018098720292504568, + "loss": 0.67, + "step": 100 + }, + { + "epoch": 0.13854595336076816, + "grad_norm": 7.783656120300293, + "learning_rate": 0.00018281535648994517, + "loss": 0.4813, + "step": 101 + }, + { + "epoch": 0.13991769547325103, + "grad_norm": 0.5169872641563416, + "learning_rate": 0.00018464351005484462, + "loss": 0.0488, + "step": 102 + }, + { + "epoch": 0.1412894375857339, + "grad_norm": 6.967692852020264, + "learning_rate": 0.00018647166361974407, + "loss": 0.5388, + "step": 103 + }, + { + "epoch": 0.14266117969821673, + "grad_norm": 6.324373245239258, + "learning_rate": 0.0001882998171846435, + "loss": 0.376, + "step": 104 + }, + { + "epoch": 0.1440329218106996, + "grad_norm": 0.7642683982849121, + "learning_rate": 0.00019012797074954296, + "loss": 0.017, + "step": 105 + }, + { + "epoch": 0.14540466392318244, + "grad_norm": 8.600672721862793, + "learning_rate": 0.00019195612431444244, + "loss": 0.7542, + "step": 106 + }, + { + "epoch": 0.1467764060356653, + "grad_norm": 7.111880302429199, + "learning_rate": 0.00019378427787934186, + "loss": 0.4063, + "step": 107 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 6.075577735900879, + "learning_rate": 0.00019561243144424132, + "loss": 0.3658, + "step": 108 + }, + { + "epoch": 0.149519890260631, + "grad_norm": 6.12313175201416, + "learning_rate": 0.00019744058500914077, + "loss": 0.4389, + "step": 109 + }, + { + "epoch": 0.15089163237311384, + "grad_norm": 5.813235759735107, + "learning_rate": 0.00019926873857404023, + "loss": 0.3803, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_Qnli-dev_cosine_accuracy": 0.705078125, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6866907477378845, + "eval_Qnli-dev_cosine_ap": 0.7567018413685389, + "eval_Qnli-dev_cosine_f1": 0.6931818181818182, + "eval_Qnli-dev_cosine_f1_threshold": 0.6343963146209717, + "eval_Qnli-dev_cosine_precision": 0.6267123287671232, + "eval_Qnli-dev_cosine_recall": 0.7754237288135594, + "eval_allNLI-dev_cosine_accuracy": 0.76953125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.7752166986465454, + "eval_allNLI-dev_cosine_ap": 0.6627175481841632, + "eval_allNLI-dev_cosine_f1": 0.6624737945492662, + "eval_allNLI-dev_cosine_f1_threshold": 0.6564935445785522, + "eval_allNLI-dev_cosine_precision": 0.5197368421052632, + "eval_allNLI-dev_cosine_recall": 0.9132947976878613, + "eval_sequential_score": 0.7567018413685389, + "eval_sts-test_pearson_cosine": 0.9026620207137961, + "eval_sts-test_spearman_cosine": 0.913678627606199, + "eval_vitaminc-pairs_loss": 2.009296178817749, + "eval_vitaminc-pairs_runtime": 14.3224, + "eval_vitaminc-pairs_samples_per_second": 8.937, + "eval_vitaminc-pairs_steps_per_second": 0.07, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_negation-triplets_loss": 1.59572434425354, + "eval_negation-triplets_runtime": 1.1528, + "eval_negation-triplets_samples_per_second": 111.029, + "eval_negation-triplets_steps_per_second": 0.867, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_scitail-pairs-pos_loss": 0.061776161193847656, + "eval_scitail-pairs-pos_runtime": 1.5728, + "eval_scitail-pairs-pos_samples_per_second": 81.383, + "eval_scitail-pairs-pos_steps_per_second": 0.636, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_scitail-pairs-qa_loss": 0.009187542833387852, + "eval_scitail-pairs-qa_runtime": 1.2102, + "eval_scitail-pairs-qa_samples_per_second": 105.771, + "eval_scitail-pairs-qa_steps_per_second": 0.826, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_xsum-pairs_loss": 0.37210211157798767, + "eval_xsum-pairs_runtime": 6.2854, + "eval_xsum-pairs_samples_per_second": 20.365, + "eval_xsum-pairs_steps_per_second": 0.159, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_sciq_pairs_loss": 0.04122849553823471, + "eval_sciq_pairs_runtime": 8.8116, + "eval_sciq_pairs_samples_per_second": 14.526, + "eval_sciq_pairs_steps_per_second": 0.113, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_qasc_pairs_loss": 0.4748501479625702, + "eval_qasc_pairs_runtime": 1.3827, + "eval_qasc_pairs_samples_per_second": 92.573, + "eval_qasc_pairs_steps_per_second": 0.723, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_openbookqa_pairs_loss": 1.1540580987930298, + "eval_openbookqa_pairs_runtime": 1.1788, + "eval_openbookqa_pairs_samples_per_second": 108.581, + "eval_openbookqa_pairs_steps_per_second": 0.848, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_nq_pairs_loss": 0.2363465428352356, + "eval_nq_pairs_runtime": 7.8515, + "eval_nq_pairs_samples_per_second": 16.303, + "eval_nq_pairs_steps_per_second": 0.127, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_trivia_pairs_loss": 0.6520176529884338, + "eval_trivia_pairs_runtime": 8.9067, + "eval_trivia_pairs_samples_per_second": 14.371, + "eval_trivia_pairs_steps_per_second": 0.112, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_gooaq_pairs_loss": 0.22620199620723724, + "eval_gooaq_pairs_runtime": 2.067, + "eval_gooaq_pairs_samples_per_second": 61.924, + "eval_gooaq_pairs_steps_per_second": 0.484, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_paws-pos_loss": 0.02822125516831875, + "eval_paws-pos_runtime": 1.5117, + "eval_paws-pos_samples_per_second": 84.672, + "eval_paws-pos_steps_per_second": 0.662, + "step": 110 + }, + { + "epoch": 0.15089163237311384, + "eval_global_dataset_loss": 0.30668479204177856, + "eval_global_dataset_runtime": 33.2591, + "eval_global_dataset_samples_per_second": 11.546, + "eval_global_dataset_steps_per_second": 0.06, + "step": 110 + }, + { + "epoch": 0.1522633744855967, + "grad_norm": 4.30504846572876, + "learning_rate": 0.00020109689213893968, + "loss": 0.2478, + "step": 111 + }, + { + "epoch": 0.15363511659807957, + "grad_norm": 6.559568881988525, + "learning_rate": 0.00020292504570383914, + "loss": 0.8402, + "step": 112 + }, + { + "epoch": 0.1550068587105624, + "grad_norm": 5.812280654907227, + "learning_rate": 0.0002047531992687386, + "loss": 0.6608, + "step": 113 + }, + { + "epoch": 0.15637860082304528, + "grad_norm": 2.0805885791778564, + "learning_rate": 0.00020658135283363802, + "loss": 0.0934, + "step": 114 + }, + { + "epoch": 0.15775034293552812, + "grad_norm": 5.199294090270996, + "learning_rate": 0.00020840950639853747, + "loss": 0.3907, + "step": 115 + }, + { + "epoch": 0.15912208504801098, + "grad_norm": 6.3685078620910645, + "learning_rate": 0.00021023765996343693, + "loss": 0.449, + "step": 116 + }, + { + "epoch": 0.16049382716049382, + "grad_norm": 6.4199652671813965, + "learning_rate": 0.00021206581352833638, + "loss": 0.4041, + "step": 117 + }, + { + "epoch": 0.16186556927297668, + "grad_norm": 6.015898704528809, + "learning_rate": 0.00021389396709323584, + "loss": 0.6749, + "step": 118 + }, + { + "epoch": 0.16323731138545952, + "grad_norm": 7.721911430358887, + "learning_rate": 0.0002157221206581353, + "loss": 0.4847, + "step": 119 + }, + { + "epoch": 0.1646090534979424, + "grad_norm": 1.8774610757827759, + "learning_rate": 0.00021755027422303474, + "loss": 0.0526, + "step": 120 + }, + { + "epoch": 0.16598079561042525, + "grad_norm": 8.094359397888184, + "learning_rate": 0.00021937842778793417, + "loss": 0.6795, + "step": 121 + }, + { + "epoch": 0.1673525377229081, + "grad_norm": 0.33090323209762573, + "learning_rate": 0.00022120658135283365, + "loss": 0.0064, + "step": 122 + }, + { + "epoch": 0.16872427983539096, + "grad_norm": 7.3609418869018555, + "learning_rate": 0.0002230347349177331, + "loss": 0.5918, + "step": 123 + }, + { + "epoch": 0.1700960219478738, + "grad_norm": 6.189216613769531, + "learning_rate": 0.00022486288848263253, + "loss": 0.3544, + "step": 124 + }, + { + "epoch": 0.17146776406035666, + "grad_norm": 5.588890075683594, + "learning_rate": 0.000226691042047532, + "loss": 0.3849, + "step": 125 + }, + { + "epoch": 0.1728395061728395, + "grad_norm": 3.4582345485687256, + "learning_rate": 0.00022851919561243144, + "loss": 0.2051, + "step": 126 + }, + { + "epoch": 0.17421124828532236, + "grad_norm": 4.075862407684326, + "learning_rate": 0.00023034734917733092, + "loss": 0.2129, + "step": 127 + }, + { + "epoch": 0.1755829903978052, + "grad_norm": 15.110091209411621, + "learning_rate": 0.00023217550274223035, + "loss": 2.7937, + "step": 128 + }, + { + "epoch": 0.17695473251028807, + "grad_norm": 0.35791516304016113, + "learning_rate": 0.0002340036563071298, + "loss": 0.0166, + "step": 129 + }, + { + "epoch": 0.17832647462277093, + "grad_norm": 7.5200090408325195, + "learning_rate": 0.00023583180987202926, + "loss": 0.7856, + "step": 130 + }, + { + "epoch": 0.17969821673525377, + "grad_norm": 6.566864490509033, + "learning_rate": 0.0002376599634369287, + "loss": 0.8368, + "step": 131 + }, + { + "epoch": 0.18106995884773663, + "grad_norm": 4.958701133728027, + "learning_rate": 0.00023948811700182814, + "loss": 0.3813, + "step": 132 + }, + { + "epoch": 0.18244170096021947, + "grad_norm": 5.745133876800537, + "learning_rate": 0.00024131627056672762, + "loss": 0.5695, + "step": 133 + }, + { + "epoch": 0.18381344307270234, + "grad_norm": 4.952736854553223, + "learning_rate": 0.00024314442413162708, + "loss": 0.351, + "step": 134 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 5.733601093292236, + "learning_rate": 0.0002449725776965265, + "loss": 0.3821, + "step": 135 + }, + { + "epoch": 0.18655692729766804, + "grad_norm": 5.019097328186035, + "learning_rate": 0.00024680073126142596, + "loss": 0.3249, + "step": 136 + }, + { + "epoch": 0.18792866941015088, + "grad_norm": 5.300777912139893, + "learning_rate": 0.0002486288848263254, + "loss": 0.3404, + "step": 137 + }, + { + "epoch": 0.18930041152263374, + "grad_norm": 4.518141269683838, + "learning_rate": 0.00025045703839122487, + "loss": 0.4535, + "step": 138 + }, + { + "epoch": 0.1906721536351166, + "grad_norm": 1.0158088207244873, + "learning_rate": 0.0002522851919561243, + "loss": 0.0577, + "step": 139 + }, + { + "epoch": 0.19204389574759945, + "grad_norm": 5.966796398162842, + "learning_rate": 0.0002541133455210238, + "loss": 0.7431, + "step": 140 + }, + { + "epoch": 0.1934156378600823, + "grad_norm": 6.123642921447754, + "learning_rate": 0.00025594149908592323, + "loss": 0.6778, + "step": 141 + }, + { + "epoch": 0.19478737997256515, + "grad_norm": 5.842874050140381, + "learning_rate": 0.0002577696526508227, + "loss": 0.5436, + "step": 142 + }, + { + "epoch": 0.19615912208504802, + "grad_norm": 4.759068012237549, + "learning_rate": 0.0002595978062157221, + "loss": 0.3582, + "step": 143 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 4.080338478088379, + "learning_rate": 0.00026142595978062154, + "loss": 0.316, + "step": 144 + }, + { + "epoch": 0.19890260631001372, + "grad_norm": 4.1391448974609375, + "learning_rate": 0.000263254113345521, + "loss": 0.4446, + "step": 145 + }, + { + "epoch": 0.20027434842249658, + "grad_norm": 5.856256008148193, + "learning_rate": 0.0002650822669104205, + "loss": 0.7792, + "step": 146 + }, + { + "epoch": 0.20164609053497942, + "grad_norm": 7.747331142425537, + "learning_rate": 0.00026691042047531996, + "loss": 1.1147, + "step": 147 + }, + { + "epoch": 0.2030178326474623, + "grad_norm": 6.825289249420166, + "learning_rate": 0.0002687385740402194, + "loss": 0.8267, + "step": 148 + }, + { + "epoch": 0.20438957475994513, + "grad_norm": 7.336719512939453, + "learning_rate": 0.00027056672760511887, + "loss": 0.8149, + "step": 149 + }, + { + "epoch": 0.205761316872428, + "grad_norm": 6.731626510620117, + "learning_rate": 0.00027239488117001827, + "loss": 0.942, + "step": 150 + }, + { + "epoch": 0.20713305898491083, + "grad_norm": 10.727692604064941, + "learning_rate": 0.0002742230347349177, + "loss": 2.4865, + "step": 151 + }, + { + "epoch": 0.2085048010973937, + "grad_norm": 8.583380699157715, + "learning_rate": 0.0002760511882998172, + "loss": 1.0715, + "step": 152 + }, + { + "epoch": 0.20987654320987653, + "grad_norm": 6.236877918243408, + "learning_rate": 0.00027787934186471663, + "loss": 0.6219, + "step": 153 + }, + { + "epoch": 0.2112482853223594, + "grad_norm": 6.254538536071777, + "learning_rate": 0.0002797074954296161, + "loss": 0.8705, + "step": 154 + }, + { + "epoch": 0.21262002743484226, + "grad_norm": 3.0917959213256836, + "learning_rate": 0.00028153564899451554, + "loss": 0.2407, + "step": 155 + }, + { + "epoch": 0.2139917695473251, + "grad_norm": 4.438024997711182, + "learning_rate": 0.000283363802559415, + "loss": 0.4925, + "step": 156 + }, + { + "epoch": 0.21536351165980797, + "grad_norm": 0.43344631791114807, + "learning_rate": 0.00028519195612431445, + "loss": 0.0316, + "step": 157 + }, + { + "epoch": 0.2167352537722908, + "grad_norm": 5.73934268951416, + "learning_rate": 0.0002870201096892139, + "loss": 0.3935, + "step": 158 + }, + { + "epoch": 0.21810699588477367, + "grad_norm": 4.532804012298584, + "learning_rate": 0.00028884826325411336, + "loss": 0.2083, + "step": 159 + }, + { + "epoch": 0.2194787379972565, + "grad_norm": 4.846848487854004, + "learning_rate": 0.0002906764168190128, + "loss": 0.2798, + "step": 160 + }, + { + "epoch": 0.22085048010973937, + "grad_norm": 7.060863018035889, + "learning_rate": 0.00029250457038391227, + "loss": 0.8777, + "step": 161 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.012754157185554504, + "learning_rate": 0.0002943327239488117, + "loss": 0.0002, + "step": 162 + }, + { + "epoch": 0.22359396433470508, + "grad_norm": 4.094379901885986, + "learning_rate": 0.0002961608775137112, + "loss": 0.2736, + "step": 163 + }, + { + "epoch": 0.22496570644718794, + "grad_norm": 10.741785049438477, + "learning_rate": 0.0002979890310786106, + "loss": 2.4185, + "step": 164 + }, + { + "epoch": 0.22633744855967078, + "grad_norm": 4.820891380310059, + "learning_rate": 0.00029981718464351003, + "loss": 0.7767, + "step": 165 + }, + { + "epoch": 0.22770919067215364, + "grad_norm": 6.423076152801514, + "learning_rate": 0.0003016453382084095, + "loss": 0.7971, + "step": 166 + }, + { + "epoch": 0.22908093278463648, + "grad_norm": 4.492727756500244, + "learning_rate": 0.00030347349177330894, + "loss": 0.4535, + "step": 167 + }, + { + "epoch": 0.23045267489711935, + "grad_norm": 5.301379680633545, + "learning_rate": 0.00030530164533820845, + "loss": 0.6654, + "step": 168 + }, + { + "epoch": 0.23182441700960219, + "grad_norm": 5.155853748321533, + "learning_rate": 0.0003071297989031079, + "loss": 0.3985, + "step": 169 + }, + { + "epoch": 0.23319615912208505, + "grad_norm": 0.4378865361213684, + "learning_rate": 0.00030895795246800735, + "loss": 0.0338, + "step": 170 + }, + { + "epoch": 0.2345679012345679, + "grad_norm": 4.022473335266113, + "learning_rate": 0.00031078610603290675, + "loss": 0.1834, + "step": 171 + }, + { + "epoch": 0.23593964334705075, + "grad_norm": 7.863429069519043, + "learning_rate": 0.0003126142595978062, + "loss": 0.603, + "step": 172 + }, + { + "epoch": 0.23731138545953362, + "grad_norm": 8.951998710632324, + "learning_rate": 0.00031444241316270566, + "loss": 0.7871, + "step": 173 + }, + { + "epoch": 0.23868312757201646, + "grad_norm": 6.265102386474609, + "learning_rate": 0.0003162705667276051, + "loss": 0.4304, + "step": 174 + }, + { + "epoch": 0.24005486968449932, + "grad_norm": 6.6486005783081055, + "learning_rate": 0.00031809872029250457, + "loss": 0.649, + "step": 175 + }, + { + "epoch": 0.24142661179698216, + "grad_norm": 0.47100114822387695, + "learning_rate": 0.000319926873857404, + "loss": 0.048, + "step": 176 + }, + { + "epoch": 0.24279835390946503, + "grad_norm": 4.884115695953369, + "learning_rate": 0.0003217550274223035, + "loss": 0.4079, + "step": 177 + }, + { + "epoch": 0.24417009602194786, + "grad_norm": 4.508667469024658, + "learning_rate": 0.0003235831809872029, + "loss": 0.4627, + "step": 178 + }, + { + "epoch": 0.24554183813443073, + "grad_norm": 3.22367262840271, + "learning_rate": 0.0003254113345521024, + "loss": 0.3703, + "step": 179 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 7.695303916931152, + "learning_rate": 0.00032723948811700184, + "loss": 0.8343, + "step": 180 + }, + { + "epoch": 0.24828532235939643, + "grad_norm": 7.249318599700928, + "learning_rate": 0.0003290676416819013, + "loss": 0.692, + "step": 181 + }, + { + "epoch": 0.2496570644718793, + "grad_norm": 11.686202049255371, + "learning_rate": 0.00033089579524680075, + "loss": 2.7071, + "step": 182 + }, + { + "epoch": 0.25102880658436216, + "grad_norm": 6.061092376708984, + "learning_rate": 0.0003327239488117002, + "loss": 0.8451, + "step": 183 + }, + { + "epoch": 0.252400548696845, + "grad_norm": 5.932607650756836, + "learning_rate": 0.00033455210237659966, + "loss": 0.635, + "step": 184 + }, + { + "epoch": 0.25377229080932784, + "grad_norm": 3.491114616394043, + "learning_rate": 0.00033638025594149906, + "loss": 0.312, + "step": 185 + }, + { + "epoch": 0.2551440329218107, + "grad_norm": 6.4914164543151855, + "learning_rate": 0.0003382084095063985, + "loss": 0.6996, + "step": 186 + }, + { + "epoch": 0.25651577503429357, + "grad_norm": 6.15857458114624, + "learning_rate": 0.00034003656307129797, + "loss": 0.4432, + "step": 187 + }, + { + "epoch": 0.2578875171467764, + "grad_norm": 4.767185211181641, + "learning_rate": 0.0003418647166361974, + "loss": 0.375, + "step": 188 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 7.944342613220215, + "learning_rate": 0.00034369287020109693, + "loss": 0.9366, + "step": 189 + }, + { + "epoch": 0.2606310013717421, + "grad_norm": 6.573953628540039, + "learning_rate": 0.0003455210237659964, + "loss": 0.755, + "step": 190 + }, + { + "epoch": 0.262002743484225, + "grad_norm": 4.173367023468018, + "learning_rate": 0.00034734917733089584, + "loss": 0.6068, + "step": 191 + }, + { + "epoch": 0.26337448559670784, + "grad_norm": 5.26171875, + "learning_rate": 0.00034917733089579524, + "loss": 0.5336, + "step": 192 + }, + { + "epoch": 0.26474622770919065, + "grad_norm": 6.669304370880127, + "learning_rate": 0.0003510054844606947, + "loss": 0.8783, + "step": 193 + }, + { + "epoch": 0.2661179698216735, + "grad_norm": 4.4192938804626465, + "learning_rate": 0.00035283363802559415, + "loss": 0.3576, + "step": 194 + }, + { + "epoch": 0.2674897119341564, + "grad_norm": 10.117819786071777, + "learning_rate": 0.0003546617915904936, + "loss": 2.1854, + "step": 195 + }, + { + "epoch": 0.26886145404663925, + "grad_norm": 5.256247520446777, + "learning_rate": 0.00035648994515539306, + "loss": 0.7835, + "step": 196 + }, + { + "epoch": 0.27023319615912206, + "grad_norm": 5.784887313842773, + "learning_rate": 0.0003583180987202925, + "loss": 0.5668, + "step": 197 + }, + { + "epoch": 0.2716049382716049, + "grad_norm": 4.977567672729492, + "learning_rate": 0.00036014625228519197, + "loss": 0.7033, + "step": 198 + }, + { + "epoch": 0.2729766803840878, + "grad_norm": 0.011424711905419827, + "learning_rate": 0.00036197440585009137, + "loss": 0.0002, + "step": 199 + }, + { + "epoch": 0.27434842249657065, + "grad_norm": 5.805008411407471, + "learning_rate": 0.0003638025594149909, + "loss": 0.5791, + "step": 200 + }, + { + "epoch": 0.2757201646090535, + "grad_norm": 3.8826043605804443, + "learning_rate": 0.00036563071297989033, + "loss": 0.2697, + "step": 201 + }, + { + "epoch": 0.27709190672153633, + "grad_norm": 6.563521385192871, + "learning_rate": 0.0003674588665447898, + "loss": 0.6261, + "step": 202 + }, + { + "epoch": 0.2784636488340192, + "grad_norm": 4.584529399871826, + "learning_rate": 0.00036928702010968924, + "loss": 0.3253, + "step": 203 + }, + { + "epoch": 0.27983539094650206, + "grad_norm": 6.636009216308594, + "learning_rate": 0.0003711151736745887, + "loss": 0.8323, + "step": 204 + }, + { + "epoch": 0.2812071330589849, + "grad_norm": 5.0911359786987305, + "learning_rate": 0.00037294332723948815, + "loss": 0.4472, + "step": 205 + }, + { + "epoch": 0.2825788751714678, + "grad_norm": 3.9219255447387695, + "learning_rate": 0.00037477148080438755, + "loss": 0.3342, + "step": 206 + }, + { + "epoch": 0.2839506172839506, + "grad_norm": 5.114777565002441, + "learning_rate": 0.000376599634369287, + "loss": 0.6313, + "step": 207 + }, + { + "epoch": 0.28532235939643347, + "grad_norm": 0.3298715353012085, + "learning_rate": 0.00037842778793418646, + "loss": 0.059, + "step": 208 + }, + { + "epoch": 0.28669410150891633, + "grad_norm": 1.5965046882629395, + "learning_rate": 0.0003802559414990859, + "loss": 0.1195, + "step": 209 + }, + { + "epoch": 0.2880658436213992, + "grad_norm": 0.39121323823928833, + "learning_rate": 0.00038208409506398537, + "loss": 0.0296, + "step": 210 + }, + { + "epoch": 0.289437585733882, + "grad_norm": 4.317224025726318, + "learning_rate": 0.0003839122486288849, + "loss": 0.5316, + "step": 211 + }, + { + "epoch": 0.2908093278463649, + "grad_norm": 4.000308036804199, + "learning_rate": 0.00038574040219378433, + "loss": 0.5201, + "step": 212 + }, + { + "epoch": 0.29218106995884774, + "grad_norm": 6.2192301750183105, + "learning_rate": 0.00038756855575868373, + "loss": 0.6602, + "step": 213 + }, + { + "epoch": 0.2935528120713306, + "grad_norm": 6.702320098876953, + "learning_rate": 0.0003893967093235832, + "loss": 0.9578, + "step": 214 + }, + { + "epoch": 0.29492455418381347, + "grad_norm": 3.9136242866516113, + "learning_rate": 0.00039122486288848264, + "loss": 0.2089, + "step": 215 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 6.901303768157959, + "learning_rate": 0.0003930530164533821, + "loss": 1.2112, + "step": 216 + }, + { + "epoch": 0.29766803840877915, + "grad_norm": 4.04884672164917, + "learning_rate": 0.00039488117001828155, + "loss": 0.3294, + "step": 217 + }, + { + "epoch": 0.299039780521262, + "grad_norm": 5.46201753616333, + "learning_rate": 0.000396709323583181, + "loss": 0.867, + "step": 218 + }, + { + "epoch": 0.3004115226337449, + "grad_norm": 5.559458255767822, + "learning_rate": 0.00039853747714808046, + "loss": 1.1745, + "step": 219 + }, + { + "epoch": 0.3017832647462277, + "grad_norm": 4.930731296539307, + "learning_rate": 0.00040036563071297986, + "loss": 0.7287, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_Qnli-dev_cosine_accuracy": 0.736328125, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8059661388397217, + "eval_Qnli-dev_cosine_ap": 0.7693397822540732, + "eval_Qnli-dev_cosine_f1": 0.720136518771331, + "eval_Qnli-dev_cosine_f1_threshold": 0.7346209287643433, + "eval_Qnli-dev_cosine_precision": 0.6028571428571429, + "eval_Qnli-dev_cosine_recall": 0.8940677966101694, + "eval_allNLI-dev_cosine_accuracy": 0.751953125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.8584632873535156, + "eval_allNLI-dev_cosine_ap": 0.6542482370347211, + "eval_allNLI-dev_cosine_f1": 0.6681127982646421, + "eval_allNLI-dev_cosine_f1_threshold": 0.7551147937774658, + "eval_allNLI-dev_cosine_precision": 0.5347222222222222, + "eval_allNLI-dev_cosine_recall": 0.8901734104046243, + "eval_sequential_score": 0.7693397822540732, + "eval_sts-test_pearson_cosine": 0.8635423204589071, + "eval_sts-test_spearman_cosine": 0.8907274956890058, + "eval_vitaminc-pairs_loss": 2.5131773948669434, + "eval_vitaminc-pairs_runtime": 14.3452, + "eval_vitaminc-pairs_samples_per_second": 8.923, + "eval_vitaminc-pairs_steps_per_second": 0.07, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_negation-triplets_loss": 1.5992437601089478, + "eval_negation-triplets_runtime": 1.1695, + "eval_negation-triplets_samples_per_second": 109.451, + "eval_negation-triplets_steps_per_second": 0.855, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_scitail-pairs-pos_loss": 0.12100159376859665, + "eval_scitail-pairs-pos_runtime": 1.5843, + "eval_scitail-pairs-pos_samples_per_second": 80.791, + "eval_scitail-pairs-pos_steps_per_second": 0.631, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_scitail-pairs-qa_loss": 0.029047677293419838, + "eval_scitail-pairs-qa_runtime": 1.2089, + "eval_scitail-pairs-qa_samples_per_second": 105.881, + "eval_scitail-pairs-qa_steps_per_second": 0.827, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_xsum-pairs_loss": 0.6063941717147827, + "eval_xsum-pairs_runtime": 6.3291, + "eval_xsum-pairs_samples_per_second": 20.224, + "eval_xsum-pairs_steps_per_second": 0.158, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_sciq_pairs_loss": 0.05205194652080536, + "eval_sciq_pairs_runtime": 8.8533, + "eval_sciq_pairs_samples_per_second": 14.458, + "eval_sciq_pairs_steps_per_second": 0.113, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_qasc_pairs_loss": 0.7244825959205627, + "eval_qasc_pairs_runtime": 1.3845, + "eval_qasc_pairs_samples_per_second": 92.449, + "eval_qasc_pairs_steps_per_second": 0.722, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_openbookqa_pairs_loss": 1.4260488748550415, + "eval_openbookqa_pairs_runtime": 1.1784, + "eval_openbookqa_pairs_samples_per_second": 108.623, + "eval_openbookqa_pairs_steps_per_second": 0.849, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_nq_pairs_loss": 0.7104523777961731, + "eval_nq_pairs_runtime": 7.8911, + "eval_nq_pairs_samples_per_second": 16.221, + "eval_nq_pairs_steps_per_second": 0.127, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_trivia_pairs_loss": 0.8537120223045349, + "eval_trivia_pairs_runtime": 8.9305, + "eval_trivia_pairs_samples_per_second": 14.333, + "eval_trivia_pairs_steps_per_second": 0.112, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_gooaq_pairs_loss": 0.5029886960983276, + "eval_gooaq_pairs_runtime": 2.0675, + "eval_gooaq_pairs_samples_per_second": 61.91, + "eval_gooaq_pairs_steps_per_second": 0.484, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_paws-pos_loss": 0.03150199353694916, + "eval_paws-pos_runtime": 1.5078, + "eval_paws-pos_samples_per_second": 84.892, + "eval_paws-pos_steps_per_second": 0.663, + "step": 220 + }, + { + "epoch": 0.3017832647462277, + "eval_global_dataset_loss": 0.18923546373844147, + "eval_global_dataset_runtime": 33.3421, + "eval_global_dataset_samples_per_second": 11.517, + "eval_global_dataset_steps_per_second": 0.06, + "step": 220 + }, + { + "epoch": 0.30315500685871055, + "grad_norm": 3.582907199859619, + "learning_rate": 0.00040219378427787936, + "loss": 0.5484, + "step": 221 + }, + { + "epoch": 0.3045267489711934, + "grad_norm": 4.960206031799316, + "learning_rate": 0.0004040219378427788, + "loss": 0.9396, + "step": 222 + }, + { + "epoch": 0.3058984910836763, + "grad_norm": 4.219746112823486, + "learning_rate": 0.0004058500914076783, + "loss": 0.4335, + "step": 223 + }, + { + "epoch": 0.30727023319615915, + "grad_norm": 6.449894428253174, + "learning_rate": 0.00040767824497257773, + "loss": 0.9026, + "step": 224 + }, + { + "epoch": 0.30864197530864196, + "grad_norm": 6.239223003387451, + "learning_rate": 0.0004095063985374772, + "loss": 0.7214, + "step": 225 + }, + { + "epoch": 0.3100137174211248, + "grad_norm": 4.418921947479248, + "learning_rate": 0.00041133455210237664, + "loss": 0.4794, + "step": 226 + }, + { + "epoch": 0.3113854595336077, + "grad_norm": 0.012527555227279663, + "learning_rate": 0.00041316270566727604, + "loss": 0.0003, + "step": 227 + }, + { + "epoch": 0.31275720164609055, + "grad_norm": 2.672603130340576, + "learning_rate": 0.0004149908592321755, + "loss": 0.3003, + "step": 228 + }, + { + "epoch": 0.31412894375857336, + "grad_norm": 4.433743476867676, + "learning_rate": 0.00041681901279707495, + "loss": 0.4667, + "step": 229 + }, + { + "epoch": 0.31550068587105623, + "grad_norm": 4.458980083465576, + "learning_rate": 0.0004186471663619744, + "loss": 0.5006, + "step": 230 + }, + { + "epoch": 0.3168724279835391, + "grad_norm": 5.0898237228393555, + "learning_rate": 0.00042047531992687385, + "loss": 0.5555, + "step": 231 + }, + { + "epoch": 0.31824417009602196, + "grad_norm": 4.338139533996582, + "learning_rate": 0.00042230347349177336, + "loss": 0.4437, + "step": 232 + }, + { + "epoch": 0.3196159122085048, + "grad_norm": 5.023694038391113, + "learning_rate": 0.00042413162705667276, + "loss": 0.8813, + "step": 233 + }, + { + "epoch": 0.32098765432098764, + "grad_norm": 6.410233497619629, + "learning_rate": 0.0004259597806215722, + "loss": 1.1836, + "step": 234 + }, + { + "epoch": 0.3223593964334705, + "grad_norm": 3.8459813594818115, + "learning_rate": 0.00042778793418647167, + "loss": 0.3176, + "step": 235 + }, + { + "epoch": 0.32373113854595337, + "grad_norm": 5.539570331573486, + "learning_rate": 0.0004296160877513711, + "loss": 0.6248, + "step": 236 + }, + { + "epoch": 0.32510288065843623, + "grad_norm": 3.5191774368286133, + "learning_rate": 0.0004314442413162706, + "loss": 0.3623, + "step": 237 + }, + { + "epoch": 0.32647462277091904, + "grad_norm": 3.2997043132781982, + "learning_rate": 0.00043327239488117003, + "loss": 0.3205, + "step": 238 + }, + { + "epoch": 0.3278463648834019, + "grad_norm": 3.9236536026000977, + "learning_rate": 0.0004351005484460695, + "loss": 0.439, + "step": 239 + }, + { + "epoch": 0.3292181069958848, + "grad_norm": 3.9750499725341797, + "learning_rate": 0.0004369287020109689, + "loss": 0.653, + "step": 240 + }, + { + "epoch": 0.33058984910836764, + "grad_norm": 4.344120502471924, + "learning_rate": 0.00043875685557586834, + "loss": 0.5743, + "step": 241 + }, + { + "epoch": 0.3319615912208505, + "grad_norm": 3.2905893325805664, + "learning_rate": 0.0004405850091407678, + "loss": 0.5844, + "step": 242 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 5.996461391448975, + "learning_rate": 0.0004424131627056673, + "loss": 0.9451, + "step": 243 + }, + { + "epoch": 0.3347050754458162, + "grad_norm": 4.085718631744385, + "learning_rate": 0.00044424131627056676, + "loss": 0.7071, + "step": 244 + }, + { + "epoch": 0.33607681755829905, + "grad_norm": 0.26297450065612793, + "learning_rate": 0.0004460694698354662, + "loss": 0.0226, + "step": 245 + }, + { + "epoch": 0.3374485596707819, + "grad_norm": 5.191401481628418, + "learning_rate": 0.00044789762340036567, + "loss": 1.0585, + "step": 246 + }, + { + "epoch": 0.3388203017832647, + "grad_norm": 5.426116466522217, + "learning_rate": 0.00044972577696526507, + "loss": 1.0764, + "step": 247 + }, + { + "epoch": 0.3401920438957476, + "grad_norm": 0.17406082153320312, + "learning_rate": 0.0004515539305301645, + "loss": 0.0289, + "step": 248 + }, + { + "epoch": 0.34156378600823045, + "grad_norm": 4.900349140167236, + "learning_rate": 0.000453382084095064, + "loss": 0.5588, + "step": 249 + }, + { + "epoch": 0.3429355281207133, + "grad_norm": 5.373581409454346, + "learning_rate": 0.00045521023765996343, + "loss": 0.7509, + "step": 250 + }, + { + "epoch": 0.3443072702331962, + "grad_norm": 5.4629106521606445, + "learning_rate": 0.0004570383912248629, + "loss": 0.8388, + "step": 251 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 4.140360355377197, + "learning_rate": 0.00045886654478976234, + "loss": 0.5444, + "step": 252 + }, + { + "epoch": 0.34705075445816186, + "grad_norm": 5.176646709442139, + "learning_rate": 0.00046069469835466185, + "loss": 1.2432, + "step": 253 + }, + { + "epoch": 0.3484224965706447, + "grad_norm": 5.172772407531738, + "learning_rate": 0.00046252285191956125, + "loss": 0.8329, + "step": 254 + }, + { + "epoch": 0.3497942386831276, + "grad_norm": 4.968120574951172, + "learning_rate": 0.0004643510054844607, + "loss": 0.7158, + "step": 255 + }, + { + "epoch": 0.3511659807956104, + "grad_norm": 5.91867208480835, + "learning_rate": 0.00046617915904936016, + "loss": 0.935, + "step": 256 + }, + { + "epoch": 0.35253772290809327, + "grad_norm": 6.223313808441162, + "learning_rate": 0.0004680073126142596, + "loss": 0.8777, + "step": 257 + }, + { + "epoch": 0.35390946502057613, + "grad_norm": 11.474114418029785, + "learning_rate": 0.00046983546617915907, + "loss": 3.1178, + "step": 258 + }, + { + "epoch": 0.355281207133059, + "grad_norm": 6.462806701660156, + "learning_rate": 0.0004716636197440585, + "loss": 0.9418, + "step": 259 + }, + { + "epoch": 0.35665294924554186, + "grad_norm": 5.5286173820495605, + "learning_rate": 0.000473491773308958, + "loss": 0.6252, + "step": 260 + }, + { + "epoch": 0.35802469135802467, + "grad_norm": 8.872392654418945, + "learning_rate": 0.0004753199268738574, + "loss": 2.9074, + "step": 261 + }, + { + "epoch": 0.35939643347050754, + "grad_norm": 2.4954137802124023, + "learning_rate": 0.00047714808043875683, + "loss": 0.303, + "step": 262 + }, + { + "epoch": 0.3607681755829904, + "grad_norm": 3.22896671295166, + "learning_rate": 0.0004789762340036563, + "loss": 0.7146, + "step": 263 + }, + { + "epoch": 0.36213991769547327, + "grad_norm": 4.280979156494141, + "learning_rate": 0.0004808043875685558, + "loss": 0.6848, + "step": 264 + }, + { + "epoch": 0.3635116598079561, + "grad_norm": 3.9235236644744873, + "learning_rate": 0.00048263254113345525, + "loss": 0.47, + "step": 265 + }, + { + "epoch": 0.36488340192043894, + "grad_norm": 3.460500717163086, + "learning_rate": 0.0004844606946983547, + "loss": 0.5762, + "step": 266 + }, + { + "epoch": 0.3662551440329218, + "grad_norm": 11.026795387268066, + "learning_rate": 0.00048628884826325416, + "loss": 3.3872, + "step": 267 + }, + { + "epoch": 0.3676268861454047, + "grad_norm": 3.2708449363708496, + "learning_rate": 0.00048811700182815356, + "loss": 0.4195, + "step": 268 + }, + { + "epoch": 0.36899862825788754, + "grad_norm": 5.177340030670166, + "learning_rate": 0.000489945155393053, + "loss": 1.2292, + "step": 269 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 5.413723945617676, + "learning_rate": 0.0004917733089579525, + "loss": 1.1249, + "step": 270 + }, + { + "epoch": 0.3717421124828532, + "grad_norm": 4.922053337097168, + "learning_rate": 0.0004936014625228519, + "loss": 1.0863, + "step": 271 + }, + { + "epoch": 0.3731138545953361, + "grad_norm": 4.433996677398682, + "learning_rate": 0.0004954296160877514, + "loss": 0.9361, + "step": 272 + }, + { + "epoch": 0.37448559670781895, + "grad_norm": 5.205246448516846, + "learning_rate": 0.0004972577696526508, + "loss": 0.7965, + "step": 273 + }, + { + "epoch": 0.37585733882030176, + "grad_norm": 4.139344215393066, + "learning_rate": 0.0004990859232175503, + "loss": 0.7914, + "step": 274 + }, + { + "epoch": 0.3772290809327846, + "grad_norm": 0.1397838592529297, + "learning_rate": 0.0005009140767824497, + "loss": 0.0027, + "step": 275 + }, + { + "epoch": 0.3786008230452675, + "grad_norm": 3.3880808353424072, + "learning_rate": 0.0005027422303473492, + "loss": 0.6585, + "step": 276 + }, + { + "epoch": 0.37997256515775035, + "grad_norm": 4.524999141693115, + "learning_rate": 0.0005045703839122486, + "loss": 0.5388, + "step": 277 + }, + { + "epoch": 0.3813443072702332, + "grad_norm": 6.445588111877441, + "learning_rate": 0.000506398537477148, + "loss": 1.238, + "step": 278 + }, + { + "epoch": 0.38271604938271603, + "grad_norm": 2.8341169357299805, + "learning_rate": 0.0005082266910420476, + "loss": 0.3782, + "step": 279 + }, + { + "epoch": 0.3840877914951989, + "grad_norm": 5.0054240226745605, + "learning_rate": 0.0005100548446069469, + "loss": 1.1769, + "step": 280 + }, + { + "epoch": 0.38545953360768176, + "grad_norm": 3.6371870040893555, + "learning_rate": 0.0005118829981718465, + "loss": 0.4808, + "step": 281 + }, + { + "epoch": 0.3868312757201646, + "grad_norm": 8.928922653198242, + "learning_rate": 0.0005137111517367459, + "loss": 3.2104, + "step": 282 + }, + { + "epoch": 0.38820301783264743, + "grad_norm": 2.7686235904693604, + "learning_rate": 0.0005155393053016454, + "loss": 0.3027, + "step": 283 + }, + { + "epoch": 0.3895747599451303, + "grad_norm": 1.498235821723938, + "learning_rate": 0.0005173674588665448, + "loss": 0.1422, + "step": 284 + }, + { + "epoch": 0.39094650205761317, + "grad_norm": 3.578543186187744, + "learning_rate": 0.0005191956124314442, + "loss": 0.6059, + "step": 285 + }, + { + "epoch": 0.39231824417009603, + "grad_norm": 2.900531053543091, + "learning_rate": 0.0005210237659963437, + "loss": 0.3491, + "step": 286 + }, + { + "epoch": 0.3936899862825789, + "grad_norm": 5.693866729736328, + "learning_rate": 0.0005228519195612431, + "loss": 1.1603, + "step": 287 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 3.7944750785827637, + "learning_rate": 0.0005246800731261426, + "loss": 0.5784, + "step": 288 + }, + { + "epoch": 0.39643347050754457, + "grad_norm": 4.433256149291992, + "learning_rate": 0.000526508226691042, + "loss": 0.6532, + "step": 289 + }, + { + "epoch": 0.39780521262002744, + "grad_norm": 3.986520290374756, + "learning_rate": 0.0005283363802559416, + "loss": 0.5613, + "step": 290 + }, + { + "epoch": 0.3991769547325103, + "grad_norm": 4.399818420410156, + "learning_rate": 0.000530164533820841, + "loss": 0.8469, + "step": 291 + }, + { + "epoch": 0.40054869684499317, + "grad_norm": 3.0586366653442383, + "learning_rate": 0.0005319926873857404, + "loss": 0.4484, + "step": 292 + }, + { + "epoch": 0.401920438957476, + "grad_norm": 0.10376634448766708, + "learning_rate": 0.0005338208409506399, + "loss": 0.0034, + "step": 293 + }, + { + "epoch": 0.40329218106995884, + "grad_norm": 8.303990364074707, + "learning_rate": 0.0005356489945155393, + "loss": 3.0798, + "step": 294 + }, + { + "epoch": 0.4046639231824417, + "grad_norm": 4.507086277008057, + "learning_rate": 0.0005374771480804388, + "loss": 0.8632, + "step": 295 + }, + { + "epoch": 0.4060356652949246, + "grad_norm": 5.233419895172119, + "learning_rate": 0.0005393053016453382, + "loss": 1.2788, + "step": 296 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 4.281100273132324, + "learning_rate": 0.0005411334552102377, + "loss": 1.2439, + "step": 297 + }, + { + "epoch": 0.40877914951989025, + "grad_norm": 1.219199299812317, + "learning_rate": 0.0005429616087751371, + "loss": 0.1067, + "step": 298 + }, + { + "epoch": 0.4101508916323731, + "grad_norm": 3.0503711700439453, + "learning_rate": 0.0005447897623400365, + "loss": 0.4197, + "step": 299 + }, + { + "epoch": 0.411522633744856, + "grad_norm": 3.5557351112365723, + "learning_rate": 0.000546617915904936, + "loss": 0.56, + "step": 300 + }, + { + "epoch": 0.41289437585733885, + "grad_norm": 4.1112470626831055, + "learning_rate": 0.0005484460694698354, + "loss": 0.7235, + "step": 301 + }, + { + "epoch": 0.41426611796982166, + "grad_norm": 2.915947675704956, + "learning_rate": 0.000550274223034735, + "loss": 0.3506, + "step": 302 + }, + { + "epoch": 0.4156378600823045, + "grad_norm": 4.125770568847656, + "learning_rate": 0.0005521023765996344, + "loss": 0.6808, + "step": 303 + }, + { + "epoch": 0.4170096021947874, + "grad_norm": 5.084654808044434, + "learning_rate": 0.0005539305301645339, + "loss": 0.9406, + "step": 304 + }, + { + "epoch": 0.41838134430727025, + "grad_norm": 4.542891025543213, + "learning_rate": 0.0005557586837294333, + "loss": 0.6707, + "step": 305 + }, + { + "epoch": 0.41975308641975306, + "grad_norm": 4.285159587860107, + "learning_rate": 0.0005575868372943327, + "loss": 1.0718, + "step": 306 + }, + { + "epoch": 0.42112482853223593, + "grad_norm": 5.053350925445557, + "learning_rate": 0.0005594149908592322, + "loss": 0.9847, + "step": 307 + }, + { + "epoch": 0.4224965706447188, + "grad_norm": 1.4923471212387085, + "learning_rate": 0.0005612431444241316, + "loss": 0.122, + "step": 308 + }, + { + "epoch": 0.42386831275720166, + "grad_norm": 4.332481384277344, + "learning_rate": 0.0005630712979890311, + "loss": 0.8221, + "step": 309 + }, + { + "epoch": 0.4252400548696845, + "grad_norm": 4.23899507522583, + "learning_rate": 0.0005648994515539305, + "loss": 0.9891, + "step": 310 + }, + { + "epoch": 0.42661179698216734, + "grad_norm": 4.370994567871094, + "learning_rate": 0.00056672760511883, + "loss": 0.926, + "step": 311 + }, + { + "epoch": 0.4279835390946502, + "grad_norm": 0.23886460065841675, + "learning_rate": 0.0005685557586837294, + "loss": 0.0151, + "step": 312 + }, + { + "epoch": 0.42935528120713307, + "grad_norm": 7.701839447021484, + "learning_rate": 0.0005703839122486289, + "loss": 2.8429, + "step": 313 + }, + { + "epoch": 0.43072702331961593, + "grad_norm": 4.738073825836182, + "learning_rate": 0.0005722120658135283, + "loss": 0.9917, + "step": 314 + }, + { + "epoch": 0.43209876543209874, + "grad_norm": 3.7907347679138184, + "learning_rate": 0.0005740402193784278, + "loss": 0.9199, + "step": 315 + }, + { + "epoch": 0.4334705075445816, + "grad_norm": 2.7892837524414062, + "learning_rate": 0.0005758683729433273, + "loss": 0.3931, + "step": 316 + }, + { + "epoch": 0.4348422496570645, + "grad_norm": 3.414641857147217, + "learning_rate": 0.0005776965265082267, + "loss": 0.7119, + "step": 317 + }, + { + "epoch": 0.43621399176954734, + "grad_norm": 0.023860761895775795, + "learning_rate": 0.0005795246800731262, + "loss": 0.0008, + "step": 318 + }, + { + "epoch": 0.4375857338820302, + "grad_norm": 3.0215470790863037, + "learning_rate": 0.0005813528336380256, + "loss": 0.4985, + "step": 319 + }, + { + "epoch": 0.438957475994513, + "grad_norm": 3.1884336471557617, + "learning_rate": 0.000583180987202925, + "loss": 0.7732, + "step": 320 + }, + { + "epoch": 0.4403292181069959, + "grad_norm": 3.4848649501800537, + "learning_rate": 0.0005850091407678245, + "loss": 0.7515, + "step": 321 + }, + { + "epoch": 0.44170096021947874, + "grad_norm": 0.9857578873634338, + "learning_rate": 0.0005868372943327239, + "loss": 0.0885, + "step": 322 + }, + { + "epoch": 0.4430727023319616, + "grad_norm": 4.590999126434326, + "learning_rate": 0.0005886654478976234, + "loss": 1.1677, + "step": 323 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 4.161310195922852, + "learning_rate": 0.0005904936014625229, + "loss": 0.7439, + "step": 324 + }, + { + "epoch": 0.4458161865569273, + "grad_norm": 4.842323303222656, + "learning_rate": 0.0005923217550274223, + "loss": 1.3473, + "step": 325 + }, + { + "epoch": 0.44718792866941015, + "grad_norm": 3.332562208175659, + "learning_rate": 0.0005941499085923218, + "loss": 0.7273, + "step": 326 + }, + { + "epoch": 0.448559670781893, + "grad_norm": 4.295160293579102, + "learning_rate": 0.0005959780621572211, + "loss": 0.843, + "step": 327 + }, + { + "epoch": 0.4499314128943759, + "grad_norm": 3.641636848449707, + "learning_rate": 0.0005978062157221207, + "loss": 0.7881, + "step": 328 + }, + { + "epoch": 0.4513031550068587, + "grad_norm": 2.767233371734619, + "learning_rate": 0.0005996343692870201, + "loss": 0.5319, + "step": 329 + }, + { + "epoch": 0.45267489711934156, + "grad_norm": 2.925886869430542, + "learning_rate": 0.0006014625228519196, + "loss": 0.4826, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_Qnli-dev_cosine_accuracy": 0.708984375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8169091939926147, + "eval_Qnli-dev_cosine_ap": 0.7472481805376167, + "eval_Qnli-dev_cosine_f1": 0.7189781021897811, + "eval_Qnli-dev_cosine_f1_threshold": 0.7571755051612854, + "eval_Qnli-dev_cosine_precision": 0.6314102564102564, + "eval_Qnli-dev_cosine_recall": 0.8347457627118644, + "eval_allNLI-dev_cosine_accuracy": 0.748046875, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.8621972799301147, + "eval_allNLI-dev_cosine_ap": 0.6527104471447597, + "eval_allNLI-dev_cosine_f1": 0.6606334841628959, + "eval_allNLI-dev_cosine_f1_threshold": 0.7864561080932617, + "eval_allNLI-dev_cosine_precision": 0.5427509293680297, + "eval_allNLI-dev_cosine_recall": 0.8439306358381503, + "eval_sequential_score": 0.7472481805376167, + "eval_sts-test_pearson_cosine": 0.8465015878560311, + "eval_sts-test_spearman_cosine": 0.8833058569973334, + "eval_vitaminc-pairs_loss": 2.5387091636657715, + "eval_vitaminc-pairs_runtime": 14.3065, + "eval_vitaminc-pairs_samples_per_second": 8.947, + "eval_vitaminc-pairs_steps_per_second": 0.07, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_negation-triplets_loss": 1.8608626127243042, + "eval_negation-triplets_runtime": 1.1519, + "eval_negation-triplets_samples_per_second": 111.119, + "eval_negation-triplets_steps_per_second": 0.868, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_scitail-pairs-pos_loss": 0.07322188466787338, + "eval_scitail-pairs-pos_runtime": 1.5539, + "eval_scitail-pairs-pos_samples_per_second": 82.372, + "eval_scitail-pairs-pos_steps_per_second": 0.644, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_scitail-pairs-qa_loss": 0.026681702584028244, + "eval_scitail-pairs-qa_runtime": 1.2098, + "eval_scitail-pairs-qa_samples_per_second": 105.799, + "eval_scitail-pairs-qa_steps_per_second": 0.827, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_xsum-pairs_loss": 0.6542444825172424, + "eval_xsum-pairs_runtime": 6.2906, + "eval_xsum-pairs_samples_per_second": 20.348, + "eval_xsum-pairs_steps_per_second": 0.159, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_sciq_pairs_loss": 0.06421030312776566, + "eval_sciq_pairs_runtime": 8.8514, + "eval_sciq_pairs_samples_per_second": 14.461, + "eval_sciq_pairs_steps_per_second": 0.113, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_qasc_pairs_loss": 0.8813464641571045, + "eval_qasc_pairs_runtime": 1.3875, + "eval_qasc_pairs_samples_per_second": 92.249, + "eval_qasc_pairs_steps_per_second": 0.721, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_openbookqa_pairs_loss": 1.4074363708496094, + "eval_openbookqa_pairs_runtime": 1.1826, + "eval_openbookqa_pairs_samples_per_second": 108.237, + "eval_openbookqa_pairs_steps_per_second": 0.846, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_nq_pairs_loss": 0.62897789478302, + "eval_nq_pairs_runtime": 7.8684, + "eval_nq_pairs_samples_per_second": 16.268, + "eval_nq_pairs_steps_per_second": 0.127, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_trivia_pairs_loss": 1.084182620048523, + "eval_trivia_pairs_runtime": 8.9262, + "eval_trivia_pairs_samples_per_second": 14.34, + "eval_trivia_pairs_steps_per_second": 0.112, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_gooaq_pairs_loss": 0.6594768762588501, + "eval_gooaq_pairs_runtime": 2.0651, + "eval_gooaq_pairs_samples_per_second": 61.982, + "eval_gooaq_pairs_steps_per_second": 0.484, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_paws-pos_loss": 0.03268549218773842, + "eval_paws-pos_runtime": 1.5103, + "eval_paws-pos_samples_per_second": 84.754, + "eval_paws-pos_steps_per_second": 0.662, + "step": 330 + }, + { + "epoch": 0.45267489711934156, + "eval_global_dataset_loss": 0.21171291172504425, + "eval_global_dataset_runtime": 33.2988, + "eval_global_dataset_samples_per_second": 11.532, + "eval_global_dataset_steps_per_second": 0.06, + "step": 330 + }, + { + "epoch": 0.4540466392318244, + "grad_norm": 2.6243412494659424, + "learning_rate": 0.000603290676416819, + "loss": 0.6096, + "step": 331 + }, + { + "epoch": 0.4554183813443073, + "grad_norm": 2.776013135910034, + "learning_rate": 0.0006051188299817185, + "loss": 0.3687, + "step": 332 + }, + { + "epoch": 0.4567901234567901, + "grad_norm": 4.418542385101318, + "learning_rate": 0.0006069469835466179, + "loss": 0.9713, + "step": 333 + }, + { + "epoch": 0.45816186556927296, + "grad_norm": 5.112300872802734, + "learning_rate": 0.0006087751371115173, + "loss": 1.3203, + "step": 334 + }, + { + "epoch": 0.45953360768175583, + "grad_norm": 2.1659023761749268, + "learning_rate": 0.0006106032906764169, + "loss": 0.3443, + "step": 335 + }, + { + "epoch": 0.4609053497942387, + "grad_norm": 4.783431529998779, + "learning_rate": 0.0006124314442413162, + "loss": 1.4592, + "step": 336 + }, + { + "epoch": 0.46227709190672156, + "grad_norm": 3.507357597351074, + "learning_rate": 0.0006142595978062158, + "loss": 0.8, + "step": 337 + }, + { + "epoch": 0.46364883401920437, + "grad_norm": 3.118370771408081, + "learning_rate": 0.0006160877513711151, + "loss": 0.5481, + "step": 338 + }, + { + "epoch": 0.46502057613168724, + "grad_norm": 4.21981143951416, + "learning_rate": 0.0006179159049360147, + "loss": 1.4286, + "step": 339 + }, + { + "epoch": 0.4663923182441701, + "grad_norm": 2.675670862197876, + "learning_rate": 0.0006197440585009141, + "loss": 0.4012, + "step": 340 + }, + { + "epoch": 0.46776406035665297, + "grad_norm": 1.6964771747589111, + "learning_rate": 0.0006215722120658135, + "loss": 0.1474, + "step": 341 + }, + { + "epoch": 0.4691358024691358, + "grad_norm": 4.83234167098999, + "learning_rate": 0.000623400365630713, + "loss": 1.2755, + "step": 342 + }, + { + "epoch": 0.47050754458161864, + "grad_norm": 2.7494046688079834, + "learning_rate": 0.0006252285191956124, + "loss": 0.4935, + "step": 343 + }, + { + "epoch": 0.4718792866941015, + "grad_norm": 4.708520412445068, + "learning_rate": 0.0006270566727605119, + "loss": 1.0101, + "step": 344 + }, + { + "epoch": 0.4732510288065844, + "grad_norm": 2.6878857612609863, + "learning_rate": 0.0006288848263254113, + "loss": 0.4529, + "step": 345 + }, + { + "epoch": 0.47462277091906724, + "grad_norm": 2.571988105773926, + "learning_rate": 0.0006307129798903109, + "loss": 0.3516, + "step": 346 + }, + { + "epoch": 0.47599451303155005, + "grad_norm": 3.210439443588257, + "learning_rate": 0.0006325411334552102, + "loss": 0.4045, + "step": 347 + }, + { + "epoch": 0.4773662551440329, + "grad_norm": 4.047224521636963, + "learning_rate": 0.0006343692870201097, + "loss": 1.2326, + "step": 348 + }, + { + "epoch": 0.4787379972565158, + "grad_norm": 4.121635437011719, + "learning_rate": 0.0006361974405850091, + "loss": 0.8951, + "step": 349 + }, + { + "epoch": 0.48010973936899864, + "grad_norm": 3.480602741241455, + "learning_rate": 0.0006380255941499086, + "loss": 0.6783, + "step": 350 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 4.286612033843994, + "learning_rate": 0.000639853747714808, + "loss": 1.1821, + "step": 351 + }, + { + "epoch": 0.4828532235939643, + "grad_norm": 3.077362537384033, + "learning_rate": 0.0006416819012797075, + "loss": 0.5101, + "step": 352 + }, + { + "epoch": 0.4842249657064472, + "grad_norm": 3.127272844314575, + "learning_rate": 0.000643510054844607, + "loss": 0.844, + "step": 353 + }, + { + "epoch": 0.48559670781893005, + "grad_norm": 2.6955454349517822, + "learning_rate": 0.0006453382084095064, + "loss": 0.5413, + "step": 354 + }, + { + "epoch": 0.4869684499314129, + "grad_norm": 3.285903215408325, + "learning_rate": 0.0006471663619744058, + "loss": 0.9982, + "step": 355 + }, + { + "epoch": 0.4883401920438957, + "grad_norm": 3.6762568950653076, + "learning_rate": 0.0006489945155393053, + "loss": 0.937, + "step": 356 + }, + { + "epoch": 0.4897119341563786, + "grad_norm": 3.365633964538574, + "learning_rate": 0.0006508226691042048, + "loss": 0.7977, + "step": 357 + }, + { + "epoch": 0.49108367626886146, + "grad_norm": 3.7572262287139893, + "learning_rate": 0.0006526508226691042, + "loss": 0.8697, + "step": 358 + }, + { + "epoch": 0.4924554183813443, + "grad_norm": 3.309539794921875, + "learning_rate": 0.0006544789762340037, + "loss": 1.1136, + "step": 359 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 4.250339508056641, + "learning_rate": 0.0006563071297989031, + "loss": 1.3018, + "step": 360 + }, + { + "epoch": 0.49519890260631, + "grad_norm": 0.07554444670677185, + "learning_rate": 0.0006581352833638026, + "loss": 0.0023, + "step": 361 + }, + { + "epoch": 0.49657064471879286, + "grad_norm": 3.6443750858306885, + "learning_rate": 0.0006599634369287019, + "loss": 0.7019, + "step": 362 + }, + { + "epoch": 0.49794238683127573, + "grad_norm": 0.19442614912986755, + "learning_rate": 0.0006617915904936015, + "loss": 0.0481, + "step": 363 + }, + { + "epoch": 0.4993141289437586, + "grad_norm": 3.7148313522338867, + "learning_rate": 0.0006636197440585009, + "loss": 0.891, + "step": 364 + }, + { + "epoch": 0.5006858710562414, + "grad_norm": 2.7239511013031006, + "learning_rate": 0.0006654478976234004, + "loss": 0.6353, + "step": 365 + }, + { + "epoch": 0.5020576131687243, + "grad_norm": 2.5572762489318848, + "learning_rate": 0.0006672760511882999, + "loss": 0.5181, + "step": 366 + }, + { + "epoch": 0.5034293552812071, + "grad_norm": 3.162834405899048, + "learning_rate": 0.0006691042047531993, + "loss": 0.8311, + "step": 367 + }, + { + "epoch": 0.50480109739369, + "grad_norm": 0.06661587208509445, + "learning_rate": 0.0006709323583180988, + "loss": 0.0026, + "step": 368 + }, + { + "epoch": 0.5061728395061729, + "grad_norm": 2.4806196689605713, + "learning_rate": 0.0006727605118829981, + "loss": 0.5004, + "step": 369 + }, + { + "epoch": 0.5075445816186557, + "grad_norm": 2.5774953365325928, + "learning_rate": 0.0006745886654478977, + "loss": 0.4511, + "step": 370 + }, + { + "epoch": 0.5089163237311386, + "grad_norm": 2.2276206016540527, + "learning_rate": 0.000676416819012797, + "loss": 0.4558, + "step": 371 + }, + { + "epoch": 0.5102880658436214, + "grad_norm": 2.652674674987793, + "learning_rate": 0.0006782449725776966, + "loss": 0.5073, + "step": 372 + }, + { + "epoch": 0.5116598079561042, + "grad_norm": 2.2147669792175293, + "learning_rate": 0.0006800731261425959, + "loss": 0.3979, + "step": 373 + }, + { + "epoch": 0.5130315500685871, + "grad_norm": 1.5278068780899048, + "learning_rate": 0.0006819012797074955, + "loss": 0.1665, + "step": 374 + }, + { + "epoch": 0.51440329218107, + "grad_norm": 2.9153432846069336, + "learning_rate": 0.0006837294332723948, + "loss": 0.8231, + "step": 375 + }, + { + "epoch": 0.5157750342935528, + "grad_norm": 4.252976894378662, + "learning_rate": 0.0006855575868372943, + "loss": 1.0406, + "step": 376 + }, + { + "epoch": 0.5171467764060357, + "grad_norm": 3.768838405609131, + "learning_rate": 0.0006873857404021939, + "loss": 0.725, + "step": 377 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 1.8634569644927979, + "learning_rate": 0.0006892138939670932, + "loss": 0.2603, + "step": 378 + }, + { + "epoch": 0.5198902606310014, + "grad_norm": 5.095807075500488, + "learning_rate": 0.0006910420475319928, + "loss": 1.7357, + "step": 379 + }, + { + "epoch": 0.5212620027434842, + "grad_norm": 2.8866710662841797, + "learning_rate": 0.0006928702010968921, + "loss": 0.9147, + "step": 380 + }, + { + "epoch": 0.522633744855967, + "grad_norm": 2.369819402694702, + "learning_rate": 0.0006946983546617917, + "loss": 0.4277, + "step": 381 + }, + { + "epoch": 0.52400548696845, + "grad_norm": 1.7865092754364014, + "learning_rate": 0.000696526508226691, + "loss": 0.4788, + "step": 382 + }, + { + "epoch": 0.5253772290809328, + "grad_norm": 3.0729360580444336, + "learning_rate": 0.0006983546617915905, + "loss": 0.7666, + "step": 383 + }, + { + "epoch": 0.5267489711934157, + "grad_norm": 3.023810386657715, + "learning_rate": 0.0007001828153564899, + "loss": 0.7728, + "step": 384 + }, + { + "epoch": 0.5281207133058985, + "grad_norm": 2.5811986923217773, + "learning_rate": 0.0007020109689213894, + "loss": 0.4744, + "step": 385 + }, + { + "epoch": 0.5294924554183813, + "grad_norm": 2.748720407485962, + "learning_rate": 0.0007038391224862888, + "loss": 1.1014, + "step": 386 + }, + { + "epoch": 0.5308641975308642, + "grad_norm": 3.950869560241699, + "learning_rate": 0.0007056672760511883, + "loss": 1.5588, + "step": 387 + }, + { + "epoch": 0.532235939643347, + "grad_norm": 1.595503330230713, + "learning_rate": 0.0007074954296160879, + "loss": 0.3185, + "step": 388 + }, + { + "epoch": 0.53360768175583, + "grad_norm": 3.505636692047119, + "learning_rate": 0.0007093235831809872, + "loss": 1.3348, + "step": 389 + }, + { + "epoch": 0.5349794238683128, + "grad_norm": 1.0456945896148682, + "learning_rate": 0.0007111517367458867, + "loss": 0.1656, + "step": 390 + }, + { + "epoch": 0.5363511659807956, + "grad_norm": 2.747938871383667, + "learning_rate": 0.0007129798903107861, + "loss": 0.9375, + "step": 391 + }, + { + "epoch": 0.5377229080932785, + "grad_norm": 4.741049289703369, + "learning_rate": 0.0007148080438756856, + "loss": 1.4665, + "step": 392 + }, + { + "epoch": 0.5390946502057613, + "grad_norm": 4.830301284790039, + "learning_rate": 0.000716636197440585, + "loss": 1.4635, + "step": 393 + }, + { + "epoch": 0.5404663923182441, + "grad_norm": 2.938199758529663, + "learning_rate": 0.0007184643510054845, + "loss": 0.8677, + "step": 394 + }, + { + "epoch": 0.541838134430727, + "grad_norm": 6.291453838348389, + "learning_rate": 0.0007202925045703839, + "loss": 3.033, + "step": 395 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 3.0533947944641113, + "learning_rate": 0.0007221206581352834, + "loss": 1.4375, + "step": 396 + }, + { + "epoch": 0.5445816186556928, + "grad_norm": 3.13800048828125, + "learning_rate": 0.0007239488117001827, + "loss": 0.9762, + "step": 397 + }, + { + "epoch": 0.5459533607681756, + "grad_norm": 1.1184616088867188, + "learning_rate": 0.0007257769652650823, + "loss": 0.1333, + "step": 398 + }, + { + "epoch": 0.5473251028806584, + "grad_norm": 3.610217571258545, + "learning_rate": 0.0007276051188299818, + "loss": 1.1823, + "step": 399 + }, + { + "epoch": 0.5486968449931413, + "grad_norm": 3.6696395874023438, + "learning_rate": 0.0007294332723948812, + "loss": 1.2443, + "step": 400 + }, + { + "epoch": 0.5500685871056241, + "grad_norm": 2.967648506164551, + "learning_rate": 0.0007312614259597807, + "loss": 0.662, + "step": 401 + }, + { + "epoch": 0.551440329218107, + "grad_norm": 0.8750410079956055, + "learning_rate": 0.0007330895795246801, + "loss": 0.0709, + "step": 402 + }, + { + "epoch": 0.5528120713305898, + "grad_norm": 3.0801315307617188, + "learning_rate": 0.0007349177330895796, + "loss": 0.5822, + "step": 403 + }, + { + "epoch": 0.5541838134430727, + "grad_norm": 3.701993227005005, + "learning_rate": 0.0007367458866544789, + "loss": 1.0826, + "step": 404 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 3.439502716064453, + "learning_rate": 0.0007385740402193785, + "loss": 0.7953, + "step": 405 + }, + { + "epoch": 0.5569272976680384, + "grad_norm": 4.8917341232299805, + "learning_rate": 0.0007404021937842778, + "loss": 1.6109, + "step": 406 + }, + { + "epoch": 0.5582990397805213, + "grad_norm": 3.793834924697876, + "learning_rate": 0.0007422303473491774, + "loss": 1.2505, + "step": 407 + }, + { + "epoch": 0.5596707818930041, + "grad_norm": 3.0604796409606934, + "learning_rate": 0.0007440585009140767, + "loss": 1.0019, + "step": 408 + }, + { + "epoch": 0.5610425240054869, + "grad_norm": 2.872400999069214, + "learning_rate": 0.0007458866544789763, + "loss": 0.8224, + "step": 409 + }, + { + "epoch": 0.5624142661179699, + "grad_norm": 2.631157398223877, + "learning_rate": 0.0007477148080438758, + "loss": 0.6592, + "step": 410 + }, + { + "epoch": 0.5637860082304527, + "grad_norm": 2.843379020690918, + "learning_rate": 0.0007495429616087751, + "loss": 0.7099, + "step": 411 + }, + { + "epoch": 0.5651577503429356, + "grad_norm": 2.3366591930389404, + "learning_rate": 0.0007513711151736747, + "loss": 0.5484, + "step": 412 + }, + { + "epoch": 0.5665294924554184, + "grad_norm": 3.4202780723571777, + "learning_rate": 0.000753199268738574, + "loss": 1.3019, + "step": 413 + }, + { + "epoch": 0.5679012345679012, + "grad_norm": 0.9125491976737976, + "learning_rate": 0.0007550274223034736, + "loss": 0.1266, + "step": 414 + }, + { + "epoch": 0.5692729766803841, + "grad_norm": 2.8945682048797607, + "learning_rate": 0.0007568555758683729, + "loss": 0.8932, + "step": 415 + }, + { + "epoch": 0.5706447187928669, + "grad_norm": 2.898399591445923, + "learning_rate": 0.0007586837294332725, + "loss": 0.7547, + "step": 416 + }, + { + "epoch": 0.5720164609053497, + "grad_norm": 2.9406332969665527, + "learning_rate": 0.0007605118829981718, + "loss": 0.7614, + "step": 417 + }, + { + "epoch": 0.5733882030178327, + "grad_norm": 2.1753182411193848, + "learning_rate": 0.0007623400365630713, + "loss": 0.7364, + "step": 418 + }, + { + "epoch": 0.5747599451303155, + "grad_norm": 2.5140652656555176, + "learning_rate": 0.0007641681901279707, + "loss": 0.6539, + "step": 419 + }, + { + "epoch": 0.5761316872427984, + "grad_norm": 2.0174620151519775, + "learning_rate": 0.0007659963436928702, + "loss": 0.3848, + "step": 420 + }, + { + "epoch": 0.5775034293552812, + "grad_norm": 0.11105114966630936, + "learning_rate": 0.0007678244972577697, + "loss": 0.0394, + "step": 421 + }, + { + "epoch": 0.578875171467764, + "grad_norm": 1.8194284439086914, + "learning_rate": 0.0007696526508226691, + "loss": 0.4623, + "step": 422 + }, + { + "epoch": 0.5802469135802469, + "grad_norm": 0.7781994342803955, + "learning_rate": 0.0007714808043875687, + "loss": 0.0783, + "step": 423 + }, + { + "epoch": 0.5816186556927297, + "grad_norm": 3.7422642707824707, + "learning_rate": 0.000773308957952468, + "loss": 1.4366, + "step": 424 + }, + { + "epoch": 0.5829903978052127, + "grad_norm": 3.9761717319488525, + "learning_rate": 0.0007751371115173675, + "loss": 1.3568, + "step": 425 + }, + { + "epoch": 0.5843621399176955, + "grad_norm": 2.948404550552368, + "learning_rate": 0.0007769652650822669, + "loss": 0.9065, + "step": 426 + }, + { + "epoch": 0.5857338820301783, + "grad_norm": 2.7700140476226807, + "learning_rate": 0.0007787934186471664, + "loss": 0.6723, + "step": 427 + }, + { + "epoch": 0.5871056241426612, + "grad_norm": 2.8618569374084473, + "learning_rate": 0.0007806215722120658, + "loss": 0.7596, + "step": 428 + }, + { + "epoch": 0.588477366255144, + "grad_norm": 0.956656813621521, + "learning_rate": 0.0007824497257769653, + "loss": 0.1426, + "step": 429 + }, + { + "epoch": 0.5898491083676269, + "grad_norm": 2.8430604934692383, + "learning_rate": 0.0007842778793418648, + "loss": 1.3264, + "step": 430 + }, + { + "epoch": 0.5912208504801097, + "grad_norm": 0.11261007934808731, + "learning_rate": 0.0007861060329067642, + "loss": 0.0442, + "step": 431 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 2.7201974391937256, + "learning_rate": 0.0007879341864716636, + "loss": 0.6046, + "step": 432 + }, + { + "epoch": 0.5939643347050755, + "grad_norm": 2.5923287868499756, + "learning_rate": 0.0007897623400365631, + "loss": 0.6173, + "step": 433 + }, + { + "epoch": 0.5953360768175583, + "grad_norm": 4.77182674407959, + "learning_rate": 0.0007915904936014625, + "loss": 2.8892, + "step": 434 + }, + { + "epoch": 0.5967078189300411, + "grad_norm": 3.1731245517730713, + "learning_rate": 0.000793418647166362, + "loss": 1.3149, + "step": 435 + }, + { + "epoch": 0.598079561042524, + "grad_norm": 2.849473237991333, + "learning_rate": 0.0007952468007312615, + "loss": 1.1886, + "step": 436 + }, + { + "epoch": 0.5994513031550068, + "grad_norm": 3.5986573696136475, + "learning_rate": 0.0007970749542961609, + "loss": 1.2799, + "step": 437 + }, + { + "epoch": 0.6008230452674898, + "grad_norm": 2.668875217437744, + "learning_rate": 0.0007989031078610604, + "loss": 0.7527, + "step": 438 + }, + { + "epoch": 0.6021947873799726, + "grad_norm": 2.7584633827209473, + "learning_rate": 0.0008007312614259597, + "loss": 0.7853, + "step": 439 + }, + { + "epoch": 0.6035665294924554, + "grad_norm": 2.521331787109375, + "learning_rate": 0.0008025594149908593, + "loss": 0.7291, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8342527151107788, + "eval_Qnli-dev_cosine_ap": 0.7590651825705997, + "eval_Qnli-dev_cosine_f1": 0.6824034334763949, + "eval_Qnli-dev_cosine_f1_threshold": 0.830267071723938, + "eval_Qnli-dev_cosine_precision": 0.691304347826087, + "eval_Qnli-dev_cosine_recall": 0.673728813559322, + "eval_allNLI-dev_cosine_accuracy": 0.75, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.8865965604782104, + "eval_allNLI-dev_cosine_ap": 0.6185049182990069, + "eval_allNLI-dev_cosine_f1": 0.6239669421487604, + "eval_allNLI-dev_cosine_f1_threshold": 0.7850924730300903, + "eval_allNLI-dev_cosine_precision": 0.4855305466237942, + "eval_allNLI-dev_cosine_recall": 0.8728323699421965, + "eval_sequential_score": 0.7590651825705997, + "eval_sts-test_pearson_cosine": 0.8447585777536966, + "eval_sts-test_spearman_cosine": 0.8940651790173606, + "eval_vitaminc-pairs_loss": 3.1624042987823486, + "eval_vitaminc-pairs_runtime": 14.3967, + "eval_vitaminc-pairs_samples_per_second": 8.891, + "eval_vitaminc-pairs_steps_per_second": 0.069, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_negation-triplets_loss": 2.1101934909820557, + "eval_negation-triplets_runtime": 1.1572, + "eval_negation-triplets_samples_per_second": 110.608, + "eval_negation-triplets_steps_per_second": 0.864, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_scitail-pairs-pos_loss": 0.14219464361667633, + "eval_scitail-pairs-pos_runtime": 1.5903, + "eval_scitail-pairs-pos_samples_per_second": 80.488, + "eval_scitail-pairs-pos_steps_per_second": 0.629, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_scitail-pairs-qa_loss": 0.03170065954327583, + "eval_scitail-pairs-qa_runtime": 1.2167, + "eval_scitail-pairs-qa_samples_per_second": 105.199, + "eval_scitail-pairs-qa_steps_per_second": 0.822, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_xsum-pairs_loss": 0.8414545655250549, + "eval_xsum-pairs_runtime": 6.3264, + "eval_xsum-pairs_samples_per_second": 20.233, + "eval_xsum-pairs_steps_per_second": 0.158, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_sciq_pairs_loss": 0.06330692023038864, + "eval_sciq_pairs_runtime": 8.8936, + "eval_sciq_pairs_samples_per_second": 14.392, + "eval_sciq_pairs_steps_per_second": 0.112, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_qasc_pairs_loss": 0.9395630359649658, + "eval_qasc_pairs_runtime": 1.3863, + "eval_qasc_pairs_samples_per_second": 92.332, + "eval_qasc_pairs_steps_per_second": 0.721, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_openbookqa_pairs_loss": 1.5656604766845703, + "eval_openbookqa_pairs_runtime": 1.1858, + "eval_openbookqa_pairs_samples_per_second": 107.947, + "eval_openbookqa_pairs_steps_per_second": 0.843, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_nq_pairs_loss": 0.8197879195213318, + "eval_nq_pairs_runtime": 7.9153, + "eval_nq_pairs_samples_per_second": 16.171, + "eval_nq_pairs_steps_per_second": 0.126, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_trivia_pairs_loss": 1.1605956554412842, + "eval_trivia_pairs_runtime": 8.9789, + "eval_trivia_pairs_samples_per_second": 14.256, + "eval_trivia_pairs_steps_per_second": 0.111, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_gooaq_pairs_loss": 0.7199620604515076, + "eval_gooaq_pairs_runtime": 2.0759, + "eval_gooaq_pairs_samples_per_second": 61.66, + "eval_gooaq_pairs_steps_per_second": 0.482, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_paws-pos_loss": 0.034769680351018906, + "eval_paws-pos_runtime": 1.5142, + "eval_paws-pos_samples_per_second": 84.536, + "eval_paws-pos_steps_per_second": 0.66, + "step": 440 + }, + { + "epoch": 0.6035665294924554, + "eval_global_dataset_loss": 0.2699156403541565, + "eval_global_dataset_runtime": 33.4517, + "eval_global_dataset_samples_per_second": 11.479, + "eval_global_dataset_steps_per_second": 0.06, + "step": 440 + }, + { + "epoch": 0.6049382716049383, + "grad_norm": 2.2343204021453857, + "learning_rate": 0.0008043875685557587, + "loss": 0.6532, + "step": 441 + }, + { + "epoch": 0.6063100137174211, + "grad_norm": 2.0353541374206543, + "learning_rate": 0.0008062157221206582, + "loss": 0.5846, + "step": 442 + }, + { + "epoch": 0.607681755829904, + "grad_norm": 2.2103233337402344, + "learning_rate": 0.0008080438756855576, + "loss": 0.6766, + "step": 443 + }, + { + "epoch": 0.6090534979423868, + "grad_norm": 0.8574596047401428, + "learning_rate": 0.0008098720292504571, + "loss": 0.1179, + "step": 444 + }, + { + "epoch": 0.6104252400548696, + "grad_norm": 2.2213659286499023, + "learning_rate": 0.0008117001828153565, + "loss": 0.5953, + "step": 445 + }, + { + "epoch": 0.6117969821673526, + "grad_norm": 2.4394009113311768, + "learning_rate": 0.0008135283363802559, + "loss": 0.7219, + "step": 446 + }, + { + "epoch": 0.6131687242798354, + "grad_norm": 2.216285228729248, + "learning_rate": 0.0008153564899451555, + "loss": 0.9445, + "step": 447 + }, + { + "epoch": 0.6145404663923183, + "grad_norm": 2.296591281890869, + "learning_rate": 0.0008171846435100548, + "loss": 0.5726, + "step": 448 + }, + { + "epoch": 0.6159122085048011, + "grad_norm": 2.837359666824341, + "learning_rate": 0.0008190127970749544, + "loss": 0.8687, + "step": 449 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 2.9849822521209717, + "learning_rate": 0.0008208409506398537, + "loss": 0.7864, + "step": 450 + }, + { + "epoch": 0.6186556927297668, + "grad_norm": 2.474256753921509, "learning_rate": 0.0008226691042047533, - "loss": 0.3255, + "loss": 0.6835, "step": 451 }, + { + "epoch": 0.6200274348422496, + "grad_norm": 2.815581798553467, + "learning_rate": 0.0008244972577696527, + "loss": 1.2349, + "step": 452 + }, + { + "epoch": 0.6213991769547325, + "grad_norm": 2.5145702362060547, + "learning_rate": 0.0008263254113345521, + "loss": 0.5894, + "step": 453 + }, + { + "epoch": 0.6227709190672154, + "grad_norm": 3.4624481201171875, + "learning_rate": 0.0008281535648994516, + "loss": 1.2882, + "step": 454 + }, + { + "epoch": 0.6241426611796982, + "grad_norm": 3.0444629192352295, + "learning_rate": 0.000829981718464351, + "loss": 1.1063, + "step": 455 + }, + { + "epoch": 0.6255144032921811, + "grad_norm": 3.4558730125427246, + "learning_rate": 0.0008318098720292505, + "loss": 1.6217, + "step": 456 + }, + { + "epoch": 0.6268861454046639, + "grad_norm": 1.9938262701034546, + "learning_rate": 0.0008336380255941499, + "loss": 0.6361, + "step": 457 + }, + { + "epoch": 0.6282578875171467, + "grad_norm": 2.8914825916290283, + "learning_rate": 0.0008354661791590493, + "loss": 1.2991, + "step": 458 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 1.987092137336731, + "learning_rate": 0.0008372943327239488, + "loss": 0.6638, + "step": 459 + }, + { + "epoch": 0.6310013717421125, + "grad_norm": 1.9341247081756592, + "learning_rate": 0.0008391224862888483, + "loss": 0.6688, + "step": 460 + }, + { + "epoch": 0.6323731138545954, + "grad_norm": 2.6572768688201904, + "learning_rate": 0.0008409506398537477, + "loss": 0.9422, + "step": 461 + }, { "epoch": 0.6337448559670782, - "grad_norm": 1.707679033279419, + "grad_norm": 2.504377841949463, "learning_rate": 0.0008427787934186472, - "loss": 0.4211, + "loss": 0.8578, "step": 462 }, { - "epoch": 0.6488340192043895, - "grad_norm": 1.942112922668457, - "learning_rate": 0.0008628884826325412, - "loss": 0.5372, - "step": 473 + "epoch": 0.635116598079561, + "grad_norm": 1.6810640096664429, + "learning_rate": 0.0008446069469835467, + "loss": 0.4707, + "step": 463 + }, + { + "epoch": 0.6364883401920439, + "grad_norm": 2.0036683082580566, + "learning_rate": 0.0008464351005484461, + "loss": 0.4281, + "step": 464 + }, + { + "epoch": 0.6378600823045267, + "grad_norm": 2.055009365081787, + "learning_rate": 0.0008482632541133455, + "loss": 0.5323, + "step": 465 + }, + { + "epoch": 0.6392318244170097, + "grad_norm": 1.8697012662887573, + "learning_rate": 0.000850091407678245, + "loss": 0.4676, + "step": 466 + }, + { + "epoch": 0.6406035665294925, + "grad_norm": 0.3099338114261627, + "learning_rate": 0.0008519195612431444, + "loss": 0.0323, + "step": 467 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 2.843210458755493, + "learning_rate": 0.0008537477148080439, + "loss": 1.0748, + "step": 468 + }, + { + "epoch": 0.6433470507544582, + "grad_norm": 5.503166198730469, + "learning_rate": 0.0008555758683729433, + "loss": 3.3347, + "step": 469 + }, + { + "epoch": 0.644718792866941, + "grad_norm": 3.0612828731536865, + "learning_rate": 0.0008574040219378428, + "loss": 1.4944, + "step": 470 + }, + { + "epoch": 0.6460905349794238, + "grad_norm": 1.8079982995986938, + "learning_rate": 0.0008592321755027423, + "loss": 0.666, + "step": 471 + }, + { + "epoch": 0.6474622770919067, + "grad_norm": 2.734410285949707, + "learning_rate": 0.0008610603290676416, + "loss": 1.1204, + "step": 472 + }, + { + "epoch": 0.6488340192043895, + "grad_norm": 2.612880229949951, + "learning_rate": 0.0008628884826325412, + "loss": 1.452, + "step": 473 + }, + { + "epoch": 0.6502057613168725, + "grad_norm": 1.7585951089859009, + "learning_rate": 0.0008647166361974406, + "loss": 0.5587, + "step": 474 + }, + { + "epoch": 0.6515775034293553, + "grad_norm": 2.7308504581451416, + "learning_rate": 0.0008665447897623401, + "loss": 1.189, + "step": 475 + }, + { + "epoch": 0.6529492455418381, + "grad_norm": 1.9969093799591064, + "learning_rate": 0.0008683729433272395, + "loss": 0.5736, + "step": 476 + }, + { + "epoch": 0.654320987654321, + "grad_norm": 2.545562505722046, + "learning_rate": 0.000870201096892139, + "loss": 0.7508, + "step": 477 + }, + { + "epoch": 0.6556927297668038, + "grad_norm": 2.3430142402648926, + "learning_rate": 0.0008720292504570384, + "loss": 0.7138, + "step": 478 + }, + { + "epoch": 0.6570644718792867, + "grad_norm": 2.081550359725952, + "learning_rate": 0.0008738574040219378, + "loss": 0.5152, + "step": 479 + }, + { + "epoch": 0.6584362139917695, + "grad_norm": 3.031339645385742, + "learning_rate": 0.0008756855575868373, + "loss": 1.0291, + "step": 480 + }, + { + "epoch": 0.6598079561042524, + "grad_norm": 3.382559299468994, + "learning_rate": 0.0008775137111517367, + "loss": 1.2339, + "step": 481 + }, + { + "epoch": 0.6611796982167353, + "grad_norm": 2.266045570373535, + "learning_rate": 0.0008793418647166362, + "loss": 0.6402, + "step": 482 + }, + { + "epoch": 0.6625514403292181, + "grad_norm": 4.375522136688232, + "learning_rate": 0.0008811700182815356, + "loss": 3.3593, + "step": 483 + }, + { + "epoch": 0.663923182441701, + "grad_norm": 1.7218207120895386, + "learning_rate": 0.0008829981718464352, + "loss": 0.4424, + "step": 484 + }, + { + "epoch": 0.6652949245541838, + "grad_norm": 1.930618405342102, + "learning_rate": 0.0008848263254113346, + "loss": 0.79, + "step": 485 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.2200028896331787, + "learning_rate": 0.000886654478976234, + "loss": 0.9445, + "step": 486 + }, + { + "epoch": 0.6680384087791496, + "grad_norm": 1.67755925655365, + "learning_rate": 0.0008884826325411335, + "loss": 0.5046, + "step": 487 + }, + { + "epoch": 0.6694101508916324, + "grad_norm": 1.119525671005249, + "learning_rate": 0.0008903107861060329, + "loss": 0.2215, + "step": 488 + }, + { + "epoch": 0.6707818930041153, + "grad_norm": 1.9932385683059692, + "learning_rate": 0.0008921389396709324, + "loss": 0.9744, + "step": 489 + }, + { + "epoch": 0.6721536351165981, + "grad_norm": 1.830539345741272, + "learning_rate": 0.0008939670932358318, + "loss": 0.5102, + "step": 490 + }, + { + "epoch": 0.6735253772290809, + "grad_norm": 2.152487277984619, + "learning_rate": 0.0008957952468007313, + "loss": 0.5452, + "step": 491 + }, + { + "epoch": 0.6748971193415638, + "grad_norm": 2.4515275955200195, + "learning_rate": 0.0008976234003656307, + "loss": 1.1853, + "step": 492 + }, + { + "epoch": 0.6762688614540466, + "grad_norm": 2.1000306606292725, + "learning_rate": 0.0008994515539305301, + "loss": 0.5773, + "step": 493 + }, + { + "epoch": 0.6776406035665294, + "grad_norm": 2.6680173873901367, + "learning_rate": 0.0009012797074954296, + "loss": 0.8209, + "step": 494 + }, + { + "epoch": 0.6790123456790124, + "grad_norm": 2.963197708129883, + "learning_rate": 0.000903107861060329, + "loss": 0.961, + "step": 495 + }, + { + "epoch": 0.6803840877914952, + "grad_norm": 1.574116826057434, + "learning_rate": 0.0009049360146252286, + "loss": 0.3303, + "step": 496 + }, + { + "epoch": 0.6817558299039781, + "grad_norm": 2.57684063911438, + "learning_rate": 0.000906764168190128, + "loss": 0.8192, + "step": 497 + }, + { + "epoch": 0.6831275720164609, + "grad_norm": 3.053014039993286, + "learning_rate": 0.0009085923217550275, + "loss": 1.3271, + "step": 498 + }, + { + "epoch": 0.6844993141289437, + "grad_norm": 2.6430559158325195, + "learning_rate": 0.0009104204753199269, + "loss": 0.9859, + "step": 499 + }, + { + "epoch": 0.6858710562414266, + "grad_norm": 0.7626333236694336, + "learning_rate": 0.0009122486288848263, + "loss": 0.1419, + "step": 500 + }, + { + "epoch": 0.6872427983539094, + "grad_norm": 0.8805200457572937, + "learning_rate": 0.0009140767824497258, + "loss": 0.1522, + "step": 501 + }, + { + "epoch": 0.6886145404663924, + "grad_norm": 1.4140968322753906, + "learning_rate": 0.0009159049360146252, + "loss": 0.3983, + "step": 502 + }, + { + "epoch": 0.6899862825788752, + "grad_norm": 0.16096271574497223, + "learning_rate": 0.0009177330895795247, + "loss": 0.0167, + "step": 503 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 2.1687278747558594, + "learning_rate": 0.0009195612431444241, + "loss": 0.7314, + "step": 504 + }, + { + "epoch": 0.6927297668038409, + "grad_norm": 2.1703848838806152, + "learning_rate": 0.0009213893967093237, + "loss": 0.5175, + "step": 505 + }, + { + "epoch": 0.6941015089163237, + "grad_norm": 3.221038341522217, + "learning_rate": 0.000923217550274223, + "loss": 1.6745, + "step": 506 + }, + { + "epoch": 0.6954732510288066, + "grad_norm": 2.9030301570892334, + "learning_rate": 0.0009250457038391225, + "loss": 1.2944, + "step": 507 + }, + { + "epoch": 0.6968449931412894, + "grad_norm": 2.1682050228118896, + "learning_rate": 0.000926873857404022, + "loss": 0.5509, + "step": 508 + }, + { + "epoch": 0.6982167352537723, + "grad_norm": 2.4524831771850586, + "learning_rate": 0.0009287020109689214, + "loss": 0.8585, + "step": 509 + }, + { + "epoch": 0.6995884773662552, + "grad_norm": 2.954036235809326, + "learning_rate": 0.0009305301645338209, + "loss": 1.1137, + "step": 510 + }, + { + "epoch": 0.700960219478738, + "grad_norm": 0.5421282052993774, + "learning_rate": 0.0009323583180987203, + "loss": 0.08, + "step": 511 + }, + { + "epoch": 0.7023319615912208, + "grad_norm": 1.943343997001648, + "learning_rate": 0.0009341864716636198, + "loss": 0.6881, + "step": 512 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 2.1507110595703125, + "learning_rate": 0.0009360146252285192, + "loss": 0.8449, + "step": 513 + }, + { + "epoch": 0.7050754458161865, + "grad_norm": 2.5957894325256348, + "learning_rate": 0.0009378427787934186, + "loss": 1.3807, + "step": 514 + }, + { + "epoch": 0.7064471879286695, + "grad_norm": 2.7923741340637207, + "learning_rate": 0.0009396709323583181, + "loss": 1.2345, + "step": 515 + }, + { + "epoch": 0.7078189300411523, + "grad_norm": 2.737544298171997, + "learning_rate": 0.0009414990859232176, + "loss": 1.3458, + "step": 516 + }, + { + "epoch": 0.7091906721536351, + "grad_norm": 3.436458110809326, + "learning_rate": 0.000943327239488117, + "loss": 1.7297, + "step": 517 + }, + { + "epoch": 0.710562414266118, + "grad_norm": 0.19496861100196838, + "learning_rate": 0.0009451553930530165, + "loss": 0.0337, + "step": 518 + }, + { + "epoch": 0.7119341563786008, + "grad_norm": 1.9844475984573364, + "learning_rate": 0.000946983546617916, + "loss": 0.7494, + "step": 519 + }, + { + "epoch": 0.7133058984910837, + "grad_norm": 2.368131637573242, + "learning_rate": 0.0009488117001828154, + "loss": 1.2839, + "step": 520 + }, + { + "epoch": 0.7146776406035665, + "grad_norm": 1.8333654403686523, + "learning_rate": 0.0009506398537477148, + "loss": 0.7679, + "step": 521 + }, + { + "epoch": 0.7160493827160493, + "grad_norm": 2.5262162685394287, + "learning_rate": 0.0009524680073126143, + "loss": 1.1101, + "step": 522 + }, + { + "epoch": 0.7174211248285323, + "grad_norm": 1.7962846755981445, + "learning_rate": 0.0009542961608775137, + "loss": 0.7259, + "step": 523 }, { - "epoch": 0.663923182441701, - "grad_norm": 1.4559303522109985, - "learning_rate": 0.0008829981718464352, - "loss": 0.5747, - "step": 484 + "epoch": 0.7187928669410151, + "grad_norm": 1.6876013278961182, + "learning_rate": 0.0009561243144424132, + "loss": 0.8531, + "step": 524 }, { - "epoch": 0.6790123456790124, - "grad_norm": 1.95571768283844, - "learning_rate": 0.000903107861060329, - "loss": 0.4851, - "step": 495 + "epoch": 0.720164609053498, + "grad_norm": 0.15522260963916779, + "learning_rate": 0.0009579524680073126, + "loss": 0.0376, + "step": 525 }, { - "epoch": 0.6941015089163237, - "grad_norm": 2.090074300765991, - "learning_rate": 0.000923217550274223, - "loss": 0.243, - "step": 506 + "epoch": 0.7215363511659808, + "grad_norm": 0.10816041380167007, + "learning_rate": 0.0009597806215722121, + "loss": 0.0368, + "step": 526 }, { - "epoch": 0.7091906721536351, - "grad_norm": 2.7206690311431885, - "learning_rate": 0.000943327239488117, - "loss": 0.4899, - "step": 517 + "epoch": 0.7229080932784636, + "grad_norm": 0.5582392811775208, + "learning_rate": 0.0009616087751371116, + "loss": 0.0679, + "step": 527 }, { "epoch": 0.7242798353909465, - "grad_norm": 0.44097644090652466, + "grad_norm": 0.790245532989502, "learning_rate": 0.0009634369287020109, - "loss": 0.2475, + "loss": 0.1483, "step": 528 }, + { + "epoch": 0.7256515775034293, + "grad_norm": 0.5912861227989197, + "learning_rate": 0.0009652650822669105, + "loss": 0.086, + "step": 529 + }, + { + "epoch": 0.7270233196159122, + "grad_norm": 1.9863473176956177, + "learning_rate": 0.0009670932358318098, + "loss": 0.4599, + "step": 530 + }, + { + "epoch": 0.7283950617283951, + "grad_norm": 2.432748794555664, + "learning_rate": 0.0009689213893967094, + "loss": 0.6046, + "step": 531 + }, + { + "epoch": 0.7297668038408779, + "grad_norm": 2.2815091609954834, + "learning_rate": 0.0009707495429616088, + "loss": 0.5801, + "step": 532 + }, + { + "epoch": 0.7311385459533608, + "grad_norm": 6.126629829406738, + "learning_rate": 0.0009725776965265083, + "loss": 4.0638, + "step": 533 + }, + { + "epoch": 0.7325102880658436, + "grad_norm": 2.440173864364624, + "learning_rate": 0.0009744058500914077, + "loss": 0.9248, + "step": 534 + }, + { + "epoch": 0.7338820301783264, + "grad_norm": 2.2785732746124268, + "learning_rate": 0.0009762340036563071, + "loss": 0.7769, + "step": 535 + }, + { + "epoch": 0.7352537722908093, + "grad_norm": 1.7889460325241089, + "learning_rate": 0.0009780621572212066, + "loss": 0.5365, + "step": 536 + }, + { + "epoch": 0.7366255144032922, + "grad_norm": 1.9316608905792236, + "learning_rate": 0.000979890310786106, + "loss": 0.7444, + "step": 537 + }, + { + "epoch": 0.7379972565157751, + "grad_norm": 1.945172905921936, + "learning_rate": 0.0009817184643510055, + "loss": 0.7193, + "step": 538 + }, { "epoch": 0.7393689986282579, - "grad_norm": 2.089887857437134, + "grad_norm": 2.1771185398101807, "learning_rate": 0.000983546617915905, - "loss": 0.5144, + "loss": 1.5464, "step": 539 }, + { + "epoch": 0.7407407407407407, + "grad_norm": 2.8104841709136963, + "learning_rate": 0.0009853747714808044, + "loss": 1.7129, + "step": 540 + }, + { + "epoch": 0.7421124828532236, + "grad_norm": 2.2292134761810303, + "learning_rate": 0.0009872029250457038, + "loss": 1.0137, + "step": 541 + }, + { + "epoch": 0.7434842249657064, + "grad_norm": 1.7931193113327026, + "learning_rate": 0.0009890310786106033, + "loss": 0.6174, + "step": 542 + }, + { + "epoch": 0.7448559670781894, + "grad_norm": 1.5730267763137817, + "learning_rate": 0.0009908592321755027, + "loss": 1.1872, + "step": 543 + }, + { + "epoch": 0.7462277091906722, + "grad_norm": 1.9772863388061523, + "learning_rate": 0.0009926873857404022, + "loss": 1.2298, + "step": 544 + }, + { + "epoch": 0.747599451303155, + "grad_norm": 3.7602641582489014, + "learning_rate": 0.0009945155393053017, + "loss": 3.6099, + "step": 545 + }, + { + "epoch": 0.7489711934156379, + "grad_norm": 1.6600819826126099, + "learning_rate": 0.0009963436928702011, + "loss": 0.9129, + "step": 546 + }, + { + "epoch": 0.7503429355281207, + "grad_norm": 1.4769471883773804, + "learning_rate": 0.0009981718464351006, + "loss": 0.4851, + "step": 547 + }, + { + "epoch": 0.7517146776406035, + "grad_norm": 1.5102423429489136, + "learning_rate": 0.001, + "loss": 0.5724, + "step": 548 + }, + { + "epoch": 0.7530864197530864, + "grad_norm": 1.6490839719772339, + "learning_rate": 0.000999999174352948, + "loss": 0.6958, + "step": 549 + }, { "epoch": 0.7544581618655692, - "grad_norm": 1.4973657131195068, + "grad_norm": 1.4960081577301025, "learning_rate": 0.0009999966974148216, - "loss": 0.7029, + "loss": 0.9115, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_Qnli-dev_cosine_accuracy": 0.7177734375, - "eval_Qnli-dev_cosine_accuracy_threshold": 0.7129597663879395, - "eval_Qnli-dev_cosine_ap": 0.7715845808061284, - "eval_Qnli-dev_cosine_f1": 0.7221719457013575, - "eval_Qnli-dev_cosine_f1_threshold": 0.6851584911346436, - "eval_Qnli-dev_cosine_precision": 0.6435483870967742, - "eval_Qnli-dev_cosine_recall": 0.822680412371134, - "eval_allNLI-dev_cosine_accuracy": 0.73828125, - "eval_allNLI-dev_cosine_accuracy_threshold": 0.8462234139442444, - "eval_allNLI-dev_cosine_ap": 0.6244503911184303, - "eval_allNLI-dev_cosine_f1": 0.6362545018007203, - "eval_allNLI-dev_cosine_f1_threshold": 0.7372293472290039, - "eval_allNLI-dev_cosine_precision": 0.5364372469635628, - "eval_allNLI-dev_cosine_recall": 0.7817109144542773, - "eval_sequential_score": 0.7715845808061284, - "eval_sts-test_pearson_cosine": 0.8245725507043073, - "eval_sts-test_spearman_cosine": 0.8556805260032072, - "eval_vitaminc-pairs_loss": 2.2667033672332764, - "eval_vitaminc-pairs_runtime": 24.1991, - "eval_vitaminc-pairs_samples_per_second": 10.579, - "eval_vitaminc-pairs_steps_per_second": 0.041, + "eval_Qnli-dev_cosine_accuracy": 0.685546875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8639271259307861, + "eval_Qnli-dev_cosine_ap": 0.7427350367644162, + "eval_Qnli-dev_cosine_f1": 0.6996336996336996, + "eval_Qnli-dev_cosine_f1_threshold": 0.7733485698699951, + "eval_Qnli-dev_cosine_precision": 0.6161290322580645, + "eval_Qnli-dev_cosine_recall": 0.809322033898305, + "eval_allNLI-dev_cosine_accuracy": 0.755859375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.8583443760871887, + "eval_allNLI-dev_cosine_ap": 0.6513030662407349, + "eval_allNLI-dev_cosine_f1": 0.6559633027522936, + "eval_allNLI-dev_cosine_f1_threshold": 0.8035047054290771, + "eval_allNLI-dev_cosine_precision": 0.5437262357414449, + "eval_allNLI-dev_cosine_recall": 0.8265895953757225, + "eval_sequential_score": 0.7427350367644162, + "eval_sts-test_pearson_cosine": 0.8397477513546239, + "eval_sts-test_spearman_cosine": 0.8842177086994444, + "eval_vitaminc-pairs_loss": 2.9589016437530518, + "eval_vitaminc-pairs_runtime": 14.2988, + "eval_vitaminc-pairs_samples_per_second": 8.952, + "eval_vitaminc-pairs_steps_per_second": 0.07, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_negation-triplets_loss": 0.8184330463409424, - "eval_negation-triplets_runtime": 4.2711, - "eval_negation-triplets_samples_per_second": 59.938, - "eval_negation-triplets_steps_per_second": 0.234, + "eval_negation-triplets_loss": 2.106090784072876, + "eval_negation-triplets_runtime": 1.1528, + "eval_negation-triplets_samples_per_second": 111.031, + "eval_negation-triplets_steps_per_second": 0.867, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_scitail-pairs-pos_loss": 0.11127087473869324, - "eval_scitail-pairs-pos_runtime": 3.0897, - "eval_scitail-pairs-pos_samples_per_second": 82.855, - "eval_scitail-pairs-pos_steps_per_second": 0.324, + "eval_scitail-pairs-pos_loss": 0.25539278984069824, + "eval_scitail-pairs-pos_runtime": 1.5749, + "eval_scitail-pairs-pos_samples_per_second": 81.276, + "eval_scitail-pairs-pos_steps_per_second": 0.635, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_scitail-pairs-qa_loss": 0.018279315903782845, - "eval_scitail-pairs-qa_runtime": 2.3553, - "eval_scitail-pairs-qa_samples_per_second": 108.691, - "eval_scitail-pairs-qa_steps_per_second": 0.425, + "eval_scitail-pairs-qa_loss": 0.02696998417377472, + "eval_scitail-pairs-qa_runtime": 1.2219, + "eval_scitail-pairs-qa_samples_per_second": 104.753, + "eval_scitail-pairs-qa_steps_per_second": 0.818, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_xsum-pairs_loss": 0.42622849345207214, - "eval_xsum-pairs_runtime": 12.8556, - "eval_xsum-pairs_samples_per_second": 19.914, - "eval_xsum-pairs_steps_per_second": 0.078, + "eval_xsum-pairs_loss": 1.035770297050476, + "eval_xsum-pairs_runtime": 6.3034, + "eval_xsum-pairs_samples_per_second": 20.307, + "eval_xsum-pairs_steps_per_second": 0.159, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_sciq_pairs_loss": 0.039594996720552444, - "eval_sciq_pairs_runtime": 20.7135, - "eval_sciq_pairs_samples_per_second": 12.359, - "eval_sciq_pairs_steps_per_second": 0.048, + "eval_sciq_pairs_loss": 0.07481610774993896, + "eval_sciq_pairs_runtime": 8.8401, + "eval_sciq_pairs_samples_per_second": 14.48, + "eval_sciq_pairs_steps_per_second": 0.113, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_qasc_pairs_loss": 0.88427734375, - "eval_qasc_pairs_runtime": 3.014, - "eval_qasc_pairs_samples_per_second": 84.937, - "eval_qasc_pairs_steps_per_second": 0.332, + "eval_qasc_pairs_loss": 0.9381294846534729, + "eval_qasc_pairs_runtime": 1.3844, + "eval_qasc_pairs_samples_per_second": 92.461, + "eval_qasc_pairs_steps_per_second": 0.722, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_openbookqa_pairs_loss": 1.3467748165130615, - "eval_openbookqa_pairs_runtime": 2.245, - "eval_openbookqa_pairs_samples_per_second": 114.032, - "eval_openbookqa_pairs_steps_per_second": 0.445, + "eval_openbookqa_pairs_loss": 1.4043291807174683, + "eval_openbookqa_pairs_runtime": 1.1842, + "eval_openbookqa_pairs_samples_per_second": 108.088, + "eval_openbookqa_pairs_steps_per_second": 0.844, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_nq_pairs_loss": 0.1950540691614151, - "eval_nq_pairs_runtime": 18.1193, - "eval_nq_pairs_samples_per_second": 14.129, - "eval_nq_pairs_steps_per_second": 0.055, + "eval_nq_pairs_loss": 0.8262844085693359, + "eval_nq_pairs_runtime": 7.8627, + "eval_nq_pairs_samples_per_second": 16.279, + "eval_nq_pairs_steps_per_second": 0.127, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_trivia_pairs_loss": 0.8971078395843506, - "eval_trivia_pairs_runtime": 16.9594, - "eval_trivia_pairs_samples_per_second": 15.095, - "eval_trivia_pairs_steps_per_second": 0.059, + "eval_trivia_pairs_loss": 1.2341746091842651, + "eval_trivia_pairs_runtime": 8.9261, + "eval_trivia_pairs_samples_per_second": 14.34, + "eval_trivia_pairs_steps_per_second": 0.112, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_gooaq_pairs_loss": 0.5568686127662659, - "eval_gooaq_pairs_runtime": 3.6329, - "eval_gooaq_pairs_samples_per_second": 70.468, - "eval_gooaq_pairs_steps_per_second": 0.275, + "eval_gooaq_pairs_loss": 0.8736229538917542, + "eval_gooaq_pairs_runtime": 2.07, + "eval_gooaq_pairs_samples_per_second": 61.836, + "eval_gooaq_pairs_steps_per_second": 0.483, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_paws-pos_loss": 0.04390861093997955, - "eval_paws-pos_runtime": 2.9399, - "eval_paws-pos_samples_per_second": 87.078, - "eval_paws-pos_steps_per_second": 0.34, + "eval_paws-pos_loss": 0.03411751613020897, + "eval_paws-pos_runtime": 1.5071, + "eval_paws-pos_samples_per_second": 84.931, + "eval_paws-pos_steps_per_second": 0.664, "step": 550 }, { "epoch": 0.7544581618655692, - "eval_global_dataset_loss": 0.2895234227180481, - "eval_global_dataset_runtime": 125.42, - "eval_global_dataset_samples_per_second": 9.759, - "eval_global_dataset_steps_per_second": 0.04, + "eval_global_dataset_loss": 0.26876401901245117, + "eval_global_dataset_runtime": 33.2849, + "eval_global_dataset_samples_per_second": 11.537, + "eval_global_dataset_steps_per_second": 0.06, "step": 550 }, + { + "epoch": 0.7558299039780522, + "grad_norm": 1.4109474420547485, + "learning_rate": 0.0009999925691947101, + "loss": 0.6115, + "step": 551 + }, + { + "epoch": 0.757201646090535, + "grad_norm": 0.21552617847919464, + "learning_rate": 0.000999986789707762, + "loss": 0.0239, + "step": 552 + }, + { + "epoch": 0.7585733882030178, + "grad_norm": 2.0374319553375244, + "learning_rate": 0.000999979358975186, + "loss": 0.8398, + "step": 553 + }, + { + "epoch": 0.7599451303155007, + "grad_norm": 2.699413776397705, + "learning_rate": 0.0009999702770242486, + "loss": 1.0964, + "step": 554 + }, + { + "epoch": 0.7613168724279835, + "grad_norm": 1.9054522514343262, + "learning_rate": 0.000999959543888277, + "loss": 0.5098, + "step": 555 + }, + { + "epoch": 0.7626886145404664, + "grad_norm": 1.9262897968292236, + "learning_rate": 0.0009999471596066566, + "loss": 0.5578, + "step": 556 + }, + { + "epoch": 0.7640603566529492, + "grad_norm": 2.377450942993164, + "learning_rate": 0.0009999331242248324, + "loss": 1.3982, + "step": 557 + }, + { + "epoch": 0.7654320987654321, + "grad_norm": 2.6483099460601807, + "learning_rate": 0.0009999174377943074, + "loss": 1.1118, + "step": 558 + }, + { + "epoch": 0.766803840877915, + "grad_norm": 1.7998634576797485, + "learning_rate": 0.000999900100372644, + "loss": 0.508, + "step": 559 + }, + { + "epoch": 0.7681755829903978, + "grad_norm": 2.284386157989502, + "learning_rate": 0.0009998811120234623, + "loss": 1.4645, + "step": 560 + }, { "epoch": 0.7695473251028807, - "grad_norm": 1.4781888723373413, + "grad_norm": 2.0943026542663574, "learning_rate": 0.0009998604728164411, - "loss": 0.4442, + "loss": 0.8549, "step": 561 }, + { + "epoch": 0.7709190672153635, + "grad_norm": 1.3964064121246338, + "learning_rate": 0.0009998381828273167, + "loss": 0.4917, + "step": 562 + }, + { + "epoch": 0.7722908093278463, + "grad_norm": 1.6305673122406006, + "learning_rate": 0.0009998142421378832, + "loss": 0.6253, + "step": 563 + }, + { + "epoch": 0.7736625514403292, + "grad_norm": 1.6184250116348267, + "learning_rate": 0.0009997886508359918, + "loss": 0.8867, + "step": 564 + }, + { + "epoch": 0.7750342935528121, + "grad_norm": 1.9399840831756592, + "learning_rate": 0.0009997614090155513, + "loss": 0.7135, + "step": 565 + }, + { + "epoch": 0.7764060356652949, + "grad_norm": 2.2604219913482666, + "learning_rate": 0.0009997325167765264, + "loss": 1.222, + "step": 566 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.4148451089859009, + "learning_rate": 0.0009997019742249383, + "loss": 0.4528, + "step": 567 + }, + { + "epoch": 0.7791495198902606, + "grad_norm": 0.7912073731422424, + "learning_rate": 0.0009996697814728644, + "loss": 0.1328, + "step": 568 + }, + { + "epoch": 0.7805212620027435, + "grad_norm": 2.348604917526245, + "learning_rate": 0.0009996359386384374, + "loss": 0.8546, + "step": 569 + }, + { + "epoch": 0.7818930041152263, + "grad_norm": 1.507812261581421, + "learning_rate": 0.000999600445845845, + "loss": 0.4279, + "step": 570 + }, + { + "epoch": 0.7832647462277091, + "grad_norm": 2.647885799407959, + "learning_rate": 0.0009995633032253294, + "loss": 1.425, + "step": 571 + }, { "epoch": 0.7846364883401921, - "grad_norm": 1.0631585121154785, + "grad_norm": 1.7565921545028687, "learning_rate": 0.0009995245109131869, - "loss": 0.3803, + "loss": 0.4775, "step": 572 }, + { + "epoch": 0.7860082304526749, + "grad_norm": 2.1025984287261963, + "learning_rate": 0.000999484069051768, + "loss": 0.6325, + "step": 573 + }, + { + "epoch": 0.7873799725651578, + "grad_norm": 2.177631378173828, + "learning_rate": 0.0009994419777894753, + "loss": 0.7343, + "step": 574 + }, + { + "epoch": 0.7887517146776406, + "grad_norm": 2.1677980422973633, + "learning_rate": 0.0009993982372807648, + "loss": 0.8758, + "step": 575 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 2.5000686645507812, + "learning_rate": 0.000999352847686144, + "loss": 1.4886, + "step": 576 + }, + { + "epoch": 0.7914951989026063, + "grad_norm": 2.379132032394409, + "learning_rate": 0.000999305809172172, + "loss": 1.1258, + "step": 577 + }, + { + "epoch": 0.7928669410150891, + "grad_norm": 2.348276376724243, + "learning_rate": 0.0009992571219114585, + "loss": 1.0294, + "step": 578 + }, + { + "epoch": 0.7942386831275721, + "grad_norm": 2.1541614532470703, + "learning_rate": 0.000999206786082664, + "loss": 1.0091, + "step": 579 + }, + { + "epoch": 0.7956104252400549, + "grad_norm": 2.221259593963623, + "learning_rate": 0.000999154801870497, + "loss": 1.4008, + "step": 580 + }, + { + "epoch": 0.7969821673525377, + "grad_norm": 1.3551172018051147, + "learning_rate": 0.0009991011694657167, + "loss": 0.4384, + "step": 581 + }, + { + "epoch": 0.7983539094650206, + "grad_norm": 2.4766149520874023, + "learning_rate": 0.000999045889065129, + "loss": 1.2668, + "step": 582 + }, { "epoch": 0.7997256515775034, - "grad_norm": 1.074621319770813, + "grad_norm": 1.6486444473266602, "learning_rate": 0.000998988960871588, - "loss": 0.5295, + "loss": 0.7217, "step": 583 }, + { + "epoch": 0.8010973936899863, + "grad_norm": 2.368161201477051, + "learning_rate": 0.0009989303850939937, + "loss": 1.242, + "step": 584 + }, + { + "epoch": 0.8024691358024691, + "grad_norm": 1.684938907623291, + "learning_rate": 0.0009988701619472926, + "loss": 0.8439, + "step": 585 + }, + { + "epoch": 0.803840877914952, + "grad_norm": 2.2783966064453125, + "learning_rate": 0.0009988082916524762, + "loss": 0.876, + "step": 586 + }, + { + "epoch": 0.8052126200274349, + "grad_norm": 2.324449300765991, + "learning_rate": 0.00099874477443658, + "loss": 1.194, + "step": 587 + }, + { + "epoch": 0.8065843621399177, + "grad_norm": 0.8597037196159363, + "learning_rate": 0.0009986796105326831, + "loss": 0.2045, + "step": 588 + }, + { + "epoch": 0.8079561042524005, + "grad_norm": 2.2256205081939697, + "learning_rate": 0.0009986128001799076, + "loss": 1.2798, + "step": 589 + }, + { + "epoch": 0.8093278463648834, + "grad_norm": 2.1167430877685547, + "learning_rate": 0.0009985443436234167, + "loss": 0.8574, + "step": 590 + }, + { + "epoch": 0.8106995884773662, + "grad_norm": 0.10849010199308395, + "learning_rate": 0.0009984742411144143, + "loss": 0.0082, + "step": 591 + }, + { + "epoch": 0.8120713305898491, + "grad_norm": 1.1498011350631714, + "learning_rate": 0.0009984024929101448, + "loss": 0.4044, + "step": 592 + }, + { + "epoch": 0.813443072702332, + "grad_norm": 1.5021452903747559, + "learning_rate": 0.0009983290992738915, + "loss": 0.4919, + "step": 593 + }, { "epoch": 0.8148148148148148, - "grad_norm": 1.6665034294128418, + "grad_norm": 2.5541698932647705, "learning_rate": 0.0009982540604749751, - "loss": 0.3499, + "loss": 1.2524, "step": 594 }, + { + "epoch": 0.8161865569272977, + "grad_norm": 2.190497636795044, + "learning_rate": 0.000998177376788754, + "loss": 0.795, + "step": 595 + }, + { + "epoch": 0.8175582990397805, + "grad_norm": 2.115995407104492, + "learning_rate": 0.000998099048496622, + "loss": 0.8756, + "step": 596 + }, + { + "epoch": 0.8189300411522634, + "grad_norm": 0.6140339374542236, + "learning_rate": 0.0009980190758860081, + "loss": 0.1396, + "step": 597 + }, + { + "epoch": 0.8203017832647462, + "grad_norm": 1.3098580837249756, + "learning_rate": 0.0009979374592503753, + "loss": 0.37, + "step": 598 + }, + { + "epoch": 0.821673525377229, + "grad_norm": 1.9168777465820312, + "learning_rate": 0.0009978541988892192, + "loss": 0.8108, + "step": 599 + }, + { + "epoch": 0.823045267489712, + "grad_norm": 1.329971432685852, + "learning_rate": 0.0009977692951080672, + "loss": 0.4655, + "step": 600 + }, + { + "epoch": 0.8244170096021948, + "grad_norm": 1.5312412977218628, + "learning_rate": 0.0009976827482184776, + "loss": 0.6358, + "step": 601 + }, + { + "epoch": 0.8257887517146777, + "grad_norm": 2.3435771465301514, + "learning_rate": 0.0009975945585380377, + "loss": 1.4994, + "step": 602 + }, + { + "epoch": 0.8271604938271605, + "grad_norm": 1.913093090057373, + "learning_rate": 0.0009975047263903632, + "loss": 0.7349, + "step": 603 + }, + { + "epoch": 0.8285322359396433, + "grad_norm": 2.202099323272705, + "learning_rate": 0.000997413252105097, + "loss": 1.3005, + "step": 604 + }, { "epoch": 0.8299039780521262, - "grad_norm": 1.859250545501709, + "grad_norm": 1.8408820629119873, "learning_rate": 0.000997320136017908, - "loss": 0.4366, + "loss": 1.019, "step": 605 }, + { + "epoch": 0.831275720164609, + "grad_norm": 2.17401385307312, + "learning_rate": 0.0009972253784704896, + "loss": 1.0097, + "step": 606 + }, + { + "epoch": 0.8326474622770919, + "grad_norm": 3.5524911880493164, + "learning_rate": 0.0009971289798105585, + "loss": 3.3517, + "step": 607 + }, + { + "epoch": 0.8340192043895748, + "grad_norm": 2.2348220348358154, + "learning_rate": 0.0009970309403918537, + "loss": 1.3093, + "step": 608 + }, + { + "epoch": 0.8353909465020576, + "grad_norm": 2.220083713531494, + "learning_rate": 0.0009969312605741353, + "loss": 1.4763, + "step": 609 + }, + { + "epoch": 0.8367626886145405, + "grad_norm": 1.5978267192840576, + "learning_rate": 0.0009968299407231824, + "loss": 0.8629, + "step": 610 + }, + { + "epoch": 0.8381344307270233, + "grad_norm": 1.6459195613861084, + "learning_rate": 0.0009967269812107924, + "loss": 0.8401, + "step": 611 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 1.993955373764038, + "learning_rate": 0.0009966223824147797, + "loss": 1.1366, + "step": 612 + }, + { + "epoch": 0.840877914951989, + "grad_norm": 2.3398549556732178, + "learning_rate": 0.0009965161447189742, + "loss": 1.4177, + "step": 613 + }, + { + "epoch": 0.8422496570644719, + "grad_norm": 2.820521116256714, + "learning_rate": 0.0009964082685132193, + "loss": 2.9207, + "step": 614 + }, + { + "epoch": 0.8436213991769548, + "grad_norm": 1.454741358757019, + "learning_rate": 0.0009962987541933717, + "loss": 0.6808, + "step": 615 + }, { "epoch": 0.8449931412894376, - "grad_norm": 1.294621467590332, + "grad_norm": 1.4422433376312256, "learning_rate": 0.0009961876021612984, - "loss": 0.941, + "loss": 0.5965, "step": 616 }, + { + "epoch": 0.8463648834019204, + "grad_norm": 0.2472972720861435, + "learning_rate": 0.0009960748128248769, + "loss": 0.0313, + "step": 617 + }, + { + "epoch": 0.8477366255144033, + "grad_norm": 1.2811288833618164, + "learning_rate": 0.000995960386597992, + "loss": 0.8359, + "step": 618 + }, + { + "epoch": 0.8491083676268861, + "grad_norm": 1.7258317470550537, + "learning_rate": 0.0009958443239005362, + "loss": 0.9549, + "step": 619 + }, + { + "epoch": 0.850480109739369, + "grad_norm": 2.052578926086426, + "learning_rate": 0.000995726625158406, + "loss": 1.2424, + "step": 620 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 2.598790407180786, + "learning_rate": 0.0009956072908035023, + "loss": 1.5772, + "step": 621 + }, + { + "epoch": 0.8532235939643347, + "grad_norm": 1.8372243642807007, + "learning_rate": 0.000995486321273727, + "loss": 0.8351, + "step": 622 + }, + { + "epoch": 0.8545953360768176, + "grad_norm": 2.4052348136901855, + "learning_rate": 0.0009953637170129837, + "loss": 0.726, + "step": 623 + }, + { + "epoch": 0.8559670781893004, + "grad_norm": 1.429192066192627, + "learning_rate": 0.0009952394784711735, + "loss": 0.4982, + "step": 624 + }, + { + "epoch": 0.8573388203017832, + "grad_norm": 1.8847312927246094, + "learning_rate": 0.000995113606104195, + "loss": 1.0102, + "step": 625 + }, + { + "epoch": 0.8587105624142661, + "grad_norm": 1.5976717472076416, + "learning_rate": 0.0009949861003739422, + "loss": 0.7013, + "step": 626 + }, { "epoch": 0.8600823045267489, "grad_norm": 0.0, "learning_rate": 0.000994856961748303, - "loss": 0.3918, + "loss": 0.0, "step": 627 }, + { + "epoch": 0.8614540466392319, + "grad_norm": 2.0918030738830566, + "learning_rate": 0.000994726190701157, + "loss": 0.9771, + "step": 628 + }, + { + "epoch": 0.8628257887517147, + "grad_norm": 1.5623385906219482, + "learning_rate": 0.0009945937877123738, + "loss": 0.6401, + "step": 629 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 1.988359808921814, + "learning_rate": 0.0009944597532678121, + "loss": 0.8453, + "step": 630 + }, + { + "epoch": 0.8655692729766804, + "grad_norm": 1.7483891248703003, + "learning_rate": 0.0009943240878593168, + "loss": 0.6376, + "step": 631 + }, + { + "epoch": 0.8669410150891632, + "grad_norm": 1.747367024421692, + "learning_rate": 0.000994186791984718, + "loss": 0.8945, + "step": 632 + }, + { + "epoch": 0.8683127572016461, + "grad_norm": 1.7018707990646362, + "learning_rate": 0.0009940478661478289, + "loss": 0.7919, + "step": 633 + }, + { + "epoch": 0.869684499314129, + "grad_norm": 2.589437484741211, + "learning_rate": 0.0009939073108584432, + "loss": 2.7374, + "step": 634 + }, + { + "epoch": 0.8710562414266118, + "grad_norm": 1.5699372291564941, + "learning_rate": 0.0009937651266323352, + "loss": 0.6152, + "step": 635 + }, + { + "epoch": 0.8724279835390947, + "grad_norm": 1.5574616193771362, + "learning_rate": 0.0009936213139912556, + "loss": 0.7544, + "step": 636 + }, + { + "epoch": 0.8737997256515775, + "grad_norm": 1.4045825004577637, + "learning_rate": 0.0009934758734629308, + "loss": 0.764, + "step": 637 + }, { "epoch": 0.8751714677640604, "grad_norm": 0.0, "learning_rate": 0.0009933288055810616, - "loss": 0.5476, + "loss": 0.0, "step": 638 }, + { + "epoch": 0.8765432098765432, + "grad_norm": 0.1518259048461914, + "learning_rate": 0.0009931801108853191, + "loss": 0.0341, + "step": 639 + }, + { + "epoch": 0.877914951989026, + "grad_norm": 1.7389847040176392, + "learning_rate": 0.0009930297899213452, + "loss": 0.8661, + "step": 640 + }, + { + "epoch": 0.879286694101509, + "grad_norm": 1.4264923334121704, + "learning_rate": 0.0009928778432407492, + "loss": 0.5543, + "step": 641 + }, + { + "epoch": 0.8806584362139918, + "grad_norm": 1.8644518852233887, + "learning_rate": 0.0009927242714011055, + "loss": 0.7215, + "step": 642 + }, + { + "epoch": 0.8820301783264746, + "grad_norm": 1.5277243852615356, + "learning_rate": 0.0009925690749659527, + "loss": 0.5738, + "step": 643 + }, + { + "epoch": 0.8834019204389575, + "grad_norm": 1.8840594291687012, + "learning_rate": 0.0009924122545047907, + "loss": 1.2774, + "step": 644 + }, + { + "epoch": 0.8847736625514403, + "grad_norm": 1.5248987674713135, + "learning_rate": 0.000992253810593079, + "loss": 0.5194, + "step": 645 + }, + { + "epoch": 0.8861454046639232, + "grad_norm": 1.2852954864501953, + "learning_rate": 0.000992093743812234, + "loss": 0.4957, + "step": 646 + }, + { + "epoch": 0.887517146776406, + "grad_norm": 1.4410088062286377, + "learning_rate": 0.0009919320547496276, + "loss": 0.6336, + "step": 647 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.2397258281707764, + "learning_rate": 0.0009917687439985848, + "loss": 1.3606, + "step": 648 + }, { "epoch": 0.8902606310013718, - "grad_norm": 0.9047508835792542, + "grad_norm": 1.6791517734527588, "learning_rate": 0.000991603812158381, - "loss": 0.3118, + "loss": 0.5841, "step": 649 + }, + { + "epoch": 0.8916323731138546, + "grad_norm": 1.8580983877182007, + "learning_rate": 0.000991437259834241, + "loss": 0.7388, + "step": 650 + }, + { + "epoch": 0.8930041152263375, + "grad_norm": 0.7454937696456909, + "learning_rate": 0.0009912690876373357, + "loss": 0.1784, + "step": 651 + }, + { + "epoch": 0.8943758573388203, + "grad_norm": 1.311299443244934, + "learning_rate": 0.0009910992961847797, + "loss": 0.8583, + "step": 652 + }, + { + "epoch": 0.8957475994513031, + "grad_norm": 0.6724503040313721, + "learning_rate": 0.00099092788609963, + "loss": 0.16, + "step": 653 + }, + { + "epoch": 0.897119341563786, + "grad_norm": 0.3248959481716156, + "learning_rate": 0.0009907548580108834, + "loss": 0.0596, + "step": 654 + }, + { + "epoch": 0.8984910836762688, + "grad_norm": 1.593714714050293, + "learning_rate": 0.0009905802125534738, + "loss": 0.5839, + "step": 655 + }, + { + "epoch": 0.8998628257887518, + "grad_norm": 1.5883684158325195, + "learning_rate": 0.0009904039503682701, + "loss": 0.5885, + "step": 656 + }, + { + "epoch": 0.9012345679012346, + "grad_norm": 2.72161602973938, + "learning_rate": 0.0009902260721020737, + "loss": 1.712, + "step": 657 } ], - "logging_steps": 11, + "logging_steps": 1, "max_steps": 2187, "num_input_tokens_seen": 0, "num_train_epochs": 3,