{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0078277886497065, "eval_steps": 5, "global_step": 515, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019569471624266144, "grad_norm": 3.932948112487793, "learning_rate": 7.8125e-08, "loss": 0.107, "step": 1 }, { "epoch": 0.003913894324853229, "grad_norm": 4.482716083526611, "learning_rate": 1.5625e-07, "loss": 0.1529, "step": 2 }, { "epoch": 0.005870841487279843, "grad_norm": 4.672689437866211, "learning_rate": 2.3437500000000003e-07, "loss": 0.1874, "step": 3 }, { "epoch": 0.007827788649706457, "grad_norm": 4.226949214935303, "learning_rate": 3.125e-07, "loss": 0.1682, "step": 4 }, { "epoch": 0.009784735812133072, "grad_norm": 4.327479362487793, "learning_rate": 3.90625e-07, "loss": 0.1438, "step": 5 }, { "epoch": 0.009784735812133072, "eval_loss": 0.1470455378293991, "eval_runtime": 107.3614, "eval_samples_per_second": 28.427, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8861388036460539, "eval_sts-test_pearson_dot": 0.8769528313548112, "eval_sts-test_pearson_euclidean": 0.9079831987750276, "eval_sts-test_pearson_manhattan": 0.9086786527495163, "eval_sts-test_pearson_max": 0.9086786527495163, "eval_sts-test_spearman_cosine": 0.9077902566323186, "eval_sts-test_spearman_dot": 0.8794770733264693, "eval_sts-test_spearman_euclidean": 0.903967335376697, "eval_sts-test_spearman_manhattan": 0.9043498244078092, "eval_sts-test_spearman_max": 0.9077902566323186, "step": 5 }, { "epoch": 0.011741682974559686, "grad_norm": 5.27250337600708, "learning_rate": 4.6875000000000006e-07, "loss": 0.2961, "step": 6 }, { "epoch": 0.0136986301369863, "grad_norm": 5.903276443481445, "learning_rate": 5.468750000000001e-07, "loss": 0.3019, "step": 7 }, { "epoch": 0.015655577299412915, "grad_norm": 4.000335693359375, "learning_rate": 6.25e-07, "loss": 0.1184, "step": 8 }, { "epoch": 0.01761252446183953, "grad_norm": 5.876769065856934, "learning_rate": 7.03125e-07, "loss": 0.3176, "step": 9 }, { "epoch": 0.019569471624266144, "grad_norm": 4.8437933921813965, "learning_rate": 7.8125e-07, "loss": 0.2234, "step": 10 }, { "epoch": 0.019569471624266144, "eval_loss": 0.1467687040567398, "eval_runtime": 107.2549, "eval_samples_per_second": 28.456, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8861409457129842, "eval_sts-test_pearson_dot": 0.876972814890145, "eval_sts-test_pearson_euclidean": 0.9080268416052204, "eval_sts-test_pearson_manhattan": 0.9087444298597203, "eval_sts-test_pearson_max": 0.9087444298597203, "eval_sts-test_spearman_cosine": 0.9078342918735278, "eval_sts-test_spearman_dot": 0.8794190309404447, "eval_sts-test_spearman_euclidean": 0.9039501508923226, "eval_sts-test_spearman_manhattan": 0.9044244247605487, "eval_sts-test_spearman_max": 0.9078342918735278, "step": 10 }, { "epoch": 0.021526418786692758, "grad_norm": 4.726498603820801, "learning_rate": 8.59375e-07, "loss": 0.1881, "step": 11 }, { "epoch": 0.023483365949119372, "grad_norm": 4.818070411682129, "learning_rate": 9.375000000000001e-07, "loss": 0.1593, "step": 12 }, { "epoch": 0.025440313111545987, "grad_norm": 4.98201322555542, "learning_rate": 1.0156250000000001e-06, "loss": 0.1833, "step": 13 }, { "epoch": 0.0273972602739726, "grad_norm": 4.269514560699463, "learning_rate": 1.0937500000000001e-06, "loss": 0.1352, "step": 14 }, { "epoch": 0.029354207436399216, "grad_norm": 6.1525492668151855, "learning_rate": 1.1718750000000001e-06, "loss": 0.3143, "step": 15 }, { "epoch": 0.029354207436399216, "eval_loss": 0.1462097316980362, "eval_runtime": 107.0721, "eval_samples_per_second": 28.504, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8860829119688085, "eval_sts-test_pearson_dot": 0.8768990080043222, "eval_sts-test_pearson_euclidean": 0.9080646402781543, "eval_sts-test_pearson_manhattan": 0.9088063929836994, "eval_sts-test_pearson_max": 0.9088063929836994, "eval_sts-test_spearman_cosine": 0.907713597721555, "eval_sts-test_spearman_dot": 0.8795110842851269, "eval_sts-test_spearman_euclidean": 0.9040110126078148, "eval_sts-test_spearman_manhattan": 0.9045081991218733, "eval_sts-test_spearman_max": 0.907713597721555, "step": 15 }, { "epoch": 0.03131115459882583, "grad_norm": 4.751354694366455, "learning_rate": 1.25e-06, "loss": 0.1583, "step": 16 }, { "epoch": 0.033268101761252444, "grad_norm": 5.435980319976807, "learning_rate": 1.328125e-06, "loss": 0.2015, "step": 17 }, { "epoch": 0.03522504892367906, "grad_norm": 4.1765851974487305, "learning_rate": 1.40625e-06, "loss": 0.1476, "step": 18 }, { "epoch": 0.03718199608610567, "grad_norm": 4.689794540405273, "learning_rate": 1.484375e-06, "loss": 0.1676, "step": 19 }, { "epoch": 0.03913894324853229, "grad_norm": 4.203744888305664, "learning_rate": 1.5625e-06, "loss": 0.1525, "step": 20 }, { "epoch": 0.03913894324853229, "eval_loss": 0.14544810354709625, "eval_runtime": 107.1845, "eval_samples_per_second": 28.474, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8861436293943533, "eval_sts-test_pearson_dot": 0.8769239163708102, "eval_sts-test_pearson_euclidean": 0.9082269545633608, "eval_sts-test_pearson_manhattan": 0.9089828403051001, "eval_sts-test_pearson_max": 0.9089828403051001, "eval_sts-test_spearman_cosine": 0.907929343552723, "eval_sts-test_spearman_dot": 0.8796122221358714, "eval_sts-test_spearman_euclidean": 0.9043074002120102, "eval_sts-test_spearman_manhattan": 0.9047217521412333, "eval_sts-test_spearman_max": 0.907929343552723, "step": 20 }, { "epoch": 0.0410958904109589, "grad_norm": 5.152130603790283, "learning_rate": 1.640625e-06, "loss": 0.1717, "step": 21 }, { "epoch": 0.043052837573385516, "grad_norm": 5.343059062957764, "learning_rate": 1.71875e-06, "loss": 0.198, "step": 22 }, { "epoch": 0.04500978473581213, "grad_norm": 5.224748134613037, "learning_rate": 1.796875e-06, "loss": 0.3062, "step": 23 }, { "epoch": 0.046966731898238745, "grad_norm": 4.6179423332214355, "learning_rate": 1.8750000000000003e-06, "loss": 0.1241, "step": 24 }, { "epoch": 0.04892367906066536, "grad_norm": 4.200148105621338, "learning_rate": 1.953125e-06, "loss": 0.1087, "step": 25 }, { "epoch": 0.04892367906066536, "eval_loss": 0.14457188546657562, "eval_runtime": 107.3809, "eval_samples_per_second": 28.422, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8862905994058754, "eval_sts-test_pearson_dot": 0.877015249192232, "eval_sts-test_pearson_euclidean": 0.9085054742522269, "eval_sts-test_pearson_manhattan": 0.9092575877809899, "eval_sts-test_pearson_max": 0.9092575877809899, "eval_sts-test_spearman_cosine": 0.9082294902628751, "eval_sts-test_spearman_dot": 0.8798810429630494, "eval_sts-test_spearman_euclidean": 0.9047149499495015, "eval_sts-test_spearman_manhattan": 0.9051023616193669, "eval_sts-test_spearman_max": 0.9082294902628751, "step": 25 }, { "epoch": 0.050880626223091974, "grad_norm": 4.890737533569336, "learning_rate": 2.0312500000000002e-06, "loss": 0.1767, "step": 26 }, { "epoch": 0.05283757338551859, "grad_norm": 4.683767795562744, "learning_rate": 2.109375e-06, "loss": 0.1951, "step": 27 }, { "epoch": 0.0547945205479452, "grad_norm": 4.656280040740967, "learning_rate": 2.1875000000000002e-06, "loss": 0.1621, "step": 28 }, { "epoch": 0.05675146771037182, "grad_norm": 4.446409702301025, "learning_rate": 2.265625e-06, "loss": 0.221, "step": 29 }, { "epoch": 0.05870841487279843, "grad_norm": 5.765133857727051, "learning_rate": 2.3437500000000002e-06, "loss": 0.2241, "step": 30 }, { "epoch": 0.05870841487279843, "eval_loss": 0.14350731670856476, "eval_runtime": 107.3747, "eval_samples_per_second": 28.424, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8863784941826807, "eval_sts-test_pearson_dot": 0.8768948467465629, "eval_sts-test_pearson_euclidean": 0.9088066170487232, "eval_sts-test_pearson_manhattan": 0.9095658568102677, "eval_sts-test_pearson_max": 0.9095658568102677, "eval_sts-test_spearman_cosine": 0.9082580415676429, "eval_sts-test_spearman_dot": 0.8801849487791585, "eval_sts-test_spearman_euclidean": 0.9051721735871375, "eval_sts-test_spearman_manhattan": 0.9054862826908437, "eval_sts-test_spearman_max": 0.9082580415676429, "step": 30 }, { "epoch": 0.060665362035225046, "grad_norm": 5.359245777130127, "learning_rate": 2.421875e-06, "loss": 0.2093, "step": 31 }, { "epoch": 0.06262230919765166, "grad_norm": 4.439486503601074, "learning_rate": 2.5e-06, "loss": 0.1615, "step": 32 }, { "epoch": 0.06457925636007827, "grad_norm": 3.689824342727661, "learning_rate": 2.5781250000000004e-06, "loss": 0.1615, "step": 33 }, { "epoch": 0.06653620352250489, "grad_norm": 4.842885494232178, "learning_rate": 2.65625e-06, "loss": 0.1772, "step": 34 }, { "epoch": 0.0684931506849315, "grad_norm": 5.209301948547363, "learning_rate": 2.7343750000000004e-06, "loss": 0.2324, "step": 35 }, { "epoch": 0.0684931506849315, "eval_loss": 0.14226235449314117, "eval_runtime": 107.3108, "eval_samples_per_second": 28.441, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8863574366132135, "eval_sts-test_pearson_dot": 0.8765683077424664, "eval_sts-test_pearson_euclidean": 0.9091012263251723, "eval_sts-test_pearson_manhattan": 0.9098631032540263, "eval_sts-test_pearson_max": 0.9098631032540263, "eval_sts-test_spearman_cosine": 0.9083728733043733, "eval_sts-test_spearman_dot": 0.8800282746130272, "eval_sts-test_spearman_euclidean": 0.9052579170039636, "eval_sts-test_spearman_manhattan": 0.9059997586640487, "eval_sts-test_spearman_max": 0.9083728733043733, "step": 35 }, { "epoch": 0.07045009784735812, "grad_norm": 4.740983009338379, "learning_rate": 2.8125e-06, "loss": 0.2611, "step": 36 }, { "epoch": 0.07240704500978473, "grad_norm": 5.090059757232666, "learning_rate": 2.8906250000000004e-06, "loss": 0.214, "step": 37 }, { "epoch": 0.07436399217221135, "grad_norm": 5.123153209686279, "learning_rate": 2.96875e-06, "loss": 0.1985, "step": 38 }, { "epoch": 0.07632093933463796, "grad_norm": 5.401946067810059, "learning_rate": 3.0468750000000004e-06, "loss": 0.1855, "step": 39 }, { "epoch": 0.07827788649706457, "grad_norm": 4.838700294494629, "learning_rate": 3.125e-06, "loss": 0.1234, "step": 40 }, { "epoch": 0.07827788649706457, "eval_loss": 0.14100149273872375, "eval_runtime": 107.3059, "eval_samples_per_second": 28.442, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8864265749012155, "eval_sts-test_pearson_dot": 0.8764612424174422, "eval_sts-test_pearson_euclidean": 0.9094092487009695, "eval_sts-test_pearson_manhattan": 0.9101707626021143, "eval_sts-test_pearson_max": 0.9101707626021143, "eval_sts-test_spearman_cosine": 0.908505695048183, "eval_sts-test_spearman_dot": 0.8802103674956289, "eval_sts-test_spearman_euclidean": 0.9054564783507572, "eval_sts-test_spearman_manhattan": 0.9063046490079084, "eval_sts-test_spearman_max": 0.908505695048183, "step": 40 }, { "epoch": 0.08023483365949119, "grad_norm": 3.8856801986694336, "learning_rate": 3.2031250000000004e-06, "loss": 0.1492, "step": 41 }, { "epoch": 0.0821917808219178, "grad_norm": 5.678151607513428, "learning_rate": 3.28125e-06, "loss": 0.2022, "step": 42 }, { "epoch": 0.08414872798434442, "grad_norm": 5.104148864746094, "learning_rate": 3.3593750000000003e-06, "loss": 0.2146, "step": 43 }, { "epoch": 0.08610567514677103, "grad_norm": 4.76043701171875, "learning_rate": 3.4375e-06, "loss": 0.1688, "step": 44 }, { "epoch": 0.08806262230919765, "grad_norm": 5.128803730010986, "learning_rate": 3.5156250000000003e-06, "loss": 0.175, "step": 45 }, { "epoch": 0.08806262230919765, "eval_loss": 0.13962982594966888, "eval_runtime": 107.4144, "eval_samples_per_second": 28.413, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.886410908658177, "eval_sts-test_pearson_dot": 0.8762836795862763, "eval_sts-test_pearson_euclidean": 0.9096890242379734, "eval_sts-test_pearson_manhattan": 0.9104590803642174, "eval_sts-test_pearson_max": 0.9104590803642174, "eval_sts-test_spearman_cosine": 0.9086694846648755, "eval_sts-test_spearman_dot": 0.8801346931126159, "eval_sts-test_spearman_euclidean": 0.9057376952773407, "eval_sts-test_spearman_manhattan": 0.9064708999439774, "eval_sts-test_spearman_max": 0.9086694846648755, "step": 45 }, { "epoch": 0.09001956947162426, "grad_norm": 4.968522548675537, "learning_rate": 3.59375e-06, "loss": 0.2123, "step": 46 }, { "epoch": 0.09197651663405088, "grad_norm": 4.343472957611084, "learning_rate": 3.6718750000000003e-06, "loss": 0.1118, "step": 47 }, { "epoch": 0.09393346379647749, "grad_norm": 6.252938270568848, "learning_rate": 3.7500000000000005e-06, "loss": 0.3009, "step": 48 }, { "epoch": 0.0958904109589041, "grad_norm": 3.411029815673828, "learning_rate": 3.828125000000001e-06, "loss": 0.1071, "step": 49 }, { "epoch": 0.09784735812133072, "grad_norm": 5.379226207733154, "learning_rate": 3.90625e-06, "loss": 0.2608, "step": 50 }, { "epoch": 0.09784735812133072, "eval_loss": 0.13823722302913666, "eval_runtime": 107.3656, "eval_samples_per_second": 28.426, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8863074884351817, "eval_sts-test_pearson_dot": 0.8763122134205692, "eval_sts-test_pearson_euclidean": 0.9097700018848961, "eval_sts-test_pearson_manhattan": 0.9105724410858811, "eval_sts-test_pearson_max": 0.9105724410858811, "eval_sts-test_spearman_cosine": 0.9085105281844131, "eval_sts-test_spearman_dot": 0.8801239975611433, "eval_sts-test_spearman_euclidean": 0.9059798443527296, "eval_sts-test_spearman_manhattan": 0.9065691737139927, "eval_sts-test_spearman_max": 0.9085105281844131, "step": 50 }, { "epoch": 0.09980430528375733, "grad_norm": 4.599095821380615, "learning_rate": 3.984375e-06, "loss": 0.1368, "step": 51 }, { "epoch": 0.10176125244618395, "grad_norm": 5.634761333465576, "learning_rate": 4.0625000000000005e-06, "loss": 0.2307, "step": 52 }, { "epoch": 0.10371819960861056, "grad_norm": 4.678525924682617, "learning_rate": 4.140625000000001e-06, "loss": 0.1366, "step": 53 }, { "epoch": 0.10567514677103718, "grad_norm": 4.931070327758789, "learning_rate": 4.21875e-06, "loss": 0.1857, "step": 54 }, { "epoch": 0.10763209393346379, "grad_norm": 4.903087139129639, "learning_rate": 4.296875e-06, "loss": 0.2155, "step": 55 }, { "epoch": 0.10763209393346379, "eval_loss": 0.1367325782775879, "eval_runtime": 107.3012, "eval_samples_per_second": 28.443, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.88603017002284, "eval_sts-test_pearson_dot": 0.8761626193697236, "eval_sts-test_pearson_euclidean": 0.9096799681812165, "eval_sts-test_pearson_manhattan": 0.9104977957475867, "eval_sts-test_pearson_max": 0.9104977957475867, "eval_sts-test_spearman_cosine": 0.9084685067499666, "eval_sts-test_spearman_dot": 0.8802836700617878, "eval_sts-test_spearman_euclidean": 0.9058409364373706, "eval_sts-test_spearman_manhattan": 0.9064240006220393, "eval_sts-test_spearman_max": 0.9084685067499666, "step": 55 }, { "epoch": 0.1095890410958904, "grad_norm": 5.408311367034912, "learning_rate": 4.3750000000000005e-06, "loss": 0.2022, "step": 56 }, { "epoch": 0.11154598825831702, "grad_norm": 4.5926713943481445, "learning_rate": 4.453125000000001e-06, "loss": 0.2076, "step": 57 }, { "epoch": 0.11350293542074363, "grad_norm": 6.475535869598389, "learning_rate": 4.53125e-06, "loss": 0.4133, "step": 58 }, { "epoch": 0.11545988258317025, "grad_norm": 4.997581481933594, "learning_rate": 4.609375e-06, "loss": 0.1823, "step": 59 }, { "epoch": 0.11741682974559686, "grad_norm": 3.899284601211548, "learning_rate": 4.6875000000000004e-06, "loss": 0.1136, "step": 60 }, { "epoch": 0.11741682974559686, "eval_loss": 0.13528631627559662, "eval_runtime": 107.3435, "eval_samples_per_second": 28.432, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8860224650016398, "eval_sts-test_pearson_dot": 0.8762739756970772, "eval_sts-test_pearson_euclidean": 0.9099016820022997, "eval_sts-test_pearson_manhattan": 0.9107281338135995, "eval_sts-test_pearson_max": 0.9107281338135995, "eval_sts-test_spearman_cosine": 0.9087510214631306, "eval_sts-test_spearman_dot": 0.8808623486228402, "eval_sts-test_spearman_euclidean": 0.9060555634870038, "eval_sts-test_spearman_manhattan": 0.9067256241238172, "eval_sts-test_spearman_max": 0.9087510214631306, "step": 60 }, { "epoch": 0.11937377690802348, "grad_norm": 4.476404190063477, "learning_rate": 4.765625000000001e-06, "loss": 0.1687, "step": 61 }, { "epoch": 0.12133072407045009, "grad_norm": 4.893277168273926, "learning_rate": 4.84375e-06, "loss": 0.1591, "step": 62 }, { "epoch": 0.1232876712328767, "grad_norm": 4.510354042053223, "learning_rate": 4.921875e-06, "loss": 0.1653, "step": 63 }, { "epoch": 0.12524461839530332, "grad_norm": 4.400285243988037, "learning_rate": 5e-06, "loss": 0.1799, "step": 64 }, { "epoch": 0.12720156555772993, "grad_norm": 4.631839752197266, "learning_rate": 5.078125000000001e-06, "loss": 0.1578, "step": 65 }, { "epoch": 0.12720156555772993, "eval_loss": 0.1336735188961029, "eval_runtime": 107.4984, "eval_samples_per_second": 28.391, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.886014179849858, "eval_sts-test_pearson_dot": 0.8762492282837839, "eval_sts-test_pearson_euclidean": 0.9101155794045166, "eval_sts-test_pearson_manhattan": 0.9109538919103571, "eval_sts-test_pearson_max": 0.9109538919103571, "eval_sts-test_spearman_cosine": 0.9089514176116413, "eval_sts-test_spearman_dot": 0.8810853441583534, "eval_sts-test_spearman_euclidean": 0.9061670836303911, "eval_sts-test_spearman_manhattan": 0.9072153371772234, "eval_sts-test_spearman_max": 0.9089514176116413, "step": 65 }, { "epoch": 0.12915851272015655, "grad_norm": 4.043459415435791, "learning_rate": 5.156250000000001e-06, "loss": 0.1844, "step": 66 }, { "epoch": 0.13111545988258316, "grad_norm": 4.447835922241211, "learning_rate": 5.234375e-06, "loss": 0.1489, "step": 67 }, { "epoch": 0.13307240704500978, "grad_norm": 5.372109889984131, "learning_rate": 5.3125e-06, "loss": 0.1845, "step": 68 }, { "epoch": 0.1350293542074364, "grad_norm": 3.5112483501434326, "learning_rate": 5.390625000000001e-06, "loss": 0.1364, "step": 69 }, { "epoch": 0.136986301369863, "grad_norm": 4.305239200592041, "learning_rate": 5.468750000000001e-06, "loss": 0.1584, "step": 70 }, { "epoch": 0.136986301369863, "eval_loss": 0.1320798397064209, "eval_runtime": 107.505, "eval_samples_per_second": 28.389, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.88578311613969, "eval_sts-test_pearson_dot": 0.875928774505713, "eval_sts-test_pearson_euclidean": 0.91024619729973, "eval_sts-test_pearson_manhattan": 0.9110959495329505, "eval_sts-test_pearson_max": 0.9110959495329505, "eval_sts-test_spearman_cosine": 0.9086066538938818, "eval_sts-test_spearman_dot": 0.8801235500485294, "eval_sts-test_spearman_euclidean": 0.9060052183179386, "eval_sts-test_spearman_manhattan": 0.907439182986703, "eval_sts-test_spearman_max": 0.9086066538938818, "step": 70 }, { "epoch": 0.13894324853228962, "grad_norm": 5.093306064605713, "learning_rate": 5.546875e-06, "loss": 0.2279, "step": 71 }, { "epoch": 0.14090019569471623, "grad_norm": 4.953585147857666, "learning_rate": 5.625e-06, "loss": 0.2028, "step": 72 }, { "epoch": 0.14285714285714285, "grad_norm": 4.1561102867126465, "learning_rate": 5.7031250000000006e-06, "loss": 0.2291, "step": 73 }, { "epoch": 0.14481409001956946, "grad_norm": 5.00941801071167, "learning_rate": 5.781250000000001e-06, "loss": 0.2419, "step": 74 }, { "epoch": 0.14677103718199608, "grad_norm": 3.6476099491119385, "learning_rate": 5.859375e-06, "loss": 0.1329, "step": 75 }, { "epoch": 0.14677103718199608, "eval_loss": 0.13061992824077606, "eval_runtime": 107.3395, "eval_samples_per_second": 28.433, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8854112983780439, "eval_sts-test_pearson_dot": 0.8752625071185561, "eval_sts-test_pearson_euclidean": 0.9103378320010516, "eval_sts-test_pearson_manhattan": 0.9112261622276095, "eval_sts-test_pearson_max": 0.9112261622276095, "eval_sts-test_spearman_cosine": 0.9082604133844965, "eval_sts-test_spearman_dot": 0.8794192099454903, "eval_sts-test_spearman_euclidean": 0.9060063370994732, "eval_sts-test_spearman_manhattan": 0.90766132824825, "eval_sts-test_spearman_max": 0.9082604133844965, "step": 75 }, { "epoch": 0.1487279843444227, "grad_norm": 4.10636568069458, "learning_rate": 5.9375e-06, "loss": 0.204, "step": 76 }, { "epoch": 0.1506849315068493, "grad_norm": 4.767779350280762, "learning_rate": 6.0156250000000005e-06, "loss": 0.2239, "step": 77 }, { "epoch": 0.15264187866927592, "grad_norm": 5.366302490234375, "learning_rate": 6.093750000000001e-06, "loss": 0.2181, "step": 78 }, { "epoch": 0.15459882583170254, "grad_norm": 4.087960720062256, "learning_rate": 6.171875e-06, "loss": 0.1285, "step": 79 }, { "epoch": 0.15655577299412915, "grad_norm": 3.7557668685913086, "learning_rate": 6.25e-06, "loss": 0.1067, "step": 80 }, { "epoch": 0.15655577299412915, "eval_loss": 0.12924787402153015, "eval_runtime": 107.2528, "eval_samples_per_second": 28.456, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8850894038300653, "eval_sts-test_pearson_dot": 0.874941916465686, "eval_sts-test_pearson_euclidean": 0.9101863990952803, "eval_sts-test_pearson_manhattan": 0.9110826056950171, "eval_sts-test_pearson_max": 0.9110826056950171, "eval_sts-test_spearman_cosine": 0.9078700928826409, "eval_sts-test_spearman_dot": 0.8792947566875607, "eval_sts-test_spearman_euclidean": 0.9059290069197888, "eval_sts-test_spearman_manhattan": 0.9075206750336968, "eval_sts-test_spearman_max": 0.9078700928826409, "step": 80 }, { "epoch": 0.15851272015655576, "grad_norm": 3.5708839893341064, "learning_rate": 6.3281250000000005e-06, "loss": 0.1189, "step": 81 }, { "epoch": 0.16046966731898238, "grad_norm": 4.602839469909668, "learning_rate": 6.406250000000001e-06, "loss": 0.236, "step": 82 }, { "epoch": 0.162426614481409, "grad_norm": 4.304513931274414, "learning_rate": 6.484375000000001e-06, "loss": 0.1584, "step": 83 }, { "epoch": 0.1643835616438356, "grad_norm": 4.165163516998291, "learning_rate": 6.5625e-06, "loss": 0.1925, "step": 84 }, { "epoch": 0.16634050880626222, "grad_norm": 3.9157192707061768, "learning_rate": 6.6406250000000005e-06, "loss": 0.129, "step": 85 }, { "epoch": 0.16634050880626222, "eval_loss": 0.1278335303068161, "eval_runtime": 107.1978, "eval_samples_per_second": 28.471, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8845993101894516, "eval_sts-test_pearson_dot": 0.8740701762146532, "eval_sts-test_pearson_euclidean": 0.9100055922999684, "eval_sts-test_pearson_manhattan": 0.9108899080028133, "eval_sts-test_pearson_max": 0.9108899080028133, "eval_sts-test_spearman_cosine": 0.9078923342595523, "eval_sts-test_spearman_dot": 0.8788126513485913, "eval_sts-test_spearman_euclidean": 0.9057257466905491, "eval_sts-test_spearman_manhattan": 0.9070083178420268, "eval_sts-test_spearman_max": 0.9078923342595523, "step": 85 }, { "epoch": 0.16829745596868884, "grad_norm": 4.233823776245117, "learning_rate": 6.718750000000001e-06, "loss": 0.1376, "step": 86 }, { "epoch": 0.17025440313111545, "grad_norm": 4.670790195465088, "learning_rate": 6.796875000000001e-06, "loss": 0.1691, "step": 87 }, { "epoch": 0.17221135029354206, "grad_norm": 3.742030382156372, "learning_rate": 6.875e-06, "loss": 0.1045, "step": 88 }, { "epoch": 0.17416829745596868, "grad_norm": 4.242702960968018, "learning_rate": 6.9531250000000004e-06, "loss": 0.165, "step": 89 }, { "epoch": 0.1761252446183953, "grad_norm": 5.499476909637451, "learning_rate": 7.031250000000001e-06, "loss": 0.2926, "step": 90 }, { "epoch": 0.1761252446183953, "eval_loss": 0.12669824063777924, "eval_runtime": 107.2778, "eval_samples_per_second": 28.45, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8844194771150324, "eval_sts-test_pearson_dot": 0.873458365713796, "eval_sts-test_pearson_euclidean": 0.9099396625521212, "eval_sts-test_pearson_manhattan": 0.910745898918033, "eval_sts-test_pearson_max": 0.910745898918033, "eval_sts-test_spearman_cosine": 0.907622707909669, "eval_sts-test_spearman_dot": 0.8783740442356941, "eval_sts-test_spearman_euclidean": 0.9058808545625318, "eval_sts-test_spearman_manhattan": 0.906889458491771, "eval_sts-test_spearman_max": 0.907622707909669, "step": 90 }, { "epoch": 0.1780821917808219, "grad_norm": 2.992021083831787, "learning_rate": 7.109375000000001e-06, "loss": 0.1048, "step": 91 }, { "epoch": 0.18003913894324852, "grad_norm": 4.298286437988281, "learning_rate": 7.1875e-06, "loss": 0.1596, "step": 92 }, { "epoch": 0.18199608610567514, "grad_norm": 5.210509300231934, "learning_rate": 7.265625e-06, "loss": 0.2474, "step": 93 }, { "epoch": 0.18395303326810175, "grad_norm": 4.527407169342041, "learning_rate": 7.343750000000001e-06, "loss": 0.1652, "step": 94 }, { "epoch": 0.18590998043052837, "grad_norm": 5.302050590515137, "learning_rate": 7.421875000000001e-06, "loss": 0.2483, "step": 95 }, { "epoch": 0.18590998043052837, "eval_loss": 0.1252526491880417, "eval_runtime": 107.5519, "eval_samples_per_second": 28.377, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.884272350180128, "eval_sts-test_pearson_dot": 0.8727334938335432, "eval_sts-test_pearson_euclidean": 0.9099441972021025, "eval_sts-test_pearson_manhattan": 0.9106991509833859, "eval_sts-test_pearson_max": 0.9106991509833859, "eval_sts-test_spearman_cosine": 0.9075948278738224, "eval_sts-test_spearman_dot": 0.87780624023116, "eval_sts-test_spearman_euclidean": 0.9060086194138042, "eval_sts-test_spearman_manhattan": 0.9069788267607697, "eval_sts-test_spearman_max": 0.9075948278738224, "step": 95 }, { "epoch": 0.18786692759295498, "grad_norm": 3.690441608428955, "learning_rate": 7.500000000000001e-06, "loss": 0.1623, "step": 96 }, { "epoch": 0.1898238747553816, "grad_norm": 4.585984706878662, "learning_rate": 7.578125e-06, "loss": 0.1955, "step": 97 }, { "epoch": 0.1917808219178082, "grad_norm": 4.493942737579346, "learning_rate": 7.656250000000001e-06, "loss": 0.2023, "step": 98 }, { "epoch": 0.19373776908023482, "grad_norm": 4.569936275482178, "learning_rate": 7.734375e-06, "loss": 0.1886, "step": 99 }, { "epoch": 0.19569471624266144, "grad_norm": 3.7703664302825928, "learning_rate": 7.8125e-06, "loss": 0.1284, "step": 100 }, { "epoch": 0.19569471624266144, "eval_loss": 0.12290485948324203, "eval_runtime": 107.6958, "eval_samples_per_second": 28.339, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8836376979322419, "eval_sts-test_pearson_dot": 0.8710695777275684, "eval_sts-test_pearson_euclidean": 0.9098265834859519, "eval_sts-test_pearson_manhattan": 0.9106248996071287, "eval_sts-test_pearson_max": 0.9106248996071287, "eval_sts-test_spearman_cosine": 0.9078868298544011, "eval_sts-test_spearman_dot": 0.8773200625274038, "eval_sts-test_spearman_euclidean": 0.9063156130669492, "eval_sts-test_spearman_manhattan": 0.9071474495136926, "eval_sts-test_spearman_max": 0.9078868298544011, "step": 100 }, { "epoch": 0.19765166340508805, "grad_norm": 4.356619358062744, "learning_rate": 7.890625e-06, "loss": 0.2005, "step": 101 }, { "epoch": 0.19960861056751467, "grad_norm": 4.293449878692627, "learning_rate": 7.96875e-06, "loss": 0.2301, "step": 102 }, { "epoch": 0.20156555772994128, "grad_norm": 4.654509544372559, "learning_rate": 8.046875e-06, "loss": 0.2249, "step": 103 }, { "epoch": 0.2035225048923679, "grad_norm": 4.510340213775635, "learning_rate": 8.125000000000001e-06, "loss": 0.214, "step": 104 }, { "epoch": 0.2054794520547945, "grad_norm": 3.880908489227295, "learning_rate": 8.203125000000001e-06, "loss": 0.1429, "step": 105 }, { "epoch": 0.2054794520547945, "eval_loss": 0.12076468020677567, "eval_runtime": 107.7074, "eval_samples_per_second": 28.336, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8828542959864998, "eval_sts-test_pearson_dot": 0.8689355363147886, "eval_sts-test_pearson_euclidean": 0.9096459762354197, "eval_sts-test_pearson_manhattan": 0.9104979967855148, "eval_sts-test_pearson_max": 0.9104979967855148, "eval_sts-test_spearman_cosine": 0.9076751563880199, "eval_sts-test_spearman_dot": 0.8750991469270715, "eval_sts-test_spearman_euclidean": 0.906379383614432, "eval_sts-test_spearman_manhattan": 0.9071111562407043, "eval_sts-test_spearman_max": 0.9076751563880199, "step": 105 }, { "epoch": 0.20743639921722112, "grad_norm": 3.8524463176727295, "learning_rate": 8.281250000000001e-06, "loss": 0.17, "step": 106 }, { "epoch": 0.20939334637964774, "grad_norm": 4.660905838012695, "learning_rate": 8.359375e-06, "loss": 0.1955, "step": 107 }, { "epoch": 0.21135029354207435, "grad_norm": 4.391407012939453, "learning_rate": 8.4375e-06, "loss": 0.1964, "step": 108 }, { "epoch": 0.21330724070450097, "grad_norm": 3.908740758895874, "learning_rate": 8.515625e-06, "loss": 0.1246, "step": 109 }, { "epoch": 0.21526418786692758, "grad_norm": 3.295600414276123, "learning_rate": 8.59375e-06, "loss": 0.1295, "step": 110 }, { "epoch": 0.21526418786692758, "eval_loss": 0.11901199817657471, "eval_runtime": 107.5373, "eval_samples_per_second": 28.381, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8820675142963768, "eval_sts-test_pearson_dot": 0.8664913359514981, "eval_sts-test_pearson_euclidean": 0.9093761405951237, "eval_sts-test_pearson_manhattan": 0.910248319457324, "eval_sts-test_pearson_max": 0.910248319457324, "eval_sts-test_spearman_cosine": 0.9071699146469111, "eval_sts-test_spearman_dot": 0.8726812810253556, "eval_sts-test_spearman_euclidean": 0.9064896954737618, "eval_sts-test_spearman_manhattan": 0.9068174537121922, "eval_sts-test_spearman_max": 0.9071699146469111, "step": 110 }, { "epoch": 0.2172211350293542, "grad_norm": 5.0308518409729, "learning_rate": 8.671875e-06, "loss": 0.2203, "step": 111 }, { "epoch": 0.2191780821917808, "grad_norm": 4.501624584197998, "learning_rate": 8.750000000000001e-06, "loss": 0.2195, "step": 112 }, { "epoch": 0.22113502935420742, "grad_norm": 4.200097560882568, "learning_rate": 8.828125000000001e-06, "loss": 0.1823, "step": 113 }, { "epoch": 0.22309197651663404, "grad_norm": 3.6750545501708984, "learning_rate": 8.906250000000001e-06, "loss": 0.174, "step": 114 }, { "epoch": 0.22504892367906065, "grad_norm": 4.105295181274414, "learning_rate": 8.984375000000002e-06, "loss": 0.207, "step": 115 }, { "epoch": 0.22504892367906065, "eval_loss": 0.11745984107255936, "eval_runtime": 107.5979, "eval_samples_per_second": 28.365, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.882042560326929, "eval_sts-test_pearson_dot": 0.8653067979173212, "eval_sts-test_pearson_euclidean": 0.9095832495385563, "eval_sts-test_pearson_manhattan": 0.9103602950988618, "eval_sts-test_pearson_max": 0.9103602950988618, "eval_sts-test_spearman_cosine": 0.9068824772949942, "eval_sts-test_spearman_dot": 0.8714208617482668, "eval_sts-test_spearman_euclidean": 0.906395180809703, "eval_sts-test_spearman_manhattan": 0.9068741088091138, "eval_sts-test_spearman_max": 0.9068824772949942, "step": 115 }, { "epoch": 0.22700587084148727, "grad_norm": 4.654273509979248, "learning_rate": 9.0625e-06, "loss": 0.2156, "step": 116 }, { "epoch": 0.22896281800391388, "grad_norm": 4.661588191986084, "learning_rate": 9.140625e-06, "loss": 0.2202, "step": 117 }, { "epoch": 0.2309197651663405, "grad_norm": 5.366416931152344, "learning_rate": 9.21875e-06, "loss": 0.2718, "step": 118 }, { "epoch": 0.2328767123287671, "grad_norm": 3.672802448272705, "learning_rate": 9.296875e-06, "loss": 0.1387, "step": 119 }, { "epoch": 0.23483365949119372, "grad_norm": 3.7878501415252686, "learning_rate": 9.375000000000001e-06, "loss": 0.1506, "step": 120 }, { "epoch": 0.23483365949119372, "eval_loss": 0.11679373681545258, "eval_runtime": 107.6687, "eval_samples_per_second": 28.346, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.882107468031623, "eval_sts-test_pearson_dot": 0.8647556765462645, "eval_sts-test_pearson_euclidean": 0.9099443435071429, "eval_sts-test_pearson_manhattan": 0.9105934104125866, "eval_sts-test_pearson_max": 0.9105934104125866, "eval_sts-test_spearman_cosine": 0.9068624287298908, "eval_sts-test_spearman_dot": 0.8710628964083971, "eval_sts-test_spearman_euclidean": 0.906624531024334, "eval_sts-test_spearman_manhattan": 0.9069254385059298, "eval_sts-test_spearman_max": 0.9069254385059298, "step": 120 }, { "epoch": 0.23679060665362034, "grad_norm": 3.4761197566986084, "learning_rate": 9.453125000000001e-06, "loss": 0.1185, "step": 121 }, { "epoch": 0.23874755381604695, "grad_norm": 3.9917871952056885, "learning_rate": 9.531250000000001e-06, "loss": 0.1681, "step": 122 }, { "epoch": 0.24070450097847357, "grad_norm": 4.491674423217773, "learning_rate": 9.609375000000001e-06, "loss": 0.2321, "step": 123 }, { "epoch": 0.24266144814090018, "grad_norm": 3.903496503829956, "learning_rate": 9.6875e-06, "loss": 0.1457, "step": 124 }, { "epoch": 0.2446183953033268, "grad_norm": 5.046339988708496, "learning_rate": 9.765625e-06, "loss": 0.2027, "step": 125 }, { "epoch": 0.2446183953033268, "eval_loss": 0.11647585779428482, "eval_runtime": 107.5396, "eval_samples_per_second": 28.38, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8824938293263067, "eval_sts-test_pearson_dot": 0.8653100788410637, "eval_sts-test_pearson_euclidean": 0.9104636052712812, "eval_sts-test_pearson_manhattan": 0.9109341151161342, "eval_sts-test_pearson_max": 0.9109341151161342, "eval_sts-test_spearman_cosine": 0.9070702535877924, "eval_sts-test_spearman_dot": 0.8716920543922986, "eval_sts-test_spearman_euclidean": 0.9070027239343528, "eval_sts-test_spearman_manhattan": 0.9073061822378479, "eval_sts-test_spearman_max": 0.9073061822378479, "step": 125 }, { "epoch": 0.2465753424657534, "grad_norm": 4.304446697235107, "learning_rate": 9.84375e-06, "loss": 0.1821, "step": 126 }, { "epoch": 0.24853228962818003, "grad_norm": 3.208357810974121, "learning_rate": 9.921875e-06, "loss": 0.1258, "step": 127 }, { "epoch": 0.25048923679060664, "grad_norm": 4.275379657745361, "learning_rate": 1e-05, "loss": 0.184, "step": 128 }, { "epoch": 0.25244618395303325, "grad_norm": 4.408608436584473, "learning_rate": 1.0078125000000001e-05, "loss": 0.2015, "step": 129 }, { "epoch": 0.25440313111545987, "grad_norm": 3.565253973007202, "learning_rate": 1.0156250000000001e-05, "loss": 0.1323, "step": 130 }, { "epoch": 0.25440313111545987, "eval_loss": 0.1154385656118393, "eval_runtime": 107.5442, "eval_samples_per_second": 28.379, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8820850631122565, "eval_sts-test_pearson_dot": 0.8648589750662984, "eval_sts-test_pearson_euclidean": 0.9105884442785888, "eval_sts-test_pearson_manhattan": 0.9109040210291837, "eval_sts-test_pearson_max": 0.9109040210291837, "eval_sts-test_spearman_cosine": 0.9074317095260507, "eval_sts-test_spearman_dot": 0.8710452196601474, "eval_sts-test_spearman_euclidean": 0.9070635408985837, "eval_sts-test_spearman_manhattan": 0.9074422260724778, "eval_sts-test_spearman_max": 0.9074422260724778, "step": 130 }, { "epoch": 0.2563600782778865, "grad_norm": 4.261953353881836, "learning_rate": 1.0234375000000001e-05, "loss": 0.1939, "step": 131 }, { "epoch": 0.2583170254403131, "grad_norm": 3.806480646133423, "learning_rate": 1.0312500000000002e-05, "loss": 0.1428, "step": 132 }, { "epoch": 0.2602739726027397, "grad_norm": 2.824733257293701, "learning_rate": 1.0390625e-05, "loss": 0.1063, "step": 133 }, { "epoch": 0.2622309197651663, "grad_norm": 4.076455116271973, "learning_rate": 1.046875e-05, "loss": 0.1602, "step": 134 }, { "epoch": 0.26418786692759294, "grad_norm": 3.7571659088134766, "learning_rate": 1.0546875e-05, "loss": 0.1814, "step": 135 }, { "epoch": 0.26418786692759294, "eval_loss": 0.11387230455875397, "eval_runtime": 107.5968, "eval_samples_per_second": 28.365, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8812889193869892, "eval_sts-test_pearson_dot": 0.8634898982579755, "eval_sts-test_pearson_euclidean": 0.9104977472627025, "eval_sts-test_pearson_manhattan": 0.9107178140804983, "eval_sts-test_pearson_max": 0.9107178140804983, "eval_sts-test_spearman_cosine": 0.9066986391131981, "eval_sts-test_spearman_dot": 0.870129116588204, "eval_sts-test_spearman_euclidean": 0.9070359293703052, "eval_sts-test_spearman_manhattan": 0.9073414909830857, "eval_sts-test_spearman_max": 0.9073414909830857, "step": 135 }, { "epoch": 0.26614481409001955, "grad_norm": 3.864948034286499, "learning_rate": 1.0625e-05, "loss": 0.1518, "step": 136 }, { "epoch": 0.26810176125244617, "grad_norm": 3.5900001525878906, "learning_rate": 1.0703125000000001e-05, "loss": 0.1379, "step": 137 }, { "epoch": 0.2700587084148728, "grad_norm": 4.291954517364502, "learning_rate": 1.0781250000000001e-05, "loss": 0.1708, "step": 138 }, { "epoch": 0.2720156555772994, "grad_norm": 3.8340342044830322, "learning_rate": 1.0859375000000001e-05, "loss": 0.2046, "step": 139 }, { "epoch": 0.273972602739726, "grad_norm": 3.749396562576294, "learning_rate": 1.0937500000000002e-05, "loss": 0.1259, "step": 140 }, { "epoch": 0.273972602739726, "eval_loss": 0.1124362125992775, "eval_runtime": 107.5142, "eval_samples_per_second": 28.387, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8805714116282963, "eval_sts-test_pearson_dot": 0.8618911680351633, "eval_sts-test_pearson_euclidean": 0.9102979980912764, "eval_sts-test_pearson_manhattan": 0.9105232760600299, "eval_sts-test_pearson_max": 0.9105232760600299, "eval_sts-test_spearman_cosine": 0.9063180743863257, "eval_sts-test_spearman_dot": 0.8687826406354595, "eval_sts-test_spearman_euclidean": 0.9070556199253175, "eval_sts-test_spearman_manhattan": 0.9073570196707885, "eval_sts-test_spearman_max": 0.9073570196707885, "step": 140 }, { "epoch": 0.2759295499021526, "grad_norm": 2.8815276622772217, "learning_rate": 1.1015625e-05, "loss": 0.1181, "step": 141 }, { "epoch": 0.27788649706457924, "grad_norm": 3.766554355621338, "learning_rate": 1.109375e-05, "loss": 0.2144, "step": 142 }, { "epoch": 0.27984344422700586, "grad_norm": 4.289268493652344, "learning_rate": 1.1171875e-05, "loss": 0.1822, "step": 143 }, { "epoch": 0.28180039138943247, "grad_norm": 3.9036617279052734, "learning_rate": 1.125e-05, "loss": 0.1667, "step": 144 }, { "epoch": 0.2837573385518591, "grad_norm": 3.321366786956787, "learning_rate": 1.1328125000000001e-05, "loss": 0.0779, "step": 145 }, { "epoch": 0.2837573385518591, "eval_loss": 0.1118142157793045, "eval_runtime": 107.3173, "eval_samples_per_second": 28.439, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8796044904115364, "eval_sts-test_pearson_dot": 0.8607678603166254, "eval_sts-test_pearson_euclidean": 0.9097479995877322, "eval_sts-test_pearson_manhattan": 0.9098650580518599, "eval_sts-test_pearson_max": 0.9098650580518599, "eval_sts-test_spearman_cosine": 0.9059690592987342, "eval_sts-test_spearman_dot": 0.8685229490656053, "eval_sts-test_spearman_euclidean": 0.90680836920613, "eval_sts-test_spearman_manhattan": 0.9069437865231001, "eval_sts-test_spearman_max": 0.9069437865231001, "step": 145 }, { "epoch": 0.2857142857142857, "grad_norm": 3.460301160812378, "learning_rate": 1.1406250000000001e-05, "loss": 0.147, "step": 146 }, { "epoch": 0.2876712328767123, "grad_norm": 3.8999266624450684, "learning_rate": 1.1484375000000001e-05, "loss": 0.1913, "step": 147 }, { "epoch": 0.2896281800391389, "grad_norm": 3.539788007736206, "learning_rate": 1.1562500000000002e-05, "loss": 0.1357, "step": 148 }, { "epoch": 0.29158512720156554, "grad_norm": 3.499439001083374, "learning_rate": 1.1640625000000002e-05, "loss": 0.1128, "step": 149 }, { "epoch": 0.29354207436399216, "grad_norm": 3.2960240840911865, "learning_rate": 1.171875e-05, "loss": 0.0996, "step": 150 }, { "epoch": 0.29354207436399216, "eval_loss": 0.11132737249135971, "eval_runtime": 107.5867, "eval_samples_per_second": 28.368, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8787852416493207, "eval_sts-test_pearson_dot": 0.8593025559452621, "eval_sts-test_pearson_euclidean": 0.9091617970047303, "eval_sts-test_pearson_manhattan": 0.9091664157178929, "eval_sts-test_pearson_max": 0.9091664157178929, "eval_sts-test_spearman_cosine": 0.9054375485671886, "eval_sts-test_spearman_dot": 0.867029912731804, "eval_sts-test_spearman_euclidean": 0.9062253050214613, "eval_sts-test_spearman_manhattan": 0.9062610165280517, "eval_sts-test_spearman_max": 0.9062610165280517, "step": 150 }, { "epoch": 0.29549902152641877, "grad_norm": 4.271719932556152, "learning_rate": 1.1796875e-05, "loss": 0.1956, "step": 151 }, { "epoch": 0.2974559686888454, "grad_norm": 3.168663501739502, "learning_rate": 1.1875e-05, "loss": 0.0942, "step": 152 }, { "epoch": 0.299412915851272, "grad_norm": 3.816993236541748, "learning_rate": 1.1953125000000001e-05, "loss": 0.1406, "step": 153 }, { "epoch": 0.3013698630136986, "grad_norm": 5.383023738861084, "learning_rate": 1.2031250000000001e-05, "loss": 0.2868, "step": 154 }, { "epoch": 0.30332681017612523, "grad_norm": 3.123462677001953, "learning_rate": 1.2109375000000001e-05, "loss": 0.1102, "step": 155 }, { "epoch": 0.30332681017612523, "eval_loss": 0.11142811924219131, "eval_runtime": 107.3019, "eval_samples_per_second": 28.443, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8780761726881443, "eval_sts-test_pearson_dot": 0.8581767032057357, "eval_sts-test_pearson_euclidean": 0.9081534036571242, "eval_sts-test_pearson_manhattan": 0.9081724370385316, "eval_sts-test_pearson_max": 0.9081724370385316, "eval_sts-test_spearman_cosine": 0.9048428490545583, "eval_sts-test_spearman_dot": 0.8670075818523697, "eval_sts-test_spearman_euclidean": 0.9052714766361651, "eval_sts-test_spearman_manhattan": 0.9054467225757737, "eval_sts-test_spearman_max": 0.9054467225757737, "step": 155 }, { "epoch": 0.30528375733855184, "grad_norm": 4.1034979820251465, "learning_rate": 1.2187500000000001e-05, "loss": 0.1659, "step": 156 }, { "epoch": 0.30724070450097846, "grad_norm": 3.60249400138855, "learning_rate": 1.2265625000000002e-05, "loss": 0.1645, "step": 157 }, { "epoch": 0.30919765166340507, "grad_norm": 3.771853446960449, "learning_rate": 1.234375e-05, "loss": 0.151, "step": 158 }, { "epoch": 0.3111545988258317, "grad_norm": 4.291686058044434, "learning_rate": 1.2421875e-05, "loss": 0.158, "step": 159 }, { "epoch": 0.3131115459882583, "grad_norm": 5.1689453125, "learning_rate": 1.25e-05, "loss": 0.2323, "step": 160 }, { "epoch": 0.3131115459882583, "eval_loss": 0.11126424372196198, "eval_runtime": 107.301, "eval_samples_per_second": 28.443, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8777597983330929, "eval_sts-test_pearson_dot": 0.8577739588604719, "eval_sts-test_pearson_euclidean": 0.9075483317216817, "eval_sts-test_pearson_manhattan": 0.9075908461381532, "eval_sts-test_pearson_max": 0.9075908461381532, "eval_sts-test_spearman_cosine": 0.9047649818597372, "eval_sts-test_spearman_dot": 0.867389712873391, "eval_sts-test_spearman_euclidean": 0.9048189966322366, "eval_sts-test_spearman_manhattan": 0.9049692713679889, "eval_sts-test_spearman_max": 0.9049692713679889, "step": 160 }, { "epoch": 0.3150684931506849, "grad_norm": 3.304703712463379, "learning_rate": 1.2578125e-05, "loss": 0.1157, "step": 161 }, { "epoch": 0.31702544031311153, "grad_norm": 4.064731121063232, "learning_rate": 1.2656250000000001e-05, "loss": 0.1507, "step": 162 }, { "epoch": 0.31898238747553814, "grad_norm": 4.615545749664307, "learning_rate": 1.2734375000000001e-05, "loss": 0.1879, "step": 163 }, { "epoch": 0.32093933463796476, "grad_norm": 3.767533540725708, "learning_rate": 1.2812500000000001e-05, "loss": 0.143, "step": 164 }, { "epoch": 0.32289628180039137, "grad_norm": 4.727967262268066, "learning_rate": 1.2890625000000002e-05, "loss": 0.2227, "step": 165 }, { "epoch": 0.32289628180039137, "eval_loss": 0.11155427247285843, "eval_runtime": 107.2898, "eval_samples_per_second": 28.446, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8775899700998113, "eval_sts-test_pearson_dot": 0.8571711542435376, "eval_sts-test_pearson_euclidean": 0.907399950708088, "eval_sts-test_pearson_manhattan": 0.9073879045697356, "eval_sts-test_pearson_max": 0.907399950708088, "eval_sts-test_spearman_cosine": 0.9049959431197784, "eval_sts-test_spearman_dot": 0.8667648957618442, "eval_sts-test_spearman_euclidean": 0.9048916279294749, "eval_sts-test_spearman_manhattan": 0.9050786882020909, "eval_sts-test_spearman_max": 0.9050786882020909, "step": 165 }, { "epoch": 0.324853228962818, "grad_norm": 4.0150017738342285, "learning_rate": 1.2968750000000002e-05, "loss": 0.1624, "step": 166 }, { "epoch": 0.3268101761252446, "grad_norm": 3.021153450012207, "learning_rate": 1.3046875e-05, "loss": 0.1345, "step": 167 }, { "epoch": 0.3287671232876712, "grad_norm": 3.869710922241211, "learning_rate": 1.3125e-05, "loss": 0.1765, "step": 168 }, { "epoch": 0.33072407045009783, "grad_norm": 3.538076162338257, "learning_rate": 1.3203125e-05, "loss": 0.1368, "step": 169 }, { "epoch": 0.33268101761252444, "grad_norm": 3.378551483154297, "learning_rate": 1.3281250000000001e-05, "loss": 0.0962, "step": 170 }, { "epoch": 0.33268101761252444, "eval_loss": 0.11131894588470459, "eval_runtime": 107.3532, "eval_samples_per_second": 28.43, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8782576778514848, "eval_sts-test_pearson_dot": 0.8576530243239538, "eval_sts-test_pearson_euclidean": 0.9077401564122008, "eval_sts-test_pearson_manhattan": 0.907609849534313, "eval_sts-test_pearson_max": 0.9077401564122008, "eval_sts-test_spearman_cosine": 0.9055560946586144, "eval_sts-test_spearman_dot": 0.8666707838591381, "eval_sts-test_spearman_euclidean": 0.9054064016892602, "eval_sts-test_spearman_manhattan": 0.9054834186101147, "eval_sts-test_spearman_max": 0.9055560946586144, "step": 170 }, { "epoch": 0.33463796477495106, "grad_norm": 4.588249683380127, "learning_rate": 1.3359375000000001e-05, "loss": 0.1783, "step": 171 }, { "epoch": 0.33659491193737767, "grad_norm": 4.370199680328369, "learning_rate": 1.3437500000000001e-05, "loss": 0.2019, "step": 172 }, { "epoch": 0.3385518590998043, "grad_norm": 4.000157356262207, "learning_rate": 1.3515625000000002e-05, "loss": 0.1761, "step": 173 }, { "epoch": 0.3405088062622309, "grad_norm": 4.3335862159729, "learning_rate": 1.3593750000000002e-05, "loss": 0.1855, "step": 174 }, { "epoch": 0.3424657534246575, "grad_norm": 4.247244358062744, "learning_rate": 1.3671875e-05, "loss": 0.1922, "step": 175 }, { "epoch": 0.3424657534246575, "eval_loss": 0.1105586364865303, "eval_runtime": 107.3507, "eval_samples_per_second": 28.43, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8775475016000731, "eval_sts-test_pearson_dot": 0.8543732981082479, "eval_sts-test_pearson_euclidean": 0.9076643456809551, "eval_sts-test_pearson_manhattan": 0.9075054089199206, "eval_sts-test_pearson_max": 0.9076643456809551, "eval_sts-test_spearman_cosine": 0.905357578063082, "eval_sts-test_spearman_dot": 0.8628476388472094, "eval_sts-test_spearman_euclidean": 0.9054710672619708, "eval_sts-test_spearman_manhattan": 0.9055309444497123, "eval_sts-test_spearman_max": 0.9055309444497123, "step": 175 }, { "epoch": 0.34442270058708413, "grad_norm": 3.881108522415161, "learning_rate": 1.375e-05, "loss": 0.1538, "step": 176 }, { "epoch": 0.34637964774951074, "grad_norm": 3.4271416664123535, "learning_rate": 1.3828125e-05, "loss": 0.1049, "step": 177 }, { "epoch": 0.34833659491193736, "grad_norm": 3.7847940921783447, "learning_rate": 1.3906250000000001e-05, "loss": 0.1619, "step": 178 }, { "epoch": 0.350293542074364, "grad_norm": 2.3725311756134033, "learning_rate": 1.3984375000000001e-05, "loss": 0.0731, "step": 179 }, { "epoch": 0.3522504892367906, "grad_norm": 3.6820032596588135, "learning_rate": 1.4062500000000001e-05, "loss": 0.1205, "step": 180 }, { "epoch": 0.3522504892367906, "eval_loss": 0.10974939167499542, "eval_runtime": 107.353, "eval_samples_per_second": 28.43, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8782123578217031, "eval_sts-test_pearson_dot": 0.852106566478191, "eval_sts-test_pearson_euclidean": 0.9088860377565003, "eval_sts-test_pearson_manhattan": 0.9087269620613702, "eval_sts-test_pearson_max": 0.9088860377565003, "eval_sts-test_spearman_cosine": 0.9058966517578029, "eval_sts-test_spearman_dot": 0.8595467858069799, "eval_sts-test_spearman_euclidean": 0.9064047128283795, "eval_sts-test_spearman_manhattan": 0.9067846510375924, "eval_sts-test_spearman_max": 0.9067846510375924, "step": 180 }, { "epoch": 0.3542074363992172, "grad_norm": 3.7714688777923584, "learning_rate": 1.4140625000000002e-05, "loss": 0.169, "step": 181 }, { "epoch": 0.3561643835616438, "grad_norm": 3.7113559246063232, "learning_rate": 1.4218750000000002e-05, "loss": 0.1688, "step": 182 }, { "epoch": 0.35812133072407043, "grad_norm": 3.1639597415924072, "learning_rate": 1.4296875000000002e-05, "loss": 0.1274, "step": 183 }, { "epoch": 0.36007827788649704, "grad_norm": 4.144288539886475, "learning_rate": 1.4375e-05, "loss": 0.1477, "step": 184 }, { "epoch": 0.36203522504892366, "grad_norm": 3.4342098236083984, "learning_rate": 1.4453125e-05, "loss": 0.1418, "step": 185 }, { "epoch": 0.36203522504892366, "eval_loss": 0.10942607372999191, "eval_runtime": 107.2679, "eval_samples_per_second": 28.452, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8778855142398189, "eval_sts-test_pearson_dot": 0.8501658695420333, "eval_sts-test_pearson_euclidean": 0.9088432870055996, "eval_sts-test_pearson_manhattan": 0.9086435133118579, "eval_sts-test_pearson_max": 0.9088432870055996, "eval_sts-test_spearman_cosine": 0.9055185931015683, "eval_sts-test_spearman_dot": 0.8575025481866207, "eval_sts-test_spearman_euclidean": 0.9063994321795352, "eval_sts-test_spearman_manhattan": 0.9064969899293684, "eval_sts-test_spearman_max": 0.9064969899293684, "step": 185 }, { "epoch": 0.3639921722113503, "grad_norm": 4.744626045227051, "learning_rate": 1.453125e-05, "loss": 0.2477, "step": 186 }, { "epoch": 0.3659491193737769, "grad_norm": 4.062248229980469, "learning_rate": 1.4609375000000001e-05, "loss": 0.1713, "step": 187 }, { "epoch": 0.3679060665362035, "grad_norm": 3.989694833755493, "learning_rate": 1.4687500000000001e-05, "loss": 0.1703, "step": 188 }, { "epoch": 0.3698630136986301, "grad_norm": 3.3543660640716553, "learning_rate": 1.4765625000000001e-05, "loss": 0.1176, "step": 189 }, { "epoch": 0.37181996086105673, "grad_norm": 4.307045936584473, "learning_rate": 1.4843750000000002e-05, "loss": 0.1811, "step": 190 }, { "epoch": 0.37181996086105673, "eval_loss": 0.10837770998477936, "eval_runtime": 107.3429, "eval_samples_per_second": 28.432, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8774103555789884, "eval_sts-test_pearson_dot": 0.84942827650618, "eval_sts-test_pearson_euclidean": 0.9086430009253119, "eval_sts-test_pearson_manhattan": 0.9084642534632353, "eval_sts-test_pearson_max": 0.9086430009253119, "eval_sts-test_spearman_cosine": 0.9048482639571866, "eval_sts-test_spearman_dot": 0.8562155914115267, "eval_sts-test_spearman_euclidean": 0.9060070531196555, "eval_sts-test_spearman_manhattan": 0.9061608184537963, "eval_sts-test_spearman_max": 0.9061608184537963, "step": 190 }, { "epoch": 0.37377690802348335, "grad_norm": 4.140930652618408, "learning_rate": 1.4921875000000002e-05, "loss": 0.162, "step": 191 }, { "epoch": 0.37573385518590996, "grad_norm": 2.7555642127990723, "learning_rate": 1.5000000000000002e-05, "loss": 0.1141, "step": 192 }, { "epoch": 0.3776908023483366, "grad_norm": 4.070343017578125, "learning_rate": 1.5078125e-05, "loss": 0.154, "step": 193 }, { "epoch": 0.3796477495107632, "grad_norm": 4.453440189361572, "learning_rate": 1.515625e-05, "loss": 0.2461, "step": 194 }, { "epoch": 0.3816046966731898, "grad_norm": 3.7656772136688232, "learning_rate": 1.5234375000000001e-05, "loss": 0.1573, "step": 195 }, { "epoch": 0.3816046966731898, "eval_loss": 0.10762027651071548, "eval_runtime": 107.299, "eval_samples_per_second": 28.444, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8779461080888007, "eval_sts-test_pearson_dot": 0.8521074278329072, "eval_sts-test_pearson_euclidean": 0.9087045359990432, "eval_sts-test_pearson_manhattan": 0.9086340705654771, "eval_sts-test_pearson_max": 0.9087045359990432, "eval_sts-test_spearman_cosine": 0.9045706718827756, "eval_sts-test_spearman_dot": 0.8584340456924826, "eval_sts-test_spearman_euclidean": 0.9055143864829975, "eval_sts-test_spearman_manhattan": 0.9058283613329196, "eval_sts-test_spearman_max": 0.9058283613329196, "step": 195 }, { "epoch": 0.3835616438356164, "grad_norm": 3.063400983810425, "learning_rate": 1.5312500000000003e-05, "loss": 0.1197, "step": 196 }, { "epoch": 0.38551859099804303, "grad_norm": 3.893153429031372, "learning_rate": 1.5390625e-05, "loss": 0.1395, "step": 197 }, { "epoch": 0.38747553816046965, "grad_norm": 2.95540714263916, "learning_rate": 1.546875e-05, "loss": 0.0847, "step": 198 }, { "epoch": 0.38943248532289626, "grad_norm": 3.4665300846099854, "learning_rate": 1.5546875e-05, "loss": 0.1848, "step": 199 }, { "epoch": 0.3913894324853229, "grad_norm": 3.6926543712615967, "learning_rate": 1.5625e-05, "loss": 0.1377, "step": 200 }, { "epoch": 0.3913894324853229, "eval_loss": 0.10723523795604706, "eval_runtime": 107.245, "eval_samples_per_second": 28.458, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.877994665901344, "eval_sts-test_pearson_dot": 0.854134605280733, "eval_sts-test_pearson_euclidean": 0.9085191117850383, "eval_sts-test_pearson_manhattan": 0.9086424100414001, "eval_sts-test_pearson_max": 0.9086424100414001, "eval_sts-test_spearman_cosine": 0.904685279863199, "eval_sts-test_spearman_dot": 0.8598855528557127, "eval_sts-test_spearman_euclidean": 0.9052407772708506, "eval_sts-test_spearman_manhattan": 0.9058868959828196, "eval_sts-test_spearman_max": 0.9058868959828196, "step": 200 }, { "epoch": 0.3933463796477495, "grad_norm": 3.303112268447876, "learning_rate": 1.5703125e-05, "loss": 0.1109, "step": 201 }, { "epoch": 0.3953033268101761, "grad_norm": 3.4490058422088623, "learning_rate": 1.578125e-05, "loss": 0.1051, "step": 202 }, { "epoch": 0.3972602739726027, "grad_norm": 2.6598286628723145, "learning_rate": 1.5859375e-05, "loss": 0.0975, "step": 203 }, { "epoch": 0.39921722113502933, "grad_norm": 3.373512029647827, "learning_rate": 1.59375e-05, "loss": 0.127, "step": 204 }, { "epoch": 0.40117416829745595, "grad_norm": 3.1471354961395264, "learning_rate": 1.6015625e-05, "loss": 0.1297, "step": 205 }, { "epoch": 0.40117416829745595, "eval_loss": 0.10685314983129501, "eval_runtime": 107.3321, "eval_samples_per_second": 28.435, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8785914848590666, "eval_sts-test_pearson_dot": 0.8570818659891223, "eval_sts-test_pearson_euclidean": 0.9086611488562145, "eval_sts-test_pearson_manhattan": 0.9087606701935215, "eval_sts-test_pearson_max": 0.9087606701935215, "eval_sts-test_spearman_cosine": 0.9048987433800361, "eval_sts-test_spearman_dot": 0.8616398023022556, "eval_sts-test_spearman_euclidean": 0.9052247563192726, "eval_sts-test_spearman_manhattan": 0.9056138237858093, "eval_sts-test_spearman_max": 0.9056138237858093, "step": 205 }, { "epoch": 0.40313111545988256, "grad_norm": 2.6924684047698975, "learning_rate": 1.609375e-05, "loss": 0.0783, "step": 206 }, { "epoch": 0.4050880626223092, "grad_norm": 2.1100542545318604, "learning_rate": 1.6171875000000002e-05, "loss": 0.053, "step": 207 }, { "epoch": 0.4070450097847358, "grad_norm": 3.7984156608581543, "learning_rate": 1.6250000000000002e-05, "loss": 0.1916, "step": 208 }, { "epoch": 0.4090019569471624, "grad_norm": 4.329834461212158, "learning_rate": 1.6328125000000002e-05, "loss": 0.178, "step": 209 }, { "epoch": 0.410958904109589, "grad_norm": 4.427723407745361, "learning_rate": 1.6406250000000002e-05, "loss": 0.2343, "step": 210 }, { "epoch": 0.410958904109589, "eval_loss": 0.10670512914657593, "eval_runtime": 107.2313, "eval_samples_per_second": 28.462, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8788965355860006, "eval_sts-test_pearson_dot": 0.8580075676260999, "eval_sts-test_pearson_euclidean": 0.908776492246521, "eval_sts-test_pearson_manhattan": 0.9089340980301853, "eval_sts-test_pearson_max": 0.9089340980301853, "eval_sts-test_spearman_cosine": 0.90530862018312, "eval_sts-test_spearman_dot": 0.8630207814775328, "eval_sts-test_spearman_euclidean": 0.905449362900196, "eval_sts-test_spearman_manhattan": 0.9056519071092534, "eval_sts-test_spearman_max": 0.9056519071092534, "step": 210 }, { "epoch": 0.41291585127201563, "grad_norm": 3.890899419784546, "learning_rate": 1.6484375000000003e-05, "loss": 0.1816, "step": 211 }, { "epoch": 0.41487279843444225, "grad_norm": 4.071934700012207, "learning_rate": 1.6562500000000003e-05, "loss": 0.2522, "step": 212 }, { "epoch": 0.41682974559686886, "grad_norm": 3.8046796321868896, "learning_rate": 1.6640625000000003e-05, "loss": 0.1787, "step": 213 }, { "epoch": 0.4187866927592955, "grad_norm": 3.357276201248169, "learning_rate": 1.671875e-05, "loss": 0.1913, "step": 214 }, { "epoch": 0.4207436399217221, "grad_norm": 3.8679873943328857, "learning_rate": 1.6796875e-05, "loss": 0.175, "step": 215 }, { "epoch": 0.4207436399217221, "eval_loss": 0.10552908480167389, "eval_runtime": 107.6412, "eval_samples_per_second": 28.353, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8791676817178924, "eval_sts-test_pearson_dot": 0.8573342496118925, "eval_sts-test_pearson_euclidean": 0.909475190469058, "eval_sts-test_pearson_manhattan": 0.9097533727394405, "eval_sts-test_pearson_max": 0.9097533727394405, "eval_sts-test_spearman_cosine": 0.9056468502167161, "eval_sts-test_spearman_dot": 0.8624976392318674, "eval_sts-test_spearman_euclidean": 0.9066117769148375, "eval_sts-test_spearman_manhattan": 0.9069566301351195, "eval_sts-test_spearman_max": 0.9069566301351195, "step": 215 }, { "epoch": 0.4227005870841487, "grad_norm": 3.436488389968872, "learning_rate": 1.6875e-05, "loss": 0.1533, "step": 216 }, { "epoch": 0.4246575342465753, "grad_norm": 3.891040563583374, "learning_rate": 1.6953125e-05, "loss": 0.1819, "step": 217 }, { "epoch": 0.42661448140900193, "grad_norm": 4.554884910583496, "learning_rate": 1.703125e-05, "loss": 0.2541, "step": 218 }, { "epoch": 0.42857142857142855, "grad_norm": 3.4431850910186768, "learning_rate": 1.7109375e-05, "loss": 0.1103, "step": 219 }, { "epoch": 0.43052837573385516, "grad_norm": 3.5396361351013184, "learning_rate": 1.71875e-05, "loss": 0.1693, "step": 220 }, { "epoch": 0.43052837573385516, "eval_loss": 0.10396925359964371, "eval_runtime": 107.33, "eval_samples_per_second": 28.436, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8794186180626897, "eval_sts-test_pearson_dot": 0.8555325369075935, "eval_sts-test_pearson_euclidean": 0.9099071011406157, "eval_sts-test_pearson_manhattan": 0.9104095617945829, "eval_sts-test_pearson_max": 0.9104095617945829, "eval_sts-test_spearman_cosine": 0.9061536582519738, "eval_sts-test_spearman_dot": 0.8609769018672648, "eval_sts-test_spearman_euclidean": 0.9068523149448162, "eval_sts-test_spearman_manhattan": 0.9075606826613808, "eval_sts-test_spearman_max": 0.9075606826613808, "step": 220 }, { "epoch": 0.4324853228962818, "grad_norm": 3.4416589736938477, "learning_rate": 1.7265625e-05, "loss": 0.1233, "step": 221 }, { "epoch": 0.4344422700587084, "grad_norm": 2.9554316997528076, "learning_rate": 1.734375e-05, "loss": 0.0922, "step": 222 }, { "epoch": 0.436399217221135, "grad_norm": 3.1570141315460205, "learning_rate": 1.7421875e-05, "loss": 0.1243, "step": 223 }, { "epoch": 0.4383561643835616, "grad_norm": 3.8479344844818115, "learning_rate": 1.7500000000000002e-05, "loss": 0.1613, "step": 224 }, { "epoch": 0.44031311154598823, "grad_norm": 3.004990339279175, "learning_rate": 1.7578125000000002e-05, "loss": 0.1188, "step": 225 }, { "epoch": 0.44031311154598823, "eval_loss": 0.1029738187789917, "eval_runtime": 107.2661, "eval_samples_per_second": 28.453, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8796765477862789, "eval_sts-test_pearson_dot": 0.8576485629522204, "eval_sts-test_pearson_euclidean": 0.9098263075403831, "eval_sts-test_pearson_manhattan": 0.9104321398639006, "eval_sts-test_pearson_max": 0.9104321398639006, "eval_sts-test_spearman_cosine": 0.9064603386462892, "eval_sts-test_spearman_dot": 0.8635142088856343, "eval_sts-test_spearman_euclidean": 0.9066103896257344, "eval_sts-test_spearman_manhattan": 0.9076328216947436, "eval_sts-test_spearman_max": 0.9076328216947436, "step": 225 }, { "epoch": 0.44227005870841485, "grad_norm": 3.595667839050293, "learning_rate": 1.7656250000000002e-05, "loss": 0.196, "step": 226 }, { "epoch": 0.44422700587084146, "grad_norm": 3.9599428176879883, "learning_rate": 1.7734375000000002e-05, "loss": 0.2254, "step": 227 }, { "epoch": 0.4461839530332681, "grad_norm": 3.2490875720977783, "learning_rate": 1.7812500000000003e-05, "loss": 0.1162, "step": 228 }, { "epoch": 0.4481409001956947, "grad_norm": 4.811342239379883, "learning_rate": 1.7890625000000003e-05, "loss": 0.2579, "step": 229 }, { "epoch": 0.4500978473581213, "grad_norm": 2.993255138397217, "learning_rate": 1.7968750000000003e-05, "loss": 0.1203, "step": 230 }, { "epoch": 0.4500978473581213, "eval_loss": 0.102933868765831, "eval_runtime": 107.2515, "eval_samples_per_second": 28.456, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8799758353085696, "eval_sts-test_pearson_dot": 0.8592997081846103, "eval_sts-test_pearson_euclidean": 0.9101945793558552, "eval_sts-test_pearson_manhattan": 0.9106837055219174, "eval_sts-test_pearson_max": 0.9106837055219174, "eval_sts-test_spearman_cosine": 0.9071432428951217, "eval_sts-test_spearman_dot": 0.865314059867535, "eval_sts-test_spearman_euclidean": 0.9072587906520344, "eval_sts-test_spearman_manhattan": 0.9077949555147645, "eval_sts-test_spearman_max": 0.9077949555147645, "step": 230 }, { "epoch": 0.4520547945205479, "grad_norm": 3.654191017150879, "learning_rate": 1.8046875e-05, "loss": 0.1654, "step": 231 }, { "epoch": 0.45401174168297453, "grad_norm": 3.429565668106079, "learning_rate": 1.8125e-05, "loss": 0.1808, "step": 232 }, { "epoch": 0.45596868884540115, "grad_norm": 3.5679566860198975, "learning_rate": 1.8203125e-05, "loss": 0.1397, "step": 233 }, { "epoch": 0.45792563600782776, "grad_norm": 3.9862124919891357, "learning_rate": 1.828125e-05, "loss": 0.2177, "step": 234 }, { "epoch": 0.4598825831702544, "grad_norm": 3.536984443664551, "learning_rate": 1.8359375e-05, "loss": 0.162, "step": 235 }, { "epoch": 0.4598825831702544, "eval_loss": 0.10404225438833237, "eval_runtime": 107.254, "eval_samples_per_second": 28.456, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8802088610554777, "eval_sts-test_pearson_dot": 0.8618209119350905, "eval_sts-test_pearson_euclidean": 0.9103461475031536, "eval_sts-test_pearson_manhattan": 0.9106782364335553, "eval_sts-test_pearson_max": 0.9106782364335553, "eval_sts-test_spearman_cosine": 0.9077748174471387, "eval_sts-test_spearman_dot": 0.8686349167216066, "eval_sts-test_spearman_euclidean": 0.907571109705285, "eval_sts-test_spearman_manhattan": 0.9080472631264893, "eval_sts-test_spearman_max": 0.9080472631264893, "step": 235 }, { "epoch": 0.461839530332681, "grad_norm": 3.2987570762634277, "learning_rate": 1.84375e-05, "loss": 0.177, "step": 236 }, { "epoch": 0.4637964774951076, "grad_norm": 1.792919397354126, "learning_rate": 1.8515625e-05, "loss": 0.0556, "step": 237 }, { "epoch": 0.4657534246575342, "grad_norm": 3.8270483016967773, "learning_rate": 1.859375e-05, "loss": 0.2285, "step": 238 }, { "epoch": 0.46771037181996084, "grad_norm": 3.2458577156066895, "learning_rate": 1.8671875e-05, "loss": 0.1657, "step": 239 }, { "epoch": 0.46966731898238745, "grad_norm": 4.352839469909668, "learning_rate": 1.8750000000000002e-05, "loss": 0.2555, "step": 240 }, { "epoch": 0.46966731898238745, "eval_loss": 0.10528620332479477, "eval_runtime": 107.3201, "eval_samples_per_second": 28.438, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8794205585889476, "eval_sts-test_pearson_dot": 0.8616236846471828, "eval_sts-test_pearson_euclidean": 0.9100171674371834, "eval_sts-test_pearson_manhattan": 0.9102120642982687, "eval_sts-test_pearson_max": 0.9102120642982687, "eval_sts-test_spearman_cosine": 0.9076779309662261, "eval_sts-test_spearman_dot": 0.8702396969551023, "eval_sts-test_spearman_euclidean": 0.9078436896384199, "eval_sts-test_spearman_manhattan": 0.9080407741935878, "eval_sts-test_spearman_max": 0.9080407741935878, "step": 240 }, { "epoch": 0.47162426614481406, "grad_norm": 3.644327163696289, "learning_rate": 1.8828125000000002e-05, "loss": 0.1606, "step": 241 }, { "epoch": 0.4735812133072407, "grad_norm": 3.0316474437713623, "learning_rate": 1.8906250000000002e-05, "loss": 0.1257, "step": 242 }, { "epoch": 0.4755381604696673, "grad_norm": 3.8527326583862305, "learning_rate": 1.8984375000000002e-05, "loss": 0.1898, "step": 243 }, { "epoch": 0.4774951076320939, "grad_norm": 3.91603422164917, "learning_rate": 1.9062500000000003e-05, "loss": 0.1621, "step": 244 }, { "epoch": 0.4794520547945205, "grad_norm": 3.6845171451568604, "learning_rate": 1.9140625000000003e-05, "loss": 0.1606, "step": 245 }, { "epoch": 0.4794520547945205, "eval_loss": 0.10541080683469772, "eval_runtime": 107.3443, "eval_samples_per_second": 28.432, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8782579310286232, "eval_sts-test_pearson_dot": 0.8596847230641689, "eval_sts-test_pearson_euclidean": 0.909741577402618, "eval_sts-test_pearson_manhattan": 0.9098438643121189, "eval_sts-test_pearson_max": 0.9098438643121189, "eval_sts-test_spearman_cosine": 0.9078928712746891, "eval_sts-test_spearman_dot": 0.8682800392187727, "eval_sts-test_spearman_euclidean": 0.9083291960732551, "eval_sts-test_spearman_manhattan": 0.908423397478484, "eval_sts-test_spearman_max": 0.908423397478484, "step": 245 }, { "epoch": 0.48140900195694714, "grad_norm": 3.31758451461792, "learning_rate": 1.9218750000000003e-05, "loss": 0.0983, "step": 246 }, { "epoch": 0.48336594911937375, "grad_norm": 3.8613622188568115, "learning_rate": 1.9296875000000003e-05, "loss": 0.2028, "step": 247 }, { "epoch": 0.48532289628180036, "grad_norm": 2.792924165725708, "learning_rate": 1.9375e-05, "loss": 0.0997, "step": 248 }, { "epoch": 0.487279843444227, "grad_norm": 3.4162261486053467, "learning_rate": 1.9453125e-05, "loss": 0.1582, "step": 249 }, { "epoch": 0.4892367906066536, "grad_norm": 4.499621391296387, "learning_rate": 1.953125e-05, "loss": 0.2394, "step": 250 }, { "epoch": 0.4892367906066536, "eval_loss": 0.10517927259206772, "eval_runtime": 107.2761, "eval_samples_per_second": 28.45, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8775291091187776, "eval_sts-test_pearson_dot": 0.8591957018286404, "eval_sts-test_pearson_euclidean": 0.9092406666480166, "eval_sts-test_pearson_manhattan": 0.909395200356788, "eval_sts-test_pearson_max": 0.909395200356788, "eval_sts-test_spearman_cosine": 0.9073655224104529, "eval_sts-test_spearman_dot": 0.866218124850164, "eval_sts-test_spearman_euclidean": 0.9077081380676655, "eval_sts-test_spearman_manhattan": 0.907968321901395, "eval_sts-test_spearman_max": 0.907968321901395, "step": 250 }, { "epoch": 0.4911937377690802, "grad_norm": 4.491675853729248, "learning_rate": 1.9609375e-05, "loss": 0.2186, "step": 251 }, { "epoch": 0.4931506849315068, "grad_norm": 2.9051578044891357, "learning_rate": 1.96875e-05, "loss": 0.0993, "step": 252 }, { "epoch": 0.49510763209393344, "grad_norm": 3.53365421295166, "learning_rate": 1.9765625e-05, "loss": 0.1805, "step": 253 }, { "epoch": 0.49706457925636005, "grad_norm": 3.2181098461151123, "learning_rate": 1.984375e-05, "loss": 0.1178, "step": 254 }, { "epoch": 0.49902152641878667, "grad_norm": 4.045453071594238, "learning_rate": 1.9921875e-05, "loss": 0.2198, "step": 255 }, { "epoch": 0.49902152641878667, "eval_loss": 0.10428859293460846, "eval_runtime": 107.2698, "eval_samples_per_second": 28.452, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8777129617944619, "eval_sts-test_pearson_dot": 0.8588391652180615, "eval_sts-test_pearson_euclidean": 0.9093230964292308, "eval_sts-test_pearson_manhattan": 0.9095932968076137, "eval_sts-test_pearson_max": 0.9095932968076137, "eval_sts-test_spearman_cosine": 0.9069800350448274, "eval_sts-test_spearman_dot": 0.8639776976998651, "eval_sts-test_spearman_euclidean": 0.9072912800678044, "eval_sts-test_spearman_manhattan": 0.9080281095866138, "eval_sts-test_spearman_max": 0.9080281095866138, "step": 255 }, { "epoch": 0.5009784735812133, "grad_norm": 2.8251521587371826, "learning_rate": 2e-05, "loss": 0.1064, "step": 256 }, { "epoch": 0.50293542074364, "grad_norm": 3.3597464561462402, "learning_rate": 1.999924308128909e-05, "loss": 0.1436, "step": 257 }, { "epoch": 0.5048923679060665, "grad_norm": 2.580488920211792, "learning_rate": 1.9996972439741537e-05, "loss": 0.0859, "step": 258 }, { "epoch": 0.5068493150684932, "grad_norm": 3.937856674194336, "learning_rate": 1.9993188419095562e-05, "loss": 0.2157, "step": 259 }, { "epoch": 0.5088062622309197, "grad_norm": 3.344531774520874, "learning_rate": 1.9987891592190367e-05, "loss": 0.1455, "step": 260 }, { "epoch": 0.5088062622309197, "eval_loss": 0.10292962938547134, "eval_runtime": 107.2285, "eval_samples_per_second": 28.463, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8767515459977318, "eval_sts-test_pearson_dot": 0.8564862360521637, "eval_sts-test_pearson_euclidean": 0.9083760527634203, "eval_sts-test_pearson_manhattan": 0.9086626400377007, "eval_sts-test_pearson_max": 0.9086626400377007, "eval_sts-test_spearman_cosine": 0.9057508521481897, "eval_sts-test_spearman_dot": 0.8601081456298736, "eval_sts-test_spearman_euclidean": 0.9063700753520626, "eval_sts-test_spearman_manhattan": 0.9068438122051519, "eval_sts-test_spearman_max": 0.9068438122051519, "step": 260 }, { "epoch": 0.5107632093933464, "grad_norm": 3.7637484073638916, "learning_rate": 1.9981082760879432e-05, "loss": 0.1974, "step": 261 }, { "epoch": 0.512720156555773, "grad_norm": 3.182102918624878, "learning_rate": 1.997276295590912e-05, "loss": 0.1667, "step": 262 }, { "epoch": 0.5146771037181996, "grad_norm": 3.7908170223236084, "learning_rate": 1.9962933436762644e-05, "loss": 0.1512, "step": 263 }, { "epoch": 0.5166340508806262, "grad_norm": 3.4492650032043457, "learning_rate": 1.9951595691469397e-05, "loss": 0.1684, "step": 264 }, { "epoch": 0.5185909980430529, "grad_norm": 3.816772222518921, "learning_rate": 1.9938751436379684e-05, "loss": 0.2132, "step": 265 }, { "epoch": 0.5185909980430529, "eval_loss": 0.10117975622415543, "eval_runtime": 107.3212, "eval_samples_per_second": 28.438, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8770393502752714, "eval_sts-test_pearson_dot": 0.8567524208989885, "eval_sts-test_pearson_euclidean": 0.9080912956763092, "eval_sts-test_pearson_manhattan": 0.908247948105785, "eval_sts-test_pearson_max": 0.908247948105785, "eval_sts-test_spearman_cosine": 0.9053279079767796, "eval_sts-test_spearman_dot": 0.8598375795035011, "eval_sts-test_spearman_euclidean": 0.9057662913333698, "eval_sts-test_spearman_manhattan": 0.9061448870047409, "eval_sts-test_spearman_max": 0.9061448870047409, "step": 265 }, { "epoch": 0.5205479452054794, "grad_norm": 3.5570499897003174, "learning_rate": 1.992440261590491e-05, "loss": 0.1645, "step": 266 }, { "epoch": 0.5225048923679061, "grad_norm": 4.160579681396484, "learning_rate": 1.9908551402223218e-05, "loss": 0.203, "step": 267 }, { "epoch": 0.5244618395303327, "grad_norm": 3.5718774795532227, "learning_rate": 1.9891200194950644e-05, "loss": 0.1539, "step": 268 }, { "epoch": 0.5264187866927593, "grad_norm": 3.604438066482544, "learning_rate": 1.9872351620777883e-05, "loss": 0.1445, "step": 269 }, { "epoch": 0.5283757338551859, "grad_norm": 3.4854915142059326, "learning_rate": 1.9852008533072627e-05, "loss": 0.1377, "step": 270 }, { "epoch": 0.5283757338551859, "eval_loss": 0.09936786442995071, "eval_runtime": 107.3119, "eval_samples_per_second": 28.44, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8772155384897071, "eval_sts-test_pearson_dot": 0.8577040756637748, "eval_sts-test_pearson_euclidean": 0.9081962404777727, "eval_sts-test_pearson_manhattan": 0.9082660411148933, "eval_sts-test_pearson_max": 0.9082660411148933, "eval_sts-test_spearman_cosine": 0.9056296657323417, "eval_sts-test_spearman_dot": 0.8627456954737598, "eval_sts-test_spearman_euclidean": 0.9061553587999066, "eval_sts-test_spearman_manhattan": 0.9063870360801298, "eval_sts-test_spearman_max": 0.9063870360801298, "step": 270 }, { "epoch": 0.5303326810176126, "grad_norm": 3.662992238998413, "learning_rate": 1.9830174011447617e-05, "loss": 0.1719, "step": 271 }, { "epoch": 0.5322896281800391, "grad_norm": 3.5594613552093506, "learning_rate": 1.980685136129445e-05, "loss": 0.1896, "step": 272 }, { "epoch": 0.5342465753424658, "grad_norm": 3.257335662841797, "learning_rate": 1.978204411328318e-05, "loss": 0.1452, "step": 273 }, { "epoch": 0.5362035225048923, "grad_norm": 3.292863368988037, "learning_rate": 1.9755756022827847e-05, "loss": 0.1275, "step": 274 }, { "epoch": 0.538160469667319, "grad_norm": 4.065443515777588, "learning_rate": 1.972799106951796e-05, "loss": 0.1883, "step": 275 }, { "epoch": 0.538160469667319, "eval_loss": 0.09800439327955246, "eval_runtime": 107.2596, "eval_samples_per_second": 28.454, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8773598628753827, "eval_sts-test_pearson_dot": 0.8578251655808844, "eval_sts-test_pearson_euclidean": 0.9082603623937704, "eval_sts-test_pearson_manhattan": 0.9081101963076783, "eval_sts-test_pearson_max": 0.9082603623937704, "eval_sts-test_spearman_cosine": 0.9056689328319392, "eval_sts-test_spearman_dot": 0.8647132741833555, "eval_sts-test_spearman_euclidean": 0.9063065285608867, "eval_sts-test_spearman_manhattan": 0.9067770433231558, "eval_sts-test_spearman_max": 0.9067770433231558, "step": 275 }, { "epoch": 0.5401174168297456, "grad_norm": 3.7186553478240967, "learning_rate": 1.9698753456516047e-05, "loss": 0.1462, "step": 276 }, { "epoch": 0.5420743639921722, "grad_norm": 3.5399951934814453, "learning_rate": 1.9668047609921382e-05, "loss": 0.1595, "step": 277 }, { "epoch": 0.5440313111545988, "grad_norm": 3.6143035888671875, "learning_rate": 1.963587817809993e-05, "loss": 0.1693, "step": 278 }, { "epoch": 0.5459882583170255, "grad_norm": 4.133859634399414, "learning_rate": 1.9602250030980657e-05, "loss": 0.1929, "step": 279 }, { "epoch": 0.547945205479452, "grad_norm": 3.6929726600646973, "learning_rate": 1.9567168259318324e-05, "loss": 0.154, "step": 280 }, { "epoch": 0.547945205479452, "eval_loss": 0.0969705730676651, "eval_runtime": 107.333, "eval_samples_per_second": 28.435, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8772326487842304, "eval_sts-test_pearson_dot": 0.8584362564160372, "eval_sts-test_pearson_euclidean": 0.9077579223693962, "eval_sts-test_pearson_manhattan": 0.9072827835669532, "eval_sts-test_pearson_max": 0.9077579223693962, "eval_sts-test_spearman_cosine": 0.9052923754752349, "eval_sts-test_spearman_dot": 0.866326959917868, "eval_sts-test_spearman_euclidean": 0.9057464665245734, "eval_sts-test_spearman_manhattan": 0.9059635996448444, "eval_sts-test_spearman_max": 0.9059635996448444, "step": 280 }, { "epoch": 0.5499021526418787, "grad_norm": 3.515667200088501, "learning_rate": 1.953063817392281e-05, "loss": 0.1468, "step": 281 }, { "epoch": 0.5518590998043053, "grad_norm": 2.3627371788024902, "learning_rate": 1.949266530485513e-05, "loss": 0.0898, "step": 282 }, { "epoch": 0.5538160469667319, "grad_norm": 3.26710844039917, "learning_rate": 1.945325540059032e-05, "loss": 0.1425, "step": 283 }, { "epoch": 0.5557729941291585, "grad_norm": 3.6672258377075195, "learning_rate": 1.941241442714716e-05, "loss": 0.1362, "step": 284 }, { "epoch": 0.5577299412915852, "grad_norm": 3.306119203567505, "learning_rate": 1.9370148567185043e-05, "loss": 0.1025, "step": 285 }, { "epoch": 0.5577299412915852, "eval_loss": 0.09782103449106216, "eval_runtime": 107.2147, "eval_samples_per_second": 28.466, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8780373092719852, "eval_sts-test_pearson_dot": 0.861128415219923, "eval_sts-test_pearson_euclidean": 0.9076094585437832, "eval_sts-test_pearson_manhattan": 0.9068707688162918, "eval_sts-test_pearson_max": 0.9076094585437832, "eval_sts-test_spearman_cosine": 0.9052606468309083, "eval_sts-test_spearman_dot": 0.868469739815811, "eval_sts-test_spearman_euclidean": 0.9051151604801249, "eval_sts-test_spearman_manhattan": 0.9048908224067698, "eval_sts-test_spearman_max": 0.9052606468309083, "step": 285 }, { "epoch": 0.5596868884540117, "grad_norm": 3.2606685161590576, "learning_rate": 1.9326464219068023e-05, "loss": 0.1578, "step": 286 }, { "epoch": 0.5616438356164384, "grad_norm": 3.5152740478515625, "learning_rate": 1.9281367995896187e-05, "loss": 0.1235, "step": 287 }, { "epoch": 0.5636007827788649, "grad_norm": 2.8671882152557373, "learning_rate": 1.9234866724504554e-05, "loss": 0.1109, "step": 288 }, { "epoch": 0.5655577299412916, "grad_norm": 2.315185785293579, "learning_rate": 1.9186967444429613e-05, "loss": 0.0746, "step": 289 }, { "epoch": 0.5675146771037182, "grad_norm": 3.4961392879486084, "learning_rate": 1.913767740684362e-05, "loss": 0.1471, "step": 290 }, { "epoch": 0.5675146771037182, "eval_loss": 0.09924904257059097, "eval_runtime": 107.3422, "eval_samples_per_second": 28.432, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8783681424807572, "eval_sts-test_pearson_dot": 0.861120631953773, "eval_sts-test_pearson_euclidean": 0.9077238606316402, "eval_sts-test_pearson_manhattan": 0.9069786963498391, "eval_sts-test_pearson_max": 0.9077238606316402, "eval_sts-test_spearman_cosine": 0.9052591700392825, "eval_sts-test_spearman_dot": 0.8684268233561366, "eval_sts-test_spearman_euclidean": 0.9046835793152661, "eval_sts-test_spearman_manhattan": 0.9045985071673613, "eval_sts-test_spearman_max": 0.9052591700392825, "step": 290 }, { "epoch": 0.5694716242661448, "grad_norm": 4.221432209014893, "learning_rate": 1.9087004073456926e-05, "loss": 0.2631, "step": 291 }, { "epoch": 0.5714285714285714, "grad_norm": 3.4570438861846924, "learning_rate": 1.9034955115388364e-05, "loss": 0.11, "step": 292 }, { "epoch": 0.5733855185909981, "grad_norm": 3.6059136390686035, "learning_rate": 1.898153841200398e-05, "loss": 0.1834, "step": 293 }, { "epoch": 0.5753424657534246, "grad_norm": 3.3278088569641113, "learning_rate": 1.892676204972423e-05, "loss": 0.1277, "step": 294 }, { "epoch": 0.5772994129158513, "grad_norm": 4.314577579498291, "learning_rate": 1.8870634320799822e-05, "loss": 0.2104, "step": 295 }, { "epoch": 0.5772994129158513, "eval_loss": 0.09903673827648163, "eval_runtime": 107.6434, "eval_samples_per_second": 28.353, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8781909348259072, "eval_sts-test_pearson_dot": 0.8596231931866185, "eval_sts-test_pearson_euclidean": 0.9076411156234586, "eval_sts-test_pearson_manhattan": 0.9069147632233857, "eval_sts-test_pearson_max": 0.9076411156234586, "eval_sts-test_spearman_cosine": 0.9042011607174669, "eval_sts-test_spearman_dot": 0.8660264551976247, "eval_sts-test_spearman_euclidean": 0.9044265280698341, "eval_sts-test_spearman_manhattan": 0.9041656729671835, "eval_sts-test_spearman_max": 0.9044265280698341, "step": 295 }, { "epoch": 0.5792563600782779, "grad_norm": 3.195991039276123, "learning_rate": 1.8813163722056397e-05, "loss": 0.1294, "step": 296 }, { "epoch": 0.5812133072407045, "grad_norm": 3.6352145671844482, "learning_rate": 1.875435895360826e-05, "loss": 0.1672, "step": 297 }, { "epoch": 0.5831702544031311, "grad_norm": 3.7248518466949463, "learning_rate": 1.8694228917541313e-05, "loss": 0.2171, "step": 298 }, { "epoch": 0.5851272015655578, "grad_norm": 3.459801435470581, "learning_rate": 1.8632782716565438e-05, "loss": 0.1451, "step": 299 }, { "epoch": 0.5870841487279843, "grad_norm": 2.6911542415618896, "learning_rate": 1.857002965263648e-05, "loss": 0.0871, "step": 300 }, { "epoch": 0.5870841487279843, "eval_loss": 0.09800251573324203, "eval_runtime": 107.2338, "eval_samples_per_second": 28.461, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8772925626670083, "eval_sts-test_pearson_dot": 0.8566016359384749, "eval_sts-test_pearson_euclidean": 0.9070931796775764, "eval_sts-test_pearson_manhattan": 0.9064105714529896, "eval_sts-test_pearson_max": 0.9070931796775764, "eval_sts-test_spearman_cosine": 0.9032592361677008, "eval_sts-test_spearman_dot": 0.8623085204012272, "eval_sts-test_spearman_euclidean": 0.9038942565668446, "eval_sts-test_spearman_manhattan": 0.9033954590073763, "eval_sts-test_spearman_max": 0.9038942565668446, "step": 300 }, { "epoch": 0.589041095890411, "grad_norm": 2.913508653640747, "learning_rate": 1.850597922554809e-05, "loss": 0.0897, "step": 301 }, { "epoch": 0.5909980430528375, "grad_norm": 3.2928783893585205, "learning_rate": 1.844064113149361e-05, "loss": 0.1296, "step": 302 }, { "epoch": 0.5929549902152642, "grad_norm": 3.2551913261413574, "learning_rate": 1.8374025261598224e-05, "loss": 0.1206, "step": 303 }, { "epoch": 0.5949119373776908, "grad_norm": 3.246716022491455, "learning_rate": 1.8306141700421606e-05, "loss": 0.1665, "step": 304 }, { "epoch": 0.5968688845401174, "grad_norm": 3.980085611343384, "learning_rate": 1.8237000724431283e-05, "loss": 0.1511, "step": 305 }, { "epoch": 0.5968688845401174, "eval_loss": 0.09785618633031845, "eval_runtime": 107.3879, "eval_samples_per_second": 28.42, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8767817266460346, "eval_sts-test_pearson_dot": 0.8544828510438696, "eval_sts-test_pearson_euclidean": 0.9070553577944469, "eval_sts-test_pearson_manhattan": 0.9065146784679962, "eval_sts-test_pearson_max": 0.9070553577944469, "eval_sts-test_spearman_cosine": 0.9032290290662617, "eval_sts-test_spearman_dot": 0.8599922398628699, "eval_sts-test_spearman_euclidean": 0.9039456310149221, "eval_sts-test_spearman_manhattan": 0.9035283702537087, "eval_sts-test_spearman_max": 0.9039456310149221, "step": 305 }, { "epoch": 0.598825831702544, "grad_norm": 3.576425790786743, "learning_rate": 1.8166612800446927e-05, "loss": 0.1566, "step": 306 }, { "epoch": 0.6007827788649707, "grad_norm": 3.3370437622070312, "learning_rate": 1.809498858405589e-05, "loss": 0.1339, "step": 307 }, { "epoch": 0.6027397260273972, "grad_norm": 3.3882863521575928, "learning_rate": 1.802213891800007e-05, "loss": 0.1474, "step": 308 }, { "epoch": 0.6046966731898239, "grad_norm": 2.9576971530914307, "learning_rate": 1.7948074830534535e-05, "loss": 0.1022, "step": 309 }, { "epoch": 0.6066536203522505, "grad_norm": 3.737396001815796, "learning_rate": 1.7872807533758007e-05, "loss": 0.1263, "step": 310 }, { "epoch": 0.6066536203522505, "eval_loss": 0.09827280789613724, "eval_runtime": 107.5911, "eval_samples_per_second": 28.367, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8774447336518928, "eval_sts-test_pearson_dot": 0.8526883126161577, "eval_sts-test_pearson_euclidean": 0.9083025051320742, "eval_sts-test_pearson_manhattan": 0.9079128512948802, "eval_sts-test_pearson_max": 0.9083025051320742, "eval_sts-test_spearman_cosine": 0.9043404713941784, "eval_sts-test_spearman_dot": 0.8595169367156317, "eval_sts-test_spearman_euclidean": 0.9055969973115261, "eval_sts-test_spearman_manhattan": 0.9051065234866762, "eval_sts-test_spearman_max": 0.9055969973115261, "step": 310 }, { "epoch": 0.6086105675146771, "grad_norm": 3.6634974479675293, "learning_rate": 1.7796348421915536e-05, "loss": 0.1713, "step": 311 }, { "epoch": 0.6105675146771037, "grad_norm": 4.3175225257873535, "learning_rate": 1.7718709069673595e-05, "loss": 0.1628, "step": 312 }, { "epoch": 0.6125244618395304, "grad_norm": 3.73574161529541, "learning_rate": 1.763990123036787e-05, "loss": 0.1585, "step": 313 }, { "epoch": 0.6144814090019569, "grad_norm": 3.8439183235168457, "learning_rate": 1.7559936834223982e-05, "loss": 0.1419, "step": 314 }, { "epoch": 0.6164383561643836, "grad_norm": 2.908531904220581, "learning_rate": 1.747882798655147e-05, "loss": 0.1136, "step": 315 }, { "epoch": 0.6164383561643836, "eval_loss": 0.09831386059522629, "eval_runtime": 107.369, "eval_samples_per_second": 28.425, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8766201899554078, "eval_sts-test_pearson_dot": 0.8489007626542151, "eval_sts-test_pearson_euclidean": 0.907814904603313, "eval_sts-test_pearson_manhattan": 0.9075136258935672, "eval_sts-test_pearson_max": 0.907814904603313, "eval_sts-test_spearman_cosine": 0.9040450683177336, "eval_sts-test_spearman_dot": 0.856758468963466, "eval_sts-test_spearman_euclidean": 0.9053801326988233, "eval_sts-test_spearman_manhattan": 0.9047017483273913, "eval_sts-test_spearman_max": 0.9053801326988233, "step": 315 }, { "epoch": 0.6183953033268101, "grad_norm": 4.284037113189697, "learning_rate": 1.739658696591121e-05, "loss": 0.255, "step": 316 }, { "epoch": 0.6203522504892368, "grad_norm": 3.051182270050049, "learning_rate": 1.7313226222256675e-05, "loss": 0.1262, "step": 317 }, { "epoch": 0.6223091976516634, "grad_norm": 3.270893096923828, "learning_rate": 1.7228758375049186e-05, "loss": 0.1393, "step": 318 }, { "epoch": 0.62426614481409, "grad_norm": 3.4940428733825684, "learning_rate": 1.714319621134755e-05, "loss": 0.1134, "step": 319 }, { "epoch": 0.6262230919765166, "grad_norm": 3.899348258972168, "learning_rate": 1.705655268387229e-05, "loss": 0.1441, "step": 320 }, { "epoch": 0.6262230919765166, "eval_loss": 0.09844871610403061, "eval_runtime": 107.3824, "eval_samples_per_second": 28.422, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8758902555910153, "eval_sts-test_pearson_dot": 0.8487463478874753, "eval_sts-test_pearson_euclidean": 0.9067076042499689, "eval_sts-test_pearson_manhattan": 0.9065947885559749, "eval_sts-test_pearson_max": 0.9067076042499689, "eval_sts-test_spearman_cosine": 0.9032847891379552, "eval_sts-test_spearman_dot": 0.8557776108162892, "eval_sts-test_spearman_euclidean": 0.9042920057780914, "eval_sts-test_spearman_manhattan": 0.9038587688165614, "eval_sts-test_spearman_max": 0.9042920057780914, "step": 320 }, { "epoch": 0.6281800391389433, "grad_norm": 4.422016143798828, "learning_rate": 1.696884090904484e-05, "loss": 0.1744, "step": 321 }, { "epoch": 0.6301369863013698, "grad_norm": 3.950225353240967, "learning_rate": 1.6880074165001906e-05, "loss": 0.2124, "step": 322 }, { "epoch": 0.6320939334637965, "grad_norm": 3.2186155319213867, "learning_rate": 1.6790265889585377e-05, "loss": 0.1267, "step": 323 }, { "epoch": 0.6340508806262231, "grad_norm": 3.156022548675537, "learning_rate": 1.669942967830807e-05, "loss": 0.1435, "step": 324 }, { "epoch": 0.6360078277886497, "grad_norm": 3.511422634124756, "learning_rate": 1.6607579282295572e-05, "loss": 0.1705, "step": 325 }, { "epoch": 0.6360078277886497, "eval_loss": 0.09894353151321411, "eval_runtime": 107.3913, "eval_samples_per_second": 28.419, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8748236722620941, "eval_sts-test_pearson_dot": 0.8489660029264176, "eval_sts-test_pearson_euclidean": 0.9056717167596496, "eval_sts-test_pearson_manhattan": 0.9057306950198961, "eval_sts-test_pearson_max": 0.9057306950198961, "eval_sts-test_spearman_cosine": 0.9023375391880836, "eval_sts-test_spearman_dot": 0.8556132394331987, "eval_sts-test_spearman_euclidean": 0.9032693499527753, "eval_sts-test_spearman_manhattan": 0.9032065639330431, "eval_sts-test_spearman_max": 0.9032693499527753, "step": 325 }, { "epoch": 0.6379647749510763, "grad_norm": 3.2097976207733154, "learning_rate": 1.651472860620455e-05, "loss": 0.1441, "step": 326 }, { "epoch": 0.639921722113503, "grad_norm": 3.0201833248138428, "learning_rate": 1.6420891706117818e-05, "loss": 0.118, "step": 327 }, { "epoch": 0.6418786692759295, "grad_norm": 3.370908737182617, "learning_rate": 1.6326082787416465e-05, "loss": 0.1956, "step": 328 }, { "epoch": 0.6438356164383562, "grad_norm": 2.768566131591797, "learning_rate": 1.6230316202629393e-05, "loss": 0.0803, "step": 329 }, { "epoch": 0.6457925636007827, "grad_norm": 3.2455928325653076, "learning_rate": 1.613360644926059e-05, "loss": 0.1651, "step": 330 }, { "epoch": 0.6457925636007827, "eval_loss": 0.09914453327655792, "eval_runtime": 107.2995, "eval_samples_per_second": 28.444, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8743455111936658, "eval_sts-test_pearson_dot": 0.8479229638933452, "eval_sts-test_pearson_euclidean": 0.9055198964101038, "eval_sts-test_pearson_manhattan": 0.9055992524553022, "eval_sts-test_pearson_max": 0.9055992524553022, "eval_sts-test_spearman_cosine": 0.9022275405875834, "eval_sts-test_spearman_dot": 0.8543106197166178, "eval_sts-test_spearman_euclidean": 0.9029672341871219, "eval_sts-test_spearman_manhattan": 0.9028108285285591, "eval_sts-test_spearman_max": 0.9029672341871219, "step": 330 }, { "epoch": 0.6477495107632094, "grad_norm": 3.465236186981201, "learning_rate": 1.603596816759442e-05, "loss": 0.1498, "step": 331 }, { "epoch": 0.649706457925636, "grad_norm": 3.303255558013916, "learning_rate": 1.5937416138479344e-05, "loss": 0.1171, "step": 332 }, { "epoch": 0.6516634050880626, "grad_norm": 3.893554449081421, "learning_rate": 1.5837965281090334e-05, "loss": 0.1976, "step": 333 }, { "epoch": 0.6536203522504892, "grad_norm": 2.688338041305542, "learning_rate": 1.5737630650670336e-05, "loss": 0.0926, "step": 334 }, { "epoch": 0.6555772994129159, "grad_norm": 3.4313673973083496, "learning_rate": 1.5636427436251182e-05, "loss": 0.1496, "step": 335 }, { "epoch": 0.6555772994129159, "eval_loss": 0.09911184757947922, "eval_runtime": 107.4795, "eval_samples_per_second": 28.396, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8746652537201496, "eval_sts-test_pearson_dot": 0.8492828619583224, "eval_sts-test_pearson_euclidean": 0.906047319803132, "eval_sts-test_pearson_manhattan": 0.9060247174283395, "eval_sts-test_pearson_max": 0.906047319803132, "eval_sts-test_spearman_cosine": 0.9026310179602884, "eval_sts-test_spearman_dot": 0.856069836553175, "eval_sts-test_spearman_euclidean": 0.9034376594468683, "eval_sts-test_spearman_manhattan": 0.9036356837785253, "eval_sts-test_spearman_max": 0.9036356837785253, "step": 335 }, { "epoch": 0.6575342465753424, "grad_norm": 3.2240829467773438, "learning_rate": 1.5534370958354184e-05, "loss": 0.1131, "step": 336 }, { "epoch": 0.6594911937377691, "grad_norm": 3.2019200325012207, "learning_rate": 1.5431476666670885e-05, "loss": 0.1352, "step": 337 }, { "epoch": 0.6614481409001957, "grad_norm": 3.5696215629577637, "learning_rate": 1.5327760137724213e-05, "loss": 0.1608, "step": 338 }, { "epoch": 0.6634050880626223, "grad_norm": 3.2444350719451904, "learning_rate": 1.5223237072510433e-05, "loss": 0.1239, "step": 339 }, { "epoch": 0.6653620352250489, "grad_norm": 3.1613712310791016, "learning_rate": 1.5117923294122312e-05, "loss": 0.1227, "step": 340 }, { "epoch": 0.6653620352250489, "eval_loss": 0.09929565340280533, "eval_runtime": 107.2604, "eval_samples_per_second": 28.454, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.874603261003018, "eval_sts-test_pearson_dot": 0.8513553588561518, "eval_sts-test_pearson_euclidean": 0.9056124511024704, "eval_sts-test_pearson_manhattan": 0.9053134930024975, "eval_sts-test_pearson_max": 0.9056124511024704, "eval_sts-test_spearman_cosine": 0.9019408192558488, "eval_sts-test_spearman_dot": 0.8587178581922269, "eval_sts-test_spearman_euclidean": 0.9026150865112329, "eval_sts-test_spearman_manhattan": 0.9023037967369943, "eval_sts-test_spearman_max": 0.9026150865112329, "step": 340 }, { "epoch": 0.6673189823874756, "grad_norm": 3.3650641441345215, "learning_rate": 1.5011834745353725e-05, "loss": 0.1452, "step": 341 }, { "epoch": 0.6692759295499021, "grad_norm": 3.7061643600463867, "learning_rate": 1.4904987486286184e-05, "loss": 0.1992, "step": 342 }, { "epoch": 0.6712328767123288, "grad_norm": 3.262500286102295, "learning_rate": 1.4797397691857614e-05, "loss": 0.1349, "step": 343 }, { "epoch": 0.6731898238747553, "grad_norm": 3.4780774116516113, "learning_rate": 1.468908164941371e-05, "loss": 0.1702, "step": 344 }, { "epoch": 0.675146771037182, "grad_norm": 2.908043146133423, "learning_rate": 1.4580055756242315e-05, "loss": 0.1033, "step": 345 }, { "epoch": 0.675146771037182, "eval_loss": 0.09903653711080551, "eval_runtime": 107.5298, "eval_samples_per_second": 28.383, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8748590023241125, "eval_sts-test_pearson_dot": 0.8538817887560792, "eval_sts-test_pearson_euclidean": 0.905694726781384, "eval_sts-test_pearson_manhattan": 0.9051916896005284, "eval_sts-test_pearson_max": 0.905694726781384, "eval_sts-test_spearman_cosine": 0.9022368936012142, "eval_sts-test_spearman_dot": 0.86127843586652, "eval_sts-test_spearman_euclidean": 0.9024703161806319, "eval_sts-test_spearman_manhattan": 0.9023726241770144, "eval_sts-test_spearman_max": 0.9024703161806319, "step": 345 }, { "epoch": 0.6771037181996086, "grad_norm": 3.7118523120880127, "learning_rate": 1.4470336517091139e-05, "loss": 0.1788, "step": 346 }, { "epoch": 0.6790606653620352, "grad_norm": 3.106895923614502, "learning_rate": 1.435994054166919e-05, "loss": 0.1084, "step": 347 }, { "epoch": 0.6810176125244618, "grad_norm": 3.782027244567871, "learning_rate": 1.4248884542132348e-05, "loss": 0.1325, "step": 348 }, { "epoch": 0.6829745596868885, "grad_norm": 3.8729352951049805, "learning_rate": 1.4137185330553416e-05, "loss": 0.1537, "step": 349 }, { "epoch": 0.684931506849315, "grad_norm": 3.7617311477661133, "learning_rate": 1.4024859816377046e-05, "loss": 0.2099, "step": 350 }, { "epoch": 0.684931506849315, "eval_loss": 0.09886621683835983, "eval_runtime": 107.402, "eval_samples_per_second": 28.417, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8751440351237663, "eval_sts-test_pearson_dot": 0.8544171062513735, "eval_sts-test_pearson_euclidean": 0.906084032839116, "eval_sts-test_pearson_manhattan": 0.9052674845128671, "eval_sts-test_pearson_max": 0.906084032839116, "eval_sts-test_spearman_cosine": 0.9021938876390173, "eval_sts-test_spearman_dot": 0.861093657908235, "eval_sts-test_spearman_euclidean": 0.902945753581654, "eval_sts-test_spearman_manhattan": 0.9021002232489249, "eval_sts-test_spearman_max": 0.902945753581654, "step": 350 }, { "epoch": 0.6868884540117417, "grad_norm": 3.419968366622925, "learning_rate": 1.3911925003859907e-05, "loss": 0.1603, "step": 351 }, { "epoch": 0.6888454011741683, "grad_norm": 3.050192356109619, "learning_rate": 1.3798397989496549e-05, "loss": 0.0982, "step": 352 }, { "epoch": 0.6908023483365949, "grad_norm": 3.8518471717834473, "learning_rate": 1.3684295959431241e-05, "loss": 0.1537, "step": 353 }, { "epoch": 0.6927592954990215, "grad_norm": 3.516019582748413, "learning_rate": 1.3569636186856286e-05, "loss": 0.1758, "step": 354 }, { "epoch": 0.6947162426614482, "grad_norm": 3.678056240081787, "learning_rate": 1.3454436029397135e-05, "loss": 0.1521, "step": 355 }, { "epoch": 0.6947162426614482, "eval_loss": 0.09901077300310135, "eval_runtime": 107.3981, "eval_samples_per_second": 28.418, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8757904015245911, "eval_sts-test_pearson_dot": 0.8540145186864471, "eval_sts-test_pearson_euclidean": 0.9064035478905541, "eval_sts-test_pearson_manhattan": 0.9052325995566524, "eval_sts-test_pearson_max": 0.9064035478905541, "eval_sts-test_spearman_cosine": 0.9019130734737861, "eval_sts-test_spearman_dot": 0.859719212417121, "eval_sts-test_spearman_euclidean": 0.9030878388365718, "eval_sts-test_spearman_manhattan": 0.9015811981193075, "eval_sts-test_spearman_max": 0.9030878388365718, "step": 355 }, { "epoch": 0.6966731898238747, "grad_norm": 2.466977119445801, "learning_rate": 1.3338712926484722e-05, "loss": 0.089, "step": 356 }, { "epoch": 0.6986301369863014, "grad_norm": 3.8731167316436768, "learning_rate": 1.322248439671543e-05, "loss": 0.1509, "step": 357 }, { "epoch": 0.700587084148728, "grad_norm": 4.406742572784424, "learning_rate": 1.3105768035199033e-05, "loss": 0.1943, "step": 358 }, { "epoch": 0.7025440313111546, "grad_norm": 3.6811671257019043, "learning_rate": 1.2988581510895118e-05, "loss": 0.1582, "step": 359 }, { "epoch": 0.7045009784735812, "grad_norm": 3.6019861698150635, "learning_rate": 1.2870942563938265e-05, "loss": 0.1527, "step": 360 }, { "epoch": 0.7045009784735812, "eval_loss": 0.09933393448591232, "eval_runtime": 107.4727, "eval_samples_per_second": 28.398, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8759901613929976, "eval_sts-test_pearson_dot": 0.8539317550786957, "eval_sts-test_pearson_euclidean": 0.9058198972317745, "eval_sts-test_pearson_manhattan": 0.9045671303429671, "eval_sts-test_pearson_max": 0.9058198972317745, "eval_sts-test_spearman_cosine": 0.9007536577936579, "eval_sts-test_spearman_dot": 0.8583788673871872, "eval_sts-test_spearman_euclidean": 0.9019342408204244, "eval_sts-test_spearman_manhattan": 0.9006401238435077, "eval_sts-test_spearman_max": 0.9019342408204244, "step": 360 }, { "epoch": 0.7064579256360078, "grad_norm": 2.7014875411987305, "learning_rate": 1.2752869002952492e-05, "loss": 0.0754, "step": 361 }, { "epoch": 0.7084148727984344, "grad_norm": 3.4292407035827637, "learning_rate": 1.2634378702355317e-05, "loss": 0.122, "step": 362 }, { "epoch": 0.7103718199608611, "grad_norm": 3.9553112983703613, "learning_rate": 1.2515489599651846e-05, "loss": 0.1727, "step": 363 }, { "epoch": 0.7123287671232876, "grad_norm": 2.3133935928344727, "learning_rate": 1.2396219692719364e-05, "loss": 0.074, "step": 364 }, { "epoch": 0.7142857142857143, "grad_norm": 4.136401176452637, "learning_rate": 1.2276587037082707e-05, "loss": 0.1822, "step": 365 }, { "epoch": 0.7142857142857143, "eval_loss": 0.09977750480175018, "eval_runtime": 107.5146, "eval_samples_per_second": 28.387, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8763860642424627, "eval_sts-test_pearson_dot": 0.8548955898076025, "eval_sts-test_pearson_euclidean": 0.9053617182443816, "eval_sts-test_pearson_manhattan": 0.9041710686717819, "eval_sts-test_pearson_max": 0.9053617182443816, "eval_sts-test_spearman_cosine": 0.9004566884230645, "eval_sts-test_spearman_dot": 0.8588638368068857, "eval_sts-test_spearman_euclidean": 0.90101585543415, "eval_sts-test_spearman_manhattan": 0.9002594696141124, "eval_sts-test_spearman_max": 0.90101585543415, "step": 365 }, { "epoch": 0.7162426614481409, "grad_norm": 3.4787533283233643, "learning_rate": 1.215660974318097e-05, "loss": 0.1344, "step": 366 }, { "epoch": 0.7181996086105675, "grad_norm": 3.3478775024414062, "learning_rate": 1.2036305973625881e-05, "loss": 0.1819, "step": 367 }, { "epoch": 0.7201565557729941, "grad_norm": 3.435234546661377, "learning_rate": 1.191569394045228e-05, "loss": 0.1811, "step": 368 }, { "epoch": 0.7221135029354208, "grad_norm": 3.827272653579712, "learning_rate": 1.1794791902361095e-05, "loss": 0.1564, "step": 369 }, { "epoch": 0.7240704500978473, "grad_norm": 4.088834762573242, "learning_rate": 1.1673618161955288e-05, "loss": 0.1522, "step": 370 }, { "epoch": 0.7240704500978473, "eval_loss": 0.09972013533115387, "eval_runtime": 107.5001, "eval_samples_per_second": 28.391, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8760486874181325, "eval_sts-test_pearson_dot": 0.8547648526750222, "eval_sts-test_pearson_euclidean": 0.9048215352945358, "eval_sts-test_pearson_manhattan": 0.9037328672460638, "eval_sts-test_pearson_max": 0.9048215352945358, "eval_sts-test_spearman_cosine": 0.8997958912973589, "eval_sts-test_spearman_dot": 0.859370958100973, "eval_sts-test_spearman_euclidean": 0.9005396125104228, "eval_sts-test_spearman_manhattan": 0.8996819545858564, "eval_sts-test_spearman_max": 0.9005396125104228, "step": 370 }, { "epoch": 0.726027397260274, "grad_norm": 3.117750406265259, "learning_rate": 1.1552191062969147e-05, "loss": 0.1379, "step": 371 }, { "epoch": 0.7279843444227005, "grad_norm": 2.870415449142456, "learning_rate": 1.1430528987491303e-05, "loss": 0.082, "step": 372 }, { "epoch": 0.7299412915851272, "grad_norm": 3.0934267044067383, "learning_rate": 1.1308650353182036e-05, "loss": 0.1288, "step": 373 }, { "epoch": 0.7318982387475538, "grad_norm": 4.175031661987305, "learning_rate": 1.1186573610485099e-05, "loss": 0.1809, "step": 374 }, { "epoch": 0.7338551859099804, "grad_norm": 4.580765724182129, "learning_rate": 1.1064317239834628e-05, "loss": 0.2418, "step": 375 }, { "epoch": 0.7338551859099804, "eval_loss": 0.0990942195057869, "eval_runtime": 107.4134, "eval_samples_per_second": 28.414, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8759927956331243, "eval_sts-test_pearson_dot": 0.8545842177056733, "eval_sts-test_pearson_euclidean": 0.9050231511967292, "eval_sts-test_pearson_manhattan": 0.9040000709018708, "eval_sts-test_pearson_max": 0.9050231511967292, "eval_sts-test_spearman_cosine": 0.9005535301527153, "eval_sts-test_spearman_dot": 0.8591386543030902, "eval_sts-test_spearman_euclidean": 0.9009244733583888, "eval_sts-test_spearman_manhattan": 0.9002575453098726, "eval_sts-test_spearman_max": 0.9009244733583888, "step": 375 }, { "epoch": 0.735812133072407, "grad_norm": 2.8520517349243164, "learning_rate": 1.094189974885752e-05, "loss": 0.0789, "step": 376 }, { "epoch": 0.7377690802348337, "grad_norm": 3.5163254737854004, "learning_rate": 1.081933966957167e-05, "loss": 0.132, "step": 377 }, { "epoch": 0.7397260273972602, "grad_norm": 3.160409688949585, "learning_rate": 1.0696655555580527e-05, "loss": 0.1425, "step": 378 }, { "epoch": 0.7416829745596869, "grad_norm": 3.013707160949707, "learning_rate": 1.0573865979264362e-05, "loss": 0.1514, "step": 379 }, { "epoch": 0.7436399217221135, "grad_norm": 2.8930296897888184, "learning_rate": 1.0450989528968747e-05, "loss": 0.0997, "step": 380 }, { "epoch": 0.7436399217221135, "eval_loss": 0.09842444956302643, "eval_runtime": 107.304, "eval_samples_per_second": 28.443, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8764482210558782, "eval_sts-test_pearson_dot": 0.8556595611954538, "eval_sts-test_pearson_euclidean": 0.9055523774056253, "eval_sts-test_pearson_manhattan": 0.9046224074077234, "eval_sts-test_pearson_max": 0.9055523774056253, "eval_sts-test_spearman_cosine": 0.9017774324005088, "eval_sts-test_spearman_dot": 0.860274840931642, "eval_sts-test_spearman_euclidean": 0.9016713719110112, "eval_sts-test_spearman_manhattan": 0.9011951289872838, "eval_sts-test_spearman_max": 0.9017774324005088, "step": 380 }, { "epoch": 0.7455968688845401, "grad_norm": 4.028687477111816, "learning_rate": 1.0328044806190547e-05, "loss": 0.2002, "step": 381 }, { "epoch": 0.7475538160469667, "grad_norm": 3.8006536960601807, "learning_rate": 1.0205050422761989e-05, "loss": 0.1943, "step": 382 }, { "epoch": 0.7495107632093934, "grad_norm": 2.689953088760376, "learning_rate": 1.0082024998033092e-05, "loss": 0.1198, "step": 383 }, { "epoch": 0.7514677103718199, "grad_norm": 3.1326684951782227, "learning_rate": 9.95898715605304e-06, "loss": 0.1171, "step": 384 }, { "epoch": 0.7534246575342466, "grad_norm": 2.8200089931488037, "learning_rate": 9.835955522750789e-06, "loss": 0.0872, "step": 385 }, { "epoch": 0.7534246575342466, "eval_loss": 0.09781364351511002, "eval_runtime": 107.4026, "eval_samples_per_second": 28.416, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8764091455338202, "eval_sts-test_pearson_dot": 0.8558042892931453, "eval_sts-test_pearson_euclidean": 0.9058212793054227, "eval_sts-test_pearson_manhattan": 0.9049414370234095, "eval_sts-test_pearson_max": 0.9058212793054227, "eval_sts-test_spearman_cosine": 0.902642877044557, "eval_sts-test_spearman_dot": 0.861618590204356, "eval_sts-test_spearman_euclidean": 0.9022327317339048, "eval_sts-test_spearman_manhattan": 0.9017312490987527, "eval_sts-test_spearman_max": 0.902642877044557, "step": 385 }, { "epoch": 0.7553816046966731, "grad_norm": 3.0478882789611816, "learning_rate": 9.712948723115384e-06, "loss": 0.0937, "step": 386 }, { "epoch": 0.7573385518590998, "grad_norm": 3.0198819637298584, "learning_rate": 9.589985378376474e-06, "loss": 0.0933, "step": 387 }, { "epoch": 0.7592954990215264, "grad_norm": 3.319575786590576, "learning_rate": 9.46708410318533e-06, "loss": 0.1109, "step": 388 }, { "epoch": 0.761252446183953, "grad_norm": 3.1134960651397705, "learning_rate": 9.344263502796918e-06, "loss": 0.0999, "step": 389 }, { "epoch": 0.7632093933463796, "grad_norm": 3.596510887145996, "learning_rate": 9.221542170253334e-06, "loss": 0.1625, "step": 390 }, { "epoch": 0.7632093933463796, "eval_loss": 0.09730728715658188, "eval_runtime": 107.3398, "eval_samples_per_second": 28.433, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8764763861944456, "eval_sts-test_pearson_dot": 0.8561556513692123, "eval_sts-test_pearson_euclidean": 0.9062056976600675, "eval_sts-test_pearson_manhattan": 0.9053202975786792, "eval_sts-test_pearson_max": 0.9062056976600675, "eval_sts-test_spearman_cosine": 0.9034457146739187, "eval_sts-test_spearman_dot": 0.8626341305791112, "eval_sts-test_spearman_euclidean": 0.9030662687285811, "eval_sts-test_spearman_manhattan": 0.9025959777226189, "eval_sts-test_spearman_max": 0.9034457146739187, "step": 390 }, { "epoch": 0.7651663405088063, "grad_norm": 2.9087250232696533, "learning_rate": 9.098938683569155e-06, "loss": 0.1357, "step": 391 }, { "epoch": 0.7671232876712328, "grad_norm": 3.40970516204834, "learning_rate": 8.97647160291899e-06, "loss": 0.1202, "step": 392 }, { "epoch": 0.7690802348336595, "grad_norm": 2.962822437286377, "learning_rate": 8.854159467827808e-06, "loss": 0.116, "step": 393 }, { "epoch": 0.7710371819960861, "grad_norm": 3.2842860221862793, "learning_rate": 8.732020794364327e-06, "loss": 0.1256, "step": 394 }, { "epoch": 0.7729941291585127, "grad_norm": 3.8854291439056396, "learning_rate": 8.610074072338006e-06, "loss": 0.2402, "step": 395 }, { "epoch": 0.7729941291585127, "eval_loss": 0.09690071642398834, "eval_runtime": 107.3243, "eval_samples_per_second": 28.437, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8764916753522476, "eval_sts-test_pearson_dot": 0.8559063099369891, "eval_sts-test_pearson_euclidean": 0.906553160229111, "eval_sts-test_pearson_manhattan": 0.9056446394888255, "eval_sts-test_pearson_max": 0.906553160229111, "eval_sts-test_spearman_cosine": 0.9036960084788808, "eval_sts-test_spearman_dot": 0.8628471465833341, "eval_sts-test_spearman_euclidean": 0.9032961559583487, "eval_sts-test_spearman_manhattan": 0.9026765747443849, "eval_sts-test_spearman_max": 0.9036960084788808, "step": 395 }, { "epoch": 0.7749510763209393, "grad_norm": 4.216285705566406, "learning_rate": 8.488337762499971e-06, "loss": 0.2413, "step": 396 }, { "epoch": 0.776908023483366, "grad_norm": 3.007631540298462, "learning_rate": 8.366830293748364e-06, "loss": 0.1144, "step": 397 }, { "epoch": 0.7788649706457925, "grad_norm": 3.3882946968078613, "learning_rate": 8.245570060338511e-06, "loss": 0.1198, "step": 398 }, { "epoch": 0.7808219178082192, "grad_norm": 3.6439969539642334, "learning_rate": 8.124575419098321e-06, "loss": 0.1361, "step": 399 }, { "epoch": 0.7827788649706457, "grad_norm": 3.3682761192321777, "learning_rate": 8.003864686649369e-06, "loss": 0.1496, "step": 400 }, { "epoch": 0.7827788649706457, "eval_loss": 0.09637484699487686, "eval_runtime": 107.2784, "eval_samples_per_second": 28.449, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8766720014290486, "eval_sts-test_pearson_dot": 0.8562700614904787, "eval_sts-test_pearson_euclidean": 0.9067249528004879, "eval_sts-test_pearson_manhattan": 0.905876462094101, "eval_sts-test_pearson_max": 0.9067249528004879, "eval_sts-test_spearman_cosine": 0.9038214015132995, "eval_sts-test_spearman_dot": 0.8632292328530939, "eval_sts-test_spearman_euclidean": 0.9036244959631774, "eval_sts-test_spearman_manhattan": 0.9027354226531145, "eval_sts-test_spearman_max": 0.9038214015132995, "step": 400 }, { "epoch": 0.7847358121330724, "grad_norm": 3.5519731044769287, "learning_rate": 7.883456136634053e-06, "loss": 0.1606, "step": 401 }, { "epoch": 0.786692759295499, "grad_norm": 4.0106329917907715, "learning_rate": 7.763367996949262e-06, "loss": 0.1739, "step": 402 }, { "epoch": 0.7886497064579256, "grad_norm": 3.347114086151123, "learning_rate": 7.64361844698699e-06, "loss": 0.1121, "step": 403 }, { "epoch": 0.7906066536203522, "grad_norm": 3.429165840148926, "learning_rate": 7.524225614882216e-06, "loss": 0.1176, "step": 404 }, { "epoch": 0.7925636007827789, "grad_norm": 3.169438362121582, "learning_rate": 7.4052075747686625e-06, "loss": 0.1024, "step": 405 }, { "epoch": 0.7925636007827789, "eval_loss": 0.09552557021379471, "eval_runtime": 107.3797, "eval_samples_per_second": 28.423, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8765829897566032, "eval_sts-test_pearson_dot": 0.856005218083783, "eval_sts-test_pearson_euclidean": 0.9066659750196452, "eval_sts-test_pearson_manhattan": 0.9058351541345429, "eval_sts-test_pearson_max": 0.9066659750196452, "eval_sts-test_spearman_cosine": 0.9033677132253135, "eval_sts-test_spearman_dot": 0.8628391808588065, "eval_sts-test_spearman_euclidean": 0.9033363425910781, "eval_sts-test_spearman_manhattan": 0.9027725662000693, "eval_sts-test_spearman_max": 0.9033677132253135, "step": 405 }, { "epoch": 0.7945205479452054, "grad_norm": 3.571244239807129, "learning_rate": 7.286582344042625e-06, "loss": 0.1256, "step": 406 }, { "epoch": 0.7964774951076321, "grad_norm": 3.144022226333618, "learning_rate": 7.168367880635454e-06, "loss": 0.1424, "step": 407 }, { "epoch": 0.7984344422700587, "grad_norm": 3.899695634841919, "learning_rate": 7.050582080294996e-06, "loss": 0.181, "step": 408 }, { "epoch": 0.8003913894324853, "grad_norm": 2.7152762413024902, "learning_rate": 6.933242773876481e-06, "loss": 0.0829, "step": 409 }, { "epoch": 0.8023483365949119, "grad_norm": 4.248819351196289, "learning_rate": 6.816367724643225e-06, "loss": 0.2329, "step": 410 }, { "epoch": 0.8023483365949119, "eval_loss": 0.09485543519258499, "eval_runtime": 107.35, "eval_samples_per_second": 28.43, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.876662263871305, "eval_sts-test_pearson_dot": 0.8565606973853341, "eval_sts-test_pearson_euclidean": 0.906503720630583, "eval_sts-test_pearson_manhattan": 0.9056869432887309, "eval_sts-test_pearson_max": 0.906503720630583, "eval_sts-test_spearman_cosine": 0.9031667800616662, "eval_sts-test_spearman_dot": 0.8635160884386128, "eval_sts-test_spearman_euclidean": 0.9027914064811151, "eval_sts-test_spearman_manhattan": 0.9026432798059095, "eval_sts-test_spearman_max": 0.9031667800616662, "step": 410 }, { "epoch": 0.8043052837573386, "grad_norm": 2.724637508392334, "learning_rate": 6.699974625577545e-06, "loss": 0.075, "step": 411 }, { "epoch": 0.8062622309197651, "grad_norm": 2.825380325317383, "learning_rate": 6.584081096702343e-06, "loss": 0.1157, "step": 412 }, { "epoch": 0.8082191780821918, "grad_norm": 3.5397439002990723, "learning_rate": 6.4687046824137115e-06, "loss": 0.1383, "step": 413 }, { "epoch": 0.8101761252446184, "grad_norm": 2.7954063415527344, "learning_rate": 6.353862848825011e-06, "loss": 0.1042, "step": 414 }, { "epoch": 0.812133072407045, "grad_norm": 3.3894357681274414, "learning_rate": 6.2395729811227635e-06, "loss": 0.1352, "step": 415 }, { "epoch": 0.812133072407045, "eval_loss": 0.09433061629533768, "eval_runtime": 107.4406, "eval_samples_per_second": 28.406, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8762374184553781, "eval_sts-test_pearson_dot": 0.8565984619458994, "eval_sts-test_pearson_euclidean": 0.9059975243362595, "eval_sts-test_pearson_manhattan": 0.9052369107641345, "eval_sts-test_pearson_max": 0.9059975243362595, "eval_sts-test_spearman_cosine": 0.9025091804482506, "eval_sts-test_spearman_dot": 0.8638915067704253, "eval_sts-test_spearman_euclidean": 0.9023325717980691, "eval_sts-test_spearman_manhattan": 0.9021176314896061, "eval_sts-test_spearman_max": 0.9025091804482506, "step": 415 }, { "epoch": 0.8140900195694716, "grad_norm": 2.5251049995422363, "learning_rate": 6.125852380934841e-06, "loss": 0.0778, "step": 416 }, { "epoch": 0.8160469667318982, "grad_norm": 2.7683308124542236, "learning_rate": 6.012718263711261e-06, "loss": 0.1006, "step": 417 }, { "epoch": 0.8180039138943248, "grad_norm": 4.330955982208252, "learning_rate": 5.900187756118055e-06, "loss": 0.2188, "step": 418 }, { "epoch": 0.8199608610567515, "grad_norm": 4.001932621002197, "learning_rate": 5.788277893444574e-06, "loss": 0.1338, "step": 419 }, { "epoch": 0.821917808219178, "grad_norm": 3.1434426307678223, "learning_rate": 5.677005617024618e-06, "loss": 0.1314, "step": 420 }, { "epoch": 0.821917808219178, "eval_loss": 0.09401828795671463, "eval_runtime": 107.3518, "eval_samples_per_second": 28.43, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.875859101651175, "eval_sts-test_pearson_dot": 0.85667243561015, "eval_sts-test_pearson_euclidean": 0.9054683468020812, "eval_sts-test_pearson_manhattan": 0.9047678088467972, "eval_sts-test_pearson_max": 0.9054683468020812, "eval_sts-test_spearman_cosine": 0.9022520195275646, "eval_sts-test_spearman_dot": 0.8640136777140237, "eval_sts-test_spearman_euclidean": 0.9018411134454687, "eval_sts-test_spearman_manhattan": 0.9017817285216024, "eval_sts-test_spearman_max": 0.9022520195275646, "step": 420 }, { "epoch": 0.8238747553816047, "grad_norm": 3.4323763847351074, "learning_rate": 5.566387771671788e-06, "loss": 0.1298, "step": 421 }, { "epoch": 0.8258317025440313, "grad_norm": 3.4278817176818848, "learning_rate": 5.4564411031294695e-06, "loss": 0.1573, "step": 422 }, { "epoch": 0.8277886497064579, "grad_norm": 3.182588577270508, "learning_rate": 5.34718225553579e-06, "loss": 0.1283, "step": 423 }, { "epoch": 0.8297455968688845, "grad_norm": 3.8493266105651855, "learning_rate": 5.238627768903952e-06, "loss": 0.1998, "step": 424 }, { "epoch": 0.8317025440313112, "grad_norm": 2.763507127761841, "learning_rate": 5.130794076618395e-06, "loss": 0.0747, "step": 425 }, { "epoch": 0.8317025440313112, "eval_loss": 0.09391138702630997, "eval_runtime": 107.4165, "eval_samples_per_second": 28.413, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8759008831174315, "eval_sts-test_pearson_dot": 0.8568367129286429, "eval_sts-test_pearson_euclidean": 0.9053146972146215, "eval_sts-test_pearson_manhattan": 0.9046055003987086, "eval_sts-test_pearson_max": 0.9053146972146215, "eval_sts-test_spearman_cosine": 0.9019464579147841, "eval_sts-test_spearman_dot": 0.8642364494932302, "eval_sts-test_spearman_euclidean": 0.9015885820774372, "eval_sts-test_spearman_manhattan": 0.9014194670606389, "eval_sts-test_spearman_max": 0.9019464579147841, "step": 425 }, { "epoch": 0.8336594911937377, "grad_norm": 3.613516092300415, "learning_rate": 5.02369750294697e-06, "loss": 0.142, "step": 426 }, { "epoch": 0.8356164383561644, "grad_norm": 3.1161856651306152, "learning_rate": 4.917354260569775e-06, "loss": 0.131, "step": 427 }, { "epoch": 0.837573385518591, "grad_norm": 2.7324466705322266, "learning_rate": 4.811780448124812e-06, "loss": 0.1037, "step": 428 }, { "epoch": 0.8395303326810176, "grad_norm": 3.0681588649749756, "learning_rate": 4.706992047770877e-06, "loss": 0.1145, "step": 429 }, { "epoch": 0.8414872798434442, "grad_norm": 3.202045202255249, "learning_rate": 4.6030049227681484e-06, "loss": 0.1371, "step": 430 }, { "epoch": 0.8414872798434442, "eval_loss": 0.09390870481729507, "eval_runtime": 107.3691, "eval_samples_per_second": 28.425, "eval_steps_per_second": 0.224, "eval_sts-test_pearson_cosine": 0.8761008129035103, "eval_sts-test_pearson_dot": 0.8571168858418021, "eval_sts-test_pearson_euclidean": 0.9053023363687696, "eval_sts-test_pearson_manhattan": 0.9045322392753761, "eval_sts-test_pearson_max": 0.9053023363687696, "eval_sts-test_spearman_cosine": 0.9020062008487415, "eval_sts-test_spearman_dot": 0.8644168418278989, "eval_sts-test_spearman_euclidean": 0.9017023845351555, "eval_sts-test_spearman_manhattan": 0.9012797088713137, "eval_sts-test_spearman_max": 0.9020062008487415, "step": 430 }, { "epoch": 0.8434442270058709, "grad_norm": 3.2915422916412354, "learning_rate": 4.4998348150767525e-06, "loss": 0.1457, "step": 431 }, { "epoch": 0.8454011741682974, "grad_norm": 2.914283514022827, "learning_rate": 4.397497342973677e-06, "loss": 0.123, "step": 432 }, { "epoch": 0.8473581213307241, "grad_norm": 2.803455114364624, "learning_rate": 4.296007998688405e-06, "loss": 0.1312, "step": 433 }, { "epoch": 0.8493150684931506, "grad_norm": 3.1922266483306885, "learning_rate": 4.195382146057672e-06, "loss": 0.1301, "step": 434 }, { "epoch": 0.8512720156555773, "grad_norm": 3.4597818851470947, "learning_rate": 4.095635018199612e-06, "loss": 0.1324, "step": 435 }, { "epoch": 0.8512720156555773, "eval_loss": 0.09401452541351318, "eval_runtime": 107.4416, "eval_samples_per_second": 28.406, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8763389911185249, "eval_sts-test_pearson_dot": 0.8577101143502727, "eval_sts-test_pearson_euclidean": 0.9054263092988353, "eval_sts-test_pearson_manhattan": 0.904625946001658, "eval_sts-test_pearson_max": 0.9054263092988353, "eval_sts-test_spearman_cosine": 0.902128819304954, "eval_sts-test_spearman_dot": 0.8650113175842223, "eval_sts-test_spearman_euclidean": 0.9018843879152342, "eval_sts-test_spearman_manhattan": 0.9012396564923683, "eval_sts-test_spearman_max": 0.902128819304954, "step": 435 }, { "epoch": 0.8532289628180039, "grad_norm": 4.107654571533203, "learning_rate": 3.996781715207706e-06, "loss": 0.1826, "step": 436 }, { "epoch": 0.8551859099804305, "grad_norm": 3.786817789077759, "learning_rate": 3.898837201864893e-06, "loss": 0.2145, "step": 437 }, { "epoch": 0.8571428571428571, "grad_norm": 3.1700594425201416, "learning_rate": 3.8018163053781243e-06, "loss": 0.132, "step": 438 }, { "epoch": 0.8590998043052838, "grad_norm": 3.451833963394165, "learning_rate": 3.7057337131337822e-06, "loss": 0.1263, "step": 439 }, { "epoch": 0.8610567514677103, "grad_norm": 3.5724422931671143, "learning_rate": 3.610603970474239e-06, "loss": 0.1461, "step": 440 }, { "epoch": 0.8610567514677103, "eval_loss": 0.09419582784175873, "eval_runtime": 107.4099, "eval_samples_per_second": 28.415, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8765822162435379, "eval_sts-test_pearson_dot": 0.858518150878541, "eval_sts-test_pearson_euclidean": 0.9054994480778518, "eval_sts-test_pearson_manhattan": 0.9046991633393977, "eval_sts-test_pearson_max": 0.9054994480778518, "eval_sts-test_spearman_cosine": 0.9021761661395061, "eval_sts-test_spearman_dot": 0.8659527051188517, "eval_sts-test_spearman_euclidean": 0.9019459656509089, "eval_sts-test_spearman_manhattan": 0.9013235203562157, "eval_sts-test_spearman_max": 0.9021761661395061, "step": 440 }, { "epoch": 0.863013698630137, "grad_norm": 3.693023204803467, "learning_rate": 3.5164414784959368e-06, "loss": 0.1349, "step": 441 }, { "epoch": 0.8649706457925636, "grad_norm": 2.6913726329803467, "learning_rate": 3.423260491869276e-06, "loss": 0.0824, "step": 442 }, { "epoch": 0.8669275929549902, "grad_norm": 3.2380688190460205, "learning_rate": 3.3310751166807186e-06, "loss": 0.1639, "step": 443 }, { "epoch": 0.8688845401174168, "grad_norm": 3.5086116790771484, "learning_rate": 3.2398993082973294e-06, "loss": 0.1403, "step": 444 }, { "epoch": 0.8708414872798435, "grad_norm": 3.3980696201324463, "learning_rate": 3.1497468692541812e-06, "loss": 0.1766, "step": 445 }, { "epoch": 0.8708414872798435, "eval_loss": 0.09430497139692307, "eval_runtime": 107.5105, "eval_samples_per_second": 28.388, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8769023883634329, "eval_sts-test_pearson_dot": 0.8592785166761266, "eval_sts-test_pearson_euclidean": 0.9057520441250233, "eval_sts-test_pearson_manhattan": 0.9049586591092414, "eval_sts-test_pearson_max": 0.9057520441250233, "eval_sts-test_spearman_cosine": 0.9023938810261752, "eval_sts-test_spearman_dot": 0.8662211679359387, "eval_sts-test_spearman_euclidean": 0.9022854934710853, "eval_sts-test_spearman_manhattan": 0.9016112262157012, "eval_sts-test_spearman_max": 0.9023938810261752, "step": 445 }, { "epoch": 0.87279843444227, "grad_norm": 3.6057686805725098, "learning_rate": 3.0606314471648667e-06, "loss": 0.1402, "step": 446 }, { "epoch": 0.8747553816046967, "grad_norm": 3.3049042224884033, "learning_rate": 2.972566532655462e-06, "loss": 0.1203, "step": 447 }, { "epoch": 0.8767123287671232, "grad_norm": 3.3699026107788086, "learning_rate": 2.8855654573222824e-06, "loss": 0.1398, "step": 448 }, { "epoch": 0.8786692759295499, "grad_norm": 4.141995906829834, "learning_rate": 2.79964139171369e-06, "loss": 0.2226, "step": 449 }, { "epoch": 0.8806262230919765, "grad_norm": 3.005411386489868, "learning_rate": 2.7148073433362732e-06, "loss": 0.0943, "step": 450 }, { "epoch": 0.8806262230919765, "eval_loss": 0.09433107823133469, "eval_runtime": 107.5997, "eval_samples_per_second": 28.364, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8770850204583311, "eval_sts-test_pearson_dot": 0.859396639463007, "eval_sts-test_pearson_euclidean": 0.9058757986724881, "eval_sts-test_pearson_manhattan": 0.9050657797722572, "eval_sts-test_pearson_max": 0.9058757986724881, "eval_sts-test_spearman_cosine": 0.9023927622446406, "eval_sts-test_spearman_dot": 0.8658192568573825, "eval_sts-test_spearman_euclidean": 0.9024802062093993, "eval_sts-test_spearman_manhattan": 0.9016976856527094, "eval_sts-test_spearman_max": 0.9024802062093993, "step": 450 }, { "epoch": 0.8825831702544031, "grad_norm": 3.1855523586273193, "learning_rate": 2.6310761546857433e-06, "loss": 0.1101, "step": 451 }, { "epoch": 0.8845401174168297, "grad_norm": 3.945880651473999, "learning_rate": 2.5484605013027753e-06, "loss": 0.1536, "step": 452 }, { "epoch": 0.8864970645792564, "grad_norm": 3.068859100341797, "learning_rate": 2.4669728898541456e-06, "loss": 0.1159, "step": 453 }, { "epoch": 0.8884540117416829, "grad_norm": 2.9130120277404785, "learning_rate": 2.3866256562394084e-06, "loss": 0.1373, "step": 454 }, { "epoch": 0.8904109589041096, "grad_norm": 3.2793118953704834, "learning_rate": 2.3074309637234673e-06, "loss": 0.1412, "step": 455 }, { "epoch": 0.8904109589041096, "eval_loss": 0.0943305641412735, "eval_runtime": 107.5498, "eval_samples_per_second": 28.378, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8773045053021702, "eval_sts-test_pearson_dot": 0.8595737170903098, "eval_sts-test_pearson_euclidean": 0.9060339759187682, "eval_sts-test_pearson_manhattan": 0.9052200019449477, "eval_sts-test_pearson_max": 0.9060339759187682, "eval_sts-test_spearman_cosine": 0.902556909871424, "eval_sts-test_spearman_dot": 0.8656299142704356, "eval_sts-test_spearman_euclidean": 0.9027095564240303, "eval_sts-test_spearman_manhattan": 0.9019418485348608, "eval_sts-test_spearman_max": 0.9027095564240303, "step": 455 }, { "epoch": 0.8923679060665362, "grad_norm": 2.4695982933044434, "learning_rate": 2.229400801095235e-06, "loss": 0.0626, "step": 456 }, { "epoch": 0.8943248532289628, "grad_norm": 3.314206838607788, "learning_rate": 2.15254698085274e-06, "loss": 0.1447, "step": 457 }, { "epoch": 0.8962818003913894, "grad_norm": 2.9935507774353027, "learning_rate": 2.07688113741488e-06, "loss": 0.1296, "step": 458 }, { "epoch": 0.898238747553816, "grad_norm": 3.59999942779541, "learning_rate": 2.0024147253601957e-06, "loss": 0.1407, "step": 459 }, { "epoch": 0.9001956947162426, "grad_norm": 3.781248092651367, "learning_rate": 1.92915901769281e-06, "loss": 0.1966, "step": 460 }, { "epoch": 0.9001956947162426, "eval_loss": 0.0943402424454689, "eval_runtime": 107.5535, "eval_samples_per_second": 28.377, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8773605385802614, "eval_sts-test_pearson_dot": 0.8594795998035439, "eval_sts-test_pearson_euclidean": 0.906101891634431, "eval_sts-test_pearson_manhattan": 0.9052843548013115, "eval_sts-test_pearson_max": 0.906101891634431, "eval_sts-test_spearman_cosine": 0.9026339267922787, "eval_sts-test_spearman_dot": 0.8655708873566604, "eval_sts-test_spearman_euclidean": 0.9027678225663618, "eval_sts-test_spearman_manhattan": 0.9020037842806264, "eval_sts-test_spearman_max": 0.9027678225663618, "step": 460 }, { "epoch": 0.9021526418786693, "grad_norm": 3.1703717708587646, "learning_rate": 1.8571251041358895e-06, "loss": 0.1019, "step": 461 }, { "epoch": 0.9041095890410958, "grad_norm": 3.5208213329315186, "learning_rate": 1.786323889452828e-06, "loss": 0.1332, "step": 462 }, { "epoch": 0.9060665362035225, "grad_norm": 3.7119576930999756, "learning_rate": 1.7167660917964557e-06, "loss": 0.1417, "step": 463 }, { "epoch": 0.9080234833659491, "grad_norm": 3.853672981262207, "learning_rate": 1.6484622410864815e-06, "loss": 0.1782, "step": 464 }, { "epoch": 0.9099804305283757, "grad_norm": 3.186000347137451, "learning_rate": 1.5814226774154351e-06, "loss": 0.1068, "step": 465 }, { "epoch": 0.9099804305283757, "eval_loss": 0.09426674991846085, "eval_runtime": 107.6056, "eval_samples_per_second": 28.363, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8773874434779254, "eval_sts-test_pearson_dot": 0.8592176171000234, "eval_sts-test_pearson_euclidean": 0.9061215425720242, "eval_sts-test_pearson_manhattan": 0.9052975958981246, "eval_sts-test_pearson_max": 0.9061215425720242, "eval_sts-test_spearman_cosine": 0.9025369508088437, "eval_sts-test_spearman_dot": 0.8649846905836943, "eval_sts-test_spearman_euclidean": 0.9027074083634833, "eval_sts-test_spearman_manhattan": 0.9020343941434181, "eval_sts-test_spearman_max": 0.9027074083634833, "step": 465 }, { "epoch": 0.9119373776908023, "grad_norm": 3.7655446529388428, "learning_rate": 1.515657549483328e-06, "loss": 0.1292, "step": 466 }, { "epoch": 0.913894324853229, "grad_norm": 4.406928539276123, "learning_rate": 1.4511768130613434e-06, "loss": 0.1896, "step": 467 }, { "epoch": 0.9158512720156555, "grad_norm": 3.7682666778564453, "learning_rate": 1.3879902294846559e-06, "loss": 0.1597, "step": 468 }, { "epoch": 0.9178082191780822, "grad_norm": 3.596527338027954, "learning_rate": 1.3261073641747358e-06, "loss": 0.1588, "step": 469 }, { "epoch": 0.9197651663405088, "grad_norm": 3.1445114612579346, "learning_rate": 1.2655375851913209e-06, "loss": 0.13, "step": 470 }, { "epoch": 0.9197651663405088, "eval_loss": 0.09422677010297775, "eval_runtime": 107.5943, "eval_samples_per_second": 28.366, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8774123494705477, "eval_sts-test_pearson_dot": 0.8591397738989044, "eval_sts-test_pearson_euclidean": 0.9061478507699354, "eval_sts-test_pearson_manhattan": 0.905330010345305, "eval_sts-test_pearson_max": 0.9061478507699354, "eval_sts-test_spearman_cosine": 0.9025525689990692, "eval_sts-test_spearman_dot": 0.8648119954659849, "eval_sts-test_spearman_euclidean": 0.9027778468489135, "eval_sts-test_spearman_manhattan": 0.9020008754486362, "eval_sts-test_spearman_max": 0.9027778468489135, "step": 470 }, { "epoch": 0.9217221135029354, "grad_norm": 3.6816020011901855, "learning_rate": 1.2062900618142136e-06, "loss": 0.1593, "step": 471 }, { "epoch": 0.923679060665362, "grad_norm": 3.369281530380249, "learning_rate": 1.1483737631552161e-06, "loss": 0.1137, "step": 472 }, { "epoch": 0.9256360078277887, "grad_norm": 3.330564260482788, "learning_rate": 1.0917974568003531e-06, "loss": 0.1295, "step": 473 }, { "epoch": 0.9275929549902152, "grad_norm": 3.9396634101867676, "learning_rate": 1.0365697074826043e-06, "loss": 0.1367, "step": 474 }, { "epoch": 0.9295499021526419, "grad_norm": 3.385772228240967, "learning_rate": 9.82698875785325e-07, "loss": 0.107, "step": 475 }, { "epoch": 0.9295499021526419, "eval_loss": 0.09417176246643066, "eval_runtime": 107.4989, "eval_samples_per_second": 28.391, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8774075619796129, "eval_sts-test_pearson_dot": 0.8590208674593851, "eval_sts-test_pearson_euclidean": 0.9061610232160783, "eval_sts-test_pearson_manhattan": 0.9053388130087265, "eval_sts-test_pearson_max": 0.9061610232160783, "eval_sts-test_spearman_cosine": 0.9025310883936013, "eval_sts-test_spearman_dot": 0.8647318907080943, "eval_sts-test_spearman_euclidean": 0.9027404347943903, "eval_sts-test_spearman_manhattan": 0.9020158671212021, "eval_sts-test_spearman_max": 0.9027404347943903, "step": 475 }, { "epoch": 0.9315068493150684, "grad_norm": 2.921172618865967, "learning_rate": 9.301931168766165e-07, "loss": 0.1442, "step": 476 }, { "epoch": 0.9334637964774951, "grad_norm": 4.05568790435791, "learning_rate": 8.790603792747499e-07, "loss": 0.1841, "step": 477 }, { "epoch": 0.9354207436399217, "grad_norm": 1.999931812286377, "learning_rate": 8.293084036448895e-07, "loss": 0.0436, "step": 478 }, { "epoch": 0.9373776908023483, "grad_norm": 2.164313793182373, "learning_rate": 7.809447216272892e-07, "loss": 0.0908, "step": 479 }, { "epoch": 0.9393346379647749, "grad_norm": 3.318042755126953, "learning_rate": 7.33976654697115e-07, "loss": 0.1233, "step": 480 }, { "epoch": 0.9393346379647749, "eval_loss": 0.09411043673753738, "eval_runtime": 107.5241, "eval_samples_per_second": 28.384, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8774878626138263, "eval_sts-test_pearson_dot": 0.8591304690756056, "eval_sts-test_pearson_euclidean": 0.9062269621782642, "eval_sts-test_pearson_manhattan": 0.9054142728411033, "eval_sts-test_pearson_max": 0.9062269621782642, "eval_sts-test_spearman_cosine": 0.9025523452427622, "eval_sts-test_spearman_dot": 0.8649809762289989, "eval_sts-test_spearman_euclidean": 0.9027271436697569, "eval_sts-test_spearman_manhattan": 0.9021333391823545, "eval_sts-test_spearman_max": 0.9027271436697569, "step": 480 }, { "epoch": 0.9412915851272016, "grad_norm": 3.7038323879241943, "learning_rate": 6.884113130561043e-07, "loss": 0.1676, "step": 481 }, { "epoch": 0.9432485322896281, "grad_norm": 3.2897257804870605, "learning_rate": 6.442555945561923e-07, "loss": 0.1449, "step": 482 }, { "epoch": 0.9452054794520548, "grad_norm": 3.172556161880493, "learning_rate": 6.015161836552764e-07, "loss": 0.1234, "step": 483 }, { "epoch": 0.9471624266144814, "grad_norm": 2.7821803092956543, "learning_rate": 5.601995504053193e-07, "loss": 0.076, "step": 484 }, { "epoch": 0.949119373776908, "grad_norm": 3.796891927719116, "learning_rate": 5.203119494728826e-07, "loss": 0.1369, "step": 485 }, { "epoch": 0.949119373776908, "eval_loss": 0.09407136589288712, "eval_runtime": 107.5103, "eval_samples_per_second": 28.388, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775371989941387, "eval_sts-test_pearson_dot": 0.8591526007541124, "eval_sts-test_pearson_euclidean": 0.9062640446951766, "eval_sts-test_pearson_manhattan": 0.905456867436393, "eval_sts-test_pearson_max": 0.9062640446951766, "eval_sts-test_spearman_cosine": 0.902595753966312, "eval_sts-test_spearman_dot": 0.8649717127178908, "eval_sts-test_spearman_euclidean": 0.9027548894518198, "eval_sts-test_spearman_manhattan": 0.9021964384609165, "eval_sts-test_spearman_max": 0.9027548894518198, "step": 485 }, { "epoch": 0.9510763209393346, "grad_norm": 3.227461576461792, "learning_rate": 4.818594191922577e-07, "loss": 0.144, "step": 486 }, { "epoch": 0.9530332681017613, "grad_norm": 2.9072837829589844, "learning_rate": 4.448477806513729e-07, "loss": 0.0874, "step": 487 }, { "epoch": 0.9549902152641878, "grad_norm": 4.316675186157227, "learning_rate": 4.0928263681057845e-07, "loss": 0.195, "step": 488 }, { "epoch": 0.9569471624266145, "grad_norm": 3.2540643215179443, "learning_rate": 3.7516937165444025e-07, "loss": 0.1585, "step": 489 }, { "epoch": 0.958904109589041, "grad_norm": 3.0417003631591797, "learning_rate": 3.4251314937669313e-07, "loss": 0.1152, "step": 490 }, { "epoch": 0.958904109589041, "eval_loss": 0.09405405074357986, "eval_runtime": 107.5662, "eval_samples_per_second": 28.373, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775483547748348, "eval_sts-test_pearson_dot": 0.8591693308199357, "eval_sts-test_pearson_euclidean": 0.9062789492300676, "eval_sts-test_pearson_manhattan": 0.9054771724847893, "eval_sts-test_pearson_max": 0.9062789492300676, "eval_sts-test_spearman_cosine": 0.9026681615072432, "eval_sts-test_spearman_dot": 0.8650296656013927, "eval_sts-test_spearman_euclidean": 0.9028729432793703, "eval_sts-test_spearman_manhattan": 0.9022379228802262, "eval_sts-test_spearman_max": 0.9028729432793703, "step": 490 }, { "epoch": 0.9608610567514677, "grad_norm": 2.9117610454559326, "learning_rate": 3.1131891359847397e-07, "loss": 0.0862, "step": 491 }, { "epoch": 0.9628180039138943, "grad_norm": 4.31304407119751, "learning_rate": 2.8159138661992824e-07, "loss": 0.2244, "step": 492 }, { "epoch": 0.9647749510763209, "grad_norm": 3.088024854660034, "learning_rate": 2.5333506870533484e-07, "loss": 0.0987, "step": 493 }, { "epoch": 0.9667318982387475, "grad_norm": 3.0071840286254883, "learning_rate": 2.2655423740183925e-07, "loss": 0.105, "step": 494 }, { "epoch": 0.9686888454011742, "grad_norm": 3.72670316696167, "learning_rate": 2.0125294689190555e-07, "loss": 0.1777, "step": 495 }, { "epoch": 0.9686888454011742, "eval_loss": 0.09404183179140091, "eval_runtime": 107.5451, "eval_samples_per_second": 28.379, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775779600592797, "eval_sts-test_pearson_dot": 0.8592464798748846, "eval_sts-test_pearson_euclidean": 0.9062982054856592, "eval_sts-test_pearson_manhattan": 0.905493676973923, "eval_sts-test_pearson_max": 0.9062982054856592, "eval_sts-test_spearman_cosine": 0.9026481576934011, "eval_sts-test_spearman_dot": 0.8650861416932687, "eval_sts-test_spearman_euclidean": 0.9028581753631112, "eval_sts-test_spearman_manhattan": 0.9022116538897895, "eval_sts-test_spearman_max": 0.9028581753631112, "step": 495 }, { "epoch": 0.9706457925636007, "grad_norm": 3.807492256164551, "learning_rate": 1.7743502737957107e-07, "loss": 0.1838, "step": 496 }, { "epoch": 0.9726027397260274, "grad_norm": 3.7314093112945557, "learning_rate": 1.5510408451062552e-07, "loss": 0.1813, "step": 497 }, { "epoch": 0.974559686888454, "grad_norm": 3.0683677196502686, "learning_rate": 1.3426349882676326e-07, "loss": 0.0994, "step": 498 }, { "epoch": 0.9765166340508806, "grad_norm": 3.496849298477173, "learning_rate": 1.1491642525383595e-07, "loss": 0.1576, "step": 499 }, { "epoch": 0.9784735812133072, "grad_norm": 3.3572986125946045, "learning_rate": 9.706579262424243e-08, "loss": 0.1298, "step": 500 }, { "epoch": 0.9784735812133072, "eval_loss": 0.09404946863651276, "eval_runtime": 107.5442, "eval_samples_per_second": 28.379, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775676935231487, "eval_sts-test_pearson_dot": 0.8592566533355803, "eval_sts-test_pearson_euclidean": 0.9062872900289646, "eval_sts-test_pearson_manhattan": 0.9054889721563202, "eval_sts-test_pearson_max": 0.9062872900289646, "eval_sts-test_spearman_cosine": 0.9026514245354827, "eval_sts-test_spearman_dot": 0.8649815132441356, "eval_sts-test_spearman_euclidean": 0.9028325328903337, "eval_sts-test_spearman_manhattan": 0.9022372963625668, "eval_sts-test_spearman_max": 0.9028325328903337, "step": 500 }, { "epoch": 0.9804305283757339, "grad_norm": 3.594297170639038, "learning_rate": 8.071430323354778e-08, "loss": 0.1884, "step": 501 }, { "epoch": 0.9823874755381604, "grad_norm": 2.890631914138794, "learning_rate": 6.586443243140839e-08, "loss": 0.1032, "step": 502 }, { "epoch": 0.9843444227005871, "grad_norm": 2.5980706214904785, "learning_rate": 5.251842824683717e-08, "loss": 0.1164, "step": 503 }, { "epoch": 0.9863013698630136, "grad_norm": 3.6865243911743164, "learning_rate": 4.067831104789033e-08, "loss": 0.1466, "step": 504 }, { "epoch": 0.9882583170254403, "grad_norm": 4.4932637214660645, "learning_rate": 3.034587323581639e-08, "loss": 0.2192, "step": 505 }, { "epoch": 0.9882583170254403, "eval_loss": 0.09403973817825317, "eval_runtime": 107.6061, "eval_samples_per_second": 28.363, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775751164978143, "eval_sts-test_pearson_dot": 0.859260290269255, "eval_sts-test_pearson_euclidean": 0.906298388123929, "eval_sts-test_pearson_manhattan": 0.9055001359483166, "eval_sts-test_pearson_max": 0.906298388123929, "eval_sts-test_spearman_cosine": 0.9026775592721354, "eval_sts-test_spearman_dot": 0.8650394661276374, "eval_sts-test_spearman_euclidean": 0.9028324881390722, "eval_sts-test_spearman_manhattan": 0.9022903713585769, "eval_sts-test_spearman_max": 0.9028324881390722, "step": 505 }, { "epoch": 0.9902152641878669, "grad_norm": 2.892927408218384, "learning_rate": 2.1522678973718848e-08, "loss": 0.1302, "step": 506 }, { "epoch": 0.9921722113502935, "grad_norm": 3.516493320465088, "learning_rate": 1.421006394976221e-08, "loss": 0.1371, "step": 507 }, { "epoch": 0.9941291585127201, "grad_norm": 2.9676320552825928, "learning_rate": 8.40913517497377e-09, "loss": 0.1543, "step": 508 }, { "epoch": 0.9960861056751468, "grad_norm": Infinity, "learning_rate": 8.40913517497377e-09, "loss": 0.1084, "step": 509 }, { "epoch": 0.9980430528375733, "grad_norm": 0.0, "learning_rate": 4.120770815659869e-09, "loss": 0.0, "step": 510 }, { "epoch": 0.9980430528375733, "eval_loss": 0.09404201060533524, "eval_runtime": 107.6151, "eval_samples_per_second": 28.36, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8775732893584032, "eval_sts-test_pearson_dot": 0.8592601719833955, "eval_sts-test_pearson_euclidean": 0.9062985160748658, "eval_sts-test_pearson_manhattan": 0.9054996976916896, "eval_sts-test_pearson_max": 0.9062985160748658, "eval_sts-test_spearman_cosine": 0.9026670874769698, "eval_sts-test_spearman_dot": 0.8650449257815273, "eval_sts-test_spearman_euclidean": 0.9028166014412784, "eval_sts-test_spearman_manhattan": 0.9022697410270756, "eval_sts-test_spearman_max": 0.9028166014412784, "step": 510 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 1.345620060465569e-09, "loss": 0.0, "step": 511 }, { "epoch": 1.0019569471624266, "grad_norm": 0.0, "learning_rate": 8.410302209660437e-11, "loss": 0.0, "step": 512 }, { "epoch": 1.0039138943248533, "grad_norm": 0.0, "learning_rate": 1.999966358932628e-05, "loss": 0.0, "step": 513 }, { "epoch": 1.00587084148728, "grad_norm": 0.0, "learning_rate": 1.9997897495179932e-05, "loss": 0.0, "step": 514 }, { "epoch": 1.0078277886497065, "grad_norm": 0.0, "learning_rate": 1.999461788189681e-05, "loss": 0.0, "step": 515 }, { "epoch": 1.0078277886497065, "eval_loss": 0.09377636015415192, "eval_runtime": 107.6229, "eval_samples_per_second": 28.358, "eval_steps_per_second": 0.223, "eval_sts-test_pearson_cosine": 0.8778303207829065, "eval_sts-test_pearson_dot": 0.8593106541530915, "eval_sts-test_pearson_euclidean": 0.9066522007596656, "eval_sts-test_pearson_manhattan": 0.9058577790072662, "eval_sts-test_pearson_max": 0.9066522007596656, "eval_sts-test_spearman_cosine": 0.9030437588441012, "eval_sts-test_spearman_dot": 0.865145302860828, "eval_sts-test_spearman_euclidean": 0.9032262992393169, "eval_sts-test_spearman_manhattan": 0.9024990017391836, "eval_sts-test_spearman_max": 0.9032262992393169, "step": 515 } ], "logging_steps": 1, "max_steps": 1022, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 103, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 320, "trial_name": null, "trial_params": null }