bobox's picture
Training in progress, step 103, checkpoint
18b1190 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.20156555772994128,
"eval_steps": 5,
"global_step": 103,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019569471624266144,
"grad_norm": 3.932948112487793,
"learning_rate": 7.8125e-08,
"loss": 0.107,
"step": 1
},
{
"epoch": 0.003913894324853229,
"grad_norm": 4.482716083526611,
"learning_rate": 1.5625e-07,
"loss": 0.1529,
"step": 2
},
{
"epoch": 0.005870841487279843,
"grad_norm": 4.672689437866211,
"learning_rate": 2.3437500000000003e-07,
"loss": 0.1874,
"step": 3
},
{
"epoch": 0.007827788649706457,
"grad_norm": 4.226949214935303,
"learning_rate": 3.125e-07,
"loss": 0.1682,
"step": 4
},
{
"epoch": 0.009784735812133072,
"grad_norm": 4.327479362487793,
"learning_rate": 3.90625e-07,
"loss": 0.1438,
"step": 5
},
{
"epoch": 0.009784735812133072,
"eval_loss": 0.1470455378293991,
"eval_runtime": 107.3614,
"eval_samples_per_second": 28.427,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8861388036460539,
"eval_sts-test_pearson_dot": 0.8769528313548112,
"eval_sts-test_pearson_euclidean": 0.9079831987750276,
"eval_sts-test_pearson_manhattan": 0.9086786527495163,
"eval_sts-test_pearson_max": 0.9086786527495163,
"eval_sts-test_spearman_cosine": 0.9077902566323186,
"eval_sts-test_spearman_dot": 0.8794770733264693,
"eval_sts-test_spearman_euclidean": 0.903967335376697,
"eval_sts-test_spearman_manhattan": 0.9043498244078092,
"eval_sts-test_spearman_max": 0.9077902566323186,
"step": 5
},
{
"epoch": 0.011741682974559686,
"grad_norm": 5.27250337600708,
"learning_rate": 4.6875000000000006e-07,
"loss": 0.2961,
"step": 6
},
{
"epoch": 0.0136986301369863,
"grad_norm": 5.903276443481445,
"learning_rate": 5.468750000000001e-07,
"loss": 0.3019,
"step": 7
},
{
"epoch": 0.015655577299412915,
"grad_norm": 4.000335693359375,
"learning_rate": 6.25e-07,
"loss": 0.1184,
"step": 8
},
{
"epoch": 0.01761252446183953,
"grad_norm": 5.876769065856934,
"learning_rate": 7.03125e-07,
"loss": 0.3176,
"step": 9
},
{
"epoch": 0.019569471624266144,
"grad_norm": 4.8437933921813965,
"learning_rate": 7.8125e-07,
"loss": 0.2234,
"step": 10
},
{
"epoch": 0.019569471624266144,
"eval_loss": 0.1467687040567398,
"eval_runtime": 107.2549,
"eval_samples_per_second": 28.456,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8861409457129842,
"eval_sts-test_pearson_dot": 0.876972814890145,
"eval_sts-test_pearson_euclidean": 0.9080268416052204,
"eval_sts-test_pearson_manhattan": 0.9087444298597203,
"eval_sts-test_pearson_max": 0.9087444298597203,
"eval_sts-test_spearman_cosine": 0.9078342918735278,
"eval_sts-test_spearman_dot": 0.8794190309404447,
"eval_sts-test_spearman_euclidean": 0.9039501508923226,
"eval_sts-test_spearman_manhattan": 0.9044244247605487,
"eval_sts-test_spearman_max": 0.9078342918735278,
"step": 10
},
{
"epoch": 0.021526418786692758,
"grad_norm": 4.726498603820801,
"learning_rate": 8.59375e-07,
"loss": 0.1881,
"step": 11
},
{
"epoch": 0.023483365949119372,
"grad_norm": 4.818070411682129,
"learning_rate": 9.375000000000001e-07,
"loss": 0.1593,
"step": 12
},
{
"epoch": 0.025440313111545987,
"grad_norm": 4.98201322555542,
"learning_rate": 1.0156250000000001e-06,
"loss": 0.1833,
"step": 13
},
{
"epoch": 0.0273972602739726,
"grad_norm": 4.269514560699463,
"learning_rate": 1.0937500000000001e-06,
"loss": 0.1352,
"step": 14
},
{
"epoch": 0.029354207436399216,
"grad_norm": 6.1525492668151855,
"learning_rate": 1.1718750000000001e-06,
"loss": 0.3143,
"step": 15
},
{
"epoch": 0.029354207436399216,
"eval_loss": 0.1462097316980362,
"eval_runtime": 107.0721,
"eval_samples_per_second": 28.504,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8860829119688085,
"eval_sts-test_pearson_dot": 0.8768990080043222,
"eval_sts-test_pearson_euclidean": 0.9080646402781543,
"eval_sts-test_pearson_manhattan": 0.9088063929836994,
"eval_sts-test_pearson_max": 0.9088063929836994,
"eval_sts-test_spearman_cosine": 0.907713597721555,
"eval_sts-test_spearman_dot": 0.8795110842851269,
"eval_sts-test_spearman_euclidean": 0.9040110126078148,
"eval_sts-test_spearman_manhattan": 0.9045081991218733,
"eval_sts-test_spearman_max": 0.907713597721555,
"step": 15
},
{
"epoch": 0.03131115459882583,
"grad_norm": 4.751354694366455,
"learning_rate": 1.25e-06,
"loss": 0.1583,
"step": 16
},
{
"epoch": 0.033268101761252444,
"grad_norm": 5.435980319976807,
"learning_rate": 1.328125e-06,
"loss": 0.2015,
"step": 17
},
{
"epoch": 0.03522504892367906,
"grad_norm": 4.1765851974487305,
"learning_rate": 1.40625e-06,
"loss": 0.1476,
"step": 18
},
{
"epoch": 0.03718199608610567,
"grad_norm": 4.689794540405273,
"learning_rate": 1.484375e-06,
"loss": 0.1676,
"step": 19
},
{
"epoch": 0.03913894324853229,
"grad_norm": 4.203744888305664,
"learning_rate": 1.5625e-06,
"loss": 0.1525,
"step": 20
},
{
"epoch": 0.03913894324853229,
"eval_loss": 0.14544810354709625,
"eval_runtime": 107.1845,
"eval_samples_per_second": 28.474,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8861436293943533,
"eval_sts-test_pearson_dot": 0.8769239163708102,
"eval_sts-test_pearson_euclidean": 0.9082269545633608,
"eval_sts-test_pearson_manhattan": 0.9089828403051001,
"eval_sts-test_pearson_max": 0.9089828403051001,
"eval_sts-test_spearman_cosine": 0.907929343552723,
"eval_sts-test_spearman_dot": 0.8796122221358714,
"eval_sts-test_spearman_euclidean": 0.9043074002120102,
"eval_sts-test_spearman_manhattan": 0.9047217521412333,
"eval_sts-test_spearman_max": 0.907929343552723,
"step": 20
},
{
"epoch": 0.0410958904109589,
"grad_norm": 5.152130603790283,
"learning_rate": 1.640625e-06,
"loss": 0.1717,
"step": 21
},
{
"epoch": 0.043052837573385516,
"grad_norm": 5.343059062957764,
"learning_rate": 1.71875e-06,
"loss": 0.198,
"step": 22
},
{
"epoch": 0.04500978473581213,
"grad_norm": 5.224748134613037,
"learning_rate": 1.796875e-06,
"loss": 0.3062,
"step": 23
},
{
"epoch": 0.046966731898238745,
"grad_norm": 4.6179423332214355,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.1241,
"step": 24
},
{
"epoch": 0.04892367906066536,
"grad_norm": 4.200148105621338,
"learning_rate": 1.953125e-06,
"loss": 0.1087,
"step": 25
},
{
"epoch": 0.04892367906066536,
"eval_loss": 0.14457188546657562,
"eval_runtime": 107.3809,
"eval_samples_per_second": 28.422,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8862905994058754,
"eval_sts-test_pearson_dot": 0.877015249192232,
"eval_sts-test_pearson_euclidean": 0.9085054742522269,
"eval_sts-test_pearson_manhattan": 0.9092575877809899,
"eval_sts-test_pearson_max": 0.9092575877809899,
"eval_sts-test_spearman_cosine": 0.9082294902628751,
"eval_sts-test_spearman_dot": 0.8798810429630494,
"eval_sts-test_spearman_euclidean": 0.9047149499495015,
"eval_sts-test_spearman_manhattan": 0.9051023616193669,
"eval_sts-test_spearman_max": 0.9082294902628751,
"step": 25
},
{
"epoch": 0.050880626223091974,
"grad_norm": 4.890737533569336,
"learning_rate": 2.0312500000000002e-06,
"loss": 0.1767,
"step": 26
},
{
"epoch": 0.05283757338551859,
"grad_norm": 4.683767795562744,
"learning_rate": 2.109375e-06,
"loss": 0.1951,
"step": 27
},
{
"epoch": 0.0547945205479452,
"grad_norm": 4.656280040740967,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.1621,
"step": 28
},
{
"epoch": 0.05675146771037182,
"grad_norm": 4.446409702301025,
"learning_rate": 2.265625e-06,
"loss": 0.221,
"step": 29
},
{
"epoch": 0.05870841487279843,
"grad_norm": 5.765133857727051,
"learning_rate": 2.3437500000000002e-06,
"loss": 0.2241,
"step": 30
},
{
"epoch": 0.05870841487279843,
"eval_loss": 0.14350731670856476,
"eval_runtime": 107.3747,
"eval_samples_per_second": 28.424,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8863784941826807,
"eval_sts-test_pearson_dot": 0.8768948467465629,
"eval_sts-test_pearson_euclidean": 0.9088066170487232,
"eval_sts-test_pearson_manhattan": 0.9095658568102677,
"eval_sts-test_pearson_max": 0.9095658568102677,
"eval_sts-test_spearman_cosine": 0.9082580415676429,
"eval_sts-test_spearman_dot": 0.8801849487791585,
"eval_sts-test_spearman_euclidean": 0.9051721735871375,
"eval_sts-test_spearman_manhattan": 0.9054862826908437,
"eval_sts-test_spearman_max": 0.9082580415676429,
"step": 30
},
{
"epoch": 0.060665362035225046,
"grad_norm": 5.359245777130127,
"learning_rate": 2.421875e-06,
"loss": 0.2093,
"step": 31
},
{
"epoch": 0.06262230919765166,
"grad_norm": 4.439486503601074,
"learning_rate": 2.5e-06,
"loss": 0.1615,
"step": 32
},
{
"epoch": 0.06457925636007827,
"grad_norm": 3.689824342727661,
"learning_rate": 2.5781250000000004e-06,
"loss": 0.1615,
"step": 33
},
{
"epoch": 0.06653620352250489,
"grad_norm": 4.842885494232178,
"learning_rate": 2.65625e-06,
"loss": 0.1772,
"step": 34
},
{
"epoch": 0.0684931506849315,
"grad_norm": 5.209301948547363,
"learning_rate": 2.7343750000000004e-06,
"loss": 0.2324,
"step": 35
},
{
"epoch": 0.0684931506849315,
"eval_loss": 0.14226235449314117,
"eval_runtime": 107.3108,
"eval_samples_per_second": 28.441,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8863574366132135,
"eval_sts-test_pearson_dot": 0.8765683077424664,
"eval_sts-test_pearson_euclidean": 0.9091012263251723,
"eval_sts-test_pearson_manhattan": 0.9098631032540263,
"eval_sts-test_pearson_max": 0.9098631032540263,
"eval_sts-test_spearman_cosine": 0.9083728733043733,
"eval_sts-test_spearman_dot": 0.8800282746130272,
"eval_sts-test_spearman_euclidean": 0.9052579170039636,
"eval_sts-test_spearman_manhattan": 0.9059997586640487,
"eval_sts-test_spearman_max": 0.9083728733043733,
"step": 35
},
{
"epoch": 0.07045009784735812,
"grad_norm": 4.740983009338379,
"learning_rate": 2.8125e-06,
"loss": 0.2611,
"step": 36
},
{
"epoch": 0.07240704500978473,
"grad_norm": 5.090059757232666,
"learning_rate": 2.8906250000000004e-06,
"loss": 0.214,
"step": 37
},
{
"epoch": 0.07436399217221135,
"grad_norm": 5.123153209686279,
"learning_rate": 2.96875e-06,
"loss": 0.1985,
"step": 38
},
{
"epoch": 0.07632093933463796,
"grad_norm": 5.401946067810059,
"learning_rate": 3.0468750000000004e-06,
"loss": 0.1855,
"step": 39
},
{
"epoch": 0.07827788649706457,
"grad_norm": 4.838700294494629,
"learning_rate": 3.125e-06,
"loss": 0.1234,
"step": 40
},
{
"epoch": 0.07827788649706457,
"eval_loss": 0.14100149273872375,
"eval_runtime": 107.3059,
"eval_samples_per_second": 28.442,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8864265749012155,
"eval_sts-test_pearson_dot": 0.8764612424174422,
"eval_sts-test_pearson_euclidean": 0.9094092487009695,
"eval_sts-test_pearson_manhattan": 0.9101707626021143,
"eval_sts-test_pearson_max": 0.9101707626021143,
"eval_sts-test_spearman_cosine": 0.908505695048183,
"eval_sts-test_spearman_dot": 0.8802103674956289,
"eval_sts-test_spearman_euclidean": 0.9054564783507572,
"eval_sts-test_spearman_manhattan": 0.9063046490079084,
"eval_sts-test_spearman_max": 0.908505695048183,
"step": 40
},
{
"epoch": 0.08023483365949119,
"grad_norm": 3.8856801986694336,
"learning_rate": 3.2031250000000004e-06,
"loss": 0.1492,
"step": 41
},
{
"epoch": 0.0821917808219178,
"grad_norm": 5.678151607513428,
"learning_rate": 3.28125e-06,
"loss": 0.2022,
"step": 42
},
{
"epoch": 0.08414872798434442,
"grad_norm": 5.104148864746094,
"learning_rate": 3.3593750000000003e-06,
"loss": 0.2146,
"step": 43
},
{
"epoch": 0.08610567514677103,
"grad_norm": 4.76043701171875,
"learning_rate": 3.4375e-06,
"loss": 0.1688,
"step": 44
},
{
"epoch": 0.08806262230919765,
"grad_norm": 5.128803730010986,
"learning_rate": 3.5156250000000003e-06,
"loss": 0.175,
"step": 45
},
{
"epoch": 0.08806262230919765,
"eval_loss": 0.13962982594966888,
"eval_runtime": 107.4144,
"eval_samples_per_second": 28.413,
"eval_steps_per_second": 0.223,
"eval_sts-test_pearson_cosine": 0.886410908658177,
"eval_sts-test_pearson_dot": 0.8762836795862763,
"eval_sts-test_pearson_euclidean": 0.9096890242379734,
"eval_sts-test_pearson_manhattan": 0.9104590803642174,
"eval_sts-test_pearson_max": 0.9104590803642174,
"eval_sts-test_spearman_cosine": 0.9086694846648755,
"eval_sts-test_spearman_dot": 0.8801346931126159,
"eval_sts-test_spearman_euclidean": 0.9057376952773407,
"eval_sts-test_spearman_manhattan": 0.9064708999439774,
"eval_sts-test_spearman_max": 0.9086694846648755,
"step": 45
},
{
"epoch": 0.09001956947162426,
"grad_norm": 4.968522548675537,
"learning_rate": 3.59375e-06,
"loss": 0.2123,
"step": 46
},
{
"epoch": 0.09197651663405088,
"grad_norm": 4.343472957611084,
"learning_rate": 3.6718750000000003e-06,
"loss": 0.1118,
"step": 47
},
{
"epoch": 0.09393346379647749,
"grad_norm": 6.252938270568848,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.3009,
"step": 48
},
{
"epoch": 0.0958904109589041,
"grad_norm": 3.411029815673828,
"learning_rate": 3.828125000000001e-06,
"loss": 0.1071,
"step": 49
},
{
"epoch": 0.09784735812133072,
"grad_norm": 5.379226207733154,
"learning_rate": 3.90625e-06,
"loss": 0.2608,
"step": 50
},
{
"epoch": 0.09784735812133072,
"eval_loss": 0.13823722302913666,
"eval_runtime": 107.3656,
"eval_samples_per_second": 28.426,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8863074884351817,
"eval_sts-test_pearson_dot": 0.8763122134205692,
"eval_sts-test_pearson_euclidean": 0.9097700018848961,
"eval_sts-test_pearson_manhattan": 0.9105724410858811,
"eval_sts-test_pearson_max": 0.9105724410858811,
"eval_sts-test_spearman_cosine": 0.9085105281844131,
"eval_sts-test_spearman_dot": 0.8801239975611433,
"eval_sts-test_spearman_euclidean": 0.9059798443527296,
"eval_sts-test_spearman_manhattan": 0.9065691737139927,
"eval_sts-test_spearman_max": 0.9085105281844131,
"step": 50
},
{
"epoch": 0.09980430528375733,
"grad_norm": 4.599095821380615,
"learning_rate": 3.984375e-06,
"loss": 0.1368,
"step": 51
},
{
"epoch": 0.10176125244618395,
"grad_norm": 5.634761333465576,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.2307,
"step": 52
},
{
"epoch": 0.10371819960861056,
"grad_norm": 4.678525924682617,
"learning_rate": 4.140625000000001e-06,
"loss": 0.1366,
"step": 53
},
{
"epoch": 0.10567514677103718,
"grad_norm": 4.931070327758789,
"learning_rate": 4.21875e-06,
"loss": 0.1857,
"step": 54
},
{
"epoch": 0.10763209393346379,
"grad_norm": 4.903087139129639,
"learning_rate": 4.296875e-06,
"loss": 0.2155,
"step": 55
},
{
"epoch": 0.10763209393346379,
"eval_loss": 0.1367325782775879,
"eval_runtime": 107.3012,
"eval_samples_per_second": 28.443,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.88603017002284,
"eval_sts-test_pearson_dot": 0.8761626193697236,
"eval_sts-test_pearson_euclidean": 0.9096799681812165,
"eval_sts-test_pearson_manhattan": 0.9104977957475867,
"eval_sts-test_pearson_max": 0.9104977957475867,
"eval_sts-test_spearman_cosine": 0.9084685067499666,
"eval_sts-test_spearman_dot": 0.8802836700617878,
"eval_sts-test_spearman_euclidean": 0.9058409364373706,
"eval_sts-test_spearman_manhattan": 0.9064240006220393,
"eval_sts-test_spearman_max": 0.9084685067499666,
"step": 55
},
{
"epoch": 0.1095890410958904,
"grad_norm": 5.408311367034912,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.2022,
"step": 56
},
{
"epoch": 0.11154598825831702,
"grad_norm": 4.5926713943481445,
"learning_rate": 4.453125000000001e-06,
"loss": 0.2076,
"step": 57
},
{
"epoch": 0.11350293542074363,
"grad_norm": 6.475535869598389,
"learning_rate": 4.53125e-06,
"loss": 0.4133,
"step": 58
},
{
"epoch": 0.11545988258317025,
"grad_norm": 4.997581481933594,
"learning_rate": 4.609375e-06,
"loss": 0.1823,
"step": 59
},
{
"epoch": 0.11741682974559686,
"grad_norm": 3.899284601211548,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.1136,
"step": 60
},
{
"epoch": 0.11741682974559686,
"eval_loss": 0.13528631627559662,
"eval_runtime": 107.3435,
"eval_samples_per_second": 28.432,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8860224650016398,
"eval_sts-test_pearson_dot": 0.8762739756970772,
"eval_sts-test_pearson_euclidean": 0.9099016820022997,
"eval_sts-test_pearson_manhattan": 0.9107281338135995,
"eval_sts-test_pearson_max": 0.9107281338135995,
"eval_sts-test_spearman_cosine": 0.9087510214631306,
"eval_sts-test_spearman_dot": 0.8808623486228402,
"eval_sts-test_spearman_euclidean": 0.9060555634870038,
"eval_sts-test_spearman_manhattan": 0.9067256241238172,
"eval_sts-test_spearman_max": 0.9087510214631306,
"step": 60
},
{
"epoch": 0.11937377690802348,
"grad_norm": 4.476404190063477,
"learning_rate": 4.765625000000001e-06,
"loss": 0.1687,
"step": 61
},
{
"epoch": 0.12133072407045009,
"grad_norm": 4.893277168273926,
"learning_rate": 4.84375e-06,
"loss": 0.1591,
"step": 62
},
{
"epoch": 0.1232876712328767,
"grad_norm": 4.510354042053223,
"learning_rate": 4.921875e-06,
"loss": 0.1653,
"step": 63
},
{
"epoch": 0.12524461839530332,
"grad_norm": 4.400285243988037,
"learning_rate": 5e-06,
"loss": 0.1799,
"step": 64
},
{
"epoch": 0.12720156555772993,
"grad_norm": 4.631839752197266,
"learning_rate": 5.078125000000001e-06,
"loss": 0.1578,
"step": 65
},
{
"epoch": 0.12720156555772993,
"eval_loss": 0.1336735188961029,
"eval_runtime": 107.4984,
"eval_samples_per_second": 28.391,
"eval_steps_per_second": 0.223,
"eval_sts-test_pearson_cosine": 0.886014179849858,
"eval_sts-test_pearson_dot": 0.8762492282837839,
"eval_sts-test_pearson_euclidean": 0.9101155794045166,
"eval_sts-test_pearson_manhattan": 0.9109538919103571,
"eval_sts-test_pearson_max": 0.9109538919103571,
"eval_sts-test_spearman_cosine": 0.9089514176116413,
"eval_sts-test_spearman_dot": 0.8810853441583534,
"eval_sts-test_spearman_euclidean": 0.9061670836303911,
"eval_sts-test_spearman_manhattan": 0.9072153371772234,
"eval_sts-test_spearman_max": 0.9089514176116413,
"step": 65
},
{
"epoch": 0.12915851272015655,
"grad_norm": 4.043459415435791,
"learning_rate": 5.156250000000001e-06,
"loss": 0.1844,
"step": 66
},
{
"epoch": 0.13111545988258316,
"grad_norm": 4.447835922241211,
"learning_rate": 5.234375e-06,
"loss": 0.1489,
"step": 67
},
{
"epoch": 0.13307240704500978,
"grad_norm": 5.372109889984131,
"learning_rate": 5.3125e-06,
"loss": 0.1845,
"step": 68
},
{
"epoch": 0.1350293542074364,
"grad_norm": 3.5112483501434326,
"learning_rate": 5.390625000000001e-06,
"loss": 0.1364,
"step": 69
},
{
"epoch": 0.136986301369863,
"grad_norm": 4.305239200592041,
"learning_rate": 5.468750000000001e-06,
"loss": 0.1584,
"step": 70
},
{
"epoch": 0.136986301369863,
"eval_loss": 0.1320798397064209,
"eval_runtime": 107.505,
"eval_samples_per_second": 28.389,
"eval_steps_per_second": 0.223,
"eval_sts-test_pearson_cosine": 0.88578311613969,
"eval_sts-test_pearson_dot": 0.875928774505713,
"eval_sts-test_pearson_euclidean": 0.91024619729973,
"eval_sts-test_pearson_manhattan": 0.9110959495329505,
"eval_sts-test_pearson_max": 0.9110959495329505,
"eval_sts-test_spearman_cosine": 0.9086066538938818,
"eval_sts-test_spearman_dot": 0.8801235500485294,
"eval_sts-test_spearman_euclidean": 0.9060052183179386,
"eval_sts-test_spearman_manhattan": 0.907439182986703,
"eval_sts-test_spearman_max": 0.9086066538938818,
"step": 70
},
{
"epoch": 0.13894324853228962,
"grad_norm": 5.093306064605713,
"learning_rate": 5.546875e-06,
"loss": 0.2279,
"step": 71
},
{
"epoch": 0.14090019569471623,
"grad_norm": 4.953585147857666,
"learning_rate": 5.625e-06,
"loss": 0.2028,
"step": 72
},
{
"epoch": 0.14285714285714285,
"grad_norm": 4.1561102867126465,
"learning_rate": 5.7031250000000006e-06,
"loss": 0.2291,
"step": 73
},
{
"epoch": 0.14481409001956946,
"grad_norm": 5.00941801071167,
"learning_rate": 5.781250000000001e-06,
"loss": 0.2419,
"step": 74
},
{
"epoch": 0.14677103718199608,
"grad_norm": 3.6476099491119385,
"learning_rate": 5.859375e-06,
"loss": 0.1329,
"step": 75
},
{
"epoch": 0.14677103718199608,
"eval_loss": 0.13061992824077606,
"eval_runtime": 107.3395,
"eval_samples_per_second": 28.433,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8854112983780439,
"eval_sts-test_pearson_dot": 0.8752625071185561,
"eval_sts-test_pearson_euclidean": 0.9103378320010516,
"eval_sts-test_pearson_manhattan": 0.9112261622276095,
"eval_sts-test_pearson_max": 0.9112261622276095,
"eval_sts-test_spearman_cosine": 0.9082604133844965,
"eval_sts-test_spearman_dot": 0.8794192099454903,
"eval_sts-test_spearman_euclidean": 0.9060063370994732,
"eval_sts-test_spearman_manhattan": 0.90766132824825,
"eval_sts-test_spearman_max": 0.9082604133844965,
"step": 75
},
{
"epoch": 0.1487279843444227,
"grad_norm": 4.10636568069458,
"learning_rate": 5.9375e-06,
"loss": 0.204,
"step": 76
},
{
"epoch": 0.1506849315068493,
"grad_norm": 4.767779350280762,
"learning_rate": 6.0156250000000005e-06,
"loss": 0.2239,
"step": 77
},
{
"epoch": 0.15264187866927592,
"grad_norm": 5.366302490234375,
"learning_rate": 6.093750000000001e-06,
"loss": 0.2181,
"step": 78
},
{
"epoch": 0.15459882583170254,
"grad_norm": 4.087960720062256,
"learning_rate": 6.171875e-06,
"loss": 0.1285,
"step": 79
},
{
"epoch": 0.15655577299412915,
"grad_norm": 3.7557668685913086,
"learning_rate": 6.25e-06,
"loss": 0.1067,
"step": 80
},
{
"epoch": 0.15655577299412915,
"eval_loss": 0.12924787402153015,
"eval_runtime": 107.2528,
"eval_samples_per_second": 28.456,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8850894038300653,
"eval_sts-test_pearson_dot": 0.874941916465686,
"eval_sts-test_pearson_euclidean": 0.9101863990952803,
"eval_sts-test_pearson_manhattan": 0.9110826056950171,
"eval_sts-test_pearson_max": 0.9110826056950171,
"eval_sts-test_spearman_cosine": 0.9078700928826409,
"eval_sts-test_spearman_dot": 0.8792947566875607,
"eval_sts-test_spearman_euclidean": 0.9059290069197888,
"eval_sts-test_spearman_manhattan": 0.9075206750336968,
"eval_sts-test_spearman_max": 0.9078700928826409,
"step": 80
},
{
"epoch": 0.15851272015655576,
"grad_norm": 3.5708839893341064,
"learning_rate": 6.3281250000000005e-06,
"loss": 0.1189,
"step": 81
},
{
"epoch": 0.16046966731898238,
"grad_norm": 4.602839469909668,
"learning_rate": 6.406250000000001e-06,
"loss": 0.236,
"step": 82
},
{
"epoch": 0.162426614481409,
"grad_norm": 4.304513931274414,
"learning_rate": 6.484375000000001e-06,
"loss": 0.1584,
"step": 83
},
{
"epoch": 0.1643835616438356,
"grad_norm": 4.165163516998291,
"learning_rate": 6.5625e-06,
"loss": 0.1925,
"step": 84
},
{
"epoch": 0.16634050880626222,
"grad_norm": 3.9157192707061768,
"learning_rate": 6.6406250000000005e-06,
"loss": 0.129,
"step": 85
},
{
"epoch": 0.16634050880626222,
"eval_loss": 0.1278335303068161,
"eval_runtime": 107.1978,
"eval_samples_per_second": 28.471,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8845993101894516,
"eval_sts-test_pearson_dot": 0.8740701762146532,
"eval_sts-test_pearson_euclidean": 0.9100055922999684,
"eval_sts-test_pearson_manhattan": 0.9108899080028133,
"eval_sts-test_pearson_max": 0.9108899080028133,
"eval_sts-test_spearman_cosine": 0.9078923342595523,
"eval_sts-test_spearman_dot": 0.8788126513485913,
"eval_sts-test_spearman_euclidean": 0.9057257466905491,
"eval_sts-test_spearman_manhattan": 0.9070083178420268,
"eval_sts-test_spearman_max": 0.9078923342595523,
"step": 85
},
{
"epoch": 0.16829745596868884,
"grad_norm": 4.233823776245117,
"learning_rate": 6.718750000000001e-06,
"loss": 0.1376,
"step": 86
},
{
"epoch": 0.17025440313111545,
"grad_norm": 4.670790195465088,
"learning_rate": 6.796875000000001e-06,
"loss": 0.1691,
"step": 87
},
{
"epoch": 0.17221135029354206,
"grad_norm": 3.742030382156372,
"learning_rate": 6.875e-06,
"loss": 0.1045,
"step": 88
},
{
"epoch": 0.17416829745596868,
"grad_norm": 4.242702960968018,
"learning_rate": 6.9531250000000004e-06,
"loss": 0.165,
"step": 89
},
{
"epoch": 0.1761252446183953,
"grad_norm": 5.499476909637451,
"learning_rate": 7.031250000000001e-06,
"loss": 0.2926,
"step": 90
},
{
"epoch": 0.1761252446183953,
"eval_loss": 0.12669824063777924,
"eval_runtime": 107.2778,
"eval_samples_per_second": 28.45,
"eval_steps_per_second": 0.224,
"eval_sts-test_pearson_cosine": 0.8844194771150324,
"eval_sts-test_pearson_dot": 0.873458365713796,
"eval_sts-test_pearson_euclidean": 0.9099396625521212,
"eval_sts-test_pearson_manhattan": 0.910745898918033,
"eval_sts-test_pearson_max": 0.910745898918033,
"eval_sts-test_spearman_cosine": 0.907622707909669,
"eval_sts-test_spearman_dot": 0.8783740442356941,
"eval_sts-test_spearman_euclidean": 0.9058808545625318,
"eval_sts-test_spearman_manhattan": 0.906889458491771,
"eval_sts-test_spearman_max": 0.907622707909669,
"step": 90
},
{
"epoch": 0.1780821917808219,
"grad_norm": 2.992021083831787,
"learning_rate": 7.109375000000001e-06,
"loss": 0.1048,
"step": 91
},
{
"epoch": 0.18003913894324852,
"grad_norm": 4.298286437988281,
"learning_rate": 7.1875e-06,
"loss": 0.1596,
"step": 92
},
{
"epoch": 0.18199608610567514,
"grad_norm": 5.210509300231934,
"learning_rate": 7.265625e-06,
"loss": 0.2474,
"step": 93
},
{
"epoch": 0.18395303326810175,
"grad_norm": 4.527407169342041,
"learning_rate": 7.343750000000001e-06,
"loss": 0.1652,
"step": 94
},
{
"epoch": 0.18590998043052837,
"grad_norm": 5.302050590515137,
"learning_rate": 7.421875000000001e-06,
"loss": 0.2483,
"step": 95
},
{
"epoch": 0.18590998043052837,
"eval_loss": 0.1252526491880417,
"eval_runtime": 107.5519,
"eval_samples_per_second": 28.377,
"eval_steps_per_second": 0.223,
"eval_sts-test_pearson_cosine": 0.884272350180128,
"eval_sts-test_pearson_dot": 0.8727334938335432,
"eval_sts-test_pearson_euclidean": 0.9099441972021025,
"eval_sts-test_pearson_manhattan": 0.9106991509833859,
"eval_sts-test_pearson_max": 0.9106991509833859,
"eval_sts-test_spearman_cosine": 0.9075948278738224,
"eval_sts-test_spearman_dot": 0.87780624023116,
"eval_sts-test_spearman_euclidean": 0.9060086194138042,
"eval_sts-test_spearman_manhattan": 0.9069788267607697,
"eval_sts-test_spearman_max": 0.9075948278738224,
"step": 95
},
{
"epoch": 0.18786692759295498,
"grad_norm": 3.690441608428955,
"learning_rate": 7.500000000000001e-06,
"loss": 0.1623,
"step": 96
},
{
"epoch": 0.1898238747553816,
"grad_norm": 4.585984706878662,
"learning_rate": 7.578125e-06,
"loss": 0.1955,
"step": 97
},
{
"epoch": 0.1917808219178082,
"grad_norm": 4.493942737579346,
"learning_rate": 7.656250000000001e-06,
"loss": 0.2023,
"step": 98
},
{
"epoch": 0.19373776908023482,
"grad_norm": 4.569936275482178,
"learning_rate": 7.734375e-06,
"loss": 0.1886,
"step": 99
},
{
"epoch": 0.19569471624266144,
"grad_norm": 3.7703664302825928,
"learning_rate": 7.8125e-06,
"loss": 0.1284,
"step": 100
},
{
"epoch": 0.19569471624266144,
"eval_loss": 0.12290485948324203,
"eval_runtime": 107.6958,
"eval_samples_per_second": 28.339,
"eval_steps_per_second": 0.223,
"eval_sts-test_pearson_cosine": 0.8836376979322419,
"eval_sts-test_pearson_dot": 0.8710695777275684,
"eval_sts-test_pearson_euclidean": 0.9098265834859519,
"eval_sts-test_pearson_manhattan": 0.9106248996071287,
"eval_sts-test_pearson_max": 0.9106248996071287,
"eval_sts-test_spearman_cosine": 0.9078868298544011,
"eval_sts-test_spearman_dot": 0.8773200625274038,
"eval_sts-test_spearman_euclidean": 0.9063156130669492,
"eval_sts-test_spearman_manhattan": 0.9071474495136926,
"eval_sts-test_spearman_max": 0.9078868298544011,
"step": 100
},
{
"epoch": 0.19765166340508805,
"grad_norm": 4.356619358062744,
"learning_rate": 7.890625e-06,
"loss": 0.2005,
"step": 101
},
{
"epoch": 0.19960861056751467,
"grad_norm": 4.293449878692627,
"learning_rate": 7.96875e-06,
"loss": 0.2301,
"step": 102
},
{
"epoch": 0.20156555772994128,
"grad_norm": 4.654509544372559,
"learning_rate": 8.046875e-06,
"loss": 0.2249,
"step": 103
}
],
"logging_steps": 1,
"max_steps": 1022,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 103,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 320,
"trial_name": null,
"trial_params": null
}