|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.20156555772994128, |
|
"eval_steps": 5, |
|
"global_step": 103, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019569471624266144, |
|
"grad_norm": 3.932948112487793, |
|
"learning_rate": 7.8125e-08, |
|
"loss": 0.107, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003913894324853229, |
|
"grad_norm": 4.482716083526611, |
|
"learning_rate": 1.5625e-07, |
|
"loss": 0.1529, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005870841487279843, |
|
"grad_norm": 4.672689437866211, |
|
"learning_rate": 2.3437500000000003e-07, |
|
"loss": 0.1874, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007827788649706457, |
|
"grad_norm": 4.226949214935303, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.1682, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009784735812133072, |
|
"grad_norm": 4.327479362487793, |
|
"learning_rate": 3.90625e-07, |
|
"loss": 0.1438, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.009784735812133072, |
|
"eval_loss": 0.1470455378293991, |
|
"eval_runtime": 107.3614, |
|
"eval_samples_per_second": 28.427, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8861388036460539, |
|
"eval_sts-test_pearson_dot": 0.8769528313548112, |
|
"eval_sts-test_pearson_euclidean": 0.9079831987750276, |
|
"eval_sts-test_pearson_manhattan": 0.9086786527495163, |
|
"eval_sts-test_pearson_max": 0.9086786527495163, |
|
"eval_sts-test_spearman_cosine": 0.9077902566323186, |
|
"eval_sts-test_spearman_dot": 0.8794770733264693, |
|
"eval_sts-test_spearman_euclidean": 0.903967335376697, |
|
"eval_sts-test_spearman_manhattan": 0.9043498244078092, |
|
"eval_sts-test_spearman_max": 0.9077902566323186, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011741682974559686, |
|
"grad_norm": 5.27250337600708, |
|
"learning_rate": 4.6875000000000006e-07, |
|
"loss": 0.2961, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0136986301369863, |
|
"grad_norm": 5.903276443481445, |
|
"learning_rate": 5.468750000000001e-07, |
|
"loss": 0.3019, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015655577299412915, |
|
"grad_norm": 4.000335693359375, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.1184, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01761252446183953, |
|
"grad_norm": 5.876769065856934, |
|
"learning_rate": 7.03125e-07, |
|
"loss": 0.3176, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.019569471624266144, |
|
"grad_norm": 4.8437933921813965, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 0.2234, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019569471624266144, |
|
"eval_loss": 0.1467687040567398, |
|
"eval_runtime": 107.2549, |
|
"eval_samples_per_second": 28.456, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8861409457129842, |
|
"eval_sts-test_pearson_dot": 0.876972814890145, |
|
"eval_sts-test_pearson_euclidean": 0.9080268416052204, |
|
"eval_sts-test_pearson_manhattan": 0.9087444298597203, |
|
"eval_sts-test_pearson_max": 0.9087444298597203, |
|
"eval_sts-test_spearman_cosine": 0.9078342918735278, |
|
"eval_sts-test_spearman_dot": 0.8794190309404447, |
|
"eval_sts-test_spearman_euclidean": 0.9039501508923226, |
|
"eval_sts-test_spearman_manhattan": 0.9044244247605487, |
|
"eval_sts-test_spearman_max": 0.9078342918735278, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021526418786692758, |
|
"grad_norm": 4.726498603820801, |
|
"learning_rate": 8.59375e-07, |
|
"loss": 0.1881, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.023483365949119372, |
|
"grad_norm": 4.818070411682129, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.1593, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.025440313111545987, |
|
"grad_norm": 4.98201322555542, |
|
"learning_rate": 1.0156250000000001e-06, |
|
"loss": 0.1833, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0273972602739726, |
|
"grad_norm": 4.269514560699463, |
|
"learning_rate": 1.0937500000000001e-06, |
|
"loss": 0.1352, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.029354207436399216, |
|
"grad_norm": 6.1525492668151855, |
|
"learning_rate": 1.1718750000000001e-06, |
|
"loss": 0.3143, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.029354207436399216, |
|
"eval_loss": 0.1462097316980362, |
|
"eval_runtime": 107.0721, |
|
"eval_samples_per_second": 28.504, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8860829119688085, |
|
"eval_sts-test_pearson_dot": 0.8768990080043222, |
|
"eval_sts-test_pearson_euclidean": 0.9080646402781543, |
|
"eval_sts-test_pearson_manhattan": 0.9088063929836994, |
|
"eval_sts-test_pearson_max": 0.9088063929836994, |
|
"eval_sts-test_spearman_cosine": 0.907713597721555, |
|
"eval_sts-test_spearman_dot": 0.8795110842851269, |
|
"eval_sts-test_spearman_euclidean": 0.9040110126078148, |
|
"eval_sts-test_spearman_manhattan": 0.9045081991218733, |
|
"eval_sts-test_spearman_max": 0.907713597721555, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03131115459882583, |
|
"grad_norm": 4.751354694366455, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.1583, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.033268101761252444, |
|
"grad_norm": 5.435980319976807, |
|
"learning_rate": 1.328125e-06, |
|
"loss": 0.2015, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03522504892367906, |
|
"grad_norm": 4.1765851974487305, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 0.1476, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03718199608610567, |
|
"grad_norm": 4.689794540405273, |
|
"learning_rate": 1.484375e-06, |
|
"loss": 0.1676, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03913894324853229, |
|
"grad_norm": 4.203744888305664, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.1525, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03913894324853229, |
|
"eval_loss": 0.14544810354709625, |
|
"eval_runtime": 107.1845, |
|
"eval_samples_per_second": 28.474, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8861436293943533, |
|
"eval_sts-test_pearson_dot": 0.8769239163708102, |
|
"eval_sts-test_pearson_euclidean": 0.9082269545633608, |
|
"eval_sts-test_pearson_manhattan": 0.9089828403051001, |
|
"eval_sts-test_pearson_max": 0.9089828403051001, |
|
"eval_sts-test_spearman_cosine": 0.907929343552723, |
|
"eval_sts-test_spearman_dot": 0.8796122221358714, |
|
"eval_sts-test_spearman_euclidean": 0.9043074002120102, |
|
"eval_sts-test_spearman_manhattan": 0.9047217521412333, |
|
"eval_sts-test_spearman_max": 0.907929343552723, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0410958904109589, |
|
"grad_norm": 5.152130603790283, |
|
"learning_rate": 1.640625e-06, |
|
"loss": 0.1717, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.043052837573385516, |
|
"grad_norm": 5.343059062957764, |
|
"learning_rate": 1.71875e-06, |
|
"loss": 0.198, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04500978473581213, |
|
"grad_norm": 5.224748134613037, |
|
"learning_rate": 1.796875e-06, |
|
"loss": 0.3062, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.046966731898238745, |
|
"grad_norm": 4.6179423332214355, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.1241, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04892367906066536, |
|
"grad_norm": 4.200148105621338, |
|
"learning_rate": 1.953125e-06, |
|
"loss": 0.1087, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04892367906066536, |
|
"eval_loss": 0.14457188546657562, |
|
"eval_runtime": 107.3809, |
|
"eval_samples_per_second": 28.422, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8862905994058754, |
|
"eval_sts-test_pearson_dot": 0.877015249192232, |
|
"eval_sts-test_pearson_euclidean": 0.9085054742522269, |
|
"eval_sts-test_pearson_manhattan": 0.9092575877809899, |
|
"eval_sts-test_pearson_max": 0.9092575877809899, |
|
"eval_sts-test_spearman_cosine": 0.9082294902628751, |
|
"eval_sts-test_spearman_dot": 0.8798810429630494, |
|
"eval_sts-test_spearman_euclidean": 0.9047149499495015, |
|
"eval_sts-test_spearman_manhattan": 0.9051023616193669, |
|
"eval_sts-test_spearman_max": 0.9082294902628751, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.050880626223091974, |
|
"grad_norm": 4.890737533569336, |
|
"learning_rate": 2.0312500000000002e-06, |
|
"loss": 0.1767, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05283757338551859, |
|
"grad_norm": 4.683767795562744, |
|
"learning_rate": 2.109375e-06, |
|
"loss": 0.1951, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0547945205479452, |
|
"grad_norm": 4.656280040740967, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.1621, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05675146771037182, |
|
"grad_norm": 4.446409702301025, |
|
"learning_rate": 2.265625e-06, |
|
"loss": 0.221, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05870841487279843, |
|
"grad_norm": 5.765133857727051, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.2241, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05870841487279843, |
|
"eval_loss": 0.14350731670856476, |
|
"eval_runtime": 107.3747, |
|
"eval_samples_per_second": 28.424, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8863784941826807, |
|
"eval_sts-test_pearson_dot": 0.8768948467465629, |
|
"eval_sts-test_pearson_euclidean": 0.9088066170487232, |
|
"eval_sts-test_pearson_manhattan": 0.9095658568102677, |
|
"eval_sts-test_pearson_max": 0.9095658568102677, |
|
"eval_sts-test_spearman_cosine": 0.9082580415676429, |
|
"eval_sts-test_spearman_dot": 0.8801849487791585, |
|
"eval_sts-test_spearman_euclidean": 0.9051721735871375, |
|
"eval_sts-test_spearman_manhattan": 0.9054862826908437, |
|
"eval_sts-test_spearman_max": 0.9082580415676429, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.060665362035225046, |
|
"grad_norm": 5.359245777130127, |
|
"learning_rate": 2.421875e-06, |
|
"loss": 0.2093, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06262230919765166, |
|
"grad_norm": 4.439486503601074, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.1615, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06457925636007827, |
|
"grad_norm": 3.689824342727661, |
|
"learning_rate": 2.5781250000000004e-06, |
|
"loss": 0.1615, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06653620352250489, |
|
"grad_norm": 4.842885494232178, |
|
"learning_rate": 2.65625e-06, |
|
"loss": 0.1772, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0684931506849315, |
|
"grad_norm": 5.209301948547363, |
|
"learning_rate": 2.7343750000000004e-06, |
|
"loss": 0.2324, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0684931506849315, |
|
"eval_loss": 0.14226235449314117, |
|
"eval_runtime": 107.3108, |
|
"eval_samples_per_second": 28.441, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8863574366132135, |
|
"eval_sts-test_pearson_dot": 0.8765683077424664, |
|
"eval_sts-test_pearson_euclidean": 0.9091012263251723, |
|
"eval_sts-test_pearson_manhattan": 0.9098631032540263, |
|
"eval_sts-test_pearson_max": 0.9098631032540263, |
|
"eval_sts-test_spearman_cosine": 0.9083728733043733, |
|
"eval_sts-test_spearman_dot": 0.8800282746130272, |
|
"eval_sts-test_spearman_euclidean": 0.9052579170039636, |
|
"eval_sts-test_spearman_manhattan": 0.9059997586640487, |
|
"eval_sts-test_spearman_max": 0.9083728733043733, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07045009784735812, |
|
"grad_norm": 4.740983009338379, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.2611, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07240704500978473, |
|
"grad_norm": 5.090059757232666, |
|
"learning_rate": 2.8906250000000004e-06, |
|
"loss": 0.214, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07436399217221135, |
|
"grad_norm": 5.123153209686279, |
|
"learning_rate": 2.96875e-06, |
|
"loss": 0.1985, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07632093933463796, |
|
"grad_norm": 5.401946067810059, |
|
"learning_rate": 3.0468750000000004e-06, |
|
"loss": 0.1855, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07827788649706457, |
|
"grad_norm": 4.838700294494629, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.1234, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07827788649706457, |
|
"eval_loss": 0.14100149273872375, |
|
"eval_runtime": 107.3059, |
|
"eval_samples_per_second": 28.442, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8864265749012155, |
|
"eval_sts-test_pearson_dot": 0.8764612424174422, |
|
"eval_sts-test_pearson_euclidean": 0.9094092487009695, |
|
"eval_sts-test_pearson_manhattan": 0.9101707626021143, |
|
"eval_sts-test_pearson_max": 0.9101707626021143, |
|
"eval_sts-test_spearman_cosine": 0.908505695048183, |
|
"eval_sts-test_spearman_dot": 0.8802103674956289, |
|
"eval_sts-test_spearman_euclidean": 0.9054564783507572, |
|
"eval_sts-test_spearman_manhattan": 0.9063046490079084, |
|
"eval_sts-test_spearman_max": 0.908505695048183, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08023483365949119, |
|
"grad_norm": 3.8856801986694336, |
|
"learning_rate": 3.2031250000000004e-06, |
|
"loss": 0.1492, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0821917808219178, |
|
"grad_norm": 5.678151607513428, |
|
"learning_rate": 3.28125e-06, |
|
"loss": 0.2022, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08414872798434442, |
|
"grad_norm": 5.104148864746094, |
|
"learning_rate": 3.3593750000000003e-06, |
|
"loss": 0.2146, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08610567514677103, |
|
"grad_norm": 4.76043701171875, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.1688, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08806262230919765, |
|
"grad_norm": 5.128803730010986, |
|
"learning_rate": 3.5156250000000003e-06, |
|
"loss": 0.175, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08806262230919765, |
|
"eval_loss": 0.13962982594966888, |
|
"eval_runtime": 107.4144, |
|
"eval_samples_per_second": 28.413, |
|
"eval_steps_per_second": 0.223, |
|
"eval_sts-test_pearson_cosine": 0.886410908658177, |
|
"eval_sts-test_pearson_dot": 0.8762836795862763, |
|
"eval_sts-test_pearson_euclidean": 0.9096890242379734, |
|
"eval_sts-test_pearson_manhattan": 0.9104590803642174, |
|
"eval_sts-test_pearson_max": 0.9104590803642174, |
|
"eval_sts-test_spearman_cosine": 0.9086694846648755, |
|
"eval_sts-test_spearman_dot": 0.8801346931126159, |
|
"eval_sts-test_spearman_euclidean": 0.9057376952773407, |
|
"eval_sts-test_spearman_manhattan": 0.9064708999439774, |
|
"eval_sts-test_spearman_max": 0.9086694846648755, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09001956947162426, |
|
"grad_norm": 4.968522548675537, |
|
"learning_rate": 3.59375e-06, |
|
"loss": 0.2123, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09197651663405088, |
|
"grad_norm": 4.343472957611084, |
|
"learning_rate": 3.6718750000000003e-06, |
|
"loss": 0.1118, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09393346379647749, |
|
"grad_norm": 6.252938270568848, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.3009, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0958904109589041, |
|
"grad_norm": 3.411029815673828, |
|
"learning_rate": 3.828125000000001e-06, |
|
"loss": 0.1071, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09784735812133072, |
|
"grad_norm": 5.379226207733154, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 0.2608, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09784735812133072, |
|
"eval_loss": 0.13823722302913666, |
|
"eval_runtime": 107.3656, |
|
"eval_samples_per_second": 28.426, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8863074884351817, |
|
"eval_sts-test_pearson_dot": 0.8763122134205692, |
|
"eval_sts-test_pearson_euclidean": 0.9097700018848961, |
|
"eval_sts-test_pearson_manhattan": 0.9105724410858811, |
|
"eval_sts-test_pearson_max": 0.9105724410858811, |
|
"eval_sts-test_spearman_cosine": 0.9085105281844131, |
|
"eval_sts-test_spearman_dot": 0.8801239975611433, |
|
"eval_sts-test_spearman_euclidean": 0.9059798443527296, |
|
"eval_sts-test_spearman_manhattan": 0.9065691737139927, |
|
"eval_sts-test_spearman_max": 0.9085105281844131, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09980430528375733, |
|
"grad_norm": 4.599095821380615, |
|
"learning_rate": 3.984375e-06, |
|
"loss": 0.1368, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10176125244618395, |
|
"grad_norm": 5.634761333465576, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 0.2307, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10371819960861056, |
|
"grad_norm": 4.678525924682617, |
|
"learning_rate": 4.140625000000001e-06, |
|
"loss": 0.1366, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10567514677103718, |
|
"grad_norm": 4.931070327758789, |
|
"learning_rate": 4.21875e-06, |
|
"loss": 0.1857, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10763209393346379, |
|
"grad_norm": 4.903087139129639, |
|
"learning_rate": 4.296875e-06, |
|
"loss": 0.2155, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10763209393346379, |
|
"eval_loss": 0.1367325782775879, |
|
"eval_runtime": 107.3012, |
|
"eval_samples_per_second": 28.443, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.88603017002284, |
|
"eval_sts-test_pearson_dot": 0.8761626193697236, |
|
"eval_sts-test_pearson_euclidean": 0.9096799681812165, |
|
"eval_sts-test_pearson_manhattan": 0.9104977957475867, |
|
"eval_sts-test_pearson_max": 0.9104977957475867, |
|
"eval_sts-test_spearman_cosine": 0.9084685067499666, |
|
"eval_sts-test_spearman_dot": 0.8802836700617878, |
|
"eval_sts-test_spearman_euclidean": 0.9058409364373706, |
|
"eval_sts-test_spearman_manhattan": 0.9064240006220393, |
|
"eval_sts-test_spearman_max": 0.9084685067499666, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1095890410958904, |
|
"grad_norm": 5.408311367034912, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.2022, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11154598825831702, |
|
"grad_norm": 4.5926713943481445, |
|
"learning_rate": 4.453125000000001e-06, |
|
"loss": 0.2076, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11350293542074363, |
|
"grad_norm": 6.475535869598389, |
|
"learning_rate": 4.53125e-06, |
|
"loss": 0.4133, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11545988258317025, |
|
"grad_norm": 4.997581481933594, |
|
"learning_rate": 4.609375e-06, |
|
"loss": 0.1823, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11741682974559686, |
|
"grad_norm": 3.899284601211548, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.1136, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11741682974559686, |
|
"eval_loss": 0.13528631627559662, |
|
"eval_runtime": 107.3435, |
|
"eval_samples_per_second": 28.432, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8860224650016398, |
|
"eval_sts-test_pearson_dot": 0.8762739756970772, |
|
"eval_sts-test_pearson_euclidean": 0.9099016820022997, |
|
"eval_sts-test_pearson_manhattan": 0.9107281338135995, |
|
"eval_sts-test_pearson_max": 0.9107281338135995, |
|
"eval_sts-test_spearman_cosine": 0.9087510214631306, |
|
"eval_sts-test_spearman_dot": 0.8808623486228402, |
|
"eval_sts-test_spearman_euclidean": 0.9060555634870038, |
|
"eval_sts-test_spearman_manhattan": 0.9067256241238172, |
|
"eval_sts-test_spearman_max": 0.9087510214631306, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11937377690802348, |
|
"grad_norm": 4.476404190063477, |
|
"learning_rate": 4.765625000000001e-06, |
|
"loss": 0.1687, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12133072407045009, |
|
"grad_norm": 4.893277168273926, |
|
"learning_rate": 4.84375e-06, |
|
"loss": 0.1591, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1232876712328767, |
|
"grad_norm": 4.510354042053223, |
|
"learning_rate": 4.921875e-06, |
|
"loss": 0.1653, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12524461839530332, |
|
"grad_norm": 4.400285243988037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1799, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12720156555772993, |
|
"grad_norm": 4.631839752197266, |
|
"learning_rate": 5.078125000000001e-06, |
|
"loss": 0.1578, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12720156555772993, |
|
"eval_loss": 0.1336735188961029, |
|
"eval_runtime": 107.4984, |
|
"eval_samples_per_second": 28.391, |
|
"eval_steps_per_second": 0.223, |
|
"eval_sts-test_pearson_cosine": 0.886014179849858, |
|
"eval_sts-test_pearson_dot": 0.8762492282837839, |
|
"eval_sts-test_pearson_euclidean": 0.9101155794045166, |
|
"eval_sts-test_pearson_manhattan": 0.9109538919103571, |
|
"eval_sts-test_pearson_max": 0.9109538919103571, |
|
"eval_sts-test_spearman_cosine": 0.9089514176116413, |
|
"eval_sts-test_spearman_dot": 0.8810853441583534, |
|
"eval_sts-test_spearman_euclidean": 0.9061670836303911, |
|
"eval_sts-test_spearman_manhattan": 0.9072153371772234, |
|
"eval_sts-test_spearman_max": 0.9089514176116413, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12915851272015655, |
|
"grad_norm": 4.043459415435791, |
|
"learning_rate": 5.156250000000001e-06, |
|
"loss": 0.1844, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13111545988258316, |
|
"grad_norm": 4.447835922241211, |
|
"learning_rate": 5.234375e-06, |
|
"loss": 0.1489, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.13307240704500978, |
|
"grad_norm": 5.372109889984131, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 0.1845, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1350293542074364, |
|
"grad_norm": 3.5112483501434326, |
|
"learning_rate": 5.390625000000001e-06, |
|
"loss": 0.1364, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 4.305239200592041, |
|
"learning_rate": 5.468750000000001e-06, |
|
"loss": 0.1584, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"eval_loss": 0.1320798397064209, |
|
"eval_runtime": 107.505, |
|
"eval_samples_per_second": 28.389, |
|
"eval_steps_per_second": 0.223, |
|
"eval_sts-test_pearson_cosine": 0.88578311613969, |
|
"eval_sts-test_pearson_dot": 0.875928774505713, |
|
"eval_sts-test_pearson_euclidean": 0.91024619729973, |
|
"eval_sts-test_pearson_manhattan": 0.9110959495329505, |
|
"eval_sts-test_pearson_max": 0.9110959495329505, |
|
"eval_sts-test_spearman_cosine": 0.9086066538938818, |
|
"eval_sts-test_spearman_dot": 0.8801235500485294, |
|
"eval_sts-test_spearman_euclidean": 0.9060052183179386, |
|
"eval_sts-test_spearman_manhattan": 0.907439182986703, |
|
"eval_sts-test_spearman_max": 0.9086066538938818, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13894324853228962, |
|
"grad_norm": 5.093306064605713, |
|
"learning_rate": 5.546875e-06, |
|
"loss": 0.2279, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.14090019569471623, |
|
"grad_norm": 4.953585147857666, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.2028, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 4.1561102867126465, |
|
"learning_rate": 5.7031250000000006e-06, |
|
"loss": 0.2291, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14481409001956946, |
|
"grad_norm": 5.00941801071167, |
|
"learning_rate": 5.781250000000001e-06, |
|
"loss": 0.2419, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14677103718199608, |
|
"grad_norm": 3.6476099491119385, |
|
"learning_rate": 5.859375e-06, |
|
"loss": 0.1329, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14677103718199608, |
|
"eval_loss": 0.13061992824077606, |
|
"eval_runtime": 107.3395, |
|
"eval_samples_per_second": 28.433, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8854112983780439, |
|
"eval_sts-test_pearson_dot": 0.8752625071185561, |
|
"eval_sts-test_pearson_euclidean": 0.9103378320010516, |
|
"eval_sts-test_pearson_manhattan": 0.9112261622276095, |
|
"eval_sts-test_pearson_max": 0.9112261622276095, |
|
"eval_sts-test_spearman_cosine": 0.9082604133844965, |
|
"eval_sts-test_spearman_dot": 0.8794192099454903, |
|
"eval_sts-test_spearman_euclidean": 0.9060063370994732, |
|
"eval_sts-test_spearman_manhattan": 0.90766132824825, |
|
"eval_sts-test_spearman_max": 0.9082604133844965, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1487279843444227, |
|
"grad_norm": 4.10636568069458, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.204, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1506849315068493, |
|
"grad_norm": 4.767779350280762, |
|
"learning_rate": 6.0156250000000005e-06, |
|
"loss": 0.2239, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15264187866927592, |
|
"grad_norm": 5.366302490234375, |
|
"learning_rate": 6.093750000000001e-06, |
|
"loss": 0.2181, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15459882583170254, |
|
"grad_norm": 4.087960720062256, |
|
"learning_rate": 6.171875e-06, |
|
"loss": 0.1285, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15655577299412915, |
|
"grad_norm": 3.7557668685913086, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.1067, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15655577299412915, |
|
"eval_loss": 0.12924787402153015, |
|
"eval_runtime": 107.2528, |
|
"eval_samples_per_second": 28.456, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8850894038300653, |
|
"eval_sts-test_pearson_dot": 0.874941916465686, |
|
"eval_sts-test_pearson_euclidean": 0.9101863990952803, |
|
"eval_sts-test_pearson_manhattan": 0.9110826056950171, |
|
"eval_sts-test_pearson_max": 0.9110826056950171, |
|
"eval_sts-test_spearman_cosine": 0.9078700928826409, |
|
"eval_sts-test_spearman_dot": 0.8792947566875607, |
|
"eval_sts-test_spearman_euclidean": 0.9059290069197888, |
|
"eval_sts-test_spearman_manhattan": 0.9075206750336968, |
|
"eval_sts-test_spearman_max": 0.9078700928826409, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15851272015655576, |
|
"grad_norm": 3.5708839893341064, |
|
"learning_rate": 6.3281250000000005e-06, |
|
"loss": 0.1189, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16046966731898238, |
|
"grad_norm": 4.602839469909668, |
|
"learning_rate": 6.406250000000001e-06, |
|
"loss": 0.236, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.162426614481409, |
|
"grad_norm": 4.304513931274414, |
|
"learning_rate": 6.484375000000001e-06, |
|
"loss": 0.1584, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1643835616438356, |
|
"grad_norm": 4.165163516998291, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 0.1925, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.16634050880626222, |
|
"grad_norm": 3.9157192707061768, |
|
"learning_rate": 6.6406250000000005e-06, |
|
"loss": 0.129, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16634050880626222, |
|
"eval_loss": 0.1278335303068161, |
|
"eval_runtime": 107.1978, |
|
"eval_samples_per_second": 28.471, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8845993101894516, |
|
"eval_sts-test_pearson_dot": 0.8740701762146532, |
|
"eval_sts-test_pearson_euclidean": 0.9100055922999684, |
|
"eval_sts-test_pearson_manhattan": 0.9108899080028133, |
|
"eval_sts-test_pearson_max": 0.9108899080028133, |
|
"eval_sts-test_spearman_cosine": 0.9078923342595523, |
|
"eval_sts-test_spearman_dot": 0.8788126513485913, |
|
"eval_sts-test_spearman_euclidean": 0.9057257466905491, |
|
"eval_sts-test_spearman_manhattan": 0.9070083178420268, |
|
"eval_sts-test_spearman_max": 0.9078923342595523, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16829745596868884, |
|
"grad_norm": 4.233823776245117, |
|
"learning_rate": 6.718750000000001e-06, |
|
"loss": 0.1376, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17025440313111545, |
|
"grad_norm": 4.670790195465088, |
|
"learning_rate": 6.796875000000001e-06, |
|
"loss": 0.1691, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17221135029354206, |
|
"grad_norm": 3.742030382156372, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.1045, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17416829745596868, |
|
"grad_norm": 4.242702960968018, |
|
"learning_rate": 6.9531250000000004e-06, |
|
"loss": 0.165, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1761252446183953, |
|
"grad_norm": 5.499476909637451, |
|
"learning_rate": 7.031250000000001e-06, |
|
"loss": 0.2926, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1761252446183953, |
|
"eval_loss": 0.12669824063777924, |
|
"eval_runtime": 107.2778, |
|
"eval_samples_per_second": 28.45, |
|
"eval_steps_per_second": 0.224, |
|
"eval_sts-test_pearson_cosine": 0.8844194771150324, |
|
"eval_sts-test_pearson_dot": 0.873458365713796, |
|
"eval_sts-test_pearson_euclidean": 0.9099396625521212, |
|
"eval_sts-test_pearson_manhattan": 0.910745898918033, |
|
"eval_sts-test_pearson_max": 0.910745898918033, |
|
"eval_sts-test_spearman_cosine": 0.907622707909669, |
|
"eval_sts-test_spearman_dot": 0.8783740442356941, |
|
"eval_sts-test_spearman_euclidean": 0.9058808545625318, |
|
"eval_sts-test_spearman_manhattan": 0.906889458491771, |
|
"eval_sts-test_spearman_max": 0.907622707909669, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1780821917808219, |
|
"grad_norm": 2.992021083831787, |
|
"learning_rate": 7.109375000000001e-06, |
|
"loss": 0.1048, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18003913894324852, |
|
"grad_norm": 4.298286437988281, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.1596, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18199608610567514, |
|
"grad_norm": 5.210509300231934, |
|
"learning_rate": 7.265625e-06, |
|
"loss": 0.2474, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18395303326810175, |
|
"grad_norm": 4.527407169342041, |
|
"learning_rate": 7.343750000000001e-06, |
|
"loss": 0.1652, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.18590998043052837, |
|
"grad_norm": 5.302050590515137, |
|
"learning_rate": 7.421875000000001e-06, |
|
"loss": 0.2483, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18590998043052837, |
|
"eval_loss": 0.1252526491880417, |
|
"eval_runtime": 107.5519, |
|
"eval_samples_per_second": 28.377, |
|
"eval_steps_per_second": 0.223, |
|
"eval_sts-test_pearson_cosine": 0.884272350180128, |
|
"eval_sts-test_pearson_dot": 0.8727334938335432, |
|
"eval_sts-test_pearson_euclidean": 0.9099441972021025, |
|
"eval_sts-test_pearson_manhattan": 0.9106991509833859, |
|
"eval_sts-test_pearson_max": 0.9106991509833859, |
|
"eval_sts-test_spearman_cosine": 0.9075948278738224, |
|
"eval_sts-test_spearman_dot": 0.87780624023116, |
|
"eval_sts-test_spearman_euclidean": 0.9060086194138042, |
|
"eval_sts-test_spearman_manhattan": 0.9069788267607697, |
|
"eval_sts-test_spearman_max": 0.9075948278738224, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18786692759295498, |
|
"grad_norm": 3.690441608428955, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.1623, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1898238747553816, |
|
"grad_norm": 4.585984706878662, |
|
"learning_rate": 7.578125e-06, |
|
"loss": 0.1955, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1917808219178082, |
|
"grad_norm": 4.493942737579346, |
|
"learning_rate": 7.656250000000001e-06, |
|
"loss": 0.2023, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.19373776908023482, |
|
"grad_norm": 4.569936275482178, |
|
"learning_rate": 7.734375e-06, |
|
"loss": 0.1886, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.19569471624266144, |
|
"grad_norm": 3.7703664302825928, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.1284, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19569471624266144, |
|
"eval_loss": 0.12290485948324203, |
|
"eval_runtime": 107.6958, |
|
"eval_samples_per_second": 28.339, |
|
"eval_steps_per_second": 0.223, |
|
"eval_sts-test_pearson_cosine": 0.8836376979322419, |
|
"eval_sts-test_pearson_dot": 0.8710695777275684, |
|
"eval_sts-test_pearson_euclidean": 0.9098265834859519, |
|
"eval_sts-test_pearson_manhattan": 0.9106248996071287, |
|
"eval_sts-test_pearson_max": 0.9106248996071287, |
|
"eval_sts-test_spearman_cosine": 0.9078868298544011, |
|
"eval_sts-test_spearman_dot": 0.8773200625274038, |
|
"eval_sts-test_spearman_euclidean": 0.9063156130669492, |
|
"eval_sts-test_spearman_manhattan": 0.9071474495136926, |
|
"eval_sts-test_spearman_max": 0.9078868298544011, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19765166340508805, |
|
"grad_norm": 4.356619358062744, |
|
"learning_rate": 7.890625e-06, |
|
"loss": 0.2005, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19960861056751467, |
|
"grad_norm": 4.293449878692627, |
|
"learning_rate": 7.96875e-06, |
|
"loss": 0.2301, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.20156555772994128, |
|
"grad_norm": 4.654509544372559, |
|
"learning_rate": 8.046875e-06, |
|
"loss": 0.2249, |
|
"step": 103 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1022, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 103, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 320, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|