{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8042328042328042, "eval_steps": 5, "global_step": 152, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005291005291005291, "grad_norm": 4.4978814125061035, "learning_rate": 3.684210526315789e-07, "loss": 0.3768, "step": 1 }, { "epoch": 0.010582010582010581, "grad_norm": 4.62701416015625, "learning_rate": 7.368421052631578e-07, "loss": 0.3162, "step": 2 }, { "epoch": 0.015873015873015872, "grad_norm": 4.304025650024414, "learning_rate": 1.1052631578947367e-06, "loss": 0.275, "step": 3 }, { "epoch": 0.021164021164021163, "grad_norm": 4.237359046936035, "learning_rate": 1.4736842105263156e-06, "loss": 0.293, "step": 4 }, { "epoch": 0.026455026455026454, "grad_norm": 3.979759693145752, "learning_rate": 1.8421052631578944e-06, "loss": 0.2437, "step": 5 }, { "epoch": 0.026455026455026454, "eval_loss": 0.2190357893705368, "eval_runtime": 116.0092, "eval_samples_per_second": 26.308, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8862434303901607, "eval_sts-test_pearson_dot": 0.8769941125545049, "eval_sts-test_pearson_euclidean": 0.9081330515608292, "eval_sts-test_pearson_manhattan": 0.9088207208262792, "eval_sts-test_pearson_max": 0.9088207208262792, "eval_sts-test_spearman_cosine": 0.9078630669346023, "eval_sts-test_spearman_dot": 0.8793787995564538, "eval_sts-test_spearman_euclidean": 0.9040411749579927, "eval_sts-test_spearman_manhattan": 0.9044944604846262, "eval_sts-test_spearman_max": 0.9078630669346023, "step": 5 }, { "epoch": 0.031746031746031744, "grad_norm": 4.23664665222168, "learning_rate": 2.2105263157894734e-06, "loss": 0.3681, "step": 6 }, { "epoch": 0.037037037037037035, "grad_norm": 3.7219350337982178, "learning_rate": 2.5789473684210523e-06, "loss": 0.2314, "step": 7 }, { "epoch": 0.042328042328042326, "grad_norm": 4.135527610778809, "learning_rate": 2.9473684210526313e-06, "loss": 0.2481, "step": 8 }, { "epoch": 0.047619047619047616, "grad_norm": 3.6268699169158936, "learning_rate": 3.3157894736842103e-06, "loss": 0.2403, "step": 9 }, { "epoch": 0.05291005291005291, "grad_norm": 4.4451069831848145, "learning_rate": 3.684210526315789e-06, "loss": 0.2966, "step": 10 }, { "epoch": 0.05291005291005291, "eval_loss": 0.21254141628742218, "eval_runtime": 115.9559, "eval_samples_per_second": 26.32, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8863175059264351, "eval_sts-test_pearson_dot": 0.8768417645238664, "eval_sts-test_pearson_euclidean": 0.908487750463006, "eval_sts-test_pearson_manhattan": 0.9091691821321777, "eval_sts-test_pearson_max": 0.9091691821321777, "eval_sts-test_spearman_cosine": 0.9079303280804736, "eval_sts-test_spearman_dot": 0.8797622731153167, "eval_sts-test_spearman_euclidean": 0.9046558335332037, "eval_sts-test_spearman_manhattan": 0.905077211410465, "eval_sts-test_spearman_max": 0.9079303280804736, "step": 10 }, { "epoch": 0.0582010582010582, "grad_norm": 4.20427942276001, "learning_rate": 4.052631578947368e-06, "loss": 0.2867, "step": 11 }, { "epoch": 0.06349206349206349, "grad_norm": 4.593740463256836, "learning_rate": 4.421052631578947e-06, "loss": 0.3413, "step": 12 }, { "epoch": 0.06878306878306878, "grad_norm": 4.476904392242432, "learning_rate": 4.789473684210526e-06, "loss": 0.4119, "step": 13 }, { "epoch": 0.07407407407407407, "grad_norm": 4.580070495605469, "learning_rate": 5.157894736842105e-06, "loss": 0.3118, "step": 14 }, { "epoch": 0.07936507936507936, "grad_norm": 4.245151042938232, "learning_rate": 5.526315789473683e-06, "loss": 0.327, "step": 15 }, { "epoch": 0.07936507936507936, "eval_loss": 0.20313723385334015, "eval_runtime": 115.9643, "eval_samples_per_second": 26.318, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.886401463284574, "eval_sts-test_pearson_dot": 0.8764131384801311, "eval_sts-test_pearson_euclidean": 0.9089963447148983, "eval_sts-test_pearson_manhattan": 0.9096769425767649, "eval_sts-test_pearson_max": 0.9096769425767649, "eval_sts-test_spearman_cosine": 0.9081527866008503, "eval_sts-test_spearman_dot": 0.8798801031865603, "eval_sts-test_spearman_euclidean": 0.9051157869977845, "eval_sts-test_spearman_manhattan": 0.9058301961346367, "eval_sts-test_spearman_max": 0.9081527866008503, "step": 15 }, { "epoch": 0.08465608465608465, "grad_norm": 4.45465087890625, "learning_rate": 5.894736842105263e-06, "loss": 0.3389, "step": 16 }, { "epoch": 0.08994708994708994, "grad_norm": 3.7058451175689697, "learning_rate": 6.263157894736842e-06, "loss": 0.2018, "step": 17 }, { "epoch": 0.09523809523809523, "grad_norm": 4.276317119598389, "learning_rate": 6.6315789473684205e-06, "loss": 0.2861, "step": 18 }, { "epoch": 0.10052910052910052, "grad_norm": 4.1567511558532715, "learning_rate": 7e-06, "loss": 0.2848, "step": 19 }, { "epoch": 0.10582010582010581, "grad_norm": 3.727539300918579, "learning_rate": 7.368421052631578e-06, "loss": 0.2563, "step": 20 }, { "epoch": 0.10582010582010581, "eval_loss": 0.19427762925624847, "eval_runtime": 115.9622, "eval_samples_per_second": 26.319, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8859728422141003, "eval_sts-test_pearson_dot": 0.8747321172801832, "eval_sts-test_pearson_euclidean": 0.9094621598863318, "eval_sts-test_pearson_manhattan": 0.9101781902466943, "eval_sts-test_pearson_max": 0.9101781902466943, "eval_sts-test_spearman_cosine": 0.9082097102053401, "eval_sts-test_spearman_dot": 0.8789482924218687, "eval_sts-test_spearman_euclidean": 0.9059236815196834, "eval_sts-test_spearman_manhattan": 0.9063636311704222, "eval_sts-test_spearman_max": 0.9082097102053401, "step": 20 }, { "epoch": 0.1111111111111111, "grad_norm": 4.300179958343506, "learning_rate": 7.736842105263158e-06, "loss": 0.3058, "step": 21 }, { "epoch": 0.1164021164021164, "grad_norm": 4.133388042449951, "learning_rate": 8.105263157894736e-06, "loss": 0.285, "step": 22 }, { "epoch": 0.12169312169312169, "grad_norm": 4.270442962646484, "learning_rate": 8.473684210526315e-06, "loss": 0.3151, "step": 23 }, { "epoch": 0.12698412698412698, "grad_norm": 3.83665132522583, "learning_rate": 8.842105263157893e-06, "loss": 0.2716, "step": 24 }, { "epoch": 0.13227513227513227, "grad_norm": 3.7526190280914307, "learning_rate": 9.210526315789472e-06, "loss": 0.2422, "step": 25 }, { "epoch": 0.13227513227513227, "eval_loss": 0.17938196659088135, "eval_runtime": 115.932, "eval_samples_per_second": 26.326, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8855121875537549, "eval_sts-test_pearson_dot": 0.8729669422265561, "eval_sts-test_pearson_euclidean": 0.9099991783491139, "eval_sts-test_pearson_manhattan": 0.9106856714841502, "eval_sts-test_pearson_max": 0.9106856714841502, "eval_sts-test_spearman_cosine": 0.9081626318783566, "eval_sts-test_spearman_dot": 0.8777849833819992, "eval_sts-test_spearman_euclidean": 0.9059934934874541, "eval_sts-test_spearman_manhattan": 0.9070222354843194, "eval_sts-test_spearman_max": 0.9081626318783566, "step": 25 }, { "epoch": 0.13756613756613756, "grad_norm": 4.082998275756836, "learning_rate": 9.578947368421052e-06, "loss": 0.2858, "step": 26 }, { "epoch": 0.14285714285714285, "grad_norm": 4.099833965301514, "learning_rate": 9.94736842105263e-06, "loss": 0.3211, "step": 27 }, { "epoch": 0.14814814814814814, "grad_norm": 3.2643215656280518, "learning_rate": 1.031578947368421e-05, "loss": 0.2158, "step": 28 }, { "epoch": 0.15343915343915343, "grad_norm": 3.7243423461914062, "learning_rate": 1.068421052631579e-05, "loss": 0.2811, "step": 29 }, { "epoch": 0.15873015873015872, "grad_norm": 3.2406320571899414, "learning_rate": 1.1052631578947366e-05, "loss": 0.2063, "step": 30 }, { "epoch": 0.15873015873015872, "eval_loss": 0.16362369060516357, "eval_runtime": 115.8583, "eval_samples_per_second": 26.343, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.884331218823937, "eval_sts-test_pearson_dot": 0.8704189891244698, "eval_sts-test_pearson_euclidean": 0.9099375497606286, "eval_sts-test_pearson_manhattan": 0.9105955358876667, "eval_sts-test_pearson_max": 0.9105955358876667, "eval_sts-test_spearman_cosine": 0.907708540829018, "eval_sts-test_spearman_dot": 0.8752376073298165, "eval_sts-test_spearman_euclidean": 0.9062101343438497, "eval_sts-test_spearman_manhattan": 0.9067411975627813, "eval_sts-test_spearman_max": 0.907708540829018, "step": 30 }, { "epoch": 0.164021164021164, "grad_norm": 3.6572649478912354, "learning_rate": 1.1421052631578947e-05, "loss": 0.2492, "step": 31 }, { "epoch": 0.1693121693121693, "grad_norm": 3.982050657272339, "learning_rate": 1.1789473684210525e-05, "loss": 0.3096, "step": 32 }, { "epoch": 0.1746031746031746, "grad_norm": 3.694265365600586, "learning_rate": 1.2157894736842105e-05, "loss": 0.2914, "step": 33 }, { "epoch": 0.17989417989417988, "grad_norm": 3.955336809158325, "learning_rate": 1.2526315789473684e-05, "loss": 0.2888, "step": 34 }, { "epoch": 0.18518518518518517, "grad_norm": 3.3689701557159424, "learning_rate": 1.289473684210526e-05, "loss": 0.223, "step": 35 }, { "epoch": 0.18518518518518517, "eval_loss": 0.15322057902812958, "eval_runtime": 115.8758, "eval_samples_per_second": 26.339, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8831736003329786, "eval_sts-test_pearson_dot": 0.8675011167098496, "eval_sts-test_pearson_euclidean": 0.9098679645586549, "eval_sts-test_pearson_manhattan": 0.9104851057704236, "eval_sts-test_pearson_max": 0.9104851057704236, "eval_sts-test_spearman_cosine": 0.9071846378119088, "eval_sts-test_spearman_dot": 0.8722759240996724, "eval_sts-test_spearman_euclidean": 0.9062134214414588, "eval_sts-test_spearman_manhattan": 0.9068589381315022, "eval_sts-test_spearman_max": 0.9071846378119088, "step": 35 }, { "epoch": 0.19047619047619047, "grad_norm": 3.665268898010254, "learning_rate": 1.3263157894736841e-05, "loss": 0.2595, "step": 36 }, { "epoch": 0.19576719576719576, "grad_norm": 3.816418409347534, "learning_rate": 1.363157894736842e-05, "loss": 0.3122, "step": 37 }, { "epoch": 0.20105820105820105, "grad_norm": 3.2825276851654053, "learning_rate": 1.4e-05, "loss": 0.2327, "step": 38 }, { "epoch": 0.20634920634920634, "grad_norm": 2.9109766483306885, "learning_rate": 1.4368421052631578e-05, "loss": 0.1718, "step": 39 }, { "epoch": 0.21164021164021163, "grad_norm": 3.6622438430786133, "learning_rate": 1.4736842105263155e-05, "loss": 0.3162, "step": 40 }, { "epoch": 0.21164021164021163, "eval_loss": 0.14427155256271362, "eval_runtime": 115.8732, "eval_samples_per_second": 26.339, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8815373372232074, "eval_sts-test_pearson_dot": 0.8638584960095411, "eval_sts-test_pearson_euclidean": 0.909867067075244, "eval_sts-test_pearson_manhattan": 0.9104161328856638, "eval_sts-test_pearson_max": 0.9104161328856638, "eval_sts-test_spearman_cosine": 0.9066622115864256, "eval_sts-test_spearman_dot": 0.8693618114603872, "eval_sts-test_spearman_euclidean": 0.9060806241933829, "eval_sts-test_spearman_manhattan": 0.907040852009058, "eval_sts-test_spearman_max": 0.907040852009058, "step": 40 }, { "epoch": 0.21693121693121692, "grad_norm": 3.7417047023773193, "learning_rate": 1.5105263157894735e-05, "loss": 0.296, "step": 41 }, { "epoch": 0.2222222222222222, "grad_norm": 3.4067814350128174, "learning_rate": 1.5473684210526316e-05, "loss": 0.2821, "step": 42 }, { "epoch": 0.2275132275132275, "grad_norm": 3.0177342891693115, "learning_rate": 1.5842105263157892e-05, "loss": 0.2069, "step": 43 }, { "epoch": 0.2328042328042328, "grad_norm": 3.4185049533843994, "learning_rate": 1.6210526315789473e-05, "loss": 0.2573, "step": 44 }, { "epoch": 0.23809523809523808, "grad_norm": 3.918159246444702, "learning_rate": 1.657894736842105e-05, "loss": 0.3119, "step": 45 }, { "epoch": 0.23809523809523808, "eval_loss": 0.13426049053668976, "eval_runtime": 115.9898, "eval_samples_per_second": 26.313, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8805625986040617, "eval_sts-test_pearson_dot": 0.8616132264481751, "eval_sts-test_pearson_euclidean": 0.9101879546714963, "eval_sts-test_pearson_manhattan": 0.9107296923568013, "eval_sts-test_pearson_max": 0.9107296923568013, "eval_sts-test_spearman_cosine": 0.9064269094540296, "eval_sts-test_spearman_dot": 0.8677769455382106, "eval_sts-test_spearman_euclidean": 0.9069313904236947, "eval_sts-test_spearman_manhattan": 0.9076995905767397, "eval_sts-test_spearman_max": 0.9076995905767397, "step": 45 }, { "epoch": 0.24338624338624337, "grad_norm": 3.0733871459960938, "learning_rate": 1.694736842105263e-05, "loss": 0.2743, "step": 46 }, { "epoch": 0.24867724867724866, "grad_norm": 3.3528106212615967, "learning_rate": 1.731578947368421e-05, "loss": 0.2666, "step": 47 }, { "epoch": 0.25396825396825395, "grad_norm": 3.0437304973602295, "learning_rate": 1.7684210526315787e-05, "loss": 0.2414, "step": 48 }, { "epoch": 0.25925925925925924, "grad_norm": 3.451265573501587, "learning_rate": 1.8052631578947367e-05, "loss": 0.2793, "step": 49 }, { "epoch": 0.26455026455026454, "grad_norm": 2.9664905071258545, "learning_rate": 1.8421052631578944e-05, "loss": 0.2212, "step": 50 }, { "epoch": 0.26455026455026454, "eval_loss": 0.12510398030281067, "eval_runtime": 116.2617, "eval_samples_per_second": 26.251, "eval_steps_per_second": 0.206, "eval_sts-test_pearson_cosine": 0.8803749979605222, "eval_sts-test_pearson_dot": 0.8597944232348109, "eval_sts-test_pearson_euclidean": 0.9105906609339514, "eval_sts-test_pearson_manhattan": 0.9111264139452835, "eval_sts-test_pearson_max": 0.9111264139452835, "eval_sts-test_spearman_cosine": 0.9067783858609977, "eval_sts-test_spearman_dot": 0.8664887804790594, "eval_sts-test_spearman_euclidean": 0.9077306479521453, "eval_sts-test_spearman_manhattan": 0.9082951851145977, "eval_sts-test_spearman_max": 0.9082951851145977, "step": 50 }, { "epoch": 0.2698412698412698, "grad_norm": 3.051281690597534, "learning_rate": 1.8789473684210524e-05, "loss": 0.2071, "step": 51 }, { "epoch": 0.2751322751322751, "grad_norm": 3.539724826812744, "learning_rate": 1.9157894736842104e-05, "loss": 0.296, "step": 52 }, { "epoch": 0.2804232804232804, "grad_norm": 3.101149559020996, "learning_rate": 1.952631578947368e-05, "loss": 0.2061, "step": 53 }, { "epoch": 0.2857142857142857, "grad_norm": 3.113757848739624, "learning_rate": 1.989473684210526e-05, "loss": 0.2164, "step": 54 }, { "epoch": 0.291005291005291, "grad_norm": 2.9952962398529053, "learning_rate": 2.0263157894736842e-05, "loss": 0.188, "step": 55 }, { "epoch": 0.291005291005291, "eval_loss": 0.11972539871931076, "eval_runtime": 115.8621, "eval_samples_per_second": 26.342, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.881021505301097, "eval_sts-test_pearson_dot": 0.8597228411065384, "eval_sts-test_pearson_euclidean": 0.9108125110302233, "eval_sts-test_pearson_manhattan": 0.9112746469255748, "eval_sts-test_pearson_max": 0.9112746469255748, "eval_sts-test_spearman_cosine": 0.9072042388643983, "eval_sts-test_spearman_dot": 0.8659804956521757, "eval_sts-test_spearman_euclidean": 0.9079662185921096, "eval_sts-test_spearman_manhattan": 0.9082669470686597, "eval_sts-test_spearman_max": 0.9082669470686597, "step": 55 }, { "epoch": 0.2962962962962963, "grad_norm": 3.065542697906494, "learning_rate": 2.063157894736842e-05, "loss": 0.2411, "step": 56 }, { "epoch": 0.30158730158730157, "grad_norm": 2.8689684867858887, "learning_rate": 2.1e-05, "loss": 0.2031, "step": 57 }, { "epoch": 0.30687830687830686, "grad_norm": 2.9906833171844482, "learning_rate": 2.136842105263158e-05, "loss": 0.2438, "step": 58 }, { "epoch": 0.31216931216931215, "grad_norm": 3.040048837661743, "learning_rate": 2.1736842105263156e-05, "loss": 0.2417, "step": 59 }, { "epoch": 0.31746031746031744, "grad_norm": 2.7314910888671875, "learning_rate": 2.2105263157894733e-05, "loss": 0.1515, "step": 60 }, { "epoch": 0.31746031746031744, "eval_loss": 0.12329312413930893, "eval_runtime": 115.9779, "eval_samples_per_second": 26.315, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8813346129633393, "eval_sts-test_pearson_dot": 0.8610768249798144, "eval_sts-test_pearson_euclidean": 0.9107487658386748, "eval_sts-test_pearson_manhattan": 0.9112378269812353, "eval_sts-test_pearson_max": 0.9112378269812353, "eval_sts-test_spearman_cosine": 0.9065531975136762, "eval_sts-test_spearman_dot": 0.867737743433232, "eval_sts-test_spearman_euclidean": 0.9073249330163708, "eval_sts-test_spearman_manhattan": 0.9077088988391091, "eval_sts-test_spearman_max": 0.9077088988391091, "step": 60 }, { "epoch": 0.32275132275132273, "grad_norm": 3.0552306175231934, "learning_rate": 2.2473684210526313e-05, "loss": 0.21, "step": 61 }, { "epoch": 0.328042328042328, "grad_norm": 3.1015548706054688, "learning_rate": 2.2842105263157893e-05, "loss": 0.21, "step": 62 }, { "epoch": 0.3333333333333333, "grad_norm": 3.154890537261963, "learning_rate": 2.321052631578947e-05, "loss": 0.2157, "step": 63 }, { "epoch": 0.3386243386243386, "grad_norm": 3.384970188140869, "learning_rate": 2.357894736842105e-05, "loss": 0.2138, "step": 64 }, { "epoch": 0.3439153439153439, "grad_norm": 3.58601713180542, "learning_rate": 2.394736842105263e-05, "loss": 0.2403, "step": 65 }, { "epoch": 0.3439153439153439, "eval_loss": 0.12730959057807922, "eval_runtime": 115.9959, "eval_samples_per_second": 26.311, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8797954361206015, "eval_sts-test_pearson_dot": 0.8587071220533238, "eval_sts-test_pearson_euclidean": 0.9093975785151563, "eval_sts-test_pearson_manhattan": 0.9102944590146977, "eval_sts-test_pearson_max": 0.9102944590146977, "eval_sts-test_spearman_cosine": 0.9058099238132263, "eval_sts-test_spearman_dot": 0.8649206962799048, "eval_sts-test_spearman_euclidean": 0.906062455181258, "eval_sts-test_spearman_manhattan": 0.9070602293052407, "eval_sts-test_spearman_max": 0.9070602293052407, "step": 65 }, { "epoch": 0.3492063492063492, "grad_norm": 3.397090435028076, "learning_rate": 2.431578947368421e-05, "loss": 0.2808, "step": 66 }, { "epoch": 0.3544973544973545, "grad_norm": 2.786862373352051, "learning_rate": 2.4684210526315788e-05, "loss": 0.1891, "step": 67 }, { "epoch": 0.35978835978835977, "grad_norm": 3.0845346450805664, "learning_rate": 2.5052631578947368e-05, "loss": 0.1991, "step": 68 }, { "epoch": 0.36507936507936506, "grad_norm": 3.1166787147521973, "learning_rate": 2.5421052631578948e-05, "loss": 0.2121, "step": 69 }, { "epoch": 0.37037037037037035, "grad_norm": 3.181523323059082, "learning_rate": 2.578947368421052e-05, "loss": 0.2039, "step": 70 }, { "epoch": 0.37037037037037035, "eval_loss": 0.13109122216701508, "eval_runtime": 115.9695, "eval_samples_per_second": 26.317, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8802169762825238, "eval_sts-test_pearson_dot": 0.8598037708245074, "eval_sts-test_pearson_euclidean": 0.9094245832802006, "eval_sts-test_pearson_manhattan": 0.9105289688155764, "eval_sts-test_pearson_max": 0.9105289688155764, "eval_sts-test_spearman_cosine": 0.906595934968305, "eval_sts-test_spearman_dot": 0.8654662141562657, "eval_sts-test_spearman_euclidean": 0.906532611933436, "eval_sts-test_spearman_manhattan": 0.9073325407308075, "eval_sts-test_spearman_max": 0.9073325407308075, "step": 70 }, { "epoch": 0.37566137566137564, "grad_norm": 3.032498598098755, "learning_rate": 2.6157894736842102e-05, "loss": 0.1986, "step": 71 }, { "epoch": 0.38095238095238093, "grad_norm": 3.486334800720215, "learning_rate": 2.6526315789473682e-05, "loss": 0.2925, "step": 72 }, { "epoch": 0.3862433862433862, "grad_norm": 3.424903631210327, "learning_rate": 2.689473684210526e-05, "loss": 0.2527, "step": 73 }, { "epoch": 0.3915343915343915, "grad_norm": 3.587599992752075, "learning_rate": 2.726315789473684e-05, "loss": 0.279, "step": 74 }, { "epoch": 0.3968253968253968, "grad_norm": 3.3220415115356445, "learning_rate": 2.763157894736842e-05, "loss": 0.2419, "step": 75 }, { "epoch": 0.3968253968253968, "eval_loss": 0.13154849410057068, "eval_runtime": 115.989, "eval_samples_per_second": 26.313, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.879906095889138, "eval_sts-test_pearson_dot": 0.8607649105164801, "eval_sts-test_pearson_euclidean": 0.9088378228597751, "eval_sts-test_pearson_manhattan": 0.9102620649137054, "eval_sts-test_pearson_max": 0.9102620649137054, "eval_sts-test_spearman_cosine": 0.9066359425959888, "eval_sts-test_spearman_dot": 0.8668292928269864, "eval_sts-test_spearman_euclidean": 0.9061907122964057, "eval_sts-test_spearman_manhattan": 0.9072107277972998, "eval_sts-test_spearman_max": 0.9072107277972998, "step": 75 }, { "epoch": 0.4021164021164021, "grad_norm": 2.994913101196289, "learning_rate": 2.8e-05, "loss": 0.2228, "step": 76 }, { "epoch": 0.4074074074074074, "grad_norm": 3.613633632659912, "learning_rate": 2.8368421052631576e-05, "loss": 0.2242, "step": 77 }, { "epoch": 0.4126984126984127, "grad_norm": 3.4389431476593018, "learning_rate": 2.8736842105263157e-05, "loss": 0.2737, "step": 78 }, { "epoch": 0.41798941798941797, "grad_norm": 3.348423957824707, "learning_rate": 2.9105263157894737e-05, "loss": 0.2328, "step": 79 }, { "epoch": 0.42328042328042326, "grad_norm": 3.3796963691711426, "learning_rate": 2.947368421052631e-05, "loss": 0.2802, "step": 80 }, { "epoch": 0.42328042328042326, "eval_loss": 0.12620440125465393, "eval_runtime": 115.88, "eval_samples_per_second": 26.338, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8786083890603608, "eval_sts-test_pearson_dot": 0.8607044568869332, "eval_sts-test_pearson_euclidean": 0.9083064997932102, "eval_sts-test_pearson_manhattan": 0.909785902193573, "eval_sts-test_pearson_max": 0.909785902193573, "eval_sts-test_spearman_cosine": 0.9058450088021573, "eval_sts-test_spearman_dot": 0.8683225529170947, "eval_sts-test_spearman_euclidean": 0.9058583894293133, "eval_sts-test_spearman_manhattan": 0.9067757007853141, "eval_sts-test_spearman_max": 0.9067757007853141, "step": 80 }, { "epoch": 0.42857142857142855, "grad_norm": 2.940751314163208, "learning_rate": 2.984210526315789e-05, "loss": 0.2044, "step": 81 }, { "epoch": 0.43386243386243384, "grad_norm": 2.742959499359131, "learning_rate": 3.021052631578947e-05, "loss": 0.1828, "step": 82 }, { "epoch": 0.43915343915343913, "grad_norm": 3.1117758750915527, "learning_rate": 3.0578947368421054e-05, "loss": 0.2372, "step": 83 }, { "epoch": 0.4444444444444444, "grad_norm": 3.11297869682312, "learning_rate": 3.094736842105263e-05, "loss": 0.2241, "step": 84 }, { "epoch": 0.4497354497354497, "grad_norm": 3.2359349727630615, "learning_rate": 3.131578947368421e-05, "loss": 0.2782, "step": 85 }, { "epoch": 0.4497354497354497, "eval_loss": 0.12065601348876953, "eval_runtime": 115.9874, "eval_samples_per_second": 26.313, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8792266747813421, "eval_sts-test_pearson_dot": 0.8635914020857025, "eval_sts-test_pearson_euclidean": 0.909050966414346, "eval_sts-test_pearson_manhattan": 0.9104391562008682, "eval_sts-test_pearson_max": 0.9104391562008682, "eval_sts-test_spearman_cosine": 0.9062812888494619, "eval_sts-test_spearman_dot": 0.8713958905444104, "eval_sts-test_spearman_euclidean": 0.9064880396770901, "eval_sts-test_spearman_manhattan": 0.9073885693100694, "eval_sts-test_spearman_max": 0.9073885693100694, "step": 85 }, { "epoch": 0.455026455026455, "grad_norm": 3.3813822269439697, "learning_rate": 3.1684210526315785e-05, "loss": 0.3244, "step": 86 }, { "epoch": 0.4603174603174603, "grad_norm": 2.657428503036499, "learning_rate": 3.205263157894736e-05, "loss": 0.2102, "step": 87 }, { "epoch": 0.4656084656084656, "grad_norm": 2.7600059509277344, "learning_rate": 3.2421052631578945e-05, "loss": 0.2265, "step": 88 }, { "epoch": 0.4708994708994709, "grad_norm": 3.10657000541687, "learning_rate": 3.278947368421052e-05, "loss": 0.2666, "step": 89 }, { "epoch": 0.47619047619047616, "grad_norm": 3.0764777660369873, "learning_rate": 3.31578947368421e-05, "loss": 0.23, "step": 90 }, { "epoch": 0.47619047619047616, "eval_loss": 0.11863212287425995, "eval_runtime": 115.9856, "eval_samples_per_second": 26.314, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8796178782254354, "eval_sts-test_pearson_dot": 0.8632853604720685, "eval_sts-test_pearson_euclidean": 0.9106195949839817, "eval_sts-test_pearson_manhattan": 0.9115246839048733, "eval_sts-test_pearson_max": 0.9115246839048733, "eval_sts-test_spearman_cosine": 0.9078120057453548, "eval_sts-test_spearman_dot": 0.8720862682538955, "eval_sts-test_spearman_euclidean": 0.9083378778179652, "eval_sts-test_spearman_manhattan": 0.9089643954774449, "eval_sts-test_spearman_max": 0.9089643954774449, "step": 90 }, { "epoch": 0.48148148148148145, "grad_norm": 3.020212411880493, "learning_rate": 3.352631578947368e-05, "loss": 0.2358, "step": 91 }, { "epoch": 0.48677248677248675, "grad_norm": 3.419471502304077, "learning_rate": 3.389473684210526e-05, "loss": 0.2896, "step": 92 }, { "epoch": 0.49206349206349204, "grad_norm": 3.062520980834961, "learning_rate": 3.426315789473684e-05, "loss": 0.2126, "step": 93 }, { "epoch": 0.4973544973544973, "grad_norm": 3.3430864810943604, "learning_rate": 3.463157894736842e-05, "loss": 0.2669, "step": 94 }, { "epoch": 0.5026455026455027, "grad_norm": 3.1061036586761475, "learning_rate": 3.5e-05, "loss": 0.2375, "step": 95 }, { "epoch": 0.5026455026455027, "eval_loss": 0.11278577148914337, "eval_runtime": 115.9371, "eval_samples_per_second": 26.325, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8790475705877597, "eval_sts-test_pearson_dot": 0.8631845510913274, "eval_sts-test_pearson_euclidean": 0.9107267591418424, "eval_sts-test_pearson_manhattan": 0.911801845501315, "eval_sts-test_pearson_max": 0.911801845501315, "eval_sts-test_spearman_cosine": 0.9087454723067182, "eval_sts-test_spearman_dot": 0.8721337493422319, "eval_sts-test_spearman_euclidean": 0.9085557269584185, "eval_sts-test_spearman_manhattan": 0.9095949855017109, "eval_sts-test_spearman_max": 0.9095949855017109, "step": 95 }, { "epoch": 0.5079365079365079, "grad_norm": 2.760911703109741, "learning_rate": 3.4999101435573165e-05, "loss": 0.1903, "step": 96 }, { "epoch": 0.5132275132275133, "grad_norm": 2.9554224014282227, "learning_rate": 3.499640585302429e-05, "loss": 0.2507, "step": 97 }, { "epoch": 0.5185185185185185, "grad_norm": 2.9563987255096436, "learning_rate": 3.4991913584534566e-05, "loss": 0.1897, "step": 98 }, { "epoch": 0.5238095238095238, "grad_norm": 3.0752642154693604, "learning_rate": 3.4985625183693854e-05, "loss": 0.2775, "step": 99 }, { "epoch": 0.5291005291005291, "grad_norm": 2.8446006774902344, "learning_rate": 3.4977541425432415e-05, "loss": 0.2098, "step": 100 }, { "epoch": 0.5291005291005291, "eval_loss": 0.11681391298770905, "eval_runtime": 115.9468, "eval_samples_per_second": 26.322, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8772591964347106, "eval_sts-test_pearson_dot": 0.8585096145266702, "eval_sts-test_pearson_euclidean": 0.9091316759212603, "eval_sts-test_pearson_manhattan": 0.9099210072486108, "eval_sts-test_pearson_max": 0.9099210072486108, "eval_sts-test_spearman_cosine": 0.9064237321144708, "eval_sts-test_spearman_dot": 0.8648877146002593, "eval_sts-test_spearman_euclidean": 0.9063920482214057, "eval_sts-test_spearman_manhattan": 0.9075959914066186, "eval_sts-test_spearman_max": 0.9075959914066186, "step": 100 }, { "epoch": 0.5343915343915344, "grad_norm": 2.7542779445648193, "learning_rate": 3.496766330592549e-05, "loss": 0.1628, "step": 101 }, { "epoch": 0.5396825396825397, "grad_norm": 2.7565367221832275, "learning_rate": 3.4955992042470456e-05, "loss": 0.2158, "step": 102 }, { "epoch": 0.544973544973545, "grad_norm": 2.4343347549438477, "learning_rate": 3.494252907333686e-05, "loss": 0.1552, "step": 103 }, { "epoch": 0.5502645502645502, "grad_norm": 3.1311893463134766, "learning_rate": 3.492727605758918e-05, "loss": 0.2364, "step": 104 }, { "epoch": 0.5555555555555556, "grad_norm": 3.080811023712158, "learning_rate": 3.491023487488238e-05, "loss": 0.272, "step": 105 }, { "epoch": 0.5555555555555556, "eval_loss": 0.1177554726600647, "eval_runtime": 115.9887, "eval_samples_per_second": 26.313, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8775298085625183, "eval_sts-test_pearson_dot": 0.8593352032357447, "eval_sts-test_pearson_euclidean": 0.9089120653468052, "eval_sts-test_pearson_manhattan": 0.908955609090125, "eval_sts-test_pearson_max": 0.908955609090125, "eval_sts-test_spearman_cosine": 0.9056424645930997, "eval_sts-test_spearman_dot": 0.8631139983550111, "eval_sts-test_spearman_euclidean": 0.9054400993890878, "eval_sts-test_spearman_manhattan": 0.9059316024929496, "eval_sts-test_spearman_max": 0.9059316024929496, "step": 105 }, { "epoch": 0.5608465608465608, "grad_norm": 2.8664119243621826, "learning_rate": 3.489140762523023e-05, "loss": 0.2271, "step": 106 }, { "epoch": 0.5661375661375662, "grad_norm": 2.7945520877838135, "learning_rate": 3.4870796628746594e-05, "loss": 0.2132, "step": 107 }, { "epoch": 0.5714285714285714, "grad_norm": 2.8966565132141113, "learning_rate": 3.4848404425359456e-05, "loss": 0.1782, "step": 108 }, { "epoch": 0.5767195767195767, "grad_norm": 2.6960151195526123, "learning_rate": 3.4824233774497946e-05, "loss": 0.1598, "step": 109 }, { "epoch": 0.582010582010582, "grad_norm": 3.1113312244415283, "learning_rate": 3.4798287654752294e-05, "loss": 0.2472, "step": 110 }, { "epoch": 0.582010582010582, "eval_loss": 0.12694795429706573, "eval_runtime": 115.9427, "eval_samples_per_second": 26.323, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8781162735100789, "eval_sts-test_pearson_dot": 0.8634318341628767, "eval_sts-test_pearson_euclidean": 0.9086688422059872, "eval_sts-test_pearson_manhattan": 0.9085119625190576, "eval_sts-test_pearson_max": 0.9086688422059872, "eval_sts-test_spearman_cosine": 0.9050255684548193, "eval_sts-test_spearman_dot": 0.8664220563483247, "eval_sts-test_spearman_euclidean": 0.9042466279990407, "eval_sts-test_spearman_manhattan": 0.9047099825594873, "eval_sts-test_spearman_max": 0.9050255684548193, "step": 110 }, { "epoch": 0.5873015873015873, "grad_norm": 3.0204665660858154, "learning_rate": 3.477056926350673e-05, "loss": 0.2041, "step": 111 }, { "epoch": 0.5925925925925926, "grad_norm": 3.3232171535491943, "learning_rate": 3.474108201654556e-05, "loss": 0.2426, "step": 112 }, { "epoch": 0.5978835978835979, "grad_norm": 3.1148412227630615, "learning_rate": 3.4709829547632134e-05, "loss": 0.2105, "step": 113 }, { "epoch": 0.6031746031746031, "grad_norm": 2.886481523513794, "learning_rate": 3.467681570806109e-05, "loss": 0.1923, "step": 114 }, { "epoch": 0.6084656084656085, "grad_norm": 3.2116329669952393, "learning_rate": 3.464204456618378e-05, "loss": 0.2271, "step": 115 }, { "epoch": 0.6084656084656085, "eval_loss": 0.12330599874258041, "eval_runtime": 115.9423, "eval_samples_per_second": 26.323, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8798272374408064, "eval_sts-test_pearson_dot": 0.867285673702976, "eval_sts-test_pearson_euclidean": 0.909363327246087, "eval_sts-test_pearson_manhattan": 0.9091246101967716, "eval_sts-test_pearson_max": 0.909363327246087, "eval_sts-test_spearman_cosine": 0.9060899772070137, "eval_sts-test_spearman_dot": 0.8704560245526682, "eval_sts-test_spearman_euclidean": 0.9054406364042247, "eval_sts-test_spearman_manhattan": 0.9051852857067252, "eval_sts-test_spearman_max": 0.9060899772070137, "step": 115 }, { "epoch": 0.6137566137566137, "grad_norm": 3.385223627090454, "learning_rate": 3.460552040690687e-05, "loss": 0.3029, "step": 116 }, { "epoch": 0.6190476190476191, "grad_norm": 3.150688409805298, "learning_rate": 3.456724773116434e-05, "loss": 0.2554, "step": 117 }, { "epoch": 0.6243386243386243, "grad_norm": 3.0149803161621094, "learning_rate": 3.452723125536281e-05, "loss": 0.2182, "step": 118 }, { "epoch": 0.6296296296296297, "grad_norm": 3.19419002532959, "learning_rate": 3.448547591080033e-05, "loss": 0.2852, "step": 119 }, { "epoch": 0.6349206349206349, "grad_norm": 2.99879789352417, "learning_rate": 3.444198684305871e-05, "loss": 0.2285, "step": 120 }, { "epoch": 0.6349206349206349, "eval_loss": 0.12802907824516296, "eval_runtime": 115.8977, "eval_samples_per_second": 26.334, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8798175425010222, "eval_sts-test_pearson_dot": 0.863763595764963, "eval_sts-test_pearson_euclidean": 0.9094261191010624, "eval_sts-test_pearson_manhattan": 0.9090772657963855, "eval_sts-test_pearson_max": 0.9094261191010624, "eval_sts-test_spearman_cosine": 0.9053311300675998, "eval_sts-test_spearman_dot": 0.8654653191310379, "eval_sts-test_spearman_euclidean": 0.9053121555327698, "eval_sts-test_spearman_manhattan": 0.9052343330892101, "eval_sts-test_spearman_max": 0.9053311300675998, "step": 120 }, { "epoch": 0.6402116402116402, "grad_norm": 3.01249361038208, "learning_rate": 3.439676941136937e-05, "loss": 0.218, "step": 121 }, { "epoch": 0.6455026455026455, "grad_norm": 2.7809016704559326, "learning_rate": 3.4349829187952966e-05, "loss": 0.1841, "step": 122 }, { "epoch": 0.6507936507936508, "grad_norm": 3.183194398880005, "learning_rate": 3.4301171957332694e-05, "loss": 0.2629, "step": 123 }, { "epoch": 0.656084656084656, "grad_norm": 3.006030797958374, "learning_rate": 3.425080371562145e-05, "loss": 0.1749, "step": 124 }, { "epoch": 0.6613756613756614, "grad_norm": 2.9949681758880615, "learning_rate": 3.4198730669782925e-05, "loss": 0.2417, "step": 125 }, { "epoch": 0.6613756613756614, "eval_loss": 0.14149585366249084, "eval_runtime": 115.9037, "eval_samples_per_second": 26.332, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8801193496474387, "eval_sts-test_pearson_dot": 0.8614489114996757, "eval_sts-test_pearson_euclidean": 0.9096775446763254, "eval_sts-test_pearson_manhattan": 0.909541166275706, "eval_sts-test_pearson_max": 0.9096775446763254, "eval_sts-test_spearman_cosine": 0.9056506540739345, "eval_sts-test_spearman_dot": 0.8627393855459037, "eval_sts-test_spearman_euclidean": 0.905229410450457, "eval_sts-test_spearman_manhattan": 0.9051265720517796, "eval_sts-test_spearman_max": 0.9056506540739345, "step": 125 }, { "epoch": 0.6666666666666666, "grad_norm": 2.9404184818267822, "learning_rate": 3.414495923686671e-05, "loss": 0.2305, "step": 126 }, { "epoch": 0.671957671957672, "grad_norm": 3.515538215637207, "learning_rate": 3.408949604321751e-05, "loss": 0.2841, "step": 127 }, { "epoch": 0.6772486772486772, "grad_norm": 2.7193102836608887, "learning_rate": 3.4032347923658554e-05, "loss": 0.1785, "step": 128 }, { "epoch": 0.6825396825396826, "grad_norm": 2.949784755706787, "learning_rate": 3.397352192064938e-05, "loss": 0.2153, "step": 129 }, { "epoch": 0.6878306878306878, "grad_norm": 3.263798475265503, "learning_rate": 3.39130252834179e-05, "loss": 0.2548, "step": 130 }, { "epoch": 0.6878306878306878, "eval_loss": 0.1412901133298874, "eval_runtime": 115.8261, "eval_samples_per_second": 26.35, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8817491042172992, "eval_sts-test_pearson_dot": 0.8653158778803413, "eval_sts-test_pearson_euclidean": 0.9109017540975314, "eval_sts-test_pearson_manhattan": 0.9110420593534589, "eval_sts-test_pearson_max": 0.9110420593534589, "eval_sts-test_spearman_cosine": 0.9079463490320518, "eval_sts-test_spearman_dot": 0.8671617499478633, "eval_sts-test_spearman_euclidean": 0.9076926541312239, "eval_sts-test_spearman_manhattan": 0.9074005626481224, "eval_sts-test_spearman_max": 0.9079463490320518, "step": 130 }, { "epoch": 0.6931216931216931, "grad_norm": 2.7200613021850586, "learning_rate": 3.385086546706714e-05, "loss": 0.2059, "step": 131 }, { "epoch": 0.6984126984126984, "grad_norm": 2.916797399520874, "learning_rate": 3.3787050131656496e-05, "loss": 0.2073, "step": 132 }, { "epoch": 0.7037037037037037, "grad_norm": 3.0047614574432373, "learning_rate": 3.372158714125777e-05, "loss": 0.191, "step": 133 }, { "epoch": 0.708994708994709, "grad_norm": 2.702082633972168, "learning_rate": 3.36544845629861e-05, "loss": 0.1633, "step": 134 }, { "epoch": 0.7142857142857143, "grad_norm": 3.3000283241271973, "learning_rate": 3.358575066600578e-05, "loss": 0.2627, "step": 135 }, { "epoch": 0.7142857142857143, "eval_loss": 0.13328148424625397, "eval_runtime": 115.7947, "eval_samples_per_second": 26.357, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8814270028194036, "eval_sts-test_pearson_dot": 0.8654755263554079, "eval_sts-test_pearson_euclidean": 0.9100944689124159, "eval_sts-test_pearson_manhattan": 0.9103746959208892, "eval_sts-test_pearson_max": 0.9103746959208892, "eval_sts-test_spearman_cosine": 0.9076914905984277, "eval_sts-test_spearman_dot": 0.8685219197865933, "eval_sts-test_spearman_euclidean": 0.9077337805404427, "eval_sts-test_spearman_manhattan": 0.9075425136492559, "eval_sts-test_spearman_max": 0.9077337805404427, "step": 135 }, { "epoch": 0.7195767195767195, "grad_norm": 3.1306369304656982, "learning_rate": 3.351539392051131e-05, "loss": 0.2451, "step": 136 }, { "epoch": 0.7248677248677249, "grad_norm": 2.528568983078003, "learning_rate": 3.344342299668353e-05, "loss": 0.1441, "step": 137 }, { "epoch": 0.7301587301587301, "grad_norm": 2.893615484237671, "learning_rate": 3.33698467636212e-05, "loss": 0.2138, "step": 138 }, { "epoch": 0.7354497354497355, "grad_norm": 3.235039472579956, "learning_rate": 3.32946742882481e-05, "loss": 0.2564, "step": 139 }, { "epoch": 0.7407407407407407, "grad_norm": 2.8678057193756104, "learning_rate": 3.321791483419561e-05, "loss": 0.1524, "step": 140 }, { "epoch": 0.7407407407407407, "eval_loss": 0.13225606083869934, "eval_runtime": 115.8232, "eval_samples_per_second": 26.351, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8781919392624582, "eval_sts-test_pearson_dot": 0.8608413217193998, "eval_sts-test_pearson_euclidean": 0.9068541902117857, "eval_sts-test_pearson_manhattan": 0.9075012201786494, "eval_sts-test_pearson_max": 0.9075012201786494, "eval_sts-test_spearman_cosine": 0.9049418835960173, "eval_sts-test_spearman_dot": 0.8634087749137961, "eval_sts-test_spearman_euclidean": 0.9045396592586316, "eval_sts-test_spearman_manhattan": 0.9048969533295804, "eval_sts-test_spearman_max": 0.9049418835960173, "step": 140 }, { "epoch": 0.746031746031746, "grad_norm": 2.5006282329559326, "learning_rate": 3.313957786066119e-05, "loss": 0.1786, "step": 141 }, { "epoch": 0.7513227513227513, "grad_norm": 3.081282377243042, "learning_rate": 3.3059673021242716e-05, "loss": 0.2104, "step": 142 }, { "epoch": 0.7566137566137566, "grad_norm": 3.044879198074341, "learning_rate": 3.297821016274879e-05, "loss": 0.2512, "step": 143 }, { "epoch": 0.7619047619047619, "grad_norm": 2.87981915473938, "learning_rate": 3.2895199323985386e-05, "loss": 0.1889, "step": 144 }, { "epoch": 0.7671957671957672, "grad_norm": 2.799271821975708, "learning_rate": 3.281065073451867e-05, "loss": 0.2127, "step": 145 }, { "epoch": 0.7671957671957672, "eval_loss": 0.12909488379955292, "eval_runtime": 115.7825, "eval_samples_per_second": 26.36, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.8735553111785674, "eval_sts-test_pearson_dot": 0.8567353468688738, "eval_sts-test_pearson_euclidean": 0.9031851032382168, "eval_sts-test_pearson_manhattan": 0.9046152117713773, "eval_sts-test_pearson_max": 0.9046152117713773, "eval_sts-test_spearman_cosine": 0.9015403849689184, "eval_sts-test_spearman_dot": 0.8596978213141758, "eval_sts-test_spearman_euclidean": 0.9003828040905072, "eval_sts-test_spearman_manhattan": 0.9019434148290095, "eval_sts-test_spearman_max": 0.9019434148290095, "step": 145 }, { "epoch": 0.7724867724867724, "grad_norm": 2.8767826557159424, "learning_rate": 3.2724574813414464e-05, "loss": 0.2115, "step": 146 }, { "epoch": 0.7777777777777778, "grad_norm": 2.861135244369507, "learning_rate": 3.2636982167954236e-05, "loss": 0.179, "step": 147 }, { "epoch": 0.783068783068783, "grad_norm": 2.881589651107788, "learning_rate": 3.254788359232795e-05, "loss": 0.2188, "step": 148 }, { "epoch": 0.7883597883597884, "grad_norm": 2.7440359592437744, "learning_rate": 3.245729006630393e-05, "loss": 0.1687, "step": 149 }, { "epoch": 0.7936507936507936, "grad_norm": 2.965336561203003, "learning_rate": 3.236521275387573e-05, "loss": 0.2265, "step": 150 }, { "epoch": 0.7936507936507936, "eval_loss": 0.11447698622941971, "eval_runtime": 115.8332, "eval_samples_per_second": 26.348, "eval_steps_per_second": 0.207, "eval_sts-test_pearson_cosine": 0.874015974599682, "eval_sts-test_pearson_dot": 0.8572630785995417, "eval_sts-test_pearson_euclidean": 0.9037766701587291, "eval_sts-test_pearson_manhattan": 0.905395978358323, "eval_sts-test_pearson_max": 0.905395978358323, "eval_sts-test_spearman_cosine": 0.9014261797498476, "eval_sts-test_spearman_dot": 0.8600332320183043, "eval_sts-test_spearman_euclidean": 0.9009870356218139, "eval_sts-test_spearman_manhattan": 0.9018416504606055, "eval_sts-test_spearman_max": 0.9018416504606055, "step": 150 }, { "epoch": 0.798941798941799, "grad_norm": 2.718717098236084, "learning_rate": 3.227166300188643e-05, "loss": 0.182, "step": 151 }, { "epoch": 0.8042328042328042, "grad_norm": 2.84299373626709, "learning_rate": 3.2176652338630344e-05, "loss": 0.1789, "step": 152 } ], "logging_steps": 1, "max_steps": 378, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 38, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 640, "trial_name": null, "trial_params": null }