|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.1008610086100861, |
|
"eval_steps": 21, |
|
"global_step": 82, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012300123001230013, |
|
"grad_norm": 6.540346145629883, |
|
"learning_rate": 9.803921568627452e-08, |
|
"loss": 0.3208, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0024600246002460025, |
|
"grad_norm": 5.055933475494385, |
|
"learning_rate": 1.9607843137254904e-07, |
|
"loss": 0.1703, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0036900369003690036, |
|
"grad_norm": 6.361550331115723, |
|
"learning_rate": 2.9411764705882356e-07, |
|
"loss": 0.3362, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004920049200492005, |
|
"grad_norm": 6.709433078765869, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 0.3346, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006150061500615006, |
|
"grad_norm": 5.4415154457092285, |
|
"learning_rate": 4.901960784313725e-07, |
|
"loss": 0.2484, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007380073800738007, |
|
"grad_norm": 5.709558010101318, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 0.2249, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.008610086100861008, |
|
"grad_norm": 6.553178787231445, |
|
"learning_rate": 6.862745098039217e-07, |
|
"loss": 0.2724, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00984009840098401, |
|
"grad_norm": 5.640111446380615, |
|
"learning_rate": 7.843137254901962e-07, |
|
"loss": 0.251, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01107011070110701, |
|
"grad_norm": 5.696380615234375, |
|
"learning_rate": 8.823529411764707e-07, |
|
"loss": 0.2413, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012300123001230012, |
|
"grad_norm": 6.983877182006836, |
|
"learning_rate": 9.80392156862745e-07, |
|
"loss": 0.382, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013530135301353014, |
|
"grad_norm": 6.066723346710205, |
|
"learning_rate": 1.0784313725490197e-06, |
|
"loss": 0.2695, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.014760147601476014, |
|
"grad_norm": 5.643115520477295, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 0.2392, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.015990159901599015, |
|
"grad_norm": 6.062892436981201, |
|
"learning_rate": 1.2745098039215686e-06, |
|
"loss": 0.3603, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.017220172201722016, |
|
"grad_norm": 6.2491655349731445, |
|
"learning_rate": 1.3725490196078434e-06, |
|
"loss": 0.3282, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01845018450184502, |
|
"grad_norm": 6.1164398193359375, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 0.2878, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01968019680196802, |
|
"grad_norm": 5.676611423492432, |
|
"learning_rate": 1.5686274509803923e-06, |
|
"loss": 0.3046, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.020910209102091022, |
|
"grad_norm": 7.181272983551025, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.3946, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02214022140221402, |
|
"grad_norm": 5.430984020233154, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 0.2038, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.023370233702337023, |
|
"grad_norm": 7.2283220291137695, |
|
"learning_rate": 1.8627450980392158e-06, |
|
"loss": 0.3542, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.024600246002460024, |
|
"grad_norm": 5.587338924407959, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 0.2369, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025830258302583026, |
|
"grad_norm": 4.456090927124023, |
|
"learning_rate": 2.058823529411765e-06, |
|
"loss": 0.1967, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.025830258302583026, |
|
"eval_loss": 0.14506277441978455, |
|
"eval_runtime": 54.872, |
|
"eval_samples_per_second": 27.446, |
|
"eval_steps_per_second": 0.219, |
|
"eval_sts-test_pearson_cosine": 0.8860152816653839, |
|
"eval_sts-test_pearson_dot": 0.8766503125978379, |
|
"eval_sts-test_pearson_euclidean": 0.9084101290541164, |
|
"eval_sts-test_pearson_manhattan": 0.909121525028934, |
|
"eval_sts-test_pearson_max": 0.909121525028934, |
|
"eval_sts-test_spearman_cosine": 0.9080919696366193, |
|
"eval_sts-test_spearman_dot": 0.8799434709726907, |
|
"eval_sts-test_spearman_euclidean": 0.9044399981995129, |
|
"eval_sts-test_spearman_manhattan": 0.9048055712538192, |
|
"eval_sts-test_spearman_max": 0.9080919696366193, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02706027060270603, |
|
"grad_norm": 6.088884353637695, |
|
"learning_rate": 2.1568627450980393e-06, |
|
"loss": 0.2368, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.028290282902829027, |
|
"grad_norm": 5.354013919830322, |
|
"learning_rate": 2.254901960784314e-06, |
|
"loss": 0.263, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02952029520295203, |
|
"grad_norm": 7.822023391723633, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 0.3595, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03075030750307503, |
|
"grad_norm": 6.401333332061768, |
|
"learning_rate": 2.450980392156863e-06, |
|
"loss": 0.3073, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03198031980319803, |
|
"grad_norm": 5.567343235015869, |
|
"learning_rate": 2.549019607843137e-06, |
|
"loss": 0.2232, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.033210332103321034, |
|
"grad_norm": 4.244979381561279, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 0.1822, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03444034440344403, |
|
"grad_norm": 5.674376964569092, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 0.251, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03567035670356704, |
|
"grad_norm": 6.017494201660156, |
|
"learning_rate": 2.843137254901961e-06, |
|
"loss": 0.2677, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03690036900369004, |
|
"grad_norm": 6.415028095245361, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 0.3252, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.038130381303813035, |
|
"grad_norm": 5.484204292297363, |
|
"learning_rate": 3.03921568627451e-06, |
|
"loss": 0.2058, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03936039360393604, |
|
"grad_norm": 5.997295379638672, |
|
"learning_rate": 3.1372549019607846e-06, |
|
"loss": 0.3083, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04059040590405904, |
|
"grad_norm": 5.527047157287598, |
|
"learning_rate": 3.2352941176470594e-06, |
|
"loss": 0.2109, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.041820418204182044, |
|
"grad_norm": 5.817302227020264, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.2751, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04305043050430504, |
|
"grad_norm": 5.476433753967285, |
|
"learning_rate": 3.431372549019608e-06, |
|
"loss": 0.2269, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04428044280442804, |
|
"grad_norm": 5.363610744476318, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 0.2333, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04551045510455105, |
|
"grad_norm": 6.07395601272583, |
|
"learning_rate": 3.6274509803921573e-06, |
|
"loss": 0.2747, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.046740467404674045, |
|
"grad_norm": 4.726163864135742, |
|
"learning_rate": 3.7254901960784316e-06, |
|
"loss": 0.1285, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04797047970479705, |
|
"grad_norm": 5.783392906188965, |
|
"learning_rate": 3.8235294117647055e-06, |
|
"loss": 0.3659, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04920049200492005, |
|
"grad_norm": 6.566931247711182, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 0.3991, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05043050430504305, |
|
"grad_norm": 5.311452388763428, |
|
"learning_rate": 4.019607843137255e-06, |
|
"loss": 0.2647, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05166051660516605, |
|
"grad_norm": 6.0737152099609375, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 0.3627, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05166051660516605, |
|
"eval_loss": 0.1373225301504135, |
|
"eval_runtime": 54.8187, |
|
"eval_samples_per_second": 27.472, |
|
"eval_steps_per_second": 0.219, |
|
"eval_sts-test_pearson_cosine": 0.8846111050777101, |
|
"eval_sts-test_pearson_dot": 0.8747554197498655, |
|
"eval_sts-test_pearson_euclidean": 0.9089352149126115, |
|
"eval_sts-test_pearson_manhattan": 0.9098483550214526, |
|
"eval_sts-test_pearson_max": 0.9098483550214526, |
|
"eval_sts-test_spearman_cosine": 0.9084485029361248, |
|
"eval_sts-test_spearman_dot": 0.8796038088987298, |
|
"eval_sts-test_spearman_euclidean": 0.9055790073044468, |
|
"eval_sts-test_spearman_manhattan": 0.9063848432683216, |
|
"eval_sts-test_spearman_max": 0.9084485029361248, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05289052890528905, |
|
"grad_norm": 4.857839584350586, |
|
"learning_rate": 4.215686274509805e-06, |
|
"loss": 0.2026, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05412054120541206, |
|
"grad_norm": 5.248873233795166, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 0.1923, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.055350553505535055, |
|
"grad_norm": 5.329862117767334, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 0.2369, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.056580565805658053, |
|
"grad_norm": 5.581146240234375, |
|
"learning_rate": 4.509803921568628e-06, |
|
"loss": 0.2268, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05781057810578106, |
|
"grad_norm": 5.818411350250244, |
|
"learning_rate": 4.607843137254902e-06, |
|
"loss": 0.2975, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05904059040590406, |
|
"grad_norm": 5.096602916717529, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 0.1922, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06027060270602706, |
|
"grad_norm": 5.256355285644531, |
|
"learning_rate": 4.803921568627452e-06, |
|
"loss": 0.1906, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06150061500615006, |
|
"grad_norm": 5.3927388191223145, |
|
"learning_rate": 4.901960784313726e-06, |
|
"loss": 0.2379, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06273062730627306, |
|
"grad_norm": 6.2723846435546875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3796, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06396063960639606, |
|
"grad_norm": 4.595238208770752, |
|
"learning_rate": 5.098039215686274e-06, |
|
"loss": 0.1821, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06519065190651907, |
|
"grad_norm": 4.342020511627197, |
|
"learning_rate": 5.19607843137255e-06, |
|
"loss": 0.1257, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06642066420664207, |
|
"grad_norm": 4.998225212097168, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.2368, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06765067650676507, |
|
"grad_norm": 5.510946273803711, |
|
"learning_rate": 5.392156862745098e-06, |
|
"loss": 0.294, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06888068880688807, |
|
"grad_norm": 4.788788318634033, |
|
"learning_rate": 5.4901960784313735e-06, |
|
"loss": 0.2594, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07011070110701106, |
|
"grad_norm": 5.827020645141602, |
|
"learning_rate": 5.588235294117647e-06, |
|
"loss": 0.2972, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07134071340713408, |
|
"grad_norm": 4.821737289428711, |
|
"learning_rate": 5.686274509803922e-06, |
|
"loss": 0.2297, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07257072570725707, |
|
"grad_norm": 4.880247592926025, |
|
"learning_rate": 5.784313725490197e-06, |
|
"loss": 0.1487, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"grad_norm": 4.447835445404053, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.182, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07503075030750307, |
|
"grad_norm": 5.5556640625, |
|
"learning_rate": 5.980392156862746e-06, |
|
"loss": 0.2516, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07626076260762607, |
|
"grad_norm": 5.217922687530518, |
|
"learning_rate": 6.07843137254902e-06, |
|
"loss": 0.2809, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07749077490774908, |
|
"grad_norm": 4.436608791351318, |
|
"learning_rate": 6.176470588235295e-06, |
|
"loss": 0.1371, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07749077490774908, |
|
"eval_loss": 0.13080179691314697, |
|
"eval_runtime": 54.9188, |
|
"eval_samples_per_second": 27.422, |
|
"eval_steps_per_second": 0.219, |
|
"eval_sts-test_pearson_cosine": 0.882074513745531, |
|
"eval_sts-test_pearson_dot": 0.8709046425878566, |
|
"eval_sts-test_pearson_euclidean": 0.9081794284297221, |
|
"eval_sts-test_pearson_manhattan": 0.9093974331692458, |
|
"eval_sts-test_pearson_max": 0.9093974331692458, |
|
"eval_sts-test_spearman_cosine": 0.9067824582257844, |
|
"eval_sts-test_spearman_dot": 0.8757477717096785, |
|
"eval_sts-test_spearman_euclidean": 0.9051085820447002, |
|
"eval_sts-test_spearman_manhattan": 0.9064308923162935, |
|
"eval_sts-test_spearman_max": 0.9067824582257844, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07872078720787208, |
|
"grad_norm": 5.6947021484375, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 0.2149, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07995079950799508, |
|
"grad_norm": 4.272282600402832, |
|
"learning_rate": 6.372549019607843e-06, |
|
"loss": 0.1806, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08118081180811808, |
|
"grad_norm": 4.575979232788086, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 0.1458, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08241082410824108, |
|
"grad_norm": 4.315216541290283, |
|
"learning_rate": 6.568627450980393e-06, |
|
"loss": 0.249, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08364083640836409, |
|
"grad_norm": 5.67277193069458, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.2787, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08487084870848709, |
|
"grad_norm": 5.964886665344238, |
|
"learning_rate": 6.764705882352942e-06, |
|
"loss": 0.288, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08610086100861009, |
|
"grad_norm": 4.218502521514893, |
|
"learning_rate": 6.862745098039216e-06, |
|
"loss": 0.1461, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08733087330873308, |
|
"grad_norm": 5.179543972015381, |
|
"learning_rate": 6.96078431372549e-06, |
|
"loss": 0.2304, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08856088560885608, |
|
"grad_norm": 5.720668792724609, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.3505, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0897908979089791, |
|
"grad_norm": 5.2965497970581055, |
|
"learning_rate": 7.15686274509804e-06, |
|
"loss": 0.2227, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0910209102091021, |
|
"grad_norm": 4.685606956481934, |
|
"learning_rate": 7.2549019607843145e-06, |
|
"loss": 0.1746, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09225092250922509, |
|
"grad_norm": 4.2930145263671875, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 0.1484, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09348093480934809, |
|
"grad_norm": 3.764916181564331, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 0.1346, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09471094710947109, |
|
"grad_norm": 5.033151626586914, |
|
"learning_rate": 7.549019607843138e-06, |
|
"loss": 0.2112, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0959409594095941, |
|
"grad_norm": 5.817330837249756, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 0.3138, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0971709717097171, |
|
"grad_norm": 6.147035121917725, |
|
"learning_rate": 7.745098039215687e-06, |
|
"loss": 0.2675, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0984009840098401, |
|
"grad_norm": 5.131881237030029, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 0.2849, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0996309963099631, |
|
"grad_norm": 4.2269368171691895, |
|
"learning_rate": 7.941176470588236e-06, |
|
"loss": 0.1719, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1008610086100861, |
|
"grad_norm": 5.200590133666992, |
|
"learning_rate": 8.03921568627451e-06, |
|
"loss": 0.2749, |
|
"step": 82 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 813, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 82, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 320, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|