bobox commited on
Commit
74ccd24
·
verified ·
1 Parent(s): 2dc6366

Training in progress, step 16, checkpoint

Browse files
checkpoint-16/1_WeightedPooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-16/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-16/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-16/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bobox/DeBERTa-small-ST-v1-test-step3",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.44.0",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
checkpoint-16/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.44.0",
5
+ "pytorch": "2.4.0"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-16/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_WeightedPooling",
12
+ "type": "__main__"
13
+ }
14
+ ]
checkpoint-16/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0228fcfe6bd17ac2e3d96fa8d77027b57317e1f6b01900264495260ab456fb78
3
+ size 9475334
checkpoint-16/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1886a286545822d70cd17732b6025a52902e7429a8ab7eec28bf9c1a4f143b6c
3
+ size 565251810
checkpoint-16/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16a5d40a52c77f32b888879c57abab15e00b792547e1b742b08bc549b488afdc
3
+ size 14244
checkpoint-16/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ce25298e4ba9c66fbb9b17ad0654fa9bed5e59f0a04e77e53fb8ac523b4a69c
3
+ size 1064
checkpoint-16/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-16/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-16/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-16/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-16/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 512,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "sp_model_kwargs": {},
58
+ "split_by_punct": false,
59
+ "stride": 0,
60
+ "tokenizer_class": "DebertaV2Tokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]",
64
+ "vocab_type": "spm"
65
+ }
checkpoint-16/trainer_state.json ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.1553398058252427,
5
+ "eval_steps": 8,
6
+ "global_step": 16,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009708737864077669,
13
+ "grad_norm": 0.38736554980278015,
14
+ "learning_rate": 4.411764705882353e-07,
15
+ "loss": 0.1607,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.019417475728155338,
20
+ "grad_norm": 0.38099536299705505,
21
+ "learning_rate": 8.823529411764706e-07,
22
+ "loss": 0.1664,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.02912621359223301,
27
+ "grad_norm": 0.2045232504606247,
28
+ "learning_rate": 1.323529411764706e-06,
29
+ "loss": 0.2686,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.038834951456310676,
34
+ "grad_norm": 0.370655357837677,
35
+ "learning_rate": 1.7647058823529412e-06,
36
+ "loss": 0.1656,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.04854368932038835,
41
+ "grad_norm": 0.35929468274116516,
42
+ "learning_rate": 2.2058823529411767e-06,
43
+ "loss": 0.1269,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.05825242718446602,
48
+ "grad_norm": 0.2196977585554123,
49
+ "learning_rate": 2.647058823529412e-06,
50
+ "loss": 0.1066,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.06796116504854369,
55
+ "grad_norm": 0.6120501756668091,
56
+ "learning_rate": 3.0882352941176476e-06,
57
+ "loss": 0.1936,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.07766990291262135,
62
+ "grad_norm": 0.7004581689834595,
63
+ "learning_rate": 3.5294117647058825e-06,
64
+ "loss": 0.087,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.07766990291262135,
69
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
70
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.6691694855690002,
71
+ "eval_Qnli-dev_cosine_ap": 0.7181099755520133,
72
+ "eval_Qnli-dev_cosine_f1": 0.6764705882352942,
73
+ "eval_Qnli-dev_cosine_f1_threshold": 0.49676012992858887,
74
+ "eval_Qnli-dev_cosine_precision": 0.550531914893617,
75
+ "eval_Qnli-dev_cosine_recall": 0.8771186440677966,
76
+ "eval_Qnli-dev_dot_accuracy": 0.669921875,
77
+ "eval_Qnli-dev_dot_accuracy_threshold": 308.413330078125,
78
+ "eval_Qnli-dev_dot_ap": 0.678634155184842,
79
+ "eval_Qnli-dev_dot_f1": 0.6698872785829306,
80
+ "eval_Qnli-dev_dot_f1_threshold": 208.48468017578125,
81
+ "eval_Qnli-dev_dot_precision": 0.5402597402597402,
82
+ "eval_Qnli-dev_dot_recall": 0.8813559322033898,
83
+ "eval_Qnli-dev_euclidean_accuracy": 0.708984375,
84
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 17.320636749267578,
85
+ "eval_Qnli-dev_euclidean_ap": 0.7302104938839877,
86
+ "eval_Qnli-dev_euclidean_f1": 0.6845637583892616,
87
+ "eval_Qnli-dev_euclidean_f1_threshold": 20.530838012695312,
88
+ "eval_Qnli-dev_euclidean_precision": 0.5666666666666667,
89
+ "eval_Qnli-dev_euclidean_recall": 0.864406779661017,
90
+ "eval_Qnli-dev_manhattan_accuracy": 0.7109375,
91
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 369.90869140625,
92
+ "eval_Qnli-dev_manhattan_ap": 0.732538057005555,
93
+ "eval_Qnli-dev_manhattan_f1": 0.6836518046709129,
94
+ "eval_Qnli-dev_manhattan_f1_threshold": 372.4708251953125,
95
+ "eval_Qnli-dev_manhattan_precision": 0.6851063829787234,
96
+ "eval_Qnli-dev_manhattan_recall": 0.6822033898305084,
97
+ "eval_Qnli-dev_max_accuracy": 0.7109375,
98
+ "eval_Qnli-dev_max_accuracy_threshold": 369.90869140625,
99
+ "eval_Qnli-dev_max_ap": 0.732538057005555,
100
+ "eval_Qnli-dev_max_f1": 0.6845637583892616,
101
+ "eval_Qnli-dev_max_f1_threshold": 372.4708251953125,
102
+ "eval_Qnli-dev_max_precision": 0.6851063829787234,
103
+ "eval_Qnli-dev_max_recall": 0.8813559322033898,
104
+ "eval_allNLI-dev_cosine_accuracy": 0.740234375,
105
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7844594717025757,
106
+ "eval_allNLI-dev_cosine_ap": 0.6356383940181094,
107
+ "eval_allNLI-dev_cosine_f1": 0.6369710467706012,
108
+ "eval_allNLI-dev_cosine_f1_threshold": 0.5889398455619812,
109
+ "eval_allNLI-dev_cosine_precision": 0.5181159420289855,
110
+ "eval_allNLI-dev_cosine_recall": 0.8265895953757225,
111
+ "eval_allNLI-dev_dot_accuracy": 0.72265625,
112
+ "eval_allNLI-dev_dot_accuracy_threshold": 294.92120361328125,
113
+ "eval_allNLI-dev_dot_ap": 0.5914839688879726,
114
+ "eval_allNLI-dev_dot_f1": 0.6099290780141845,
115
+ "eval_allNLI-dev_dot_f1_threshold": 246.92991638183594,
116
+ "eval_allNLI-dev_dot_precision": 0.516,
117
+ "eval_allNLI-dev_dot_recall": 0.7456647398843931,
118
+ "eval_allNLI-dev_euclidean_accuracy": 0.744140625,
119
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.74991226196289,
120
+ "eval_allNLI-dev_euclidean_ap": 0.6412989964435845,
121
+ "eval_allNLI-dev_euclidean_f1": 0.6464208242950108,
122
+ "eval_allNLI-dev_euclidean_f1_threshold": 18.407054901123047,
123
+ "eval_allNLI-dev_euclidean_precision": 0.5173611111111112,
124
+ "eval_allNLI-dev_euclidean_recall": 0.861271676300578,
125
+ "eval_allNLI-dev_manhattan_accuracy": 0.744140625,
126
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 313.8516540527344,
127
+ "eval_allNLI-dev_manhattan_ap": 0.6416934437743762,
128
+ "eval_allNLI-dev_manhattan_f1": 0.655,
129
+ "eval_allNLI-dev_manhattan_f1_threshold": 361.0255126953125,
130
+ "eval_allNLI-dev_manhattan_precision": 0.5770925110132159,
131
+ "eval_allNLI-dev_manhattan_recall": 0.7572254335260116,
132
+ "eval_allNLI-dev_max_accuracy": 0.744140625,
133
+ "eval_allNLI-dev_max_accuracy_threshold": 313.8516540527344,
134
+ "eval_allNLI-dev_max_ap": 0.6416934437743762,
135
+ "eval_allNLI-dev_max_f1": 0.655,
136
+ "eval_allNLI-dev_max_f1_threshold": 361.0255126953125,
137
+ "eval_allNLI-dev_max_precision": 0.5770925110132159,
138
+ "eval_allNLI-dev_max_recall": 0.861271676300578,
139
+ "eval_sequential_score": 0.732538057005555,
140
+ "eval_sts-test_pearson_cosine": 0.8858982884210256,
141
+ "eval_sts-test_pearson_dot": 0.8750202743648935,
142
+ "eval_sts-test_pearson_euclidean": 0.9073481145616336,
143
+ "eval_sts-test_pearson_manhattan": 0.908032029485236,
144
+ "eval_sts-test_pearson_max": 0.908032029485236,
145
+ "eval_sts-test_spearman_cosine": 0.9075027297778789,
146
+ "eval_sts-test_spearman_dot": 0.8759972599919353,
147
+ "eval_sts-test_spearman_euclidean": 0.9040041656648221,
148
+ "eval_sts-test_spearman_manhattan": 0.9045863348242628,
149
+ "eval_sts-test_spearman_max": 0.9075027297778789,
150
+ "eval_vitaminc-pairs_loss": 2.801919460296631,
151
+ "eval_vitaminc-pairs_runtime": 3.8986,
152
+ "eval_vitaminc-pairs_samples_per_second": 32.832,
153
+ "eval_vitaminc-pairs_steps_per_second": 0.257,
154
+ "step": 8
155
+ },
156
+ {
157
+ "epoch": 0.07766990291262135,
158
+ "eval_negation-triplets_loss": 1.355710744857788,
159
+ "eval_negation-triplets_runtime": 0.639,
160
+ "eval_negation-triplets_samples_per_second": 200.304,
161
+ "eval_negation-triplets_steps_per_second": 1.565,
162
+ "step": 8
163
+ },
164
+ {
165
+ "epoch": 0.07766990291262135,
166
+ "eval_scitail-pairs-pos_loss": 0.06071745231747627,
167
+ "eval_scitail-pairs-pos_runtime": 0.9322,
168
+ "eval_scitail-pairs-pos_samples_per_second": 137.311,
169
+ "eval_scitail-pairs-pos_steps_per_second": 1.073,
170
+ "step": 8
171
+ },
172
+ {
173
+ "epoch": 0.07766990291262135,
174
+ "eval_scitail-pairs-qa_loss": 1.0544098927312007e-07,
175
+ "eval_scitail-pairs-qa_runtime": 0.5629,
176
+ "eval_scitail-pairs-qa_samples_per_second": 227.378,
177
+ "eval_scitail-pairs-qa_steps_per_second": 1.776,
178
+ "step": 8
179
+ },
180
+ {
181
+ "epoch": 0.07766990291262135,
182
+ "eval_xsum-pairs_loss": 0.00016850717656780034,
183
+ "eval_xsum-pairs_runtime": 2.9256,
184
+ "eval_xsum-pairs_samples_per_second": 43.751,
185
+ "eval_xsum-pairs_steps_per_second": 0.342,
186
+ "step": 8
187
+ },
188
+ {
189
+ "epoch": 0.07766990291262135,
190
+ "eval_sciq_pairs_loss": 0.0582379512488842,
191
+ "eval_sciq_pairs_runtime": 2.5278,
192
+ "eval_sciq_pairs_samples_per_second": 34.418,
193
+ "eval_sciq_pairs_steps_per_second": 0.396,
194
+ "step": 8
195
+ },
196
+ {
197
+ "epoch": 0.07766990291262135,
198
+ "eval_qasc_pairs_loss": 0.07362987846136093,
199
+ "eval_qasc_pairs_runtime": 0.3723,
200
+ "eval_qasc_pairs_samples_per_second": 233.691,
201
+ "eval_qasc_pairs_steps_per_second": 2.686,
202
+ "step": 8
203
+ },
204
+ {
205
+ "epoch": 0.07766990291262135,
206
+ "eval_openbookqa_pairs_loss": 1.3787189722061157,
207
+ "eval_openbookqa_pairs_runtime": 0.5744,
208
+ "eval_openbookqa_pairs_samples_per_second": 222.849,
209
+ "eval_openbookqa_pairs_steps_per_second": 1.741,
210
+ "step": 8
211
+ },
212
+ {
213
+ "epoch": 0.07766990291262135,
214
+ "eval_msmarco_pairs_loss": 0.1278172731399536,
215
+ "eval_msmarco_pairs_runtime": 1.0983,
216
+ "eval_msmarco_pairs_samples_per_second": 79.215,
217
+ "eval_msmarco_pairs_steps_per_second": 0.911,
218
+ "step": 8
219
+ },
220
+ {
221
+ "epoch": 0.07766990291262135,
222
+ "eval_nq_pairs_loss": 0.0051126074977219105,
223
+ "eval_nq_pairs_runtime": 2.2469,
224
+ "eval_nq_pairs_samples_per_second": 38.719,
225
+ "eval_nq_pairs_steps_per_second": 0.445,
226
+ "step": 8
227
+ },
228
+ {
229
+ "epoch": 0.07766990291262135,
230
+ "eval_trivia_pairs_loss": 0.26755890250205994,
231
+ "eval_trivia_pairs_runtime": 2.4342,
232
+ "eval_trivia_pairs_samples_per_second": 41.081,
233
+ "eval_trivia_pairs_steps_per_second": 0.411,
234
+ "step": 8
235
+ },
236
+ {
237
+ "epoch": 0.07766990291262135,
238
+ "eval_gooaq_pairs_loss": 0.05755230411887169,
239
+ "eval_gooaq_pairs_runtime": 0.5805,
240
+ "eval_gooaq_pairs_samples_per_second": 149.876,
241
+ "eval_gooaq_pairs_steps_per_second": 1.723,
242
+ "step": 8
243
+ },
244
+ {
245
+ "epoch": 0.07766990291262135,
246
+ "eval_paws-pos_loss": 0.04592936113476753,
247
+ "eval_paws-pos_runtime": 0.6903,
248
+ "eval_paws-pos_samples_per_second": 185.415,
249
+ "eval_paws-pos_steps_per_second": 1.449,
250
+ "step": 8
251
+ },
252
+ {
253
+ "epoch": 0.07766990291262135,
254
+ "eval_global_dataset_loss": 0.4901905357837677,
255
+ "eval_global_dataset_runtime": 25.0307,
256
+ "eval_global_dataset_samples_per_second": 26.487,
257
+ "eval_global_dataset_steps_per_second": 0.24,
258
+ "step": 8
259
+ },
260
+ {
261
+ "epoch": 0.08737864077669903,
262
+ "grad_norm": 1.2741845846176147,
263
+ "learning_rate": 3.970588235294118e-06,
264
+ "loss": 0.1952,
265
+ "step": 9
266
+ },
267
+ {
268
+ "epoch": 0.0970873786407767,
269
+ "grad_norm": 0.46481379866600037,
270
+ "learning_rate": 4.411764705882353e-06,
271
+ "loss": 0.4167,
272
+ "step": 10
273
+ },
274
+ {
275
+ "epoch": 0.10679611650485436,
276
+ "grad_norm": 0.7963629961013794,
277
+ "learning_rate": 4.852941176470589e-06,
278
+ "loss": 0.7876,
279
+ "step": 11
280
+ },
281
+ {
282
+ "epoch": 0.11650485436893204,
283
+ "grad_norm": 0.6939969658851624,
284
+ "learning_rate": 5.294117647058824e-06,
285
+ "loss": 0.3714,
286
+ "step": 12
287
+ },
288
+ {
289
+ "epoch": 0.1262135922330097,
290
+ "grad_norm": 0.44538313150405884,
291
+ "learning_rate": 5.735294117647059e-06,
292
+ "loss": 0.1852,
293
+ "step": 13
294
+ },
295
+ {
296
+ "epoch": 0.13592233009708737,
297
+ "grad_norm": 0.3318534791469574,
298
+ "learning_rate": 6.176470588235295e-06,
299
+ "loss": 0.1144,
300
+ "step": 14
301
+ },
302
+ {
303
+ "epoch": 0.14563106796116504,
304
+ "grad_norm": 0.4242781698703766,
305
+ "learning_rate": 6.61764705882353e-06,
306
+ "loss": 0.1234,
307
+ "step": 15
308
+ },
309
+ {
310
+ "epoch": 0.1553398058252427,
311
+ "grad_norm": 0.2060050666332245,
312
+ "learning_rate": 7.058823529411765e-06,
313
+ "loss": 0.0569,
314
+ "step": 16
315
+ },
316
+ {
317
+ "epoch": 0.1553398058252427,
318
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
319
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.6684989929199219,
320
+ "eval_Qnli-dev_cosine_ap": 0.7189160511154118,
321
+ "eval_Qnli-dev_cosine_f1": 0.6764705882352942,
322
+ "eval_Qnli-dev_cosine_f1_threshold": 0.49889329075813293,
323
+ "eval_Qnli-dev_cosine_precision": 0.550531914893617,
324
+ "eval_Qnli-dev_cosine_recall": 0.8771186440677966,
325
+ "eval_Qnli-dev_dot_accuracy": 0.66796875,
326
+ "eval_Qnli-dev_dot_accuracy_threshold": 303.81427001953125,
327
+ "eval_Qnli-dev_dot_ap": 0.6791091476516111,
328
+ "eval_Qnli-dev_dot_f1": 0.6697247706422018,
329
+ "eval_Qnli-dev_dot_f1_threshold": 184.08914184570312,
330
+ "eval_Qnli-dev_dot_precision": 0.5239234449760766,
331
+ "eval_Qnli-dev_dot_recall": 0.9279661016949152,
332
+ "eval_Qnli-dev_euclidean_accuracy": 0.7109375,
333
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 17.21223258972168,
334
+ "eval_Qnli-dev_euclidean_ap": 0.7305713530476123,
335
+ "eval_Qnli-dev_euclidean_f1": 0.6848739495798319,
336
+ "eval_Qnli-dev_euclidean_f1_threshold": 17.62983512878418,
337
+ "eval_Qnli-dev_euclidean_precision": 0.6791666666666667,
338
+ "eval_Qnli-dev_euclidean_recall": 0.690677966101695,
339
+ "eval_Qnli-dev_manhattan_accuracy": 0.712890625,
340
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 367.2698974609375,
341
+ "eval_Qnli-dev_manhattan_ap": 0.7327153618467641,
342
+ "eval_Qnli-dev_manhattan_f1": 0.6824034334763949,
343
+ "eval_Qnli-dev_manhattan_f1_threshold": 368.1672058105469,
344
+ "eval_Qnli-dev_manhattan_precision": 0.691304347826087,
345
+ "eval_Qnli-dev_manhattan_recall": 0.673728813559322,
346
+ "eval_Qnli-dev_max_accuracy": 0.712890625,
347
+ "eval_Qnli-dev_max_accuracy_threshold": 367.2698974609375,
348
+ "eval_Qnli-dev_max_ap": 0.7327153618467641,
349
+ "eval_Qnli-dev_max_f1": 0.6848739495798319,
350
+ "eval_Qnli-dev_max_f1_threshold": 368.1672058105469,
351
+ "eval_Qnli-dev_max_precision": 0.691304347826087,
352
+ "eval_Qnli-dev_max_recall": 0.9279661016949152,
353
+ "eval_allNLI-dev_cosine_accuracy": 0.7421875,
354
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7840416431427002,
355
+ "eval_allNLI-dev_cosine_ap": 0.6349487371750362,
356
+ "eval_allNLI-dev_cosine_f1": 0.6367713004484306,
357
+ "eval_allNLI-dev_cosine_f1_threshold": 0.5954304337501526,
358
+ "eval_allNLI-dev_cosine_precision": 0.5201465201465202,
359
+ "eval_allNLI-dev_cosine_recall": 0.8208092485549133,
360
+ "eval_allNLI-dev_dot_accuracy": 0.724609375,
361
+ "eval_allNLI-dev_dot_accuracy_threshold": 291.18670654296875,
362
+ "eval_allNLI-dev_dot_ap": 0.5913303711668598,
363
+ "eval_allNLI-dev_dot_f1": 0.6070588235294118,
364
+ "eval_allNLI-dev_dot_f1_threshold": 242.49884033203125,
365
+ "eval_allNLI-dev_dot_precision": 0.5119047619047619,
366
+ "eval_allNLI-dev_dot_recall": 0.7456647398843931,
367
+ "eval_allNLI-dev_euclidean_accuracy": 0.744140625,
368
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.559854507446289,
369
+ "eval_allNLI-dev_euclidean_ap": 0.6408865358113234,
370
+ "eval_allNLI-dev_euclidean_f1": 0.6450116009280741,
371
+ "eval_allNLI-dev_euclidean_f1_threshold": 17.65105628967285,
372
+ "eval_allNLI-dev_euclidean_precision": 0.5387596899224806,
373
+ "eval_allNLI-dev_euclidean_recall": 0.8034682080924855,
374
+ "eval_allNLI-dev_manhattan_accuracy": 0.7421875,
375
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 307.83563232421875,
376
+ "eval_allNLI-dev_manhattan_ap": 0.6414745653251337,
377
+ "eval_allNLI-dev_manhattan_f1": 0.6533665835411472,
378
+ "eval_allNLI-dev_manhattan_f1_threshold": 357.5172119140625,
379
+ "eval_allNLI-dev_manhattan_precision": 0.5745614035087719,
380
+ "eval_allNLI-dev_manhattan_recall": 0.7572254335260116,
381
+ "eval_allNLI-dev_max_accuracy": 0.744140625,
382
+ "eval_allNLI-dev_max_accuracy_threshold": 307.83563232421875,
383
+ "eval_allNLI-dev_max_ap": 0.6414745653251337,
384
+ "eval_allNLI-dev_max_f1": 0.6533665835411472,
385
+ "eval_allNLI-dev_max_f1_threshold": 357.5172119140625,
386
+ "eval_allNLI-dev_max_precision": 0.5745614035087719,
387
+ "eval_allNLI-dev_max_recall": 0.8208092485549133,
388
+ "eval_sequential_score": 0.7327153618467641,
389
+ "eval_sts-test_pearson_cosine": 0.8857317653883374,
390
+ "eval_sts-test_pearson_dot": 0.8740754819923455,
391
+ "eval_sts-test_pearson_euclidean": 0.9064480556463691,
392
+ "eval_sts-test_pearson_manhattan": 0.9071116458208927,
393
+ "eval_sts-test_pearson_max": 0.9071116458208927,
394
+ "eval_sts-test_spearman_cosine": 0.9078527293932209,
395
+ "eval_sts-test_spearman_dot": 0.8744967501974819,
396
+ "eval_sts-test_spearman_euclidean": 0.9030452803869885,
397
+ "eval_sts-test_spearman_manhattan": 0.903667904686727,
398
+ "eval_sts-test_spearman_max": 0.9078527293932209,
399
+ "eval_vitaminc-pairs_loss": 2.799455165863037,
400
+ "eval_vitaminc-pairs_runtime": 3.8555,
401
+ "eval_vitaminc-pairs_samples_per_second": 33.2,
402
+ "eval_vitaminc-pairs_steps_per_second": 0.259,
403
+ "step": 16
404
+ },
405
+ {
406
+ "epoch": 0.1553398058252427,
407
+ "eval_negation-triplets_loss": 1.3501245975494385,
408
+ "eval_negation-triplets_runtime": 0.6331,
409
+ "eval_negation-triplets_samples_per_second": 202.181,
410
+ "eval_negation-triplets_steps_per_second": 1.58,
411
+ "step": 16
412
+ },
413
+ {
414
+ "epoch": 0.1553398058252427,
415
+ "eval_scitail-pairs-pos_loss": 0.060843899846076965,
416
+ "eval_scitail-pairs-pos_runtime": 0.9131,
417
+ "eval_scitail-pairs-pos_samples_per_second": 140.183,
418
+ "eval_scitail-pairs-pos_steps_per_second": 1.095,
419
+ "step": 16
420
+ },
421
+ {
422
+ "epoch": 0.1553398058252427,
423
+ "eval_scitail-pairs-qa_loss": 1.1229322893768767e-07,
424
+ "eval_scitail-pairs-qa_runtime": 0.5691,
425
+ "eval_scitail-pairs-qa_samples_per_second": 224.9,
426
+ "eval_scitail-pairs-qa_steps_per_second": 1.757,
427
+ "step": 16
428
+ },
429
+ {
430
+ "epoch": 0.1553398058252427,
431
+ "eval_xsum-pairs_loss": 0.0001665251620579511,
432
+ "eval_xsum-pairs_runtime": 2.9369,
433
+ "eval_xsum-pairs_samples_per_second": 43.584,
434
+ "eval_xsum-pairs_steps_per_second": 0.34,
435
+ "step": 16
436
+ },
437
+ {
438
+ "epoch": 0.1553398058252427,
439
+ "eval_sciq_pairs_loss": 0.05696266517043114,
440
+ "eval_sciq_pairs_runtime": 2.5397,
441
+ "eval_sciq_pairs_samples_per_second": 34.256,
442
+ "eval_sciq_pairs_steps_per_second": 0.394,
443
+ "step": 16
444
+ },
445
+ {
446
+ "epoch": 0.1553398058252427,
447
+ "eval_qasc_pairs_loss": 0.07378190755844116,
448
+ "eval_qasc_pairs_runtime": 0.3731,
449
+ "eval_qasc_pairs_samples_per_second": 233.155,
450
+ "eval_qasc_pairs_steps_per_second": 2.68,
451
+ "step": 16
452
+ },
453
+ {
454
+ "epoch": 0.1553398058252427,
455
+ "eval_openbookqa_pairs_loss": 1.3709176778793335,
456
+ "eval_openbookqa_pairs_runtime": 0.5704,
457
+ "eval_openbookqa_pairs_samples_per_second": 224.402,
458
+ "eval_openbookqa_pairs_steps_per_second": 1.753,
459
+ "step": 16
460
+ },
461
+ {
462
+ "epoch": 0.1553398058252427,
463
+ "eval_msmarco_pairs_loss": 0.13161690533161163,
464
+ "eval_msmarco_pairs_runtime": 1.0994,
465
+ "eval_msmarco_pairs_samples_per_second": 79.136,
466
+ "eval_msmarco_pairs_steps_per_second": 0.91,
467
+ "step": 16
468
+ },
469
+ {
470
+ "epoch": 0.1553398058252427,
471
+ "eval_nq_pairs_loss": 0.005713976453989744,
472
+ "eval_nq_pairs_runtime": 2.2521,
473
+ "eval_nq_pairs_samples_per_second": 38.631,
474
+ "eval_nq_pairs_steps_per_second": 0.444,
475
+ "step": 16
476
+ },
477
+ {
478
+ "epoch": 0.1553398058252427,
479
+ "eval_trivia_pairs_loss": 0.2678990364074707,
480
+ "eval_trivia_pairs_runtime": 2.4403,
481
+ "eval_trivia_pairs_samples_per_second": 40.979,
482
+ "eval_trivia_pairs_steps_per_second": 0.41,
483
+ "step": 16
484
+ },
485
+ {
486
+ "epoch": 0.1553398058252427,
487
+ "eval_gooaq_pairs_loss": 0.05776378884911537,
488
+ "eval_gooaq_pairs_runtime": 0.5822,
489
+ "eval_gooaq_pairs_samples_per_second": 149.421,
490
+ "eval_gooaq_pairs_steps_per_second": 1.717,
491
+ "step": 16
492
+ },
493
+ {
494
+ "epoch": 0.1553398058252427,
495
+ "eval_paws-pos_loss": 0.04598955065011978,
496
+ "eval_paws-pos_runtime": 0.6747,
497
+ "eval_paws-pos_samples_per_second": 189.727,
498
+ "eval_paws-pos_steps_per_second": 1.482,
499
+ "step": 16
500
+ },
501
+ {
502
+ "epoch": 0.1553398058252427,
503
+ "eval_global_dataset_loss": 0.48036298155784607,
504
+ "eval_global_dataset_runtime": 25.0731,
505
+ "eval_global_dataset_samples_per_second": 26.443,
506
+ "eval_global_dataset_steps_per_second": 0.239,
507
+ "step": 16
508
+ }
509
+ ],
510
+ "logging_steps": 1,
511
+ "max_steps": 309,
512
+ "num_input_tokens_seen": 0,
513
+ "num_train_epochs": 3,
514
+ "save_steps": 16,
515
+ "stateful_callbacks": {
516
+ "TrainerControl": {
517
+ "args": {
518
+ "should_epoch_stop": false,
519
+ "should_evaluate": false,
520
+ "should_log": false,
521
+ "should_save": true,
522
+ "should_training_stop": false
523
+ },
524
+ "attributes": {}
525
+ }
526
+ },
527
+ "total_flos": 0.0,
528
+ "train_batch_size": 96,
529
+ "trial_name": null,
530
+ "trial_params": null
531
+ }
checkpoint-16/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d891109b3776bbe6fe5299eaca62fc99db8369e44011108f84f516eaa2f0e0
3
+ size 5624