bobox commited on
Commit
a47a121
·
verified ·
1 Parent(s): ac65dff

Training in progress, step 1277, checkpoint

Browse files
checkpoint-1277/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-1277/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1277/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-1277/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bobox/DeBERTa-ST-AllLayers-v3-checkpoints-tmp",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.42.3",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
checkpoint-1277/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.42.3",
5
+ "pytorch": "2.1.2"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-1277/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
checkpoint-1277/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b66768a28b86324483203ba043198f6ac68a378812eb969e67ae8bc5742740f7
3
+ size 1130520122
checkpoint-1277/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:245c446fb6b5f9870ce15c53db1c2f7dc77eba396de8f439a5f04b416f04a4b7
3
+ size 565251810
checkpoint-1277/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e81aa0420b3efec215a5263b93b80ac3bb9668600016c42183f22470430eb2b5
3
+ size 14244
checkpoint-1277/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cc8f7c085f832821f9b924f3037f401136be640cc4aeec8246709bb9e6a339
3
+ size 1064
checkpoint-1277/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-1277/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-1277/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-1277/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1277/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "sp_model_kwargs": {},
58
+ "split_by_punct": false,
59
+ "stride": 0,
60
+ "tokenizer_class": "DebertaV2Tokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]",
64
+ "vocab_type": "spm"
65
+ }
checkpoint-1277/trainer_state.json ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.20012537219871493,
5
+ "eval_steps": 320,
6
+ "global_step": 1277,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0050148879485973985,
13
+ "grad_norm": 14.771158218383789,
14
+ "learning_rate": 9.707724425887265e-07,
15
+ "loss": 0.6329,
16
+ "step": 32
17
+ },
18
+ {
19
+ "epoch": 0.010029775897194797,
20
+ "grad_norm": 11.052021980285645,
21
+ "learning_rate": 1.9728601252609606e-06,
22
+ "loss": 0.9693,
23
+ "step": 64
24
+ },
25
+ {
26
+ "epoch": 0.015044663845792195,
27
+ "grad_norm": 20.26296615600586,
28
+ "learning_rate": 2.9749478079331944e-06,
29
+ "loss": 0.6548,
30
+ "step": 96
31
+ },
32
+ {
33
+ "epoch": 0.020059551794389594,
34
+ "grad_norm": 12.62913703918457,
35
+ "learning_rate": 3.945720250521921e-06,
36
+ "loss": 1.1279,
37
+ "step": 128
38
+ },
39
+ {
40
+ "epoch": 0.025074439742986992,
41
+ "grad_norm": 12.316486358642578,
42
+ "learning_rate": 4.916492693110647e-06,
43
+ "loss": 1.0017,
44
+ "step": 160
45
+ },
46
+ {
47
+ "epoch": 0.03008932769158439,
48
+ "grad_norm": 64.25923919677734,
49
+ "learning_rate": 5.918580375782881e-06,
50
+ "loss": 0.7571,
51
+ "step": 192
52
+ },
53
+ {
54
+ "epoch": 0.03510421564018179,
55
+ "grad_norm": 0.8205029368400574,
56
+ "learning_rate": 6.920668058455115e-06,
57
+ "loss": 0.7304,
58
+ "step": 224
59
+ },
60
+ {
61
+ "epoch": 0.04011910358877919,
62
+ "grad_norm": 6.598870754241943,
63
+ "learning_rate": 7.922755741127349e-06,
64
+ "loss": 0.7636,
65
+ "step": 256
66
+ },
67
+ {
68
+ "epoch": 0.045133991537376586,
69
+ "grad_norm": 8.728073120117188,
70
+ "learning_rate": 8.924843423799583e-06,
71
+ "loss": 0.482,
72
+ "step": 288
73
+ },
74
+ {
75
+ "epoch": 0.050148879485973984,
76
+ "grad_norm": 7.645521640777588,
77
+ "learning_rate": 9.926931106471817e-06,
78
+ "loss": 0.6312,
79
+ "step": 320
80
+ },
81
+ {
82
+ "epoch": 0.050148879485973984,
83
+ "eval_nli-pairs_loss": 1.0158467292785645,
84
+ "eval_nli-pairs_runtime": 3.7267,
85
+ "eval_nli-pairs_samples_per_second": 26.833,
86
+ "eval_nli-pairs_steps_per_second": 1.073,
87
+ "eval_sts-test_pearson_cosine": 0.7848265412179125,
88
+ "eval_sts-test_pearson_dot": 0.5437080705284749,
89
+ "eval_sts-test_pearson_euclidean": 0.7445845076364892,
90
+ "eval_sts-test_pearson_manhattan": 0.7429239204432232,
91
+ "eval_sts-test_pearson_max": 0.7848265412179125,
92
+ "eval_sts-test_spearman_cosine": 0.7989504707258924,
93
+ "eval_sts-test_spearman_dot": 0.5206855421174118,
94
+ "eval_sts-test_spearman_euclidean": 0.733568982260844,
95
+ "eval_sts-test_spearman_manhattan": 0.7349407257944446,
96
+ "eval_sts-test_spearman_max": 0.7989504707258924,
97
+ "step": 320
98
+ },
99
+ {
100
+ "epoch": 0.050148879485973984,
101
+ "eval_vitaminc-pairs_loss": 4.692601680755615,
102
+ "eval_vitaminc-pairs_runtime": 1.1397,
103
+ "eval_vitaminc-pairs_samples_per_second": 74.578,
104
+ "eval_vitaminc-pairs_steps_per_second": 2.632,
105
+ "step": 320
106
+ },
107
+ {
108
+ "epoch": 0.050148879485973984,
109
+ "eval_sts-label_loss": 3.5502490997314453,
110
+ "eval_sts-label_runtime": 0.28,
111
+ "eval_sts-label_samples_per_second": 357.117,
112
+ "eval_sts-label_steps_per_second": 14.285,
113
+ "step": 320
114
+ },
115
+ {
116
+ "epoch": 0.050148879485973984,
117
+ "eval_qnli-contrastive_loss": 0.16079513728618622,
118
+ "eval_qnli-contrastive_runtime": 0.3646,
119
+ "eval_qnli-contrastive_samples_per_second": 274.299,
120
+ "eval_qnli-contrastive_steps_per_second": 10.972,
121
+ "step": 320
122
+ },
123
+ {
124
+ "epoch": 0.050148879485973984,
125
+ "eval_scitail-pairs-qa_loss": 0.07610582560300827,
126
+ "eval_scitail-pairs-qa_runtime": 0.8885,
127
+ "eval_scitail-pairs-qa_samples_per_second": 112.548,
128
+ "eval_scitail-pairs-qa_steps_per_second": 4.502,
129
+ "step": 320
130
+ },
131
+ {
132
+ "epoch": 0.050148879485973984,
133
+ "eval_scitail-pairs-pos_loss": 0.5141278505325317,
134
+ "eval_scitail-pairs-pos_runtime": 1.3498,
135
+ "eval_scitail-pairs-pos_samples_per_second": 74.085,
136
+ "eval_scitail-pairs-pos_steps_per_second": 2.963,
137
+ "step": 320
138
+ },
139
+ {
140
+ "epoch": 0.050148879485973984,
141
+ "eval_xsum-pairs_loss": 0.25581496953964233,
142
+ "eval_xsum-pairs_runtime": 0.9407,
143
+ "eval_xsum-pairs_samples_per_second": 106.304,
144
+ "eval_xsum-pairs_steps_per_second": 4.252,
145
+ "step": 320
146
+ },
147
+ {
148
+ "epoch": 0.050148879485973984,
149
+ "eval_compression-pairs_loss": 0.09814296662807465,
150
+ "eval_compression-pairs_runtime": 0.2758,
151
+ "eval_compression-pairs_samples_per_second": 362.517,
152
+ "eval_compression-pairs_steps_per_second": 14.501,
153
+ "step": 320
154
+ },
155
+ {
156
+ "epoch": 0.050148879485973984,
157
+ "eval_sciq_pairs_loss": 0.25620242953300476,
158
+ "eval_sciq_pairs_runtime": 4.1155,
159
+ "eval_sciq_pairs_samples_per_second": 24.298,
160
+ "eval_sciq_pairs_steps_per_second": 0.972,
161
+ "step": 320
162
+ },
163
+ {
164
+ "epoch": 0.050148879485973984,
165
+ "eval_qasc_pairs_loss": 0.2044612169265747,
166
+ "eval_qasc_pairs_runtime": 1.1029,
167
+ "eval_qasc_pairs_samples_per_second": 90.672,
168
+ "eval_qasc_pairs_steps_per_second": 3.627,
169
+ "step": 320
170
+ },
171
+ {
172
+ "epoch": 0.050148879485973984,
173
+ "eval_openbookqa_pairs_loss": 1.7537646293640137,
174
+ "eval_openbookqa_pairs_runtime": 0.9037,
175
+ "eval_openbookqa_pairs_samples_per_second": 110.653,
176
+ "eval_openbookqa_pairs_steps_per_second": 4.426,
177
+ "step": 320
178
+ },
179
+ {
180
+ "epoch": 0.050148879485973984,
181
+ "eval_msmarco_pairs_loss": 0.5138561725616455,
182
+ "eval_msmarco_pairs_runtime": 2.0511,
183
+ "eval_msmarco_pairs_samples_per_second": 48.754,
184
+ "eval_msmarco_pairs_steps_per_second": 1.95,
185
+ "step": 320
186
+ },
187
+ {
188
+ "epoch": 0.050148879485973984,
189
+ "eval_nq_pairs_loss": 0.23510317504405975,
190
+ "eval_nq_pairs_runtime": 4.5293,
191
+ "eval_nq_pairs_samples_per_second": 22.078,
192
+ "eval_nq_pairs_steps_per_second": 0.883,
193
+ "step": 320
194
+ },
195
+ {
196
+ "epoch": 0.050148879485973984,
197
+ "eval_trivia_pairs_loss": 0.7808571457862854,
198
+ "eval_trivia_pairs_runtime": 6.5065,
199
+ "eval_trivia_pairs_samples_per_second": 15.369,
200
+ "eval_trivia_pairs_steps_per_second": 0.615,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.050148879485973984,
205
+ "eval_quora_pairs_loss": 0.0392119362950325,
206
+ "eval_quora_pairs_runtime": 0.675,
207
+ "eval_quora_pairs_samples_per_second": 148.153,
208
+ "eval_quora_pairs_steps_per_second": 5.926,
209
+ "step": 320
210
+ },
211
+ {
212
+ "epoch": 0.050148879485973984,
213
+ "eval_gooaq_pairs_loss": 0.4712902009487152,
214
+ "eval_gooaq_pairs_runtime": 1.4079,
215
+ "eval_gooaq_pairs_samples_per_second": 71.028,
216
+ "eval_gooaq_pairs_steps_per_second": 2.841,
217
+ "step": 320
218
+ },
219
+ {
220
+ "epoch": 0.050148879485973984,
221
+ "eval_mrpc_pairs_loss": 0.05498996376991272,
222
+ "eval_mrpc_pairs_runtime": 0.2623,
223
+ "eval_mrpc_pairs_samples_per_second": 381.172,
224
+ "eval_mrpc_pairs_steps_per_second": 15.247,
225
+ "step": 320
226
+ },
227
+ {
228
+ "epoch": 0.05516376743457138,
229
+ "grad_norm": 0.34924012422561646,
230
+ "learning_rate": 1.092901878914405e-05,
231
+ "loss": 0.5791,
232
+ "step": 352
233
+ },
234
+ {
235
+ "epoch": 0.06017865538316878,
236
+ "grad_norm": 0.36700841784477234,
237
+ "learning_rate": 1.1931106471816284e-05,
238
+ "loss": 0.6413,
239
+ "step": 384
240
+ },
241
+ {
242
+ "epoch": 0.06519354333176618,
243
+ "grad_norm": 7.559622764587402,
244
+ "learning_rate": 1.2933194154488518e-05,
245
+ "loss": 0.4319,
246
+ "step": 416
247
+ },
248
+ {
249
+ "epoch": 0.07020843128036358,
250
+ "grad_norm": 7.982416152954102,
251
+ "learning_rate": 1.3935281837160753e-05,
252
+ "loss": 0.6672,
253
+ "step": 448
254
+ },
255
+ {
256
+ "epoch": 0.07522331922896097,
257
+ "grad_norm": 0.6726166009902954,
258
+ "learning_rate": 1.4937369519832987e-05,
259
+ "loss": 0.459,
260
+ "step": 480
261
+ },
262
+ {
263
+ "epoch": 0.08023820717755838,
264
+ "grad_norm": 14.846123695373535,
265
+ "learning_rate": 1.593945720250522e-05,
266
+ "loss": 0.7621,
267
+ "step": 512
268
+ },
269
+ {
270
+ "epoch": 0.08525309512615578,
271
+ "grad_norm": 0.7846627831459045,
272
+ "learning_rate": 1.6941544885177454e-05,
273
+ "loss": 0.864,
274
+ "step": 544
275
+ },
276
+ {
277
+ "epoch": 0.09026798307475317,
278
+ "grad_norm": 0.8993583917617798,
279
+ "learning_rate": 1.7943632567849688e-05,
280
+ "loss": 0.5081,
281
+ "step": 576
282
+ },
283
+ {
284
+ "epoch": 0.09528287102335058,
285
+ "grad_norm": 1.4990565776824951,
286
+ "learning_rate": 1.894572025052192e-05,
287
+ "loss": 0.654,
288
+ "step": 608
289
+ },
290
+ {
291
+ "epoch": 0.10029775897194797,
292
+ "grad_norm": 15.647976875305176,
293
+ "learning_rate": 1.9947807933194157e-05,
294
+ "loss": 0.6372,
295
+ "step": 640
296
+ },
297
+ {
298
+ "epoch": 0.10029775897194797,
299
+ "eval_nli-pairs_loss": 1.0652996301651,
300
+ "eval_nli-pairs_runtime": 3.6326,
301
+ "eval_nli-pairs_samples_per_second": 27.528,
302
+ "eval_nli-pairs_steps_per_second": 1.101,
303
+ "eval_sts-test_pearson_cosine": 0.785263018402905,
304
+ "eval_sts-test_pearson_dot": 0.5290450141477089,
305
+ "eval_sts-test_pearson_euclidean": 0.7433756286425983,
306
+ "eval_sts-test_pearson_manhattan": 0.7411097274300102,
307
+ "eval_sts-test_pearson_max": 0.785263018402905,
308
+ "eval_sts-test_spearman_cosine": 0.7996928912411947,
309
+ "eval_sts-test_spearman_dot": 0.5102571497667188,
310
+ "eval_sts-test_spearman_euclidean": 0.7338969723324641,
311
+ "eval_sts-test_spearman_manhattan": 0.7343494860194358,
312
+ "eval_sts-test_spearman_max": 0.7996928912411947,
313
+ "step": 640
314
+ },
315
+ {
316
+ "epoch": 0.10029775897194797,
317
+ "eval_vitaminc-pairs_loss": 4.719416618347168,
318
+ "eval_vitaminc-pairs_runtime": 1.1268,
319
+ "eval_vitaminc-pairs_samples_per_second": 75.437,
320
+ "eval_vitaminc-pairs_steps_per_second": 2.662,
321
+ "step": 640
322
+ },
323
+ {
324
+ "epoch": 0.10029775897194797,
325
+ "eval_sts-label_loss": 3.612347364425659,
326
+ "eval_sts-label_runtime": 0.2683,
327
+ "eval_sts-label_samples_per_second": 372.651,
328
+ "eval_sts-label_steps_per_second": 14.906,
329
+ "step": 640
330
+ },
331
+ {
332
+ "epoch": 0.10029775897194797,
333
+ "eval_qnli-contrastive_loss": 0.15202775597572327,
334
+ "eval_qnli-contrastive_runtime": 0.3528,
335
+ "eval_qnli-contrastive_samples_per_second": 283.457,
336
+ "eval_qnli-contrastive_steps_per_second": 11.338,
337
+ "step": 640
338
+ },
339
+ {
340
+ "epoch": 0.10029775897194797,
341
+ "eval_scitail-pairs-qa_loss": 0.07544919103384018,
342
+ "eval_scitail-pairs-qa_runtime": 0.8732,
343
+ "eval_scitail-pairs-qa_samples_per_second": 114.517,
344
+ "eval_scitail-pairs-qa_steps_per_second": 4.581,
345
+ "step": 640
346
+ },
347
+ {
348
+ "epoch": 0.10029775897194797,
349
+ "eval_scitail-pairs-pos_loss": 0.5404170751571655,
350
+ "eval_scitail-pairs-pos_runtime": 1.3146,
351
+ "eval_scitail-pairs-pos_samples_per_second": 76.067,
352
+ "eval_scitail-pairs-pos_steps_per_second": 3.043,
353
+ "step": 640
354
+ },
355
+ {
356
+ "epoch": 0.10029775897194797,
357
+ "eval_xsum-pairs_loss": 0.25958582758903503,
358
+ "eval_xsum-pairs_runtime": 0.9287,
359
+ "eval_xsum-pairs_samples_per_second": 107.679,
360
+ "eval_xsum-pairs_steps_per_second": 4.307,
361
+ "step": 640
362
+ },
363
+ {
364
+ "epoch": 0.10029775897194797,
365
+ "eval_compression-pairs_loss": 0.10066353529691696,
366
+ "eval_compression-pairs_runtime": 0.2732,
367
+ "eval_compression-pairs_samples_per_second": 366.076,
368
+ "eval_compression-pairs_steps_per_second": 14.643,
369
+ "step": 640
370
+ },
371
+ {
372
+ "epoch": 0.10029775897194797,
373
+ "eval_sciq_pairs_loss": 0.2645374834537506,
374
+ "eval_sciq_pairs_runtime": 4.0725,
375
+ "eval_sciq_pairs_samples_per_second": 24.555,
376
+ "eval_sciq_pairs_steps_per_second": 0.982,
377
+ "step": 640
378
+ },
379
+ {
380
+ "epoch": 0.10029775897194797,
381
+ "eval_qasc_pairs_loss": 0.21021947264671326,
382
+ "eval_qasc_pairs_runtime": 1.0743,
383
+ "eval_qasc_pairs_samples_per_second": 93.084,
384
+ "eval_qasc_pairs_steps_per_second": 3.723,
385
+ "step": 640
386
+ },
387
+ {
388
+ "epoch": 0.10029775897194797,
389
+ "eval_openbookqa_pairs_loss": 1.7905032634735107,
390
+ "eval_openbookqa_pairs_runtime": 0.8886,
391
+ "eval_openbookqa_pairs_samples_per_second": 112.532,
392
+ "eval_openbookqa_pairs_steps_per_second": 4.501,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 0.10029775897194797,
397
+ "eval_msmarco_pairs_loss": 0.5102832913398743,
398
+ "eval_msmarco_pairs_runtime": 2.0529,
399
+ "eval_msmarco_pairs_samples_per_second": 48.712,
400
+ "eval_msmarco_pairs_steps_per_second": 1.948,
401
+ "step": 640
402
+ },
403
+ {
404
+ "epoch": 0.10029775897194797,
405
+ "eval_nq_pairs_loss": 0.24466972053050995,
406
+ "eval_nq_pairs_runtime": 4.4973,
407
+ "eval_nq_pairs_samples_per_second": 22.235,
408
+ "eval_nq_pairs_steps_per_second": 0.889,
409
+ "step": 640
410
+ },
411
+ {
412
+ "epoch": 0.10029775897194797,
413
+ "eval_trivia_pairs_loss": 0.8748095631599426,
414
+ "eval_trivia_pairs_runtime": 6.4825,
415
+ "eval_trivia_pairs_samples_per_second": 15.426,
416
+ "eval_trivia_pairs_steps_per_second": 0.617,
417
+ "step": 640
418
+ },
419
+ {
420
+ "epoch": 0.10029775897194797,
421
+ "eval_quora_pairs_loss": 0.07820220291614532,
422
+ "eval_quora_pairs_runtime": 0.6944,
423
+ "eval_quora_pairs_samples_per_second": 144.008,
424
+ "eval_quora_pairs_steps_per_second": 5.76,
425
+ "step": 640
426
+ },
427
+ {
428
+ "epoch": 0.10029775897194797,
429
+ "eval_gooaq_pairs_loss": 0.5236212611198425,
430
+ "eval_gooaq_pairs_runtime": 1.3899,
431
+ "eval_gooaq_pairs_samples_per_second": 71.949,
432
+ "eval_gooaq_pairs_steps_per_second": 2.878,
433
+ "step": 640
434
+ },
435
+ {
436
+ "epoch": 0.10029775897194797,
437
+ "eval_mrpc_pairs_loss": 0.05494727939367294,
438
+ "eval_mrpc_pairs_runtime": 0.2598,
439
+ "eval_mrpc_pairs_samples_per_second": 384.941,
440
+ "eval_mrpc_pairs_steps_per_second": 15.398,
441
+ "step": 640
442
+ },
443
+ {
444
+ "epoch": 0.10531264692054537,
445
+ "grad_norm": 11.01974105834961,
446
+ "learning_rate": 2.0949895615866387e-05,
447
+ "loss": 0.9292,
448
+ "step": 672
449
+ },
450
+ {
451
+ "epoch": 0.11032753486914276,
452
+ "grad_norm": 0.5542309284210205,
453
+ "learning_rate": 2.1951983298538625e-05,
454
+ "loss": 1.3108,
455
+ "step": 704
456
+ },
457
+ {
458
+ "epoch": 0.11534242281774017,
459
+ "grad_norm": 15.458569526672363,
460
+ "learning_rate": 2.2954070981210856e-05,
461
+ "loss": 0.9674,
462
+ "step": 736
463
+ },
464
+ {
465
+ "epoch": 0.12035731076633756,
466
+ "grad_norm": 2.7814478874206543,
467
+ "learning_rate": 2.395615866388309e-05,
468
+ "loss": 0.9226,
469
+ "step": 768
470
+ },
471
+ {
472
+ "epoch": 0.12537219871493496,
473
+ "grad_norm": 11.393244743347168,
474
+ "learning_rate": 2.4958246346555324e-05,
475
+ "loss": 0.789,
476
+ "step": 800
477
+ },
478
+ {
479
+ "epoch": 0.13038708666353235,
480
+ "grad_norm": 9.288290977478027,
481
+ "learning_rate": 2.596033402922756e-05,
482
+ "loss": 0.5186,
483
+ "step": 832
484
+ },
485
+ {
486
+ "epoch": 0.13540197461212977,
487
+ "grad_norm": 47.65571212768555,
488
+ "learning_rate": 2.6962421711899793e-05,
489
+ "loss": 0.6726,
490
+ "step": 864
491
+ },
492
+ {
493
+ "epoch": 0.14041686256072716,
494
+ "grad_norm": 12.908064842224121,
495
+ "learning_rate": 2.7964509394572024e-05,
496
+ "loss": 0.5381,
497
+ "step": 896
498
+ },
499
+ {
500
+ "epoch": 0.14543175050932455,
501
+ "grad_norm": 14.951742172241211,
502
+ "learning_rate": 2.896659707724426e-05,
503
+ "loss": 0.581,
504
+ "step": 928
505
+ },
506
+ {
507
+ "epoch": 0.15044663845792194,
508
+ "grad_norm": 20.12006187438965,
509
+ "learning_rate": 2.9968684759916492e-05,
510
+ "loss": 0.9038,
511
+ "step": 960
512
+ },
513
+ {
514
+ "epoch": 0.15044663845792194,
515
+ "eval_nli-pairs_loss": 1.2173175811767578,
516
+ "eval_nli-pairs_runtime": 3.7098,
517
+ "eval_nli-pairs_samples_per_second": 26.955,
518
+ "eval_nli-pairs_steps_per_second": 1.078,
519
+ "eval_sts-test_pearson_cosine": 0.7840992835675669,
520
+ "eval_sts-test_pearson_dot": 0.5220462136106129,
521
+ "eval_sts-test_pearson_euclidean": 0.7457350047351855,
522
+ "eval_sts-test_pearson_manhattan": 0.7425970830541657,
523
+ "eval_sts-test_pearson_max": 0.7840992835675669,
524
+ "eval_sts-test_spearman_cosine": 0.8006376809572144,
525
+ "eval_sts-test_spearman_dot": 0.5020544543992158,
526
+ "eval_sts-test_spearman_euclidean": 0.7369257710408655,
527
+ "eval_sts-test_spearman_manhattan": 0.7362649758012406,
528
+ "eval_sts-test_spearman_max": 0.8006376809572144,
529
+ "step": 960
530
+ },
531
+ {
532
+ "epoch": 0.15044663845792194,
533
+ "eval_vitaminc-pairs_loss": 4.774902820587158,
534
+ "eval_vitaminc-pairs_runtime": 1.1212,
535
+ "eval_vitaminc-pairs_samples_per_second": 75.809,
536
+ "eval_vitaminc-pairs_steps_per_second": 2.676,
537
+ "step": 960
538
+ },
539
+ {
540
+ "epoch": 0.15044663845792194,
541
+ "eval_sts-label_loss": 3.198556900024414,
542
+ "eval_sts-label_runtime": 0.2678,
543
+ "eval_sts-label_samples_per_second": 373.382,
544
+ "eval_sts-label_steps_per_second": 14.935,
545
+ "step": 960
546
+ },
547
+ {
548
+ "epoch": 0.15044663845792194,
549
+ "eval_qnli-contrastive_loss": 0.1943340301513672,
550
+ "eval_qnli-contrastive_runtime": 0.3511,
551
+ "eval_qnli-contrastive_samples_per_second": 284.789,
552
+ "eval_qnli-contrastive_steps_per_second": 11.392,
553
+ "step": 960
554
+ },
555
+ {
556
+ "epoch": 0.15044663845792194,
557
+ "eval_scitail-pairs-qa_loss": 0.08060617744922638,
558
+ "eval_scitail-pairs-qa_runtime": 0.8778,
559
+ "eval_scitail-pairs-qa_samples_per_second": 113.92,
560
+ "eval_scitail-pairs-qa_steps_per_second": 4.557,
561
+ "step": 960
562
+ },
563
+ {
564
+ "epoch": 0.15044663845792194,
565
+ "eval_scitail-pairs-pos_loss": 0.4759831428527832,
566
+ "eval_scitail-pairs-pos_runtime": 1.3609,
567
+ "eval_scitail-pairs-pos_samples_per_second": 73.48,
568
+ "eval_scitail-pairs-pos_steps_per_second": 2.939,
569
+ "step": 960
570
+ },
571
+ {
572
+ "epoch": 0.15044663845792194,
573
+ "eval_xsum-pairs_loss": 0.27583304047584534,
574
+ "eval_xsum-pairs_runtime": 0.9343,
575
+ "eval_xsum-pairs_samples_per_second": 107.035,
576
+ "eval_xsum-pairs_steps_per_second": 4.281,
577
+ "step": 960
578
+ },
579
+ {
580
+ "epoch": 0.15044663845792194,
581
+ "eval_compression-pairs_loss": 0.10094660520553589,
582
+ "eval_compression-pairs_runtime": 0.2739,
583
+ "eval_compression-pairs_samples_per_second": 365.047,
584
+ "eval_compression-pairs_steps_per_second": 14.602,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 0.15044663845792194,
589
+ "eval_sciq_pairs_loss": 0.2688131630420685,
590
+ "eval_sciq_pairs_runtime": 4.0582,
591
+ "eval_sciq_pairs_samples_per_second": 24.641,
592
+ "eval_sciq_pairs_steps_per_second": 0.986,
593
+ "step": 960
594
+ },
595
+ {
596
+ "epoch": 0.15044663845792194,
597
+ "eval_qasc_pairs_loss": 0.23267821967601776,
598
+ "eval_qasc_pairs_runtime": 1.0554,
599
+ "eval_qasc_pairs_samples_per_second": 94.75,
600
+ "eval_qasc_pairs_steps_per_second": 3.79,
601
+ "step": 960
602
+ },
603
+ {
604
+ "epoch": 0.15044663845792194,
605
+ "eval_openbookqa_pairs_loss": 1.8053069114685059,
606
+ "eval_openbookqa_pairs_runtime": 0.8871,
607
+ "eval_openbookqa_pairs_samples_per_second": 112.727,
608
+ "eval_openbookqa_pairs_steps_per_second": 4.509,
609
+ "step": 960
610
+ },
611
+ {
612
+ "epoch": 0.15044663845792194,
613
+ "eval_msmarco_pairs_loss": 0.5809260606765747,
614
+ "eval_msmarco_pairs_runtime": 2.0498,
615
+ "eval_msmarco_pairs_samples_per_second": 48.786,
616
+ "eval_msmarco_pairs_steps_per_second": 1.951,
617
+ "step": 960
618
+ },
619
+ {
620
+ "epoch": 0.15044663845792194,
621
+ "eval_nq_pairs_loss": 0.2808491885662079,
622
+ "eval_nq_pairs_runtime": 4.4982,
623
+ "eval_nq_pairs_samples_per_second": 22.231,
624
+ "eval_nq_pairs_steps_per_second": 0.889,
625
+ "step": 960
626
+ },
627
+ {
628
+ "epoch": 0.15044663845792194,
629
+ "eval_trivia_pairs_loss": 0.9379808902740479,
630
+ "eval_trivia_pairs_runtime": 6.4578,
631
+ "eval_trivia_pairs_samples_per_second": 15.485,
632
+ "eval_trivia_pairs_steps_per_second": 0.619,
633
+ "step": 960
634
+ },
635
+ {
636
+ "epoch": 0.15044663845792194,
637
+ "eval_quora_pairs_loss": 0.0913279801607132,
638
+ "eval_quora_pairs_runtime": 0.6721,
639
+ "eval_quora_pairs_samples_per_second": 148.79,
640
+ "eval_quora_pairs_steps_per_second": 5.952,
641
+ "step": 960
642
+ },
643
+ {
644
+ "epoch": 0.15044663845792194,
645
+ "eval_gooaq_pairs_loss": 0.5807955265045166,
646
+ "eval_gooaq_pairs_runtime": 1.3915,
647
+ "eval_gooaq_pairs_samples_per_second": 71.865,
648
+ "eval_gooaq_pairs_steps_per_second": 2.875,
649
+ "step": 960
650
+ },
651
+ {
652
+ "epoch": 0.15044663845792194,
653
+ "eval_mrpc_pairs_loss": 0.05799216777086258,
654
+ "eval_mrpc_pairs_runtime": 0.2571,
655
+ "eval_mrpc_pairs_samples_per_second": 388.998,
656
+ "eval_mrpc_pairs_steps_per_second": 15.56,
657
+ "step": 960
658
+ },
659
+ {
660
+ "epoch": 0.15546152640651936,
661
+ "grad_norm": 9.773286819458008,
662
+ "learning_rate": 2.9997957904107625e-05,
663
+ "loss": 0.7964,
664
+ "step": 992
665
+ },
666
+ {
667
+ "epoch": 0.16047641435511675,
668
+ "grad_norm": 19.411075592041016,
669
+ "learning_rate": 2.9991566594209126e-05,
670
+ "loss": 0.8213,
671
+ "step": 1024
672
+ },
673
+ {
674
+ "epoch": 0.16549130230371414,
675
+ "grad_norm": 3.5282175540924072,
676
+ "learning_rate": 2.9980825799589488e-05,
677
+ "loss": 0.5396,
678
+ "step": 1056
679
+ },
680
+ {
681
+ "epoch": 0.17050619025231156,
682
+ "grad_norm": 62.66339874267578,
683
+ "learning_rate": 2.996573863646219e-05,
684
+ "loss": 0.9297,
685
+ "step": 1088
686
+ },
687
+ {
688
+ "epoch": 0.17552107820090895,
689
+ "grad_norm": 8.785274505615234,
690
+ "learning_rate": 2.994630948204727e-05,
691
+ "loss": 1.169,
692
+ "step": 1120
693
+ },
694
+ {
695
+ "epoch": 0.18053596614950634,
696
+ "grad_norm": 24.10859489440918,
697
+ "learning_rate": 2.992254397330132e-05,
698
+ "loss": 0.7486,
699
+ "step": 1152
700
+ },
701
+ {
702
+ "epoch": 0.18555085409810373,
703
+ "grad_norm": 25.545284271240234,
704
+ "learning_rate": 2.9894449005282077e-05,
705
+ "loss": 0.6821,
706
+ "step": 1184
707
+ },
708
+ {
709
+ "epoch": 0.19056574204670115,
710
+ "grad_norm": 0.8675521016120911,
711
+ "learning_rate": 2.9862032729147954e-05,
712
+ "loss": 0.6125,
713
+ "step": 1216
714
+ },
715
+ {
716
+ "epoch": 0.19558062999529854,
717
+ "grad_norm": 16.122114181518555,
718
+ "learning_rate": 2.9825304549793153e-05,
719
+ "loss": 0.8061,
720
+ "step": 1248
721
+ }
722
+ ],
723
+ "logging_steps": 32,
724
+ "max_steps": 12762,
725
+ "num_input_tokens_seen": 0,
726
+ "num_train_epochs": 2,
727
+ "save_steps": 1277,
728
+ "stateful_callbacks": {
729
+ "TrainerControl": {
730
+ "args": {
731
+ "should_epoch_stop": false,
732
+ "should_evaluate": false,
733
+ "should_log": false,
734
+ "should_save": true,
735
+ "should_training_stop": false
736
+ },
737
+ "attributes": {}
738
+ }
739
+ },
740
+ "total_flos": 0.0,
741
+ "train_batch_size": 32,
742
+ "trial_name": null,
743
+ "trial_params": null
744
+ }
checkpoint-1277/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383c5bf6513da1dfbcd1294f0c8e85ce43118bc61e2de49d9b5d1e28eb653003
3
+ size 5624