bobox commited on
Commit
0e7ce92
·
verified ·
1 Parent(s): af3a5fa

Training in progress, step 32, checkpoint

Browse files
checkpoint-32/1_WeightedPooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-32/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-32/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bobox/DeBERTa-small-ST-v1-test-step3",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.44.0",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
checkpoint-32/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.44.0",
5
+ "pytorch": "2.4.0"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-32/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_WeightedPooling",
12
+ "type": "__main__"
13
+ }
14
+ ]
checkpoint-32/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5314df8adf2482eb5616d726c5ef81403b4312c5bc648f134c007ce3e862fab6
3
+ size 9475334
checkpoint-32/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1886a286545822d70cd17732b6025a52902e7429a8ab7eec28bf9c1a4f143b6c
3
+ size 565251810
checkpoint-32/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d87b490a91f5d78577727f0e410e58bda72ba61bb39ea319e4c9101937278068
3
+ size 14244
checkpoint-32/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1157d899077c408d4326b2533ea1455e16e7554d636785a95beffeb5f2cd152d
3
+ size 1064
checkpoint-32/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-32/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-32/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-32/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 512,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "sp_model_kwargs": {},
58
+ "split_by_punct": false,
59
+ "stride": 0,
60
+ "tokenizer_class": "DebertaV2Tokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]",
64
+ "vocab_type": "spm"
65
+ }
checkpoint-32/trainer_state.json ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.3106796116504854,
5
+ "eval_steps": 8,
6
+ "global_step": 32,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009708737864077669,
13
+ "grad_norm": 0.38736554980278015,
14
+ "learning_rate": 4.411764705882353e-07,
15
+ "loss": 0.1607,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.019417475728155338,
20
+ "grad_norm": 0.38099536299705505,
21
+ "learning_rate": 8.823529411764706e-07,
22
+ "loss": 0.1664,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.02912621359223301,
27
+ "grad_norm": 0.2045232504606247,
28
+ "learning_rate": 1.323529411764706e-06,
29
+ "loss": 0.2686,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.038834951456310676,
34
+ "grad_norm": 0.370655357837677,
35
+ "learning_rate": 1.7647058823529412e-06,
36
+ "loss": 0.1656,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.04854368932038835,
41
+ "grad_norm": 0.35929468274116516,
42
+ "learning_rate": 2.2058823529411767e-06,
43
+ "loss": 0.1269,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.05825242718446602,
48
+ "grad_norm": 0.2196977585554123,
49
+ "learning_rate": 2.647058823529412e-06,
50
+ "loss": 0.1066,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.06796116504854369,
55
+ "grad_norm": 0.6120501756668091,
56
+ "learning_rate": 3.0882352941176476e-06,
57
+ "loss": 0.1936,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.07766990291262135,
62
+ "grad_norm": 0.7004581689834595,
63
+ "learning_rate": 3.5294117647058825e-06,
64
+ "loss": 0.087,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.07766990291262135,
69
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
70
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.6691694855690002,
71
+ "eval_Qnli-dev_cosine_ap": 0.7181099755520133,
72
+ "eval_Qnli-dev_cosine_f1": 0.6764705882352942,
73
+ "eval_Qnli-dev_cosine_f1_threshold": 0.49676012992858887,
74
+ "eval_Qnli-dev_cosine_precision": 0.550531914893617,
75
+ "eval_Qnli-dev_cosine_recall": 0.8771186440677966,
76
+ "eval_Qnli-dev_dot_accuracy": 0.669921875,
77
+ "eval_Qnli-dev_dot_accuracy_threshold": 308.413330078125,
78
+ "eval_Qnli-dev_dot_ap": 0.678634155184842,
79
+ "eval_Qnli-dev_dot_f1": 0.6698872785829306,
80
+ "eval_Qnli-dev_dot_f1_threshold": 208.48468017578125,
81
+ "eval_Qnli-dev_dot_precision": 0.5402597402597402,
82
+ "eval_Qnli-dev_dot_recall": 0.8813559322033898,
83
+ "eval_Qnli-dev_euclidean_accuracy": 0.708984375,
84
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 17.320636749267578,
85
+ "eval_Qnli-dev_euclidean_ap": 0.7302104938839877,
86
+ "eval_Qnli-dev_euclidean_f1": 0.6845637583892616,
87
+ "eval_Qnli-dev_euclidean_f1_threshold": 20.530838012695312,
88
+ "eval_Qnli-dev_euclidean_precision": 0.5666666666666667,
89
+ "eval_Qnli-dev_euclidean_recall": 0.864406779661017,
90
+ "eval_Qnli-dev_manhattan_accuracy": 0.7109375,
91
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 369.90869140625,
92
+ "eval_Qnli-dev_manhattan_ap": 0.732538057005555,
93
+ "eval_Qnli-dev_manhattan_f1": 0.6836518046709129,
94
+ "eval_Qnli-dev_manhattan_f1_threshold": 372.4708251953125,
95
+ "eval_Qnli-dev_manhattan_precision": 0.6851063829787234,
96
+ "eval_Qnli-dev_manhattan_recall": 0.6822033898305084,
97
+ "eval_Qnli-dev_max_accuracy": 0.7109375,
98
+ "eval_Qnli-dev_max_accuracy_threshold": 369.90869140625,
99
+ "eval_Qnli-dev_max_ap": 0.732538057005555,
100
+ "eval_Qnli-dev_max_f1": 0.6845637583892616,
101
+ "eval_Qnli-dev_max_f1_threshold": 372.4708251953125,
102
+ "eval_Qnli-dev_max_precision": 0.6851063829787234,
103
+ "eval_Qnli-dev_max_recall": 0.8813559322033898,
104
+ "eval_allNLI-dev_cosine_accuracy": 0.740234375,
105
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7844594717025757,
106
+ "eval_allNLI-dev_cosine_ap": 0.6356383940181094,
107
+ "eval_allNLI-dev_cosine_f1": 0.6369710467706012,
108
+ "eval_allNLI-dev_cosine_f1_threshold": 0.5889398455619812,
109
+ "eval_allNLI-dev_cosine_precision": 0.5181159420289855,
110
+ "eval_allNLI-dev_cosine_recall": 0.8265895953757225,
111
+ "eval_allNLI-dev_dot_accuracy": 0.72265625,
112
+ "eval_allNLI-dev_dot_accuracy_threshold": 294.92120361328125,
113
+ "eval_allNLI-dev_dot_ap": 0.5914839688879726,
114
+ "eval_allNLI-dev_dot_f1": 0.6099290780141845,
115
+ "eval_allNLI-dev_dot_f1_threshold": 246.92991638183594,
116
+ "eval_allNLI-dev_dot_precision": 0.516,
117
+ "eval_allNLI-dev_dot_recall": 0.7456647398843931,
118
+ "eval_allNLI-dev_euclidean_accuracy": 0.744140625,
119
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.74991226196289,
120
+ "eval_allNLI-dev_euclidean_ap": 0.6412989964435845,
121
+ "eval_allNLI-dev_euclidean_f1": 0.6464208242950108,
122
+ "eval_allNLI-dev_euclidean_f1_threshold": 18.407054901123047,
123
+ "eval_allNLI-dev_euclidean_precision": 0.5173611111111112,
124
+ "eval_allNLI-dev_euclidean_recall": 0.861271676300578,
125
+ "eval_allNLI-dev_manhattan_accuracy": 0.744140625,
126
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 313.8516540527344,
127
+ "eval_allNLI-dev_manhattan_ap": 0.6416934437743762,
128
+ "eval_allNLI-dev_manhattan_f1": 0.655,
129
+ "eval_allNLI-dev_manhattan_f1_threshold": 361.0255126953125,
130
+ "eval_allNLI-dev_manhattan_precision": 0.5770925110132159,
131
+ "eval_allNLI-dev_manhattan_recall": 0.7572254335260116,
132
+ "eval_allNLI-dev_max_accuracy": 0.744140625,
133
+ "eval_allNLI-dev_max_accuracy_threshold": 313.8516540527344,
134
+ "eval_allNLI-dev_max_ap": 0.6416934437743762,
135
+ "eval_allNLI-dev_max_f1": 0.655,
136
+ "eval_allNLI-dev_max_f1_threshold": 361.0255126953125,
137
+ "eval_allNLI-dev_max_precision": 0.5770925110132159,
138
+ "eval_allNLI-dev_max_recall": 0.861271676300578,
139
+ "eval_sequential_score": 0.732538057005555,
140
+ "eval_sts-test_pearson_cosine": 0.8858982884210256,
141
+ "eval_sts-test_pearson_dot": 0.8750202743648935,
142
+ "eval_sts-test_pearson_euclidean": 0.9073481145616336,
143
+ "eval_sts-test_pearson_manhattan": 0.908032029485236,
144
+ "eval_sts-test_pearson_max": 0.908032029485236,
145
+ "eval_sts-test_spearman_cosine": 0.9075027297778789,
146
+ "eval_sts-test_spearman_dot": 0.8759972599919353,
147
+ "eval_sts-test_spearman_euclidean": 0.9040041656648221,
148
+ "eval_sts-test_spearman_manhattan": 0.9045863348242628,
149
+ "eval_sts-test_spearman_max": 0.9075027297778789,
150
+ "eval_vitaminc-pairs_loss": 2.801919460296631,
151
+ "eval_vitaminc-pairs_runtime": 3.8986,
152
+ "eval_vitaminc-pairs_samples_per_second": 32.832,
153
+ "eval_vitaminc-pairs_steps_per_second": 0.257,
154
+ "step": 8
155
+ },
156
+ {
157
+ "epoch": 0.07766990291262135,
158
+ "eval_negation-triplets_loss": 1.355710744857788,
159
+ "eval_negation-triplets_runtime": 0.639,
160
+ "eval_negation-triplets_samples_per_second": 200.304,
161
+ "eval_negation-triplets_steps_per_second": 1.565,
162
+ "step": 8
163
+ },
164
+ {
165
+ "epoch": 0.07766990291262135,
166
+ "eval_scitail-pairs-pos_loss": 0.06071745231747627,
167
+ "eval_scitail-pairs-pos_runtime": 0.9322,
168
+ "eval_scitail-pairs-pos_samples_per_second": 137.311,
169
+ "eval_scitail-pairs-pos_steps_per_second": 1.073,
170
+ "step": 8
171
+ },
172
+ {
173
+ "epoch": 0.07766990291262135,
174
+ "eval_scitail-pairs-qa_loss": 1.0544098927312007e-07,
175
+ "eval_scitail-pairs-qa_runtime": 0.5629,
176
+ "eval_scitail-pairs-qa_samples_per_second": 227.378,
177
+ "eval_scitail-pairs-qa_steps_per_second": 1.776,
178
+ "step": 8
179
+ },
180
+ {
181
+ "epoch": 0.07766990291262135,
182
+ "eval_xsum-pairs_loss": 0.00016850717656780034,
183
+ "eval_xsum-pairs_runtime": 2.9256,
184
+ "eval_xsum-pairs_samples_per_second": 43.751,
185
+ "eval_xsum-pairs_steps_per_second": 0.342,
186
+ "step": 8
187
+ },
188
+ {
189
+ "epoch": 0.07766990291262135,
190
+ "eval_sciq_pairs_loss": 0.0582379512488842,
191
+ "eval_sciq_pairs_runtime": 2.5278,
192
+ "eval_sciq_pairs_samples_per_second": 34.418,
193
+ "eval_sciq_pairs_steps_per_second": 0.396,
194
+ "step": 8
195
+ },
196
+ {
197
+ "epoch": 0.07766990291262135,
198
+ "eval_qasc_pairs_loss": 0.07362987846136093,
199
+ "eval_qasc_pairs_runtime": 0.3723,
200
+ "eval_qasc_pairs_samples_per_second": 233.691,
201
+ "eval_qasc_pairs_steps_per_second": 2.686,
202
+ "step": 8
203
+ },
204
+ {
205
+ "epoch": 0.07766990291262135,
206
+ "eval_openbookqa_pairs_loss": 1.3787189722061157,
207
+ "eval_openbookqa_pairs_runtime": 0.5744,
208
+ "eval_openbookqa_pairs_samples_per_second": 222.849,
209
+ "eval_openbookqa_pairs_steps_per_second": 1.741,
210
+ "step": 8
211
+ },
212
+ {
213
+ "epoch": 0.07766990291262135,
214
+ "eval_msmarco_pairs_loss": 0.1278172731399536,
215
+ "eval_msmarco_pairs_runtime": 1.0983,
216
+ "eval_msmarco_pairs_samples_per_second": 79.215,
217
+ "eval_msmarco_pairs_steps_per_second": 0.911,
218
+ "step": 8
219
+ },
220
+ {
221
+ "epoch": 0.07766990291262135,
222
+ "eval_nq_pairs_loss": 0.0051126074977219105,
223
+ "eval_nq_pairs_runtime": 2.2469,
224
+ "eval_nq_pairs_samples_per_second": 38.719,
225
+ "eval_nq_pairs_steps_per_second": 0.445,
226
+ "step": 8
227
+ },
228
+ {
229
+ "epoch": 0.07766990291262135,
230
+ "eval_trivia_pairs_loss": 0.26755890250205994,
231
+ "eval_trivia_pairs_runtime": 2.4342,
232
+ "eval_trivia_pairs_samples_per_second": 41.081,
233
+ "eval_trivia_pairs_steps_per_second": 0.411,
234
+ "step": 8
235
+ },
236
+ {
237
+ "epoch": 0.07766990291262135,
238
+ "eval_gooaq_pairs_loss": 0.05755230411887169,
239
+ "eval_gooaq_pairs_runtime": 0.5805,
240
+ "eval_gooaq_pairs_samples_per_second": 149.876,
241
+ "eval_gooaq_pairs_steps_per_second": 1.723,
242
+ "step": 8
243
+ },
244
+ {
245
+ "epoch": 0.07766990291262135,
246
+ "eval_paws-pos_loss": 0.04592936113476753,
247
+ "eval_paws-pos_runtime": 0.6903,
248
+ "eval_paws-pos_samples_per_second": 185.415,
249
+ "eval_paws-pos_steps_per_second": 1.449,
250
+ "step": 8
251
+ },
252
+ {
253
+ "epoch": 0.07766990291262135,
254
+ "eval_global_dataset_loss": 0.4901905357837677,
255
+ "eval_global_dataset_runtime": 25.0307,
256
+ "eval_global_dataset_samples_per_second": 26.487,
257
+ "eval_global_dataset_steps_per_second": 0.24,
258
+ "step": 8
259
+ },
260
+ {
261
+ "epoch": 0.08737864077669903,
262
+ "grad_norm": 1.2741845846176147,
263
+ "learning_rate": 3.970588235294118e-06,
264
+ "loss": 0.1952,
265
+ "step": 9
266
+ },
267
+ {
268
+ "epoch": 0.0970873786407767,
269
+ "grad_norm": 0.46481379866600037,
270
+ "learning_rate": 4.411764705882353e-06,
271
+ "loss": 0.4167,
272
+ "step": 10
273
+ },
274
+ {
275
+ "epoch": 0.10679611650485436,
276
+ "grad_norm": 0.7963629961013794,
277
+ "learning_rate": 4.852941176470589e-06,
278
+ "loss": 0.7876,
279
+ "step": 11
280
+ },
281
+ {
282
+ "epoch": 0.11650485436893204,
283
+ "grad_norm": 0.6939969658851624,
284
+ "learning_rate": 5.294117647058824e-06,
285
+ "loss": 0.3714,
286
+ "step": 12
287
+ },
288
+ {
289
+ "epoch": 0.1262135922330097,
290
+ "grad_norm": 0.44538313150405884,
291
+ "learning_rate": 5.735294117647059e-06,
292
+ "loss": 0.1852,
293
+ "step": 13
294
+ },
295
+ {
296
+ "epoch": 0.13592233009708737,
297
+ "grad_norm": 0.3318534791469574,
298
+ "learning_rate": 6.176470588235295e-06,
299
+ "loss": 0.1144,
300
+ "step": 14
301
+ },
302
+ {
303
+ "epoch": 0.14563106796116504,
304
+ "grad_norm": 0.4242781698703766,
305
+ "learning_rate": 6.61764705882353e-06,
306
+ "loss": 0.1234,
307
+ "step": 15
308
+ },
309
+ {
310
+ "epoch": 0.1553398058252427,
311
+ "grad_norm": 0.2060050666332245,
312
+ "learning_rate": 7.058823529411765e-06,
313
+ "loss": 0.0569,
314
+ "step": 16
315
+ },
316
+ {
317
+ "epoch": 0.1553398058252427,
318
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
319
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.6684989929199219,
320
+ "eval_Qnli-dev_cosine_ap": 0.7189160511154118,
321
+ "eval_Qnli-dev_cosine_f1": 0.6764705882352942,
322
+ "eval_Qnli-dev_cosine_f1_threshold": 0.49889329075813293,
323
+ "eval_Qnli-dev_cosine_precision": 0.550531914893617,
324
+ "eval_Qnli-dev_cosine_recall": 0.8771186440677966,
325
+ "eval_Qnli-dev_dot_accuracy": 0.66796875,
326
+ "eval_Qnli-dev_dot_accuracy_threshold": 303.81427001953125,
327
+ "eval_Qnli-dev_dot_ap": 0.6791091476516111,
328
+ "eval_Qnli-dev_dot_f1": 0.6697247706422018,
329
+ "eval_Qnli-dev_dot_f1_threshold": 184.08914184570312,
330
+ "eval_Qnli-dev_dot_precision": 0.5239234449760766,
331
+ "eval_Qnli-dev_dot_recall": 0.9279661016949152,
332
+ "eval_Qnli-dev_euclidean_accuracy": 0.7109375,
333
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 17.21223258972168,
334
+ "eval_Qnli-dev_euclidean_ap": 0.7305713530476123,
335
+ "eval_Qnli-dev_euclidean_f1": 0.6848739495798319,
336
+ "eval_Qnli-dev_euclidean_f1_threshold": 17.62983512878418,
337
+ "eval_Qnli-dev_euclidean_precision": 0.6791666666666667,
338
+ "eval_Qnli-dev_euclidean_recall": 0.690677966101695,
339
+ "eval_Qnli-dev_manhattan_accuracy": 0.712890625,
340
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 367.2698974609375,
341
+ "eval_Qnli-dev_manhattan_ap": 0.7327153618467641,
342
+ "eval_Qnli-dev_manhattan_f1": 0.6824034334763949,
343
+ "eval_Qnli-dev_manhattan_f1_threshold": 368.1672058105469,
344
+ "eval_Qnli-dev_manhattan_precision": 0.691304347826087,
345
+ "eval_Qnli-dev_manhattan_recall": 0.673728813559322,
346
+ "eval_Qnli-dev_max_accuracy": 0.712890625,
347
+ "eval_Qnli-dev_max_accuracy_threshold": 367.2698974609375,
348
+ "eval_Qnli-dev_max_ap": 0.7327153618467641,
349
+ "eval_Qnli-dev_max_f1": 0.6848739495798319,
350
+ "eval_Qnli-dev_max_f1_threshold": 368.1672058105469,
351
+ "eval_Qnli-dev_max_precision": 0.691304347826087,
352
+ "eval_Qnli-dev_max_recall": 0.9279661016949152,
353
+ "eval_allNLI-dev_cosine_accuracy": 0.7421875,
354
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7840416431427002,
355
+ "eval_allNLI-dev_cosine_ap": 0.6349487371750362,
356
+ "eval_allNLI-dev_cosine_f1": 0.6367713004484306,
357
+ "eval_allNLI-dev_cosine_f1_threshold": 0.5954304337501526,
358
+ "eval_allNLI-dev_cosine_precision": 0.5201465201465202,
359
+ "eval_allNLI-dev_cosine_recall": 0.8208092485549133,
360
+ "eval_allNLI-dev_dot_accuracy": 0.724609375,
361
+ "eval_allNLI-dev_dot_accuracy_threshold": 291.18670654296875,
362
+ "eval_allNLI-dev_dot_ap": 0.5913303711668598,
363
+ "eval_allNLI-dev_dot_f1": 0.6070588235294118,
364
+ "eval_allNLI-dev_dot_f1_threshold": 242.49884033203125,
365
+ "eval_allNLI-dev_dot_precision": 0.5119047619047619,
366
+ "eval_allNLI-dev_dot_recall": 0.7456647398843931,
367
+ "eval_allNLI-dev_euclidean_accuracy": 0.744140625,
368
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.559854507446289,
369
+ "eval_allNLI-dev_euclidean_ap": 0.6408865358113234,
370
+ "eval_allNLI-dev_euclidean_f1": 0.6450116009280741,
371
+ "eval_allNLI-dev_euclidean_f1_threshold": 17.65105628967285,
372
+ "eval_allNLI-dev_euclidean_precision": 0.5387596899224806,
373
+ "eval_allNLI-dev_euclidean_recall": 0.8034682080924855,
374
+ "eval_allNLI-dev_manhattan_accuracy": 0.7421875,
375
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 307.83563232421875,
376
+ "eval_allNLI-dev_manhattan_ap": 0.6414745653251337,
377
+ "eval_allNLI-dev_manhattan_f1": 0.6533665835411472,
378
+ "eval_allNLI-dev_manhattan_f1_threshold": 357.5172119140625,
379
+ "eval_allNLI-dev_manhattan_precision": 0.5745614035087719,
380
+ "eval_allNLI-dev_manhattan_recall": 0.7572254335260116,
381
+ "eval_allNLI-dev_max_accuracy": 0.744140625,
382
+ "eval_allNLI-dev_max_accuracy_threshold": 307.83563232421875,
383
+ "eval_allNLI-dev_max_ap": 0.6414745653251337,
384
+ "eval_allNLI-dev_max_f1": 0.6533665835411472,
385
+ "eval_allNLI-dev_max_f1_threshold": 357.5172119140625,
386
+ "eval_allNLI-dev_max_precision": 0.5745614035087719,
387
+ "eval_allNLI-dev_max_recall": 0.8208092485549133,
388
+ "eval_sequential_score": 0.7327153618467641,
389
+ "eval_sts-test_pearson_cosine": 0.8857317653883374,
390
+ "eval_sts-test_pearson_dot": 0.8740754819923455,
391
+ "eval_sts-test_pearson_euclidean": 0.9064480556463691,
392
+ "eval_sts-test_pearson_manhattan": 0.9071116458208927,
393
+ "eval_sts-test_pearson_max": 0.9071116458208927,
394
+ "eval_sts-test_spearman_cosine": 0.9078527293932209,
395
+ "eval_sts-test_spearman_dot": 0.8744967501974819,
396
+ "eval_sts-test_spearman_euclidean": 0.9030452803869885,
397
+ "eval_sts-test_spearman_manhattan": 0.903667904686727,
398
+ "eval_sts-test_spearman_max": 0.9078527293932209,
399
+ "eval_vitaminc-pairs_loss": 2.799455165863037,
400
+ "eval_vitaminc-pairs_runtime": 3.8555,
401
+ "eval_vitaminc-pairs_samples_per_second": 33.2,
402
+ "eval_vitaminc-pairs_steps_per_second": 0.259,
403
+ "step": 16
404
+ },
405
+ {
406
+ "epoch": 0.1553398058252427,
407
+ "eval_negation-triplets_loss": 1.3501245975494385,
408
+ "eval_negation-triplets_runtime": 0.6331,
409
+ "eval_negation-triplets_samples_per_second": 202.181,
410
+ "eval_negation-triplets_steps_per_second": 1.58,
411
+ "step": 16
412
+ },
413
+ {
414
+ "epoch": 0.1553398058252427,
415
+ "eval_scitail-pairs-pos_loss": 0.060843899846076965,
416
+ "eval_scitail-pairs-pos_runtime": 0.9131,
417
+ "eval_scitail-pairs-pos_samples_per_second": 140.183,
418
+ "eval_scitail-pairs-pos_steps_per_second": 1.095,
419
+ "step": 16
420
+ },
421
+ {
422
+ "epoch": 0.1553398058252427,
423
+ "eval_scitail-pairs-qa_loss": 1.1229322893768767e-07,
424
+ "eval_scitail-pairs-qa_runtime": 0.5691,
425
+ "eval_scitail-pairs-qa_samples_per_second": 224.9,
426
+ "eval_scitail-pairs-qa_steps_per_second": 1.757,
427
+ "step": 16
428
+ },
429
+ {
430
+ "epoch": 0.1553398058252427,
431
+ "eval_xsum-pairs_loss": 0.0001665251620579511,
432
+ "eval_xsum-pairs_runtime": 2.9369,
433
+ "eval_xsum-pairs_samples_per_second": 43.584,
434
+ "eval_xsum-pairs_steps_per_second": 0.34,
435
+ "step": 16
436
+ },
437
+ {
438
+ "epoch": 0.1553398058252427,
439
+ "eval_sciq_pairs_loss": 0.05696266517043114,
440
+ "eval_sciq_pairs_runtime": 2.5397,
441
+ "eval_sciq_pairs_samples_per_second": 34.256,
442
+ "eval_sciq_pairs_steps_per_second": 0.394,
443
+ "step": 16
444
+ },
445
+ {
446
+ "epoch": 0.1553398058252427,
447
+ "eval_qasc_pairs_loss": 0.07378190755844116,
448
+ "eval_qasc_pairs_runtime": 0.3731,
449
+ "eval_qasc_pairs_samples_per_second": 233.155,
450
+ "eval_qasc_pairs_steps_per_second": 2.68,
451
+ "step": 16
452
+ },
453
+ {
454
+ "epoch": 0.1553398058252427,
455
+ "eval_openbookqa_pairs_loss": 1.3709176778793335,
456
+ "eval_openbookqa_pairs_runtime": 0.5704,
457
+ "eval_openbookqa_pairs_samples_per_second": 224.402,
458
+ "eval_openbookqa_pairs_steps_per_second": 1.753,
459
+ "step": 16
460
+ },
461
+ {
462
+ "epoch": 0.1553398058252427,
463
+ "eval_msmarco_pairs_loss": 0.13161690533161163,
464
+ "eval_msmarco_pairs_runtime": 1.0994,
465
+ "eval_msmarco_pairs_samples_per_second": 79.136,
466
+ "eval_msmarco_pairs_steps_per_second": 0.91,
467
+ "step": 16
468
+ },
469
+ {
470
+ "epoch": 0.1553398058252427,
471
+ "eval_nq_pairs_loss": 0.005713976453989744,
472
+ "eval_nq_pairs_runtime": 2.2521,
473
+ "eval_nq_pairs_samples_per_second": 38.631,
474
+ "eval_nq_pairs_steps_per_second": 0.444,
475
+ "step": 16
476
+ },
477
+ {
478
+ "epoch": 0.1553398058252427,
479
+ "eval_trivia_pairs_loss": 0.2678990364074707,
480
+ "eval_trivia_pairs_runtime": 2.4403,
481
+ "eval_trivia_pairs_samples_per_second": 40.979,
482
+ "eval_trivia_pairs_steps_per_second": 0.41,
483
+ "step": 16
484
+ },
485
+ {
486
+ "epoch": 0.1553398058252427,
487
+ "eval_gooaq_pairs_loss": 0.05776378884911537,
488
+ "eval_gooaq_pairs_runtime": 0.5822,
489
+ "eval_gooaq_pairs_samples_per_second": 149.421,
490
+ "eval_gooaq_pairs_steps_per_second": 1.717,
491
+ "step": 16
492
+ },
493
+ {
494
+ "epoch": 0.1553398058252427,
495
+ "eval_paws-pos_loss": 0.04598955065011978,
496
+ "eval_paws-pos_runtime": 0.6747,
497
+ "eval_paws-pos_samples_per_second": 189.727,
498
+ "eval_paws-pos_steps_per_second": 1.482,
499
+ "step": 16
500
+ },
501
+ {
502
+ "epoch": 0.1553398058252427,
503
+ "eval_global_dataset_loss": 0.48036298155784607,
504
+ "eval_global_dataset_runtime": 25.0731,
505
+ "eval_global_dataset_samples_per_second": 26.443,
506
+ "eval_global_dataset_steps_per_second": 0.239,
507
+ "step": 16
508
+ },
509
+ {
510
+ "epoch": 0.1650485436893204,
511
+ "grad_norm": 0.4544373154640198,
512
+ "learning_rate": 7.5e-06,
513
+ "loss": 0.3203,
514
+ "step": 17
515
+ },
516
+ {
517
+ "epoch": 0.17475728155339806,
518
+ "grad_norm": 0.4103369414806366,
519
+ "learning_rate": 7.941176470588236e-06,
520
+ "loss": 0.2113,
521
+ "step": 18
522
+ },
523
+ {
524
+ "epoch": 0.18446601941747573,
525
+ "grad_norm": 1.5407989025115967,
526
+ "learning_rate": 8.382352941176472e-06,
527
+ "loss": 0.3137,
528
+ "step": 19
529
+ },
530
+ {
531
+ "epoch": 0.1941747572815534,
532
+ "grad_norm": 0.46951910853385925,
533
+ "learning_rate": 8.823529411764707e-06,
534
+ "loss": 0.4048,
535
+ "step": 20
536
+ },
537
+ {
538
+ "epoch": 0.20388349514563106,
539
+ "grad_norm": 0.6390318870544434,
540
+ "learning_rate": 9.264705882352942e-06,
541
+ "loss": 0.2855,
542
+ "step": 21
543
+ },
544
+ {
545
+ "epoch": 0.21359223300970873,
546
+ "grad_norm": 0.22210820019245148,
547
+ "learning_rate": 9.705882352941177e-06,
548
+ "loss": 0.0593,
549
+ "step": 22
550
+ },
551
+ {
552
+ "epoch": 0.22330097087378642,
553
+ "grad_norm": 0.7516863346099854,
554
+ "learning_rate": 1.0147058823529413e-05,
555
+ "loss": 0.1228,
556
+ "step": 23
557
+ },
558
+ {
559
+ "epoch": 0.23300970873786409,
560
+ "grad_norm": 0.4010964035987854,
561
+ "learning_rate": 1.0588235294117648e-05,
562
+ "loss": 0.0811,
563
+ "step": 24
564
+ },
565
+ {
566
+ "epoch": 0.23300970873786409,
567
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
568
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.651726484298706,
569
+ "eval_Qnli-dev_cosine_ap": 0.7197319450561322,
570
+ "eval_Qnli-dev_cosine_f1": 0.6766666666666667,
571
+ "eval_Qnli-dev_cosine_f1_threshold": 0.508252739906311,
572
+ "eval_Qnli-dev_cosine_precision": 0.5576923076923077,
573
+ "eval_Qnli-dev_cosine_recall": 0.8601694915254238,
574
+ "eval_Qnli-dev_dot_accuracy": 0.66796875,
575
+ "eval_Qnli-dev_dot_accuracy_threshold": 295.55169677734375,
576
+ "eval_Qnli-dev_dot_ap": 0.6760811849258039,
577
+ "eval_Qnli-dev_dot_f1": 0.6717791411042945,
578
+ "eval_Qnli-dev_dot_f1_threshold": 181.16976928710938,
579
+ "eval_Qnli-dev_dot_precision": 0.5264423076923077,
580
+ "eval_Qnli-dev_dot_recall": 0.9279661016949152,
581
+ "eval_Qnli-dev_euclidean_accuracy": 0.7109375,
582
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 17.042926788330078,
583
+ "eval_Qnli-dev_euclidean_ap": 0.7310579368620943,
584
+ "eval_Qnli-dev_euclidean_f1": 0.6848739495798319,
585
+ "eval_Qnli-dev_euclidean_f1_threshold": 17.435243606567383,
586
+ "eval_Qnli-dev_euclidean_precision": 0.6791666666666667,
587
+ "eval_Qnli-dev_euclidean_recall": 0.690677966101695,
588
+ "eval_Qnli-dev_manhattan_accuracy": 0.708984375,
589
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 364.541748046875,
590
+ "eval_Qnli-dev_manhattan_ap": 0.7326520634446195,
591
+ "eval_Qnli-dev_manhattan_f1": 0.6796747967479675,
592
+ "eval_Qnli-dev_manhattan_f1_threshold": 434.2391357421875,
593
+ "eval_Qnli-dev_manhattan_precision": 0.5514511873350924,
594
+ "eval_Qnli-dev_manhattan_recall": 0.885593220338983,
595
+ "eval_Qnli-dev_max_accuracy": 0.7109375,
596
+ "eval_Qnli-dev_max_accuracy_threshold": 364.541748046875,
597
+ "eval_Qnli-dev_max_ap": 0.7326520634446195,
598
+ "eval_Qnli-dev_max_f1": 0.6848739495798319,
599
+ "eval_Qnli-dev_max_f1_threshold": 434.2391357421875,
600
+ "eval_Qnli-dev_max_precision": 0.6791666666666667,
601
+ "eval_Qnli-dev_max_recall": 0.9279661016949152,
602
+ "eval_allNLI-dev_cosine_accuracy": 0.740234375,
603
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7892441749572754,
604
+ "eval_allNLI-dev_cosine_ap": 0.6337074778927934,
605
+ "eval_allNLI-dev_cosine_f1": 0.6367713004484306,
606
+ "eval_allNLI-dev_cosine_f1_threshold": 0.5993767976760864,
607
+ "eval_allNLI-dev_cosine_precision": 0.5201465201465202,
608
+ "eval_allNLI-dev_cosine_recall": 0.8208092485549133,
609
+ "eval_allNLI-dev_dot_accuracy": 0.724609375,
610
+ "eval_allNLI-dev_dot_accuracy_threshold": 286.42156982421875,
611
+ "eval_allNLI-dev_dot_ap": 0.5908477549155566,
612
+ "eval_allNLI-dev_dot_f1": 0.6013667425968109,
613
+ "eval_allNLI-dev_dot_f1_threshold": 234.44680786132812,
614
+ "eval_allNLI-dev_dot_precision": 0.49624060150375937,
615
+ "eval_allNLI-dev_dot_recall": 0.7630057803468208,
616
+ "eval_allNLI-dev_euclidean_accuracy": 0.744140625,
617
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.312193870544434,
618
+ "eval_allNLI-dev_euclidean_ap": 0.6395819541470485,
619
+ "eval_allNLI-dev_euclidean_f1": 0.6495327102803738,
620
+ "eval_allNLI-dev_euclidean_f1_threshold": 17.35163116455078,
621
+ "eval_allNLI-dev_euclidean_precision": 0.5450980392156862,
622
+ "eval_allNLI-dev_euclidean_recall": 0.8034682080924855,
623
+ "eval_allNLI-dev_manhattan_accuracy": 0.740234375,
624
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 273.14263916015625,
625
+ "eval_allNLI-dev_manhattan_ap": 0.6396466602355837,
626
+ "eval_allNLI-dev_manhattan_f1": 0.6482758620689655,
627
+ "eval_allNLI-dev_manhattan_f1_threshold": 367.1748046875,
628
+ "eval_allNLI-dev_manhattan_precision": 0.5381679389312977,
629
+ "eval_allNLI-dev_manhattan_recall": 0.815028901734104,
630
+ "eval_allNLI-dev_max_accuracy": 0.744140625,
631
+ "eval_allNLI-dev_max_accuracy_threshold": 286.42156982421875,
632
+ "eval_allNLI-dev_max_ap": 0.6396466602355837,
633
+ "eval_allNLI-dev_max_f1": 0.6495327102803738,
634
+ "eval_allNLI-dev_max_f1_threshold": 367.1748046875,
635
+ "eval_allNLI-dev_max_precision": 0.5450980392156862,
636
+ "eval_allNLI-dev_max_recall": 0.8208092485549133,
637
+ "eval_sequential_score": 0.7326520634446195,
638
+ "eval_sts-test_pearson_cosine": 0.8852721127163659,
639
+ "eval_sts-test_pearson_dot": 0.8721890224510694,
640
+ "eval_sts-test_pearson_euclidean": 0.9051850094674976,
641
+ "eval_sts-test_pearson_manhattan": 0.9058527749212866,
642
+ "eval_sts-test_pearson_max": 0.9058527749212866,
643
+ "eval_sts-test_spearman_cosine": 0.9079463937833133,
644
+ "eval_sts-test_spearman_dot": 0.8718395993011061,
645
+ "eval_sts-test_spearman_euclidean": 0.901890876848136,
646
+ "eval_sts-test_spearman_manhattan": 0.902598976057132,
647
+ "eval_sts-test_spearman_max": 0.9079463937833133,
648
+ "eval_vitaminc-pairs_loss": 2.794250965118408,
649
+ "eval_vitaminc-pairs_runtime": 3.828,
650
+ "eval_vitaminc-pairs_samples_per_second": 33.438,
651
+ "eval_vitaminc-pairs_steps_per_second": 0.261,
652
+ "step": 24
653
+ },
654
+ {
655
+ "epoch": 0.23300970873786409,
656
+ "eval_negation-triplets_loss": 1.340283989906311,
657
+ "eval_negation-triplets_runtime": 0.6301,
658
+ "eval_negation-triplets_samples_per_second": 203.131,
659
+ "eval_negation-triplets_steps_per_second": 1.587,
660
+ "step": 24
661
+ },
662
+ {
663
+ "epoch": 0.23300970873786409,
664
+ "eval_scitail-pairs-pos_loss": 0.060825277119874954,
665
+ "eval_scitail-pairs-pos_runtime": 0.8911,
666
+ "eval_scitail-pairs-pos_samples_per_second": 143.641,
667
+ "eval_scitail-pairs-pos_steps_per_second": 1.122,
668
+ "step": 24
669
+ },
670
+ {
671
+ "epoch": 0.23300970873786409,
672
+ "eval_scitail-pairs-qa_loss": 1.2640072100111865e-07,
673
+ "eval_scitail-pairs-qa_runtime": 0.5679,
674
+ "eval_scitail-pairs-qa_samples_per_second": 225.406,
675
+ "eval_scitail-pairs-qa_steps_per_second": 1.761,
676
+ "step": 24
677
+ },
678
+ {
679
+ "epoch": 0.23300970873786409,
680
+ "eval_xsum-pairs_loss": 0.0001656880631344393,
681
+ "eval_xsum-pairs_runtime": 2.9303,
682
+ "eval_xsum-pairs_samples_per_second": 43.681,
683
+ "eval_xsum-pairs_steps_per_second": 0.341,
684
+ "step": 24
685
+ },
686
+ {
687
+ "epoch": 0.23300970873786409,
688
+ "eval_sciq_pairs_loss": 0.05494913458824158,
689
+ "eval_sciq_pairs_runtime": 2.53,
690
+ "eval_sciq_pairs_samples_per_second": 34.387,
691
+ "eval_sciq_pairs_steps_per_second": 0.395,
692
+ "step": 24
693
+ },
694
+ {
695
+ "epoch": 0.23300970873786409,
696
+ "eval_qasc_pairs_loss": 0.07522959262132645,
697
+ "eval_qasc_pairs_runtime": 0.3767,
698
+ "eval_qasc_pairs_samples_per_second": 230.935,
699
+ "eval_qasc_pairs_steps_per_second": 2.654,
700
+ "step": 24
701
+ },
702
+ {
703
+ "epoch": 0.23300970873786409,
704
+ "eval_openbookqa_pairs_loss": 1.3630236387252808,
705
+ "eval_openbookqa_pairs_runtime": 0.5729,
706
+ "eval_openbookqa_pairs_samples_per_second": 223.429,
707
+ "eval_openbookqa_pairs_steps_per_second": 1.746,
708
+ "step": 24
709
+ },
710
+ {
711
+ "epoch": 0.23300970873786409,
712
+ "eval_msmarco_pairs_loss": 0.14150576293468475,
713
+ "eval_msmarco_pairs_runtime": 1.095,
714
+ "eval_msmarco_pairs_samples_per_second": 79.449,
715
+ "eval_msmarco_pairs_steps_per_second": 0.913,
716
+ "step": 24
717
+ },
718
+ {
719
+ "epoch": 0.23300970873786409,
720
+ "eval_nq_pairs_loss": 0.0068158903159201145,
721
+ "eval_nq_pairs_runtime": 2.2497,
722
+ "eval_nq_pairs_samples_per_second": 38.672,
723
+ "eval_nq_pairs_steps_per_second": 0.445,
724
+ "step": 24
725
+ },
726
+ {
727
+ "epoch": 0.23300970873786409,
728
+ "eval_trivia_pairs_loss": 0.26929065585136414,
729
+ "eval_trivia_pairs_runtime": 2.4805,
730
+ "eval_trivia_pairs_samples_per_second": 40.314,
731
+ "eval_trivia_pairs_steps_per_second": 0.403,
732
+ "step": 24
733
+ },
734
+ {
735
+ "epoch": 0.23300970873786409,
736
+ "eval_gooaq_pairs_loss": 0.05852792412042618,
737
+ "eval_gooaq_pairs_runtime": 0.5795,
738
+ "eval_gooaq_pairs_samples_per_second": 150.128,
739
+ "eval_gooaq_pairs_steps_per_second": 1.726,
740
+ "step": 24
741
+ },
742
+ {
743
+ "epoch": 0.23300970873786409,
744
+ "eval_paws-pos_loss": 0.046059899032115936,
745
+ "eval_paws-pos_runtime": 0.668,
746
+ "eval_paws-pos_samples_per_second": 191.605,
747
+ "eval_paws-pos_steps_per_second": 1.497,
748
+ "step": 24
749
+ },
750
+ {
751
+ "epoch": 0.23300970873786409,
752
+ "eval_global_dataset_loss": 0.4663805067539215,
753
+ "eval_global_dataset_runtime": 25.0232,
754
+ "eval_global_dataset_samples_per_second": 26.495,
755
+ "eval_global_dataset_steps_per_second": 0.24,
756
+ "step": 24
757
+ },
758
+ {
759
+ "epoch": 0.24271844660194175,
760
+ "grad_norm": 0.5014179944992065,
761
+ "learning_rate": 1.1029411764705883e-05,
762
+ "loss": 0.2806,
763
+ "step": 25
764
+ },
765
+ {
766
+ "epoch": 0.2524271844660194,
767
+ "grad_norm": 0.320849746465683,
768
+ "learning_rate": 1.1470588235294118e-05,
769
+ "loss": 0.2202,
770
+ "step": 26
771
+ },
772
+ {
773
+ "epoch": 0.2621359223300971,
774
+ "grad_norm": 0.28120842576026917,
775
+ "learning_rate": 1.1911764705882354e-05,
776
+ "loss": 0.195,
777
+ "step": 27
778
+ },
779
+ {
780
+ "epoch": 0.27184466019417475,
781
+ "grad_norm": 0.755657434463501,
782
+ "learning_rate": 1.235294117647059e-05,
783
+ "loss": 0.1986,
784
+ "step": 28
785
+ },
786
+ {
787
+ "epoch": 0.2815533980582524,
788
+ "grad_norm": 0.5951756238937378,
789
+ "learning_rate": 1.2794117647058824e-05,
790
+ "loss": 0.0546,
791
+ "step": 29
792
+ },
793
+ {
794
+ "epoch": 0.2912621359223301,
795
+ "grad_norm": 0.5201478004455566,
796
+ "learning_rate": 1.323529411764706e-05,
797
+ "loss": 0.3083,
798
+ "step": 30
799
+ },
800
+ {
801
+ "epoch": 0.30097087378640774,
802
+ "grad_norm": 0.261674702167511,
803
+ "learning_rate": 1.3676470588235295e-05,
804
+ "loss": 0.127,
805
+ "step": 31
806
+ },
807
+ {
808
+ "epoch": 0.3106796116504854,
809
+ "grad_norm": 0.6115577220916748,
810
+ "learning_rate": 1.411764705882353e-05,
811
+ "loss": 0.2447,
812
+ "step": 32
813
+ },
814
+ {
815
+ "epoch": 0.3106796116504854,
816
+ "eval_Qnli-dev_cosine_accuracy": 0.6953125,
817
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.6684017181396484,
818
+ "eval_Qnli-dev_cosine_ap": 0.7203045959833639,
819
+ "eval_Qnli-dev_cosine_f1": 0.6755852842809364,
820
+ "eval_Qnli-dev_cosine_f1_threshold": 0.5110002756118774,
821
+ "eval_Qnli-dev_cosine_precision": 0.5580110497237569,
822
+ "eval_Qnli-dev_cosine_recall": 0.8559322033898306,
823
+ "eval_Qnli-dev_dot_accuracy": 0.66796875,
824
+ "eval_Qnli-dev_dot_accuracy_threshold": 294.4780578613281,
825
+ "eval_Qnli-dev_dot_ap": 0.676288649042553,
826
+ "eval_Qnli-dev_dot_f1": 0.6738794435857806,
827
+ "eval_Qnli-dev_dot_f1_threshold": 184.11688232421875,
828
+ "eval_Qnli-dev_dot_precision": 0.5304136253041363,
829
+ "eval_Qnli-dev_dot_recall": 0.923728813559322,
830
+ "eval_Qnli-dev_euclidean_accuracy": 0.7109375,
831
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 16.85451889038086,
832
+ "eval_Qnli-dev_euclidean_ap": 0.7312433122162235,
833
+ "eval_Qnli-dev_euclidean_f1": 0.6848739495798319,
834
+ "eval_Qnli-dev_euclidean_f1_threshold": 17.339509963989258,
835
+ "eval_Qnli-dev_euclidean_precision": 0.6791666666666667,
836
+ "eval_Qnli-dev_euclidean_recall": 0.690677966101695,
837
+ "eval_Qnli-dev_manhattan_accuracy": 0.70703125,
838
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 359.81439208984375,
839
+ "eval_Qnli-dev_manhattan_ap": 0.7325123059741169,
840
+ "eval_Qnli-dev_manhattan_f1": 0.6800804828973843,
841
+ "eval_Qnli-dev_manhattan_f1_threshold": 374.1423645019531,
842
+ "eval_Qnli-dev_manhattan_precision": 0.6475095785440613,
843
+ "eval_Qnli-dev_manhattan_recall": 0.7161016949152542,
844
+ "eval_Qnli-dev_max_accuracy": 0.7109375,
845
+ "eval_Qnli-dev_max_accuracy_threshold": 359.81439208984375,
846
+ "eval_Qnli-dev_max_ap": 0.7325123059741169,
847
+ "eval_Qnli-dev_max_f1": 0.6848739495798319,
848
+ "eval_Qnli-dev_max_f1_threshold": 374.1423645019531,
849
+ "eval_Qnli-dev_max_precision": 0.6791666666666667,
850
+ "eval_Qnli-dev_max_recall": 0.923728813559322,
851
+ "eval_allNLI-dev_cosine_accuracy": 0.7421875,
852
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.7891373634338379,
853
+ "eval_allNLI-dev_cosine_ap": 0.6322954079925401,
854
+ "eval_allNLI-dev_cosine_f1": 0.6337078651685394,
855
+ "eval_allNLI-dev_cosine_f1_threshold": 0.6023260354995728,
856
+ "eval_allNLI-dev_cosine_precision": 0.5183823529411765,
857
+ "eval_allNLI-dev_cosine_recall": 0.815028901734104,
858
+ "eval_allNLI-dev_dot_accuracy": 0.724609375,
859
+ "eval_allNLI-dev_dot_accuracy_threshold": 283.5550842285156,
860
+ "eval_allNLI-dev_dot_ap": 0.5884575729907496,
861
+ "eval_allNLI-dev_dot_f1": 0.5986394557823129,
862
+ "eval_allNLI-dev_dot_f1_threshold": 231.64846801757812,
863
+ "eval_allNLI-dev_dot_precision": 0.4925373134328358,
864
+ "eval_allNLI-dev_dot_recall": 0.7630057803468208,
865
+ "eval_allNLI-dev_euclidean_accuracy": 0.7421875,
866
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 13.253250122070312,
867
+ "eval_allNLI-dev_euclidean_ap": 0.6378114586204184,
868
+ "eval_allNLI-dev_euclidean_f1": 0.6512702078521939,
869
+ "eval_allNLI-dev_euclidean_f1_threshold": 17.237594604492188,
870
+ "eval_allNLI-dev_euclidean_precision": 0.5423076923076923,
871
+ "eval_allNLI-dev_euclidean_recall": 0.815028901734104,
872
+ "eval_allNLI-dev_manhattan_accuracy": 0.740234375,
873
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 267.76416015625,
874
+ "eval_allNLI-dev_manhattan_ap": 0.6391624752319756,
875
+ "eval_allNLI-dev_manhattan_f1": 0.6457831325301204,
876
+ "eval_allNLI-dev_manhattan_f1_threshold": 353.9185791015625,
877
+ "eval_allNLI-dev_manhattan_precision": 0.5537190082644629,
878
+ "eval_allNLI-dev_manhattan_recall": 0.7745664739884393,
879
+ "eval_allNLI-dev_max_accuracy": 0.7421875,
880
+ "eval_allNLI-dev_max_accuracy_threshold": 283.5550842285156,
881
+ "eval_allNLI-dev_max_ap": 0.6391624752319756,
882
+ "eval_allNLI-dev_max_f1": 0.6512702078521939,
883
+ "eval_allNLI-dev_max_f1_threshold": 353.9185791015625,
884
+ "eval_allNLI-dev_max_precision": 0.5537190082644629,
885
+ "eval_allNLI-dev_max_recall": 0.815028901734104,
886
+ "eval_sequential_score": 0.7325123059741169,
887
+ "eval_sts-test_pearson_cosine": 0.8848940721874675,
888
+ "eval_sts-test_pearson_dot": 0.8708637407541432,
889
+ "eval_sts-test_pearson_euclidean": 0.9044975746549173,
890
+ "eval_sts-test_pearson_manhattan": 0.905176437582178,
891
+ "eval_sts-test_pearson_max": 0.905176437582178,
892
+ "eval_sts-test_spearman_cosine": 0.9077582147291623,
893
+ "eval_sts-test_spearman_dot": 0.870147285600329,
894
+ "eval_sts-test_spearman_euclidean": 0.9012842734999755,
895
+ "eval_sts-test_spearman_manhattan": 0.9020171201565211,
896
+ "eval_sts-test_spearman_max": 0.9077582147291623,
897
+ "eval_vitaminc-pairs_loss": 2.790738105773926,
898
+ "eval_vitaminc-pairs_runtime": 3.8263,
899
+ "eval_vitaminc-pairs_samples_per_second": 33.453,
900
+ "eval_vitaminc-pairs_steps_per_second": 0.261,
901
+ "step": 32
902
+ },
903
+ {
904
+ "epoch": 0.3106796116504854,
905
+ "eval_negation-triplets_loss": 1.3332663774490356,
906
+ "eval_negation-triplets_runtime": 0.631,
907
+ "eval_negation-triplets_samples_per_second": 202.843,
908
+ "eval_negation-triplets_steps_per_second": 1.585,
909
+ "step": 32
910
+ },
911
+ {
912
+ "epoch": 0.3106796116504854,
913
+ "eval_scitail-pairs-pos_loss": 0.06070985272526741,
914
+ "eval_scitail-pairs-pos_runtime": 0.8894,
915
+ "eval_scitail-pairs-pos_samples_per_second": 143.912,
916
+ "eval_scitail-pairs-pos_steps_per_second": 1.124,
917
+ "step": 32
918
+ },
919
+ {
920
+ "epoch": 0.3106796116504854,
921
+ "eval_scitail-pairs-qa_loss": 1.3768674023140193e-07,
922
+ "eval_scitail-pairs-qa_runtime": 0.5735,
923
+ "eval_scitail-pairs-qa_samples_per_second": 223.193,
924
+ "eval_scitail-pairs-qa_steps_per_second": 1.744,
925
+ "step": 32
926
+ },
927
+ {
928
+ "epoch": 0.3106796116504854,
929
+ "eval_xsum-pairs_loss": 0.00016528123524039984,
930
+ "eval_xsum-pairs_runtime": 2.9269,
931
+ "eval_xsum-pairs_samples_per_second": 43.733,
932
+ "eval_xsum-pairs_steps_per_second": 0.342,
933
+ "step": 32
934
+ },
935
+ {
936
+ "epoch": 0.3106796116504854,
937
+ "eval_sciq_pairs_loss": 0.053717803210020065,
938
+ "eval_sciq_pairs_runtime": 2.507,
939
+ "eval_sciq_pairs_samples_per_second": 34.703,
940
+ "eval_sciq_pairs_steps_per_second": 0.399,
941
+ "step": 32
942
+ },
943
+ {
944
+ "epoch": 0.3106796116504854,
945
+ "eval_qasc_pairs_loss": 0.0767521858215332,
946
+ "eval_qasc_pairs_runtime": 0.3716,
947
+ "eval_qasc_pairs_samples_per_second": 234.106,
948
+ "eval_qasc_pairs_steps_per_second": 2.691,
949
+ "step": 32
950
+ },
951
+ {
952
+ "epoch": 0.3106796116504854,
953
+ "eval_openbookqa_pairs_loss": 1.3624300956726074,
954
+ "eval_openbookqa_pairs_runtime": 0.5694,
955
+ "eval_openbookqa_pairs_samples_per_second": 224.793,
956
+ "eval_openbookqa_pairs_steps_per_second": 1.756,
957
+ "step": 32
958
+ },
959
+ {
960
+ "epoch": 0.3106796116504854,
961
+ "eval_msmarco_pairs_loss": 0.1503019779920578,
962
+ "eval_msmarco_pairs_runtime": 1.0979,
963
+ "eval_msmarco_pairs_samples_per_second": 79.242,
964
+ "eval_msmarco_pairs_steps_per_second": 0.911,
965
+ "step": 32
966
+ },
967
+ {
968
+ "epoch": 0.3106796116504854,
969
+ "eval_nq_pairs_loss": 0.007631430868059397,
970
+ "eval_nq_pairs_runtime": 2.2394,
971
+ "eval_nq_pairs_samples_per_second": 38.849,
972
+ "eval_nq_pairs_steps_per_second": 0.447,
973
+ "step": 32
974
+ },
975
+ {
976
+ "epoch": 0.3106796116504854,
977
+ "eval_trivia_pairs_loss": 0.2704613208770752,
978
+ "eval_trivia_pairs_runtime": 2.4373,
979
+ "eval_trivia_pairs_samples_per_second": 41.029,
980
+ "eval_trivia_pairs_steps_per_second": 0.41,
981
+ "step": 32
982
+ },
983
+ {
984
+ "epoch": 0.3106796116504854,
985
+ "eval_gooaq_pairs_loss": 0.05935850366950035,
986
+ "eval_gooaq_pairs_runtime": 0.5791,
987
+ "eval_gooaq_pairs_samples_per_second": 150.232,
988
+ "eval_gooaq_pairs_steps_per_second": 1.727,
989
+ "step": 32
990
+ },
991
+ {
992
+ "epoch": 0.3106796116504854,
993
+ "eval_paws-pos_loss": 0.04611051082611084,
994
+ "eval_paws-pos_runtime": 0.6911,
995
+ "eval_paws-pos_samples_per_second": 185.199,
996
+ "eval_paws-pos_steps_per_second": 1.447,
997
+ "step": 32
998
+ },
999
+ {
1000
+ "epoch": 0.3106796116504854,
1001
+ "eval_global_dataset_loss": 0.4507196247577667,
1002
+ "eval_global_dataset_runtime": 25.0483,
1003
+ "eval_global_dataset_samples_per_second": 26.469,
1004
+ "eval_global_dataset_steps_per_second": 0.24,
1005
+ "step": 32
1006
+ }
1007
+ ],
1008
+ "logging_steps": 1,
1009
+ "max_steps": 309,
1010
+ "num_input_tokens_seen": 0,
1011
+ "num_train_epochs": 3,
1012
+ "save_steps": 16,
1013
+ "stateful_callbacks": {
1014
+ "TrainerControl": {
1015
+ "args": {
1016
+ "should_epoch_stop": false,
1017
+ "should_evaluate": false,
1018
+ "should_log": false,
1019
+ "should_save": true,
1020
+ "should_training_stop": false
1021
+ },
1022
+ "attributes": {}
1023
+ }
1024
+ },
1025
+ "total_flos": 0.0,
1026
+ "train_batch_size": 96,
1027
+ "trial_name": null,
1028
+ "trial_params": null
1029
+ }
checkpoint-32/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d891109b3776bbe6fe5299eaca62fc99db8369e44011108f84f516eaa2f0e0
3
+ size 5624