bobox commited on
Commit
61bbb61
·
verified ·
1 Parent(s): 7c6b2f9

Training in progress, step 387, checkpoint

Browse files
checkpoint-387/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1536,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-387/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-387/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-387/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v2-xlarge",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_head_size": 64,
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "conv_act": "gelu",
9
+ "conv_kernel_size": 3,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1536,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
+ "layer_norm_eps": 1e-07,
16
+ "max_position_embeddings": 512,
17
+ "max_relative_positions": -1,
18
+ "model_type": "deberta-v2",
19
+ "norm_rel_ebd": "layer_norm",
20
+ "num_attention_heads": 24,
21
+ "num_hidden_layers": 24,
22
+ "pad_token_id": 0,
23
+ "pooler_dropout": 0,
24
+ "pooler_hidden_act": "gelu",
25
+ "pooler_hidden_size": 1536,
26
+ "pos_att_type": [
27
+ "p2c",
28
+ "c2p"
29
+ ],
30
+ "position_biased_input": false,
31
+ "position_buckets": 256,
32
+ "relative_attention": true,
33
+ "share_att_key": true,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.42.4",
36
+ "type_vocab_size": 0,
37
+ "vocab_size": 128100
38
+ }
checkpoint-387/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.42.4",
5
+ "pytorch": "2.4.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-387/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
checkpoint-387/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f44922bc02dadba770b21ec8da4096153d86f2a9174acde5e112b715a8adff71
3
+ size 7077084396
checkpoint-387/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbb393a78debafa192d30d98f9cbce6c8af8b145bc2e6747f2ebb41c0be3481b
3
+ size 3538506546
checkpoint-387/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6055e5d61ef1105743a47fab37b01a482caf029d5010a330b5ae074f081235d6
3
+ size 14244
checkpoint-387/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1688c9ad0e7b899d5c70cc2d961b11a0a2fe65e76a919390b1cfddca41437f4
3
+ size 1064
checkpoint-387/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-387/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
checkpoint-387/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5598d5e96f339a8d980c15f9afd405a2e5e1be7db41de3ed13b0f03fac1e8c17
3
+ size 2447305
checkpoint-387/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-387/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "sp_model_kwargs": {},
54
+ "split_by_punct": false,
55
+ "tokenizer_class": "DebertaV2Tokenizer",
56
+ "unk_token": "[UNK]",
57
+ "vocab_type": "spm"
58
+ }
checkpoint-387/trainer_state.json ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.20041429311237702,
5
+ "eval_steps": 97,
6
+ "global_step": 387,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.010357327809425169,
13
+ "grad_norm": 160.78689575195312,
14
+ "learning_rate": 3.529411764705882e-07,
15
+ "loss": 10.2062,
16
+ "step": 20
17
+ },
18
+ {
19
+ "epoch": 0.020714655618850338,
20
+ "grad_norm": 303.3471374511719,
21
+ "learning_rate": 8.000000000000001e-07,
22
+ "loss": 7.9221,
23
+ "step": 40
24
+ },
25
+ {
26
+ "epoch": 0.031071983428275506,
27
+ "grad_norm": 81.0146484375,
28
+ "learning_rate": 1.2235294117647059e-06,
29
+ "loss": 5.9499,
30
+ "step": 60
31
+ },
32
+ {
33
+ "epoch": 0.041429311237700675,
34
+ "grad_norm": 88.74897003173828,
35
+ "learning_rate": 1.6470588235294118e-06,
36
+ "loss": 6.0555,
37
+ "step": 80
38
+ },
39
+ {
40
+ "epoch": 0.050233039875712066,
41
+ "eval_Qnli-dev_cosine_accuracy": 0.62109375,
42
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.9108127355575562,
43
+ "eval_Qnli-dev_cosine_ap": 0.6197524033200674,
44
+ "eval_Qnli-dev_cosine_f1": 0.6396396396396397,
45
+ "eval_Qnli-dev_cosine_f1_threshold": 0.8376526832580566,
46
+ "eval_Qnli-dev_cosine_precision": 0.49534883720930234,
47
+ "eval_Qnli-dev_cosine_recall": 0.902542372881356,
48
+ "eval_Qnli-dev_dot_accuracy": 0.58984375,
49
+ "eval_Qnli-dev_dot_accuracy_threshold": 865.2555541992188,
50
+ "eval_Qnli-dev_dot_ap": 0.5567642852275692,
51
+ "eval_Qnli-dev_dot_f1": 0.6363636363636364,
52
+ "eval_Qnli-dev_dot_f1_threshold": 691.9456787109375,
53
+ "eval_Qnli-dev_dot_precision": 0.47863247863247865,
54
+ "eval_Qnli-dev_dot_recall": 0.9491525423728814,
55
+ "eval_Qnli-dev_euclidean_accuracy": 0.609375,
56
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 13.323524475097656,
57
+ "eval_Qnli-dev_euclidean_ap": 0.6115116478210071,
58
+ "eval_Qnli-dev_euclidean_f1": 0.6449612403100775,
59
+ "eval_Qnli-dev_euclidean_f1_threshold": 16.585830688476562,
60
+ "eval_Qnli-dev_euclidean_precision": 0.508557457212714,
61
+ "eval_Qnli-dev_euclidean_recall": 0.8813559322033898,
62
+ "eval_Qnli-dev_manhattan_accuracy": 0.619140625,
63
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 406.1038818359375,
64
+ "eval_Qnli-dev_manhattan_ap": 0.609904024113499,
65
+ "eval_Qnli-dev_manhattan_f1": 0.6494345718901454,
66
+ "eval_Qnli-dev_manhattan_f1_threshold": 484.52716064453125,
67
+ "eval_Qnli-dev_manhattan_precision": 0.5248041775456919,
68
+ "eval_Qnli-dev_manhattan_recall": 0.8516949152542372,
69
+ "eval_Qnli-dev_max_accuracy": 0.62109375,
70
+ "eval_Qnli-dev_max_accuracy_threshold": 865.2555541992188,
71
+ "eval_Qnli-dev_max_ap": 0.6197524033200674,
72
+ "eval_Qnli-dev_max_f1": 0.6494345718901454,
73
+ "eval_Qnli-dev_max_f1_threshold": 691.9456787109375,
74
+ "eval_Qnli-dev_max_precision": 0.5248041775456919,
75
+ "eval_Qnli-dev_max_recall": 0.9491525423728814,
76
+ "eval_allNLI-dev_cosine_accuracy": 0.67578125,
77
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.9652533531188965,
78
+ "eval_allNLI-dev_cosine_ap": 0.4282858392784667,
79
+ "eval_allNLI-dev_cosine_f1": 0.515527950310559,
80
+ "eval_allNLI-dev_cosine_f1_threshold": 0.798592746257782,
81
+ "eval_allNLI-dev_cosine_precision": 0.3524416135881104,
82
+ "eval_allNLI-dev_cosine_recall": 0.9595375722543352,
83
+ "eval_allNLI-dev_dot_accuracy": 0.666015625,
84
+ "eval_allNLI-dev_dot_accuracy_threshold": 968.9529418945312,
85
+ "eval_allNLI-dev_dot_ap": 0.36425260705842155,
86
+ "eval_allNLI-dev_dot_f1": 0.5162287480680062,
87
+ "eval_allNLI-dev_dot_f1_threshold": 686.5814208984375,
88
+ "eval_allNLI-dev_dot_precision": 0.35232067510548526,
89
+ "eval_allNLI-dev_dot_recall": 0.9653179190751445,
90
+ "eval_allNLI-dev_euclidean_accuracy": 0.67578125,
91
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 8.16073226928711,
92
+ "eval_allNLI-dev_euclidean_ap": 0.4333583117036793,
93
+ "eval_allNLI-dev_euclidean_f1": 0.5164319248826291,
94
+ "eval_allNLI-dev_euclidean_f1_threshold": 18.877037048339844,
95
+ "eval_allNLI-dev_euclidean_precision": 0.3540772532188841,
96
+ "eval_allNLI-dev_euclidean_recall": 0.953757225433526,
97
+ "eval_allNLI-dev_manhattan_accuracy": 0.67578125,
98
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 226.18099975585938,
99
+ "eval_allNLI-dev_manhattan_ap": 0.4400955405569059,
100
+ "eval_allNLI-dev_manhattan_f1": 0.5179407176287052,
101
+ "eval_allNLI-dev_manhattan_f1_threshold": 570.2012329101562,
102
+ "eval_allNLI-dev_manhattan_precision": 0.3547008547008547,
103
+ "eval_allNLI-dev_manhattan_recall": 0.9595375722543352,
104
+ "eval_allNLI-dev_max_accuracy": 0.67578125,
105
+ "eval_allNLI-dev_max_accuracy_threshold": 968.9529418945312,
106
+ "eval_allNLI-dev_max_ap": 0.4400955405569059,
107
+ "eval_allNLI-dev_max_f1": 0.5179407176287052,
108
+ "eval_allNLI-dev_max_f1_threshold": 686.5814208984375,
109
+ "eval_allNLI-dev_max_precision": 0.3547008547008547,
110
+ "eval_allNLI-dev_max_recall": 0.9653179190751445,
111
+ "eval_sequential_score": 0.6197524033200674,
112
+ "eval_sts-test_pearson_cosine": 0.6170839897033953,
113
+ "eval_sts-test_pearson_dot": 0.43346770865150264,
114
+ "eval_sts-test_pearson_euclidean": 0.6474775644966124,
115
+ "eval_sts-test_pearson_manhattan": 0.6616828287248389,
116
+ "eval_sts-test_pearson_max": 0.6616828287248389,
117
+ "eval_sts-test_spearman_cosine": 0.6552392427969004,
118
+ "eval_sts-test_spearman_dot": 0.4585595522909849,
119
+ "eval_sts-test_spearman_euclidean": 0.652406174691995,
120
+ "eval_sts-test_spearman_manhattan": 0.6662387448368152,
121
+ "eval_sts-test_spearman_max": 0.6662387448368152,
122
+ "eval_vitaminc-pairs_loss": 3.7554073333740234,
123
+ "eval_vitaminc-pairs_runtime": 4.7418,
124
+ "eval_vitaminc-pairs_samples_per_second": 26.994,
125
+ "eval_vitaminc-pairs_steps_per_second": 0.211,
126
+ "step": 97
127
+ },
128
+ {
129
+ "epoch": 0.050233039875712066,
130
+ "eval_negation-triplets_loss": 3.6897997856140137,
131
+ "eval_negation-triplets_runtime": 3.1578,
132
+ "eval_negation-triplets_samples_per_second": 40.534,
133
+ "eval_negation-triplets_steps_per_second": 0.317,
134
+ "step": 97
135
+ },
136
+ {
137
+ "epoch": 0.050233039875712066,
138
+ "eval_scitail-pairs-pos_loss": 0.45563364028930664,
139
+ "eval_scitail-pairs-pos_runtime": 2.7223,
140
+ "eval_scitail-pairs-pos_samples_per_second": 47.02,
141
+ "eval_scitail-pairs-pos_steps_per_second": 0.367,
142
+ "step": 97
143
+ },
144
+ {
145
+ "epoch": 0.050233039875712066,
146
+ "eval_scitail-pairs-qa_loss": 1.115855097770691,
147
+ "eval_scitail-pairs-qa_runtime": 2.2597,
148
+ "eval_scitail-pairs-qa_samples_per_second": 56.646,
149
+ "eval_scitail-pairs-qa_steps_per_second": 0.443,
150
+ "step": 97
151
+ },
152
+ {
153
+ "epoch": 0.050233039875712066,
154
+ "eval_xsum-pairs_loss": 3.2118453979492188,
155
+ "eval_xsum-pairs_runtime": 3.0538,
156
+ "eval_xsum-pairs_samples_per_second": 41.915,
157
+ "eval_xsum-pairs_steps_per_second": 0.327,
158
+ "step": 97
159
+ },
160
+ {
161
+ "epoch": 0.050233039875712066,
162
+ "eval_sciq_pairs_loss": 0.484823614358902,
163
+ "eval_sciq_pairs_runtime": 3.959,
164
+ "eval_sciq_pairs_samples_per_second": 32.331,
165
+ "eval_sciq_pairs_steps_per_second": 0.253,
166
+ "step": 97
167
+ },
168
+ {
169
+ "epoch": 0.050233039875712066,
170
+ "eval_qasc_pairs_loss": 2.8566131591796875,
171
+ "eval_qasc_pairs_runtime": 2.1087,
172
+ "eval_qasc_pairs_samples_per_second": 60.701,
173
+ "eval_qasc_pairs_steps_per_second": 0.474,
174
+ "step": 97
175
+ },
176
+ {
177
+ "epoch": 0.050233039875712066,
178
+ "eval_openbookqa_pairs_loss": 2.1501104831695557,
179
+ "eval_openbookqa_pairs_runtime": 2.2555,
180
+ "eval_openbookqa_pairs_samples_per_second": 56.751,
181
+ "eval_openbookqa_pairs_steps_per_second": 0.443,
182
+ "step": 97
183
+ },
184
+ {
185
+ "epoch": 0.050233039875712066,
186
+ "eval_msmarco_pairs_loss": 4.395960807800293,
187
+ "eval_msmarco_pairs_runtime": 2.2407,
188
+ "eval_msmarco_pairs_samples_per_second": 57.125,
189
+ "eval_msmarco_pairs_steps_per_second": 0.446,
190
+ "step": 97
191
+ },
192
+ {
193
+ "epoch": 0.050233039875712066,
194
+ "eval_nq_pairs_loss": 4.488173484802246,
195
+ "eval_nq_pairs_runtime": 2.7484,
196
+ "eval_nq_pairs_samples_per_second": 46.572,
197
+ "eval_nq_pairs_steps_per_second": 0.364,
198
+ "step": 97
199
+ },
200
+ {
201
+ "epoch": 0.050233039875712066,
202
+ "eval_trivia_pairs_loss": 4.023955345153809,
203
+ "eval_trivia_pairs_runtime": 3.7908,
204
+ "eval_trivia_pairs_samples_per_second": 33.766,
205
+ "eval_trivia_pairs_steps_per_second": 0.264,
206
+ "step": 97
207
+ },
208
+ {
209
+ "epoch": 0.050233039875712066,
210
+ "eval_gooaq_pairs_loss": 3.383638858795166,
211
+ "eval_gooaq_pairs_runtime": 2.1349,
212
+ "eval_gooaq_pairs_samples_per_second": 59.957,
213
+ "eval_gooaq_pairs_steps_per_second": 0.468,
214
+ "step": 97
215
+ },
216
+ {
217
+ "epoch": 0.050233039875712066,
218
+ "eval_paws-pos_loss": 0.12275903671979904,
219
+ "eval_paws-pos_runtime": 2.2818,
220
+ "eval_paws-pos_samples_per_second": 56.095,
221
+ "eval_paws-pos_steps_per_second": 0.438,
222
+ "step": 97
223
+ },
224
+ {
225
+ "epoch": 0.050233039875712066,
226
+ "eval_global_dataset_loss": 1.9564138650894165,
227
+ "eval_global_dataset_runtime": 10.2217,
228
+ "eval_global_dataset_samples_per_second": 40.698,
229
+ "eval_global_dataset_steps_per_second": 0.391,
230
+ "step": 97
231
+ },
232
+ {
233
+ "epoch": 0.05178663904712584,
234
+ "grad_norm": 83.8360824584961,
235
+ "learning_rate": 2.1176470588235296e-06,
236
+ "loss": 4.0315,
237
+ "step": 100
238
+ },
239
+ {
240
+ "epoch": 0.06214396685655101,
241
+ "grad_norm": 325.5680236816406,
242
+ "learning_rate": 2.588235294117647e-06,
243
+ "loss": 1.6348,
244
+ "step": 120
245
+ },
246
+ {
247
+ "epoch": 0.07250129466597618,
248
+ "grad_norm": 106.99758911132812,
249
+ "learning_rate": 3.0588235294117647e-06,
250
+ "loss": 1.1866,
251
+ "step": 140
252
+ },
253
+ {
254
+ "epoch": 0.08285862247540135,
255
+ "grad_norm": 30.390771865844727,
256
+ "learning_rate": 3.5294117647058825e-06,
257
+ "loss": 0.6138,
258
+ "step": 160
259
+ },
260
+ {
261
+ "epoch": 0.09321595028482652,
262
+ "grad_norm": 39.691532135009766,
263
+ "learning_rate": 4e-06,
264
+ "loss": 0.5244,
265
+ "step": 180
266
+ },
267
+ {
268
+ "epoch": 0.10046607975142413,
269
+ "eval_Qnli-dev_cosine_accuracy": 0.666015625,
270
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.7696025967597961,
271
+ "eval_Qnli-dev_cosine_ap": 0.693851901846308,
272
+ "eval_Qnli-dev_cosine_f1": 0.6625000000000001,
273
+ "eval_Qnli-dev_cosine_f1_threshold": 0.6638460159301758,
274
+ "eval_Qnli-dev_cosine_precision": 0.5247524752475248,
275
+ "eval_Qnli-dev_cosine_recall": 0.8983050847457628,
276
+ "eval_Qnli-dev_dot_accuracy": 0.6796875,
277
+ "eval_Qnli-dev_dot_accuracy_threshold": 822.6981201171875,
278
+ "eval_Qnli-dev_dot_ap": 0.6603086879421342,
279
+ "eval_Qnli-dev_dot_f1": 0.657856093979442,
280
+ "eval_Qnli-dev_dot_f1_threshold": 618.4547119140625,
281
+ "eval_Qnli-dev_dot_precision": 0.503370786516854,
282
+ "eval_Qnli-dev_dot_recall": 0.9491525423728814,
283
+ "eval_Qnli-dev_euclidean_accuracy": 0.666015625,
284
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 19.874027252197266,
285
+ "eval_Qnli-dev_euclidean_ap": 0.694158709095853,
286
+ "eval_Qnli-dev_euclidean_f1": 0.6630236794171221,
287
+ "eval_Qnli-dev_euclidean_f1_threshold": 23.005264282226562,
288
+ "eval_Qnli-dev_euclidean_precision": 0.5814696485623003,
289
+ "eval_Qnli-dev_euclidean_recall": 0.7711864406779662,
290
+ "eval_Qnli-dev_manhattan_accuracy": 0.66796875,
291
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 624.285888671875,
292
+ "eval_Qnli-dev_manhattan_ap": 0.692633721446368,
293
+ "eval_Qnli-dev_manhattan_f1": 0.6641366223908918,
294
+ "eval_Qnli-dev_manhattan_f1_threshold": 696.709716796875,
295
+ "eval_Qnli-dev_manhattan_precision": 0.6013745704467354,
296
+ "eval_Qnli-dev_manhattan_recall": 0.7415254237288136,
297
+ "eval_Qnli-dev_max_accuracy": 0.6796875,
298
+ "eval_Qnli-dev_max_accuracy_threshold": 822.6981201171875,
299
+ "eval_Qnli-dev_max_ap": 0.694158709095853,
300
+ "eval_Qnli-dev_max_f1": 0.6641366223908918,
301
+ "eval_Qnli-dev_max_f1_threshold": 696.709716796875,
302
+ "eval_Qnli-dev_max_precision": 0.6013745704467354,
303
+ "eval_Qnli-dev_max_recall": 0.9491525423728814,
304
+ "eval_allNLI-dev_cosine_accuracy": 0.701171875,
305
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.854247510433197,
306
+ "eval_allNLI-dev_cosine_ap": 0.5504250327111149,
307
+ "eval_allNLI-dev_cosine_f1": 0.567287784679089,
308
+ "eval_allNLI-dev_cosine_f1_threshold": 0.7080726623535156,
309
+ "eval_allNLI-dev_cosine_precision": 0.44193548387096776,
310
+ "eval_allNLI-dev_cosine_recall": 0.791907514450867,
311
+ "eval_allNLI-dev_dot_accuracy": 0.69921875,
312
+ "eval_allNLI-dev_dot_accuracy_threshold": 885.8963623046875,
313
+ "eval_allNLI-dev_dot_ap": 0.5371398846089106,
314
+ "eval_allNLI-dev_dot_f1": 0.5720338983050848,
315
+ "eval_allNLI-dev_dot_f1_threshold": 732.1597290039062,
316
+ "eval_allNLI-dev_dot_precision": 0.451505016722408,
317
+ "eval_allNLI-dev_dot_recall": 0.7803468208092486,
318
+ "eval_allNLI-dev_euclidean_accuracy": 0.701171875,
319
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 16.9801082611084,
320
+ "eval_allNLI-dev_euclidean_ap": 0.5503780840587245,
321
+ "eval_allNLI-dev_euclidean_f1": 0.5671641791044777,
322
+ "eval_allNLI-dev_euclidean_f1_threshold": 24.19074821472168,
323
+ "eval_allNLI-dev_euclidean_precision": 0.44932432432432434,
324
+ "eval_allNLI-dev_euclidean_recall": 0.7687861271676301,
325
+ "eval_allNLI-dev_manhattan_accuracy": 0.703125,
326
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 529.9462280273438,
327
+ "eval_allNLI-dev_manhattan_ap": 0.5524969745859143,
328
+ "eval_allNLI-dev_manhattan_f1": 0.5638297872340425,
329
+ "eval_allNLI-dev_manhattan_f1_threshold": 826.8560791015625,
330
+ "eval_allNLI-dev_manhattan_precision": 0.40664961636828645,
331
+ "eval_allNLI-dev_manhattan_recall": 0.9190751445086706,
332
+ "eval_allNLI-dev_max_accuracy": 0.703125,
333
+ "eval_allNLI-dev_max_accuracy_threshold": 885.8963623046875,
334
+ "eval_allNLI-dev_max_ap": 0.5524969745859143,
335
+ "eval_allNLI-dev_max_f1": 0.5720338983050848,
336
+ "eval_allNLI-dev_max_f1_threshold": 826.8560791015625,
337
+ "eval_allNLI-dev_max_precision": 0.451505016722408,
338
+ "eval_allNLI-dev_max_recall": 0.9190751445086706,
339
+ "eval_sequential_score": 0.694158709095853,
340
+ "eval_sts-test_pearson_cosine": 0.8866994033223972,
341
+ "eval_sts-test_pearson_dot": 0.8712266973511624,
342
+ "eval_sts-test_pearson_euclidean": 0.9028053322103908,
343
+ "eval_sts-test_pearson_manhattan": 0.9029714248344419,
344
+ "eval_sts-test_pearson_max": 0.9029714248344419,
345
+ "eval_sts-test_spearman_cosine": 0.8941879764786184,
346
+ "eval_sts-test_spearman_dot": 0.8632849034222648,
347
+ "eval_sts-test_spearman_euclidean": 0.8944520984233506,
348
+ "eval_sts-test_spearman_manhattan": 0.8945218656398598,
349
+ "eval_sts-test_spearman_max": 0.8945218656398598,
350
+ "eval_vitaminc-pairs_loss": 3.507073163986206,
351
+ "eval_vitaminc-pairs_runtime": 4.4774,
352
+ "eval_vitaminc-pairs_samples_per_second": 28.588,
353
+ "eval_vitaminc-pairs_steps_per_second": 0.223,
354
+ "step": 194
355
+ },
356
+ {
357
+ "epoch": 0.10046607975142413,
358
+ "eval_negation-triplets_loss": 1.1223009824752808,
359
+ "eval_negation-triplets_runtime": 3.102,
360
+ "eval_negation-triplets_samples_per_second": 41.264,
361
+ "eval_negation-triplets_steps_per_second": 0.322,
362
+ "step": 194
363
+ },
364
+ {
365
+ "epoch": 0.10046607975142413,
366
+ "eval_scitail-pairs-pos_loss": 0.06560208648443222,
367
+ "eval_scitail-pairs-pos_runtime": 2.6151,
368
+ "eval_scitail-pairs-pos_samples_per_second": 48.946,
369
+ "eval_scitail-pairs-pos_steps_per_second": 0.382,
370
+ "step": 194
371
+ },
372
+ {
373
+ "epoch": 0.10046607975142413,
374
+ "eval_scitail-pairs-qa_loss": 0.044671397656202316,
375
+ "eval_scitail-pairs-qa_runtime": 2.2115,
376
+ "eval_scitail-pairs-qa_samples_per_second": 57.879,
377
+ "eval_scitail-pairs-qa_steps_per_second": 0.452,
378
+ "step": 194
379
+ },
380
+ {
381
+ "epoch": 0.10046607975142413,
382
+ "eval_xsum-pairs_loss": 0.07691845297813416,
383
+ "eval_xsum-pairs_runtime": 3.043,
384
+ "eval_xsum-pairs_samples_per_second": 42.064,
385
+ "eval_xsum-pairs_steps_per_second": 0.329,
386
+ "step": 194
387
+ },
388
+ {
389
+ "epoch": 0.10046607975142413,
390
+ "eval_sciq_pairs_loss": 0.12039273232221603,
391
+ "eval_sciq_pairs_runtime": 3.878,
392
+ "eval_sciq_pairs_samples_per_second": 33.007,
393
+ "eval_sciq_pairs_steps_per_second": 0.258,
394
+ "step": 194
395
+ },
396
+ {
397
+ "epoch": 0.10046607975142413,
398
+ "eval_qasc_pairs_loss": 0.36198654770851135,
399
+ "eval_qasc_pairs_runtime": 2.0543,
400
+ "eval_qasc_pairs_samples_per_second": 62.307,
401
+ "eval_qasc_pairs_steps_per_second": 0.487,
402
+ "step": 194
403
+ },
404
+ {
405
+ "epoch": 0.10046607975142413,
406
+ "eval_openbookqa_pairs_loss": 0.5711529850959778,
407
+ "eval_openbookqa_pairs_runtime": 2.2213,
408
+ "eval_openbookqa_pairs_samples_per_second": 57.624,
409
+ "eval_openbookqa_pairs_steps_per_second": 0.45,
410
+ "step": 194
411
+ },
412
+ {
413
+ "epoch": 0.10046607975142413,
414
+ "eval_msmarco_pairs_loss": 0.3250836133956909,
415
+ "eval_msmarco_pairs_runtime": 2.22,
416
+ "eval_msmarco_pairs_samples_per_second": 57.657,
417
+ "eval_msmarco_pairs_steps_per_second": 0.45,
418
+ "step": 194
419
+ },
420
+ {
421
+ "epoch": 0.10046607975142413,
422
+ "eval_nq_pairs_loss": 0.4249531030654907,
423
+ "eval_nq_pairs_runtime": 2.7189,
424
+ "eval_nq_pairs_samples_per_second": 47.079,
425
+ "eval_nq_pairs_steps_per_second": 0.368,
426
+ "step": 194
427
+ },
428
+ {
429
+ "epoch": 0.10046607975142413,
430
+ "eval_trivia_pairs_loss": 0.2965388894081116,
431
+ "eval_trivia_pairs_runtime": 3.7556,
432
+ "eval_trivia_pairs_samples_per_second": 34.082,
433
+ "eval_trivia_pairs_steps_per_second": 0.266,
434
+ "step": 194
435
+ },
436
+ {
437
+ "epoch": 0.10046607975142413,
438
+ "eval_gooaq_pairs_loss": 0.2151084989309311,
439
+ "eval_gooaq_pairs_runtime": 2.1122,
440
+ "eval_gooaq_pairs_samples_per_second": 60.601,
441
+ "eval_gooaq_pairs_steps_per_second": 0.473,
442
+ "step": 194
443
+ },
444
+ {
445
+ "epoch": 0.10046607975142413,
446
+ "eval_paws-pos_loss": 0.0295370165258646,
447
+ "eval_paws-pos_runtime": 2.2123,
448
+ "eval_paws-pos_samples_per_second": 57.86,
449
+ "eval_paws-pos_steps_per_second": 0.452,
450
+ "step": 194
451
+ },
452
+ {
453
+ "epoch": 0.10046607975142413,
454
+ "eval_global_dataset_loss": 0.35498398542404175,
455
+ "eval_global_dataset_runtime": 10.1407,
456
+ "eval_global_dataset_samples_per_second": 41.023,
457
+ "eval_global_dataset_steps_per_second": 0.394,
458
+ "step": 194
459
+ },
460
+ {
461
+ "epoch": 0.10357327809425168,
462
+ "grad_norm": 43.11693572998047,
463
+ "learning_rate": 4.470588235294118e-06,
464
+ "loss": 0.376,
465
+ "step": 200
466
+ },
467
+ {
468
+ "epoch": 0.11393060590367685,
469
+ "grad_norm": 31.125375747680664,
470
+ "learning_rate": 4.941176470588235e-06,
471
+ "loss": 0.2782,
472
+ "step": 220
473
+ },
474
+ {
475
+ "epoch": 0.12428793371310203,
476
+ "grad_norm": 14.048110961914062,
477
+ "learning_rate": 5.411764705882353e-06,
478
+ "loss": 0.2391,
479
+ "step": 240
480
+ },
481
+ {
482
+ "epoch": 0.13464526152252718,
483
+ "grad_norm": 5.956579685211182,
484
+ "learning_rate": 5.882352941176471e-06,
485
+ "loss": 0.2767,
486
+ "step": 260
487
+ },
488
+ {
489
+ "epoch": 0.14500258933195237,
490
+ "grad_norm": 14.470146179199219,
491
+ "learning_rate": 6.352941176470589e-06,
492
+ "loss": 0.2359,
493
+ "step": 280
494
+ },
495
+ {
496
+ "epoch": 0.1506991196271362,
497
+ "eval_Qnli-dev_cosine_accuracy": 0.6875,
498
+ "eval_Qnli-dev_cosine_accuracy_threshold": 0.7567152976989746,
499
+ "eval_Qnli-dev_cosine_ap": 0.7133123361631746,
500
+ "eval_Qnli-dev_cosine_f1": 0.6853146853146853,
501
+ "eval_Qnli-dev_cosine_f1_threshold": 0.6536699533462524,
502
+ "eval_Qnli-dev_cosine_precision": 0.5833333333333334,
503
+ "eval_Qnli-dev_cosine_recall": 0.8305084745762712,
504
+ "eval_Qnli-dev_dot_accuracy": 0.673828125,
505
+ "eval_Qnli-dev_dot_accuracy_threshold": 731.5150756835938,
506
+ "eval_Qnli-dev_dot_ap": 0.6890325242500185,
507
+ "eval_Qnli-dev_dot_f1": 0.6782006920415226,
508
+ "eval_Qnli-dev_dot_f1_threshold": 621.156982421875,
509
+ "eval_Qnli-dev_dot_precision": 0.5730994152046783,
510
+ "eval_Qnli-dev_dot_recall": 0.8305084745762712,
511
+ "eval_Qnli-dev_euclidean_accuracy": 0.6875,
512
+ "eval_Qnli-dev_euclidean_accuracy_threshold": 21.166996002197266,
513
+ "eval_Qnli-dev_euclidean_ap": 0.717782618584373,
514
+ "eval_Qnli-dev_euclidean_f1": 0.6832740213523131,
515
+ "eval_Qnli-dev_euclidean_f1_threshold": 25.534191131591797,
516
+ "eval_Qnli-dev_euclidean_precision": 0.588957055214724,
517
+ "eval_Qnli-dev_euclidean_recall": 0.8135593220338984,
518
+ "eval_Qnli-dev_manhattan_accuracy": 0.689453125,
519
+ "eval_Qnli-dev_manhattan_accuracy_threshold": 717.0855712890625,
520
+ "eval_Qnli-dev_manhattan_ap": 0.7178394918687495,
521
+ "eval_Qnli-dev_manhattan_f1": 0.6815068493150686,
522
+ "eval_Qnli-dev_manhattan_f1_threshold": 809.9966430664062,
523
+ "eval_Qnli-dev_manhattan_precision": 0.5718390804597702,
524
+ "eval_Qnli-dev_manhattan_recall": 0.8432203389830508,
525
+ "eval_Qnli-dev_max_accuracy": 0.689453125,
526
+ "eval_Qnli-dev_max_accuracy_threshold": 731.5150756835938,
527
+ "eval_Qnli-dev_max_ap": 0.7178394918687495,
528
+ "eval_Qnli-dev_max_f1": 0.6853146853146853,
529
+ "eval_Qnli-dev_max_f1_threshold": 809.9966430664062,
530
+ "eval_Qnli-dev_max_precision": 0.588957055214724,
531
+ "eval_Qnli-dev_max_recall": 0.8432203389830508,
532
+ "eval_allNLI-dev_cosine_accuracy": 0.71484375,
533
+ "eval_allNLI-dev_cosine_accuracy_threshold": 0.8485724329948425,
534
+ "eval_allNLI-dev_cosine_ap": 0.5777522094864251,
535
+ "eval_allNLI-dev_cosine_f1": 0.5925925925925926,
536
+ "eval_allNLI-dev_cosine_f1_threshold": 0.7124052047729492,
537
+ "eval_allNLI-dev_cosine_precision": 0.4942084942084942,
538
+ "eval_allNLI-dev_cosine_recall": 0.7398843930635838,
539
+ "eval_allNLI-dev_dot_accuracy": 0.71484375,
540
+ "eval_allNLI-dev_dot_accuracy_threshold": 835.6192016601562,
541
+ "eval_allNLI-dev_dot_ap": 0.5708546535940942,
542
+ "eval_allNLI-dev_dot_f1": 0.5931372549019609,
543
+ "eval_allNLI-dev_dot_f1_threshold": 712.94482421875,
544
+ "eval_allNLI-dev_dot_precision": 0.5148936170212766,
545
+ "eval_allNLI-dev_dot_recall": 0.6994219653179191,
546
+ "eval_allNLI-dev_euclidean_accuracy": 0.712890625,
547
+ "eval_allNLI-dev_euclidean_accuracy_threshold": 15.772256851196289,
548
+ "eval_allNLI-dev_euclidean_ap": 0.5773033114664347,
549
+ "eval_allNLI-dev_euclidean_f1": 0.5957446808510639,
550
+ "eval_allNLI-dev_euclidean_f1_threshold": 24.513042449951172,
551
+ "eval_allNLI-dev_euclidean_precision": 0.4713804713804714,
552
+ "eval_allNLI-dev_euclidean_recall": 0.8092485549132948,
553
+ "eval_allNLI-dev_manhattan_accuracy": 0.71484375,
554
+ "eval_allNLI-dev_manhattan_accuracy_threshold": 494.4720153808594,
555
+ "eval_allNLI-dev_manhattan_ap": 0.5787277750430182,
556
+ "eval_allNLI-dev_manhattan_f1": 0.597457627118644,
557
+ "eval_allNLI-dev_manhattan_f1_threshold": 764.1075439453125,
558
+ "eval_allNLI-dev_manhattan_precision": 0.47157190635451507,
559
+ "eval_allNLI-dev_manhattan_recall": 0.815028901734104,
560
+ "eval_allNLI-dev_max_accuracy": 0.71484375,
561
+ "eval_allNLI-dev_max_accuracy_threshold": 835.6192016601562,
562
+ "eval_allNLI-dev_max_ap": 0.5787277750430182,
563
+ "eval_allNLI-dev_max_f1": 0.597457627118644,
564
+ "eval_allNLI-dev_max_f1_threshold": 764.1075439453125,
565
+ "eval_allNLI-dev_max_precision": 0.5148936170212766,
566
+ "eval_allNLI-dev_max_recall": 0.815028901734104,
567
+ "eval_sequential_score": 0.7178394918687495,
568
+ "eval_sts-test_pearson_cosine": 0.9080888281681364,
569
+ "eval_sts-test_pearson_dot": 0.8993720999648187,
570
+ "eval_sts-test_pearson_euclidean": 0.9185021221297063,
571
+ "eval_sts-test_pearson_manhattan": 0.9182084064307341,
572
+ "eval_sts-test_pearson_max": 0.9185021221297063,
573
+ "eval_sts-test_spearman_cosine": 0.9145502926755805,
574
+ "eval_sts-test_spearman_dot": 0.8990795555767088,
575
+ "eval_sts-test_spearman_euclidean": 0.9143005806370166,
576
+ "eval_sts-test_spearman_manhattan": 0.9141107457861942,
577
+ "eval_sts-test_spearman_max": 0.9145502926755805,
578
+ "eval_vitaminc-pairs_loss": 3.4645299911499023,
579
+ "eval_vitaminc-pairs_runtime": 4.4497,
580
+ "eval_vitaminc-pairs_samples_per_second": 28.766,
581
+ "eval_vitaminc-pairs_steps_per_second": 0.225,
582
+ "step": 291
583
+ },
584
+ {
585
+ "epoch": 0.1506991196271362,
586
+ "eval_negation-triplets_loss": 0.8774887323379517,
587
+ "eval_negation-triplets_runtime": 3.1401,
588
+ "eval_negation-triplets_samples_per_second": 40.764,
589
+ "eval_negation-triplets_steps_per_second": 0.318,
590
+ "step": 291
591
+ },
592
+ {
593
+ "epoch": 0.1506991196271362,
594
+ "eval_scitail-pairs-pos_loss": 0.029673559591174126,
595
+ "eval_scitail-pairs-pos_runtime": 2.6642,
596
+ "eval_scitail-pairs-pos_samples_per_second": 48.044,
597
+ "eval_scitail-pairs-pos_steps_per_second": 0.375,
598
+ "step": 291
599
+ },
600
+ {
601
+ "epoch": 0.1506991196271362,
602
+ "eval_scitail-pairs-qa_loss": 0.011800204403698444,
603
+ "eval_scitail-pairs-qa_runtime": 2.1861,
604
+ "eval_scitail-pairs-qa_samples_per_second": 58.551,
605
+ "eval_scitail-pairs-qa_steps_per_second": 0.457,
606
+ "step": 291
607
+ },
608
+ {
609
+ "epoch": 0.1506991196271362,
610
+ "eval_xsum-pairs_loss": 0.017930012196302414,
611
+ "eval_xsum-pairs_runtime": 3.0255,
612
+ "eval_xsum-pairs_samples_per_second": 42.307,
613
+ "eval_xsum-pairs_steps_per_second": 0.331,
614
+ "step": 291
615
+ },
616
+ {
617
+ "epoch": 0.1506991196271362,
618
+ "eval_sciq_pairs_loss": 0.09765021502971649,
619
+ "eval_sciq_pairs_runtime": 3.8726,
620
+ "eval_sciq_pairs_samples_per_second": 33.053,
621
+ "eval_sciq_pairs_steps_per_second": 0.258,
622
+ "step": 291
623
+ },
624
+ {
625
+ "epoch": 0.1506991196271362,
626
+ "eval_qasc_pairs_loss": 0.3064229488372803,
627
+ "eval_qasc_pairs_runtime": 2.1307,
628
+ "eval_qasc_pairs_samples_per_second": 60.075,
629
+ "eval_qasc_pairs_steps_per_second": 0.469,
630
+ "step": 291
631
+ },
632
+ {
633
+ "epoch": 0.1506991196271362,
634
+ "eval_openbookqa_pairs_loss": 0.46111759543418884,
635
+ "eval_openbookqa_pairs_runtime": 2.2685,
636
+ "eval_openbookqa_pairs_samples_per_second": 56.424,
637
+ "eval_openbookqa_pairs_steps_per_second": 0.441,
638
+ "step": 291
639
+ },
640
+ {
641
+ "epoch": 0.1506991196271362,
642
+ "eval_msmarco_pairs_loss": 0.08168309926986694,
643
+ "eval_msmarco_pairs_runtime": 2.2657,
644
+ "eval_msmarco_pairs_samples_per_second": 56.495,
645
+ "eval_msmarco_pairs_steps_per_second": 0.441,
646
+ "step": 291
647
+ },
648
+ {
649
+ "epoch": 0.1506991196271362,
650
+ "eval_nq_pairs_loss": 0.13220462203025818,
651
+ "eval_nq_pairs_runtime": 2.7139,
652
+ "eval_nq_pairs_samples_per_second": 47.164,
653
+ "eval_nq_pairs_steps_per_second": 0.368,
654
+ "step": 291
655
+ },
656
+ {
657
+ "epoch": 0.1506991196271362,
658
+ "eval_trivia_pairs_loss": 0.1532345414161682,
659
+ "eval_trivia_pairs_runtime": 3.76,
660
+ "eval_trivia_pairs_samples_per_second": 34.043,
661
+ "eval_trivia_pairs_steps_per_second": 0.266,
662
+ "step": 291
663
+ },
664
+ {
665
+ "epoch": 0.1506991196271362,
666
+ "eval_gooaq_pairs_loss": 0.10126010328531265,
667
+ "eval_gooaq_pairs_runtime": 2.1372,
668
+ "eval_gooaq_pairs_samples_per_second": 59.892,
669
+ "eval_gooaq_pairs_steps_per_second": 0.468,
670
+ "step": 291
671
+ },
672
+ {
673
+ "epoch": 0.1506991196271362,
674
+ "eval_paws-pos_loss": 0.021147189661860466,
675
+ "eval_paws-pos_runtime": 2.2138,
676
+ "eval_paws-pos_samples_per_second": 57.819,
677
+ "eval_paws-pos_steps_per_second": 0.452,
678
+ "step": 291
679
+ },
680
+ {
681
+ "epoch": 0.1506991196271362,
682
+ "eval_global_dataset_loss": 0.2509276270866394,
683
+ "eval_global_dataset_runtime": 10.154,
684
+ "eval_global_dataset_samples_per_second": 40.969,
685
+ "eval_global_dataset_steps_per_second": 0.394,
686
+ "step": 291
687
+ },
688
+ {
689
+ "epoch": 0.15535991714137753,
690
+ "grad_norm": 6.319842338562012,
691
+ "learning_rate": 6.823529411764706e-06,
692
+ "loss": 0.1505,
693
+ "step": 300
694
+ },
695
+ {
696
+ "epoch": 0.1657172449508027,
697
+ "grad_norm": 16.11246109008789,
698
+ "learning_rate": 7.294117647058823e-06,
699
+ "loss": 0.1473,
700
+ "step": 320
701
+ },
702
+ {
703
+ "epoch": 0.17607457276022787,
704
+ "grad_norm": 9.587472915649414,
705
+ "learning_rate": 7.764705882352943e-06,
706
+ "loss": 0.1614,
707
+ "step": 340
708
+ },
709
+ {
710
+ "epoch": 0.18643190056965303,
711
+ "grad_norm": 21.551036834716797,
712
+ "learning_rate": 8.23529411764706e-06,
713
+ "loss": 0.1834,
714
+ "step": 360
715
+ },
716
+ {
717
+ "epoch": 0.1967892283790782,
718
+ "grad_norm": 7.1668548583984375,
719
+ "learning_rate": 8.705882352941177e-06,
720
+ "loss": 0.164,
721
+ "step": 380
722
+ }
723
+ ],
724
+ "logging_steps": 20,
725
+ "max_steps": 3862,
726
+ "num_input_tokens_seen": 0,
727
+ "num_train_epochs": 2,
728
+ "save_steps": 387,
729
+ "stateful_callbacks": {
730
+ "TrainerControl": {
731
+ "args": {
732
+ "should_epoch_stop": false,
733
+ "should_evaluate": false,
734
+ "should_log": false,
735
+ "should_save": true,
736
+ "should_training_stop": false
737
+ },
738
+ "attributes": {}
739
+ }
740
+ },
741
+ "total_flos": 0.0,
742
+ "train_batch_size": 64,
743
+ "trial_name": null,
744
+ "trial_params": null
745
+ }
checkpoint-387/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6afbd3ad35d0a5a9ae1e51fcec7df790b982c51eacf41bfffb4163061732d175
3
+ size 5560