allstax commited on
Commit
05f1864
·
verified ·
1 Parent(s): a68b472

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-large-cnn",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "force_bos_token_to_be_generated": true,
28
+ "forced_bos_token_id": 0,
29
+ "forced_eos_token_id": 2,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1",
34
+ "2": "LABEL_2"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2
42
+ },
43
+ "length_penalty": 2.0,
44
+ "max_length": 142,
45
+ "max_position_embeddings": 1024,
46
+ "min_length": 56,
47
+ "model_type": "bart",
48
+ "no_repeat_ngram_size": 3,
49
+ "normalize_before": false,
50
+ "num_beams": 4,
51
+ "num_hidden_layers": 12,
52
+ "output_past": true,
53
+ "pad_token_id": 1,
54
+ "prefix": " ",
55
+ "scale_embedding": false,
56
+ "task_specific_params": {
57
+ "summarization": {
58
+ "early_stopping": true,
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "no_repeat_ngram_size": 3,
63
+ "num_beams": 4
64
+ }
65
+ },
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.39.0",
68
+ "use_cache": true,
69
+ "vocab_size": 50264
70
+ }
generation_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "early_stopping": true,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "length_penalty": 2.0,
9
+ "max_length": 142,
10
+ "min_length": 56,
11
+ "no_repeat_ngram_size": 3,
12
+ "num_beams": 4,
13
+ "pad_token_id": 1,
14
+ "transformers_version": "4.39.0"
15
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1a046115e29d9fc4fc6f17d15acf273321331ea0487dfd2957837691e6cf87d
3
+ size 1625422896
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93e6668d0d71e4a8bacc823bd506fd598fec95d54d98bec19533846e92ce874
3
+ size 3250751759
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3598de065a7686e00443ba710bed5e60aa5e2573d27b288cce4c65b4b90af763
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1882094b3a5d792444fca74f847dc6fea714ecc5e8e0806baac71db6ac44e5
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1024,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "BartTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
trainer_state.json ADDED
@@ -0,0 +1,821 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.6413997492305938,
5
+ "eval_steps": 720,
6
+ "global_step": 28800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "grad_norm": 0.535019040107727,
14
+ "learning_rate": 9.864353971540931e-05,
15
+ "loss": 0.3127,
16
+ "step": 720
17
+ },
18
+ {
19
+ "epoch": 0.04,
20
+ "eval_bertscore": 0.7034942507743835,
21
+ "eval_loss": 0.12081495672464371,
22
+ "eval_rouge1": 0.5240594026277678,
23
+ "eval_rouge2": 0.3067757525245376,
24
+ "eval_rougeL": 0.39633186458379827,
25
+ "eval_rougeLsum": 0.39613387865495875,
26
+ "eval_runtime": 80.0774,
27
+ "eval_samples_per_second": 0.674,
28
+ "eval_steps_per_second": 0.337,
29
+ "step": 720
30
+ },
31
+ {
32
+ "epoch": 0.08,
33
+ "grad_norm": 0.42372021079063416,
34
+ "learning_rate": 9.72756806048977e-05,
35
+ "loss": 0.1212,
36
+ "step": 1440
37
+ },
38
+ {
39
+ "epoch": 0.08,
40
+ "eval_bertscore": 0.7000990509986877,
41
+ "eval_loss": 0.11524277925491333,
42
+ "eval_rouge1": 0.5216793756151761,
43
+ "eval_rouge2": 0.2934274790368596,
44
+ "eval_rougeL": 0.3886043968581182,
45
+ "eval_rougeLsum": 0.38807827948983176,
46
+ "eval_runtime": 76.9989,
47
+ "eval_samples_per_second": 0.701,
48
+ "eval_steps_per_second": 0.351,
49
+ "step": 1440
50
+ },
51
+ {
52
+ "epoch": 0.12,
53
+ "grad_norm": 0.42828959226608276,
54
+ "learning_rate": 9.590782149438608e-05,
55
+ "loss": 0.1222,
56
+ "step": 2160
57
+ },
58
+ {
59
+ "epoch": 0.12,
60
+ "eval_bertscore": 0.6457424163818359,
61
+ "eval_loss": 0.1191863939166069,
62
+ "eval_rouge1": 0.4637245182501467,
63
+ "eval_rouge2": 0.25286895940302717,
64
+ "eval_rougeL": 0.34111914002345234,
65
+ "eval_rougeLsum": 0.3407154814401842,
66
+ "eval_runtime": 103.8897,
67
+ "eval_samples_per_second": 0.52,
68
+ "eval_steps_per_second": 0.26,
69
+ "step": 2160
70
+ },
71
+ {
72
+ "epoch": 0.16,
73
+ "grad_norm": 0.44599342346191406,
74
+ "learning_rate": 9.453996238387447e-05,
75
+ "loss": 0.1172,
76
+ "step": 2880
77
+ },
78
+ {
79
+ "epoch": 0.16,
80
+ "eval_bertscore": 0.6841260194778442,
81
+ "eval_loss": 0.11394956707954407,
82
+ "eval_rouge1": 0.48661400042430136,
83
+ "eval_rouge2": 0.27298564925044316,
84
+ "eval_rougeL": 0.371859286615548,
85
+ "eval_rougeLsum": 0.37045985133751724,
86
+ "eval_runtime": 67.7701,
87
+ "eval_samples_per_second": 0.797,
88
+ "eval_steps_per_second": 0.398,
89
+ "step": 2880
90
+ },
91
+ {
92
+ "epoch": 0.21,
93
+ "grad_norm": 0.4306412935256958,
94
+ "learning_rate": 9.317210327336284e-05,
95
+ "loss": 0.1265,
96
+ "step": 3600
97
+ },
98
+ {
99
+ "epoch": 0.21,
100
+ "eval_bertscore": 0.5902894139289856,
101
+ "eval_loss": 0.1217198297381401,
102
+ "eval_rouge1": 0.3684193429088964,
103
+ "eval_rouge2": 0.19503224358227592,
104
+ "eval_rougeL": 0.29399973276202795,
105
+ "eval_rougeLsum": 0.29348115386759427,
106
+ "eval_runtime": 58.6683,
107
+ "eval_samples_per_second": 0.92,
108
+ "eval_steps_per_second": 0.46,
109
+ "step": 3600
110
+ },
111
+ {
112
+ "epoch": 0.25,
113
+ "grad_norm": 0.4569800794124603,
114
+ "learning_rate": 9.180424416285122e-05,
115
+ "loss": 0.1201,
116
+ "step": 4320
117
+ },
118
+ {
119
+ "epoch": 0.25,
120
+ "eval_bertscore": 0.6664375066757202,
121
+ "eval_loss": 0.11453992873430252,
122
+ "eval_rouge1": 0.4849966834372761,
123
+ "eval_rouge2": 0.27797369581531306,
124
+ "eval_rougeL": 0.3629296708263681,
125
+ "eval_rougeLsum": 0.3623800251579623,
126
+ "eval_runtime": 81.2927,
127
+ "eval_samples_per_second": 0.664,
128
+ "eval_steps_per_second": 0.332,
129
+ "step": 4320
130
+ },
131
+ {
132
+ "epoch": 0.29,
133
+ "grad_norm": 0.4236726462841034,
134
+ "learning_rate": 9.043638505233961e-05,
135
+ "loss": 0.1171,
136
+ "step": 5040
137
+ },
138
+ {
139
+ "epoch": 0.29,
140
+ "eval_bertscore": 0.6511832475662231,
141
+ "eval_loss": 0.1140449047088623,
142
+ "eval_rouge1": 0.4622762558490229,
143
+ "eval_rouge2": 0.2720739144786822,
144
+ "eval_rougeL": 0.35563312683133363,
145
+ "eval_rougeLsum": 0.3553109181928958,
146
+ "eval_runtime": 96.9669,
147
+ "eval_samples_per_second": 0.557,
148
+ "eval_steps_per_second": 0.278,
149
+ "step": 5040
150
+ },
151
+ {
152
+ "epoch": 0.33,
153
+ "grad_norm": 0.4153118431568146,
154
+ "learning_rate": 8.9068525941828e-05,
155
+ "loss": 0.1182,
156
+ "step": 5760
157
+ },
158
+ {
159
+ "epoch": 0.33,
160
+ "eval_bertscore": 0.7056278586387634,
161
+ "eval_loss": 0.11447593569755554,
162
+ "eval_rouge1": 0.535980541670685,
163
+ "eval_rouge2": 0.3085487182685619,
164
+ "eval_rougeL": 0.3952747177668595,
165
+ "eval_rougeLsum": 0.39487374036594847,
166
+ "eval_runtime": 78.3273,
167
+ "eval_samples_per_second": 0.689,
168
+ "eval_steps_per_second": 0.345,
169
+ "step": 5760
170
+ },
171
+ {
172
+ "epoch": 0.37,
173
+ "grad_norm": 0.30859124660491943,
174
+ "learning_rate": 8.770066683131638e-05,
175
+ "loss": 0.1161,
176
+ "step": 6480
177
+ },
178
+ {
179
+ "epoch": 0.37,
180
+ "eval_bertscore": 0.717333197593689,
181
+ "eval_loss": 0.11316747963428497,
182
+ "eval_rouge1": 0.5395779598124536,
183
+ "eval_rouge2": 0.3235379995103774,
184
+ "eval_rougeL": 0.40115447322823283,
185
+ "eval_rougeLsum": 0.40261399344054405,
186
+ "eval_runtime": 77.4572,
187
+ "eval_samples_per_second": 0.697,
188
+ "eval_steps_per_second": 0.349,
189
+ "step": 6480
190
+ },
191
+ {
192
+ "epoch": 0.41,
193
+ "grad_norm": 0.3857922852039337,
194
+ "learning_rate": 8.633280772080476e-05,
195
+ "loss": 0.1151,
196
+ "step": 7200
197
+ },
198
+ {
199
+ "epoch": 0.41,
200
+ "eval_bertscore": 0.7019616365432739,
201
+ "eval_loss": 0.11000501364469528,
202
+ "eval_rouge1": 0.5236493302484355,
203
+ "eval_rouge2": 0.3068191529551719,
204
+ "eval_rougeL": 0.4036664284755191,
205
+ "eval_rougeLsum": 0.4040689187486951,
206
+ "eval_runtime": 76.1117,
207
+ "eval_samples_per_second": 0.709,
208
+ "eval_steps_per_second": 0.355,
209
+ "step": 7200
210
+ },
211
+ {
212
+ "epoch": 0.45,
213
+ "grad_norm": 0.3432803452014923,
214
+ "learning_rate": 8.496494861029315e-05,
215
+ "loss": 0.1144,
216
+ "step": 7920
217
+ },
218
+ {
219
+ "epoch": 0.45,
220
+ "eval_bertscore": 0.7042035460472107,
221
+ "eval_loss": 0.11131834983825684,
222
+ "eval_rouge1": 0.5294383728708514,
223
+ "eval_rouge2": 0.3003098501119716,
224
+ "eval_rougeL": 0.3967273111533241,
225
+ "eval_rougeLsum": 0.39619485281011757,
226
+ "eval_runtime": 77.3811,
227
+ "eval_samples_per_second": 0.698,
228
+ "eval_steps_per_second": 0.349,
229
+ "step": 7920
230
+ },
231
+ {
232
+ "epoch": 0.49,
233
+ "grad_norm": 0.3402859568595886,
234
+ "learning_rate": 8.359708949978152e-05,
235
+ "loss": 0.1126,
236
+ "step": 8640
237
+ },
238
+ {
239
+ "epoch": 0.49,
240
+ "eval_bertscore": 0.71971595287323,
241
+ "eval_loss": 0.11067274957895279,
242
+ "eval_rouge1": 0.551885774991056,
243
+ "eval_rouge2": 0.33499475588298316,
244
+ "eval_rougeL": 0.4160407628361842,
245
+ "eval_rougeLsum": 0.4164543392695917,
246
+ "eval_runtime": 77.4902,
247
+ "eval_samples_per_second": 0.697,
248
+ "eval_steps_per_second": 0.348,
249
+ "step": 8640
250
+ },
251
+ {
252
+ "epoch": 0.53,
253
+ "grad_norm": 0.4391550123691559,
254
+ "learning_rate": 8.223113019359007e-05,
255
+ "loss": 0.1116,
256
+ "step": 9360
257
+ },
258
+ {
259
+ "epoch": 0.53,
260
+ "eval_bertscore": 0.7158631086349487,
261
+ "eval_loss": 0.1099533885717392,
262
+ "eval_rouge1": 0.5557272797557176,
263
+ "eval_rouge2": 0.332779980249166,
264
+ "eval_rougeL": 0.4155444723963883,
265
+ "eval_rougeLsum": 0.41657130732656783,
266
+ "eval_runtime": 80.0828,
267
+ "eval_samples_per_second": 0.674,
268
+ "eval_steps_per_second": 0.337,
269
+ "step": 9360
270
+ },
271
+ {
272
+ "epoch": 0.57,
273
+ "grad_norm": 0.3907322287559509,
274
+ "learning_rate": 8.086327108307846e-05,
275
+ "loss": 0.1141,
276
+ "step": 10080
277
+ },
278
+ {
279
+ "epoch": 0.57,
280
+ "eval_bertscore": 0.7130799293518066,
281
+ "eval_loss": 0.11258435994386673,
282
+ "eval_rouge1": 0.5457292777447704,
283
+ "eval_rouge2": 0.3214033358835623,
284
+ "eval_rougeL": 0.40814606110656115,
285
+ "eval_rougeLsum": 0.4086806368595041,
286
+ "eval_runtime": 73.8407,
287
+ "eval_samples_per_second": 0.731,
288
+ "eval_steps_per_second": 0.366,
289
+ "step": 10080
290
+ },
291
+ {
292
+ "epoch": 0.62,
293
+ "grad_norm": 0.38848328590393066,
294
+ "learning_rate": 7.949541197256683e-05,
295
+ "loss": 0.1132,
296
+ "step": 10800
297
+ },
298
+ {
299
+ "epoch": 0.62,
300
+ "eval_bertscore": 0.7245057225227356,
301
+ "eval_loss": 0.11034353822469711,
302
+ "eval_rouge1": 0.5603983140826179,
303
+ "eval_rouge2": 0.3445987526625777,
304
+ "eval_rougeL": 0.43423536113182604,
305
+ "eval_rougeLsum": 0.43398163455334016,
306
+ "eval_runtime": 77.732,
307
+ "eval_samples_per_second": 0.695,
308
+ "eval_steps_per_second": 0.347,
309
+ "step": 10800
310
+ },
311
+ {
312
+ "epoch": 0.66,
313
+ "grad_norm": 0.37145286798477173,
314
+ "learning_rate": 7.812945266637537e-05,
315
+ "loss": 0.112,
316
+ "step": 11520
317
+ },
318
+ {
319
+ "epoch": 0.66,
320
+ "eval_bertscore": 0.7207842469215393,
321
+ "eval_loss": 0.11157318204641342,
322
+ "eval_rouge1": 0.5554178283606811,
323
+ "eval_rouge2": 0.3317069905744905,
324
+ "eval_rougeL": 0.4209451268922738,
325
+ "eval_rougeLsum": 0.42120272115590573,
326
+ "eval_runtime": 81.6277,
327
+ "eval_samples_per_second": 0.662,
328
+ "eval_steps_per_second": 0.331,
329
+ "step": 11520
330
+ },
331
+ {
332
+ "epoch": 0.7,
333
+ "grad_norm": 0.32327908277511597,
334
+ "learning_rate": 7.676349336018391e-05,
335
+ "loss": 0.1118,
336
+ "step": 12240
337
+ },
338
+ {
339
+ "epoch": 0.7,
340
+ "eval_bertscore": 0.7193225622177124,
341
+ "eval_loss": 0.11037024855613708,
342
+ "eval_rouge1": 0.5534709029702873,
343
+ "eval_rouge2": 0.33508595975393674,
344
+ "eval_rougeL": 0.4220660586810759,
345
+ "eval_rougeLsum": 0.42394444829473793,
346
+ "eval_runtime": 79.4778,
347
+ "eval_samples_per_second": 0.679,
348
+ "eval_steps_per_second": 0.34,
349
+ "step": 12240
350
+ },
351
+ {
352
+ "epoch": 0.74,
353
+ "grad_norm": 0.296165406703949,
354
+ "learning_rate": 7.539563424967229e-05,
355
+ "loss": 0.1096,
356
+ "step": 12960
357
+ },
358
+ {
359
+ "epoch": 0.74,
360
+ "eval_bertscore": 0.7183234691619873,
361
+ "eval_loss": 0.10668845474720001,
362
+ "eval_rouge1": 0.5527080110711662,
363
+ "eval_rouge2": 0.3304597058226536,
364
+ "eval_rougeL": 0.4176676998826935,
365
+ "eval_rougeLsum": 0.41906982236369805,
366
+ "eval_runtime": 74.609,
367
+ "eval_samples_per_second": 0.724,
368
+ "eval_steps_per_second": 0.362,
369
+ "step": 12960
370
+ },
371
+ {
372
+ "epoch": 0.78,
373
+ "grad_norm": 0.3004627525806427,
374
+ "learning_rate": 7.402777513916068e-05,
375
+ "loss": 0.1105,
376
+ "step": 13680
377
+ },
378
+ {
379
+ "epoch": 0.78,
380
+ "eval_bertscore": 0.7093863487243652,
381
+ "eval_loss": 0.1068594753742218,
382
+ "eval_rouge1": 0.5386027107080774,
383
+ "eval_rouge2": 0.3174670612173311,
384
+ "eval_rougeL": 0.4089464604886982,
385
+ "eval_rougeLsum": 0.40954043741634194,
386
+ "eval_runtime": 76.1174,
387
+ "eval_samples_per_second": 0.709,
388
+ "eval_steps_per_second": 0.355,
389
+ "step": 13680
390
+ },
391
+ {
392
+ "epoch": 0.82,
393
+ "grad_norm": 0.4122227132320404,
394
+ "learning_rate": 7.265991602864905e-05,
395
+ "loss": 0.1094,
396
+ "step": 14400
397
+ },
398
+ {
399
+ "epoch": 0.82,
400
+ "eval_bertscore": 0.7156451344490051,
401
+ "eval_loss": 0.10706545412540436,
402
+ "eval_rouge1": 0.5522097394348282,
403
+ "eval_rouge2": 0.3376815877629147,
404
+ "eval_rougeL": 0.41094798705443536,
405
+ "eval_rougeLsum": 0.41185755780524297,
406
+ "eval_runtime": 79.5068,
407
+ "eval_samples_per_second": 0.679,
408
+ "eval_steps_per_second": 0.34,
409
+ "step": 14400
410
+ },
411
+ {
412
+ "epoch": 0.86,
413
+ "grad_norm": 0.3019055128097534,
414
+ "learning_rate": 7.129205691813743e-05,
415
+ "loss": 0.1047,
416
+ "step": 15120
417
+ },
418
+ {
419
+ "epoch": 0.86,
420
+ "eval_bertscore": 0.723181962966919,
421
+ "eval_loss": 0.10513070970773697,
422
+ "eval_rouge1": 0.5597833953895566,
423
+ "eval_rouge2": 0.3368159976094224,
424
+ "eval_rougeL": 0.4251112326345452,
425
+ "eval_rougeLsum": 0.4271018761152323,
426
+ "eval_runtime": 72.9275,
427
+ "eval_samples_per_second": 0.74,
428
+ "eval_steps_per_second": 0.37,
429
+ "step": 15120
430
+ },
431
+ {
432
+ "epoch": 0.9,
433
+ "grad_norm": 0.39543616771698,
434
+ "learning_rate": 6.992609761194597e-05,
435
+ "loss": 0.106,
436
+ "step": 15840
437
+ },
438
+ {
439
+ "epoch": 0.9,
440
+ "eval_bertscore": 0.7264233231544495,
441
+ "eval_loss": 0.10471142083406448,
442
+ "eval_rouge1": 0.5607444186855683,
443
+ "eval_rouge2": 0.32933852525922336,
444
+ "eval_rougeL": 0.4164104876659622,
445
+ "eval_rougeLsum": 0.4178921783444509,
446
+ "eval_runtime": 79.4632,
447
+ "eval_samples_per_second": 0.68,
448
+ "eval_steps_per_second": 0.34,
449
+ "step": 15840
450
+ },
451
+ {
452
+ "epoch": 0.94,
453
+ "grad_norm": 0.17296220362186432,
454
+ "learning_rate": 6.855823850143436e-05,
455
+ "loss": 0.1085,
456
+ "step": 16560
457
+ },
458
+ {
459
+ "epoch": 0.94,
460
+ "eval_bertscore": 0.7186797261238098,
461
+ "eval_loss": 0.10288402438163757,
462
+ "eval_rouge1": 0.5499937534452628,
463
+ "eval_rouge2": 0.33202955320606253,
464
+ "eval_rougeL": 0.41109499153735635,
465
+ "eval_rougeLsum": 0.4129325173744952,
466
+ "eval_runtime": 74.2816,
467
+ "eval_samples_per_second": 0.727,
468
+ "eval_steps_per_second": 0.363,
469
+ "step": 16560
470
+ },
471
+ {
472
+ "epoch": 0.98,
473
+ "grad_norm": 0.34968239068984985,
474
+ "learning_rate": 6.719037939092274e-05,
475
+ "loss": 0.1064,
476
+ "step": 17280
477
+ },
478
+ {
479
+ "epoch": 0.98,
480
+ "eval_bertscore": 0.715437650680542,
481
+ "eval_loss": 0.10678575932979584,
482
+ "eval_rouge1": 0.5487884139639068,
483
+ "eval_rouge2": 0.3287484312214649,
484
+ "eval_rougeL": 0.4115546192599129,
485
+ "eval_rougeLsum": 0.4129129108454481,
486
+ "eval_runtime": 77.9977,
487
+ "eval_samples_per_second": 0.692,
488
+ "eval_steps_per_second": 0.346,
489
+ "step": 17280
490
+ },
491
+ {
492
+ "epoch": 1.03,
493
+ "grad_norm": 0.22770258784294128,
494
+ "learning_rate": 6.582252028041113e-05,
495
+ "loss": 0.094,
496
+ "step": 18000
497
+ },
498
+ {
499
+ "epoch": 1.03,
500
+ "eval_bertscore": 0.7268933653831482,
501
+ "eval_loss": 0.10910864919424057,
502
+ "eval_rouge1": 0.5644640432420631,
503
+ "eval_rouge2": 0.34856910757450765,
504
+ "eval_rougeL": 0.4334348850734425,
505
+ "eval_rougeLsum": 0.4322774316283801,
506
+ "eval_runtime": 70.7723,
507
+ "eval_samples_per_second": 0.763,
508
+ "eval_steps_per_second": 0.382,
509
+ "step": 18000
510
+ },
511
+ {
512
+ "epoch": 1.07,
513
+ "grad_norm": 0.2036217600107193,
514
+ "learning_rate": 6.44546611698995e-05,
515
+ "loss": 0.0864,
516
+ "step": 18720
517
+ },
518
+ {
519
+ "epoch": 1.07,
520
+ "eval_bertscore": 0.7298507690429688,
521
+ "eval_loss": 0.1051657572388649,
522
+ "eval_rouge1": 0.5693416283658175,
523
+ "eval_rouge2": 0.3547090481291705,
524
+ "eval_rougeL": 0.4367412765285528,
525
+ "eval_rougeLsum": 0.4370252833034207,
526
+ "eval_runtime": 74.7048,
527
+ "eval_samples_per_second": 0.723,
528
+ "eval_steps_per_second": 0.361,
529
+ "step": 18720
530
+ },
531
+ {
532
+ "epoch": 1.11,
533
+ "grad_norm": 0.3400803804397583,
534
+ "learning_rate": 6.308680205938788e-05,
535
+ "loss": 0.0846,
536
+ "step": 19440
537
+ },
538
+ {
539
+ "epoch": 1.11,
540
+ "eval_bertscore": 0.7288545966148376,
541
+ "eval_loss": 0.1069113239645958,
542
+ "eval_rouge1": 0.5633722381222108,
543
+ "eval_rouge2": 0.337377454492796,
544
+ "eval_rougeL": 0.4349115421710151,
545
+ "eval_rougeLsum": 0.43561356852158567,
546
+ "eval_runtime": 73.5552,
547
+ "eval_samples_per_second": 0.734,
548
+ "eval_steps_per_second": 0.367,
549
+ "step": 19440
550
+ },
551
+ {
552
+ "epoch": 1.15,
553
+ "grad_norm": 0.3360745310783386,
554
+ "learning_rate": 6.172084275319642e-05,
555
+ "loss": 0.0875,
556
+ "step": 20160
557
+ },
558
+ {
559
+ "epoch": 1.15,
560
+ "eval_bertscore": 0.715953528881073,
561
+ "eval_loss": 0.10425002127885818,
562
+ "eval_rouge1": 0.5548398737384996,
563
+ "eval_rouge2": 0.33589277481599067,
564
+ "eval_rougeL": 0.42137114864331937,
565
+ "eval_rougeLsum": 0.4231469615029759,
566
+ "eval_runtime": 78.0304,
567
+ "eval_samples_per_second": 0.692,
568
+ "eval_steps_per_second": 0.346,
569
+ "step": 20160
570
+ },
571
+ {
572
+ "epoch": 1.19,
573
+ "grad_norm": 0.4246189594268799,
574
+ "learning_rate": 6.03529836426848e-05,
575
+ "loss": 0.0868,
576
+ "step": 20880
577
+ },
578
+ {
579
+ "epoch": 1.19,
580
+ "eval_bertscore": 0.7299396395683289,
581
+ "eval_loss": 0.10365325212478638,
582
+ "eval_rouge1": 0.5715394315498017,
583
+ "eval_rouge2": 0.34427400662165897,
584
+ "eval_rougeL": 0.433027526044127,
585
+ "eval_rougeLsum": 0.4347450430032858,
586
+ "eval_runtime": 74.0473,
587
+ "eval_samples_per_second": 0.729,
588
+ "eval_steps_per_second": 0.365,
589
+ "step": 20880
590
+ },
591
+ {
592
+ "epoch": 1.23,
593
+ "grad_norm": 0.2776849865913391,
594
+ "learning_rate": 5.898512453217319e-05,
595
+ "loss": 0.0854,
596
+ "step": 21600
597
+ },
598
+ {
599
+ "epoch": 1.23,
600
+ "eval_bertscore": 0.7214290499687195,
601
+ "eval_loss": 0.10122980922460556,
602
+ "eval_rouge1": 0.5565823263453793,
603
+ "eval_rouge2": 0.3393375143994867,
604
+ "eval_rougeL": 0.4156140884756716,
605
+ "eval_rougeLsum": 0.41819540905867203,
606
+ "eval_runtime": 73.2235,
607
+ "eval_samples_per_second": 0.737,
608
+ "eval_steps_per_second": 0.369,
609
+ "step": 21600
610
+ },
611
+ {
612
+ "epoch": 1.27,
613
+ "grad_norm": 0.3710538446903229,
614
+ "learning_rate": 5.761726542166157e-05,
615
+ "loss": 0.0845,
616
+ "step": 22320
617
+ },
618
+ {
619
+ "epoch": 1.27,
620
+ "eval_bertscore": 0.7201518416404724,
621
+ "eval_loss": 0.10378885269165039,
622
+ "eval_rouge1": 0.5441295123980776,
623
+ "eval_rouge2": 0.33155064058257405,
624
+ "eval_rougeL": 0.42094247090226844,
625
+ "eval_rougeLsum": 0.42274633038817555,
626
+ "eval_runtime": 76.6313,
627
+ "eval_samples_per_second": 0.705,
628
+ "eval_steps_per_second": 0.352,
629
+ "step": 22320
630
+ },
631
+ {
632
+ "epoch": 1.31,
633
+ "grad_norm": 0.5819060206413269,
634
+ "learning_rate": 5.624940631114996e-05,
635
+ "loss": 0.0861,
636
+ "step": 23040
637
+ },
638
+ {
639
+ "epoch": 1.31,
640
+ "eval_bertscore": 0.7142701148986816,
641
+ "eval_loss": 0.10384026169776917,
642
+ "eval_rouge1": 0.5458184588249984,
643
+ "eval_rouge2": 0.3290922169442115,
644
+ "eval_rougeL": 0.4214855047650181,
645
+ "eval_rougeLsum": 0.4239018723206239,
646
+ "eval_runtime": 79.4541,
647
+ "eval_samples_per_second": 0.68,
648
+ "eval_steps_per_second": 0.34,
649
+ "step": 23040
650
+ },
651
+ {
652
+ "epoch": 1.35,
653
+ "grad_norm": 0.2952657639980316,
654
+ "learning_rate": 5.488154720063834e-05,
655
+ "loss": 0.0862,
656
+ "step": 23760
657
+ },
658
+ {
659
+ "epoch": 1.35,
660
+ "eval_bertscore": 0.7302463054656982,
661
+ "eval_loss": 0.10171066224575043,
662
+ "eval_rouge1": 0.564237466077122,
663
+ "eval_rouge2": 0.346632021192653,
664
+ "eval_rougeL": 0.44007571581541377,
665
+ "eval_rougeLsum": 0.4408434182223313,
666
+ "eval_runtime": 70.4368,
667
+ "eval_samples_per_second": 0.767,
668
+ "eval_steps_per_second": 0.383,
669
+ "step": 23760
670
+ },
671
+ {
672
+ "epoch": 1.4,
673
+ "grad_norm": 0.3152740001678467,
674
+ "learning_rate": 5.351368809012672e-05,
675
+ "loss": 0.0858,
676
+ "step": 24480
677
+ },
678
+ {
679
+ "epoch": 1.4,
680
+ "eval_bertscore": 0.7205690741539001,
681
+ "eval_loss": 0.10222817957401276,
682
+ "eval_rouge1": 0.561963492920594,
683
+ "eval_rouge2": 0.3366175149143015,
684
+ "eval_rougeL": 0.4370056834486044,
685
+ "eval_rougeLsum": 0.4383325343921459,
686
+ "eval_runtime": 70.7631,
687
+ "eval_samples_per_second": 0.763,
688
+ "eval_steps_per_second": 0.382,
689
+ "step": 24480
690
+ },
691
+ {
692
+ "epoch": 1.44,
693
+ "grad_norm": 0.3384862542152405,
694
+ "learning_rate": 5.214772878393526e-05,
695
+ "loss": 0.0868,
696
+ "step": 25200
697
+ },
698
+ {
699
+ "epoch": 1.44,
700
+ "eval_bertscore": 0.7201054096221924,
701
+ "eval_loss": 0.10058918595314026,
702
+ "eval_rouge1": 0.5506843793300468,
703
+ "eval_rouge2": 0.3305447880283259,
704
+ "eval_rougeL": 0.4221671281003694,
705
+ "eval_rougeLsum": 0.42405735392085775,
706
+ "eval_runtime": 73.3661,
707
+ "eval_samples_per_second": 0.736,
708
+ "eval_steps_per_second": 0.368,
709
+ "step": 25200
710
+ },
711
+ {
712
+ "epoch": 1.48,
713
+ "grad_norm": 0.2858143150806427,
714
+ "learning_rate": 5.078176947774379e-05,
715
+ "loss": 0.0851,
716
+ "step": 25920
717
+ },
718
+ {
719
+ "epoch": 1.48,
720
+ "eval_bertscore": 0.7324591875076294,
721
+ "eval_loss": 0.10030569136142731,
722
+ "eval_rouge1": 0.5711881175991272,
723
+ "eval_rouge2": 0.35036140380824915,
724
+ "eval_rougeL": 0.44736244718696055,
725
+ "eval_rougeLsum": 0.44882200145887735,
726
+ "eval_runtime": 73.2375,
727
+ "eval_samples_per_second": 0.737,
728
+ "eval_steps_per_second": 0.369,
729
+ "step": 25920
730
+ },
731
+ {
732
+ "epoch": 1.52,
733
+ "grad_norm": 0.34586507081985474,
734
+ "learning_rate": 4.941391036723218e-05,
735
+ "loss": 0.0839,
736
+ "step": 26640
737
+ },
738
+ {
739
+ "epoch": 1.52,
740
+ "eval_bertscore": 0.7323827147483826,
741
+ "eval_loss": 0.10078810900449753,
742
+ "eval_rouge1": 0.5643411922408847,
743
+ "eval_rouge2": 0.35335509416724475,
744
+ "eval_rougeL": 0.4412030311945061,
745
+ "eval_rougeLsum": 0.4423071630624772,
746
+ "eval_runtime": 78.0237,
747
+ "eval_samples_per_second": 0.692,
748
+ "eval_steps_per_second": 0.346,
749
+ "step": 26640
750
+ },
751
+ {
752
+ "epoch": 1.56,
753
+ "grad_norm": 0.35009488463401794,
754
+ "learning_rate": 4.804605125672056e-05,
755
+ "loss": 0.0843,
756
+ "step": 27360
757
+ },
758
+ {
759
+ "epoch": 1.56,
760
+ "eval_bertscore": 0.7391833662986755,
761
+ "eval_loss": 0.09879420697689056,
762
+ "eval_rouge1": 0.5744063451356638,
763
+ "eval_rouge2": 0.3631199161982914,
764
+ "eval_rougeL": 0.44665302719291095,
765
+ "eval_rougeLsum": 0.44897406269269213,
766
+ "eval_runtime": 80.5403,
767
+ "eval_samples_per_second": 0.67,
768
+ "eval_steps_per_second": 0.335,
769
+ "step": 27360
770
+ },
771
+ {
772
+ "epoch": 1.6,
773
+ "grad_norm": 0.3002821207046509,
774
+ "learning_rate": 4.667819214620894e-05,
775
+ "loss": 0.085,
776
+ "step": 28080
777
+ },
778
+ {
779
+ "epoch": 1.6,
780
+ "eval_bertscore": 0.7407130002975464,
781
+ "eval_loss": 0.09699860215187073,
782
+ "eval_rouge1": 0.5762741233248756,
783
+ "eval_rouge2": 0.3544722421313946,
784
+ "eval_rougeL": 0.4384246085216507,
785
+ "eval_rougeLsum": 0.4390526517186611,
786
+ "eval_runtime": 78.2681,
787
+ "eval_samples_per_second": 0.69,
788
+ "eval_steps_per_second": 0.345,
789
+ "step": 28080
790
+ },
791
+ {
792
+ "epoch": 1.64,
793
+ "grad_norm": 0.241718590259552,
794
+ "learning_rate": 4.5310333035697325e-05,
795
+ "loss": 0.0845,
796
+ "step": 28800
797
+ },
798
+ {
799
+ "epoch": 1.64,
800
+ "eval_bertscore": 0.7392789125442505,
801
+ "eval_loss": 0.09867523610591888,
802
+ "eval_rouge1": 0.580719658129176,
803
+ "eval_rouge2": 0.3694474172593357,
804
+ "eval_rougeL": 0.456964934995113,
805
+ "eval_rougeLsum": 0.45917370226539334,
806
+ "eval_runtime": 80.3113,
807
+ "eval_samples_per_second": 0.672,
808
+ "eval_steps_per_second": 0.336,
809
+ "step": 28800
810
+ }
811
+ ],
812
+ "logging_steps": 720,
813
+ "max_steps": 52638,
814
+ "num_input_tokens_seen": 0,
815
+ "num_train_epochs": 3,
816
+ "save_steps": 2880,
817
+ "total_flos": 2.496482830587003e+17,
818
+ "train_batch_size": 2,
819
+ "trial_name": null,
820
+ "trial_params": null
821
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b0fe275c173c3356b48c1b740a2f996b54b571a59ced80218c8b705d57d7cd
3
+ size 5112
vocab.json ADDED
The diff for this file is too large to render. See raw diff