Sag1012 commited on
Commit
3e91389
1 Parent(s): 5eeea68

Upload EncoderDecoder model - ver5

Browse files
EncoderDecoder_5/config.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "EncoderDecoderModel"
4
+ ],
5
+ "decoder": {
6
+ "_attn_implementation_autoset": true,
7
+ "_name_or_path": "vinai/bartpho-word",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "add_cross_attention": true,
11
+ "architectures": [
12
+ "MBartModel"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "classifier_dropout": 0.0,
20
+ "cross_attention_hidden_size": null,
21
+ "d_model": 1024,
22
+ "decoder_attention_heads": 16,
23
+ "decoder_ffn_dim": 4096,
24
+ "decoder_layerdrop": 0.0,
25
+ "decoder_layers": 12,
26
+ "decoder_start_token_id": 2,
27
+ "diversity_penalty": 0.0,
28
+ "do_sample": false,
29
+ "dropout": 0.1,
30
+ "early_stopping": false,
31
+ "encoder_attention_heads": 16,
32
+ "encoder_ffn_dim": 4096,
33
+ "encoder_layerdrop": 0.0,
34
+ "encoder_layers": 12,
35
+ "encoder_no_repeat_ngram_size": 0,
36
+ "eos_token_id": 2,
37
+ "exponential_decay_length_penalty": null,
38
+ "finetuning_task": null,
39
+ "forced_bos_token_id": null,
40
+ "forced_eos_token_id": 2,
41
+ "gradient_checkpointing": false,
42
+ "id2label": {
43
+ "0": "LABEL_0",
44
+ "1": "LABEL_1"
45
+ },
46
+ "init_std": 0.02,
47
+ "is_decoder": true,
48
+ "is_encoder_decoder": false,
49
+ "label2id": {
50
+ "LABEL_0": 0,
51
+ "LABEL_1": 1
52
+ },
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "max_position_embeddings": 1024,
56
+ "min_length": 0,
57
+ "model_type": "mbart",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_hidden_layers": 12,
62
+ "num_return_sequences": 1,
63
+ "output_attentions": false,
64
+ "output_hidden_states": false,
65
+ "output_scores": false,
66
+ "pad_token_id": 1,
67
+ "prefix": null,
68
+ "problem_type": null,
69
+ "pruned_heads": {},
70
+ "remove_invalid_values": false,
71
+ "repetition_penalty": 1.0,
72
+ "return_dict": true,
73
+ "return_dict_in_generate": false,
74
+ "scale_embedding": false,
75
+ "sep_token_id": null,
76
+ "suppress_tokens": null,
77
+ "task_specific_params": null,
78
+ "temperature": 1.0,
79
+ "tf_legacy_loss": false,
80
+ "tie_encoder_decoder": false,
81
+ "tie_word_embeddings": true,
82
+ "tokenizer_class": "PhobertTokenizer",
83
+ "top_k": 50,
84
+ "top_p": 1.0,
85
+ "torch_dtype": "float32",
86
+ "torchscript": false,
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false,
89
+ "use_cache": true,
90
+ "vocab_size": 64001
91
+ },
92
+ "decoder_start_token_id": 0,
93
+ "encoder": {
94
+ "_attn_implementation_autoset": true,
95
+ "_name_or_path": "bert-base-uncased",
96
+ "add_cross_attention": false,
97
+ "architectures": [
98
+ "BertForMaskedLM"
99
+ ],
100
+ "attention_probs_dropout_prob": 0.1,
101
+ "bad_words_ids": null,
102
+ "begin_suppress_tokens": null,
103
+ "bos_token_id": null,
104
+ "chunk_size_feed_forward": 0,
105
+ "classifier_dropout": null,
106
+ "cross_attention_hidden_size": null,
107
+ "decoder_start_token_id": null,
108
+ "diversity_penalty": 0.0,
109
+ "do_sample": false,
110
+ "early_stopping": false,
111
+ "encoder_no_repeat_ngram_size": 0,
112
+ "eos_token_id": null,
113
+ "exponential_decay_length_penalty": null,
114
+ "finetuning_task": null,
115
+ "forced_bos_token_id": null,
116
+ "forced_eos_token_id": null,
117
+ "gradient_checkpointing": false,
118
+ "hidden_act": "gelu",
119
+ "hidden_dropout_prob": 0.1,
120
+ "hidden_size": 768,
121
+ "id2label": {
122
+ "0": "LABEL_0",
123
+ "1": "LABEL_1"
124
+ },
125
+ "initializer_range": 0.02,
126
+ "intermediate_size": 3072,
127
+ "is_decoder": false,
128
+ "is_encoder_decoder": false,
129
+ "label2id": {
130
+ "LABEL_0": 0,
131
+ "LABEL_1": 1
132
+ },
133
+ "layer_norm_eps": 1e-12,
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "max_position_embeddings": 512,
137
+ "min_length": 0,
138
+ "model_type": "bert",
139
+ "no_repeat_ngram_size": 0,
140
+ "num_attention_heads": 12,
141
+ "num_beam_groups": 1,
142
+ "num_beams": 1,
143
+ "num_hidden_layers": 12,
144
+ "num_return_sequences": 1,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "output_scores": false,
148
+ "pad_token_id": 0,
149
+ "position_embedding_type": "absolute",
150
+ "prefix": null,
151
+ "problem_type": null,
152
+ "pruned_heads": {},
153
+ "remove_invalid_values": false,
154
+ "repetition_penalty": 1.0,
155
+ "return_dict": true,
156
+ "return_dict_in_generate": false,
157
+ "sep_token_id": null,
158
+ "suppress_tokens": null,
159
+ "task_specific_params": null,
160
+ "temperature": 1.0,
161
+ "tf_legacy_loss": false,
162
+ "tie_encoder_decoder": false,
163
+ "tie_word_embeddings": true,
164
+ "tokenizer_class": null,
165
+ "top_k": 50,
166
+ "top_p": 1.0,
167
+ "torch_dtype": null,
168
+ "torchscript": false,
169
+ "type_vocab_size": 2,
170
+ "typical_p": 1.0,
171
+ "use_bfloat16": false,
172
+ "use_cache": true,
173
+ "vocab_size": 30522
174
+ },
175
+ "is_encoder_decoder": true,
176
+ "model_type": "encoder-decoder",
177
+ "pad_token_id": 1,
178
+ "torch_dtype": "float32",
179
+ "transformers_version": "4.46.3"
180
+ }
EncoderDecoder_5/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "forced_eos_token_id": 2,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.46.3"
8
+ }
EncoderDecoder_5/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93857aa4227c567faecf8a2f13dd429288bd364d508e9e8da3bb3d9da37141a2
3
+ size 1513750720
EncoderDecoder_5/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40794d4e2bc8bd762b7c9731a6ffbf75e250eabc0df0681fb37835524e1ee86
3
+ size 3023080673
EncoderDecoder_5/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbcb97aba7b41f2d6abe9ef832a1926b7a86f73b19128bd88de6488ac27e81a3
3
+ size 14244
EncoderDecoder_5/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:466d9678b88de86639a0a0f85f8525b3f815a8068c10fc9b45ced453b6a8153c
3
+ size 1064
EncoderDecoder_5/trainer_state.json ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.3951435089111328,
3
+ "best_model_checkpoint": "./results/checkpoint-18616",
4
+ "epoch": 6.0,
5
+ "eval_steps": 500,
6
+ "global_step": 27924,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.10743446497636441,
13
+ "grad_norm": 4.838443756103516,
14
+ "learning_rate": 4.9462827675118175e-05,
15
+ "loss": 3.2819,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.21486892995272883,
20
+ "grad_norm": 3.5501515865325928,
21
+ "learning_rate": 4.892565535023636e-05,
22
+ "loss": 2.3948,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.32230339492909327,
27
+ "grad_norm": 3.361682653427124,
28
+ "learning_rate": 4.8388483025354535e-05,
29
+ "loss": 2.1166,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.42973785990545765,
34
+ "grad_norm": 3.207756996154785,
35
+ "learning_rate": 4.7851310700472715e-05,
36
+ "loss": 1.983,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.5371723248818221,
41
+ "grad_norm": 3.38034725189209,
42
+ "learning_rate": 4.7314138375590894e-05,
43
+ "loss": 1.8785,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.6446067898581865,
48
+ "grad_norm": 3.3562798500061035,
49
+ "learning_rate": 4.677696605070907e-05,
50
+ "loss": 1.7957,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.752041254834551,
55
+ "grad_norm": 2.897372007369995,
56
+ "learning_rate": 4.623979372582725e-05,
57
+ "loss": 1.7387,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.8594757198109153,
62
+ "grad_norm": 2.9306259155273438,
63
+ "learning_rate": 4.570262140094543e-05,
64
+ "loss": 1.7103,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.9669101847872797,
69
+ "grad_norm": 2.7418951988220215,
70
+ "learning_rate": 4.5165449076063606e-05,
71
+ "loss": 1.6637,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_loss": 1.5266185998916626,
77
+ "eval_rouge2_fmeasure": 0.4101,
78
+ "eval_rouge2_precision": 0.4101,
79
+ "eval_rouge2_recall": 0.4101,
80
+ "eval_runtime": 72.9922,
81
+ "eval_samples_per_second": 65.582,
82
+ "eval_steps_per_second": 1.028,
83
+ "step": 4654
84
+ },
85
+ {
86
+ "epoch": 1.0743446497636442,
87
+ "grad_norm": 3.1563010215759277,
88
+ "learning_rate": 4.462827675118178e-05,
89
+ "loss": 1.4927,
90
+ "step": 5000
91
+ },
92
+ {
93
+ "epoch": 1.1817791147400085,
94
+ "grad_norm": 2.845944404602051,
95
+ "learning_rate": 4.409110442629996e-05,
96
+ "loss": 1.4188,
97
+ "step": 5500
98
+ },
99
+ {
100
+ "epoch": 1.289213579716373,
101
+ "grad_norm": 2.7506182193756104,
102
+ "learning_rate": 4.355393210141814e-05,
103
+ "loss": 1.418,
104
+ "step": 6000
105
+ },
106
+ {
107
+ "epoch": 1.3966480446927374,
108
+ "grad_norm": 2.623769760131836,
109
+ "learning_rate": 4.301675977653631e-05,
110
+ "loss": 1.406,
111
+ "step": 6500
112
+ },
113
+ {
114
+ "epoch": 1.504082509669102,
115
+ "grad_norm": 2.671093225479126,
116
+ "learning_rate": 4.247958745165449e-05,
117
+ "loss": 1.4021,
118
+ "step": 7000
119
+ },
120
+ {
121
+ "epoch": 1.6115169746454663,
122
+ "grad_norm": 2.96425724029541,
123
+ "learning_rate": 4.194241512677267e-05,
124
+ "loss": 1.3883,
125
+ "step": 7500
126
+ },
127
+ {
128
+ "epoch": 1.7189514396218306,
129
+ "grad_norm": 2.6702911853790283,
130
+ "learning_rate": 4.140524280189085e-05,
131
+ "loss": 1.3883,
132
+ "step": 8000
133
+ },
134
+ {
135
+ "epoch": 1.8263859045981952,
136
+ "grad_norm": 2.6645655632019043,
137
+ "learning_rate": 4.0868070477009024e-05,
138
+ "loss": 1.3761,
139
+ "step": 8500
140
+ },
141
+ {
142
+ "epoch": 1.9338203695745595,
143
+ "grad_norm": 3.023545265197754,
144
+ "learning_rate": 4.0330898152127204e-05,
145
+ "loss": 1.376,
146
+ "step": 9000
147
+ },
148
+ {
149
+ "epoch": 2.0,
150
+ "eval_loss": 1.410889744758606,
151
+ "eval_rouge2_fmeasure": 0.4238,
152
+ "eval_rouge2_precision": 0.4238,
153
+ "eval_rouge2_recall": 0.4238,
154
+ "eval_runtime": 72.9371,
155
+ "eval_samples_per_second": 65.632,
156
+ "eval_steps_per_second": 1.028,
157
+ "step": 9308
158
+ },
159
+ {
160
+ "epoch": 2.041254834550924,
161
+ "grad_norm": 2.6128861904144287,
162
+ "learning_rate": 3.979372582724538e-05,
163
+ "loss": 1.2778,
164
+ "step": 9500
165
+ },
166
+ {
167
+ "epoch": 2.1486892995272884,
168
+ "grad_norm": 2.602187395095825,
169
+ "learning_rate": 3.9256553502363556e-05,
170
+ "loss": 1.1422,
171
+ "step": 10000
172
+ },
173
+ {
174
+ "epoch": 2.256123764503653,
175
+ "grad_norm": 2.725907802581787,
176
+ "learning_rate": 3.8719381177481736e-05,
177
+ "loss": 1.1449,
178
+ "step": 10500
179
+ },
180
+ {
181
+ "epoch": 2.363558229480017,
182
+ "grad_norm": 2.6348464488983154,
183
+ "learning_rate": 3.8182208852599916e-05,
184
+ "loss": 1.1529,
185
+ "step": 11000
186
+ },
187
+ {
188
+ "epoch": 2.4709926944563816,
189
+ "grad_norm": 2.8584792613983154,
190
+ "learning_rate": 3.764503652771809e-05,
191
+ "loss": 1.1552,
192
+ "step": 11500
193
+ },
194
+ {
195
+ "epoch": 2.578427159432746,
196
+ "grad_norm": 2.59346342086792,
197
+ "learning_rate": 3.7107864202836275e-05,
198
+ "loss": 1.1666,
199
+ "step": 12000
200
+ },
201
+ {
202
+ "epoch": 2.6858616244091102,
203
+ "grad_norm": 2.655982255935669,
204
+ "learning_rate": 3.657069187795445e-05,
205
+ "loss": 1.1658,
206
+ "step": 12500
207
+ },
208
+ {
209
+ "epoch": 2.793296089385475,
210
+ "grad_norm": 2.5818564891815186,
211
+ "learning_rate": 3.603351955307263e-05,
212
+ "loss": 1.1674,
213
+ "step": 13000
214
+ },
215
+ {
216
+ "epoch": 2.9007305543618394,
217
+ "grad_norm": 2.8041298389434814,
218
+ "learning_rate": 3.549634722819081e-05,
219
+ "loss": 1.1684,
220
+ "step": 13500
221
+ },
222
+ {
223
+ "epoch": 3.0,
224
+ "eval_loss": 1.3994433879852295,
225
+ "eval_rouge2_fmeasure": 0.4311,
226
+ "eval_rouge2_precision": 0.4311,
227
+ "eval_rouge2_recall": 0.4311,
228
+ "eval_runtime": 72.9642,
229
+ "eval_samples_per_second": 65.607,
230
+ "eval_steps_per_second": 1.028,
231
+ "step": 13962
232
+ },
233
+ {
234
+ "epoch": 3.008165019338204,
235
+ "grad_norm": 2.646003246307373,
236
+ "learning_rate": 3.495917490330898e-05,
237
+ "loss": 1.1485,
238
+ "step": 14000
239
+ },
240
+ {
241
+ "epoch": 3.115599484314568,
242
+ "grad_norm": 2.8257687091827393,
243
+ "learning_rate": 3.442200257842716e-05,
244
+ "loss": 0.9456,
245
+ "step": 14500
246
+ },
247
+ {
248
+ "epoch": 3.2230339492909326,
249
+ "grad_norm": 2.9319422245025635,
250
+ "learning_rate": 3.388483025354534e-05,
251
+ "loss": 0.961,
252
+ "step": 15000
253
+ },
254
+ {
255
+ "epoch": 3.330468414267297,
256
+ "grad_norm": 2.7773501873016357,
257
+ "learning_rate": 3.334765792866352e-05,
258
+ "loss": 0.9748,
259
+ "step": 15500
260
+ },
261
+ {
262
+ "epoch": 3.4379028792436612,
263
+ "grad_norm": 2.673140048980713,
264
+ "learning_rate": 3.281048560378169e-05,
265
+ "loss": 0.9822,
266
+ "step": 16000
267
+ },
268
+ {
269
+ "epoch": 3.5453373442200258,
270
+ "grad_norm": 2.7191991806030273,
271
+ "learning_rate": 3.227331327889987e-05,
272
+ "loss": 0.9808,
273
+ "step": 16500
274
+ },
275
+ {
276
+ "epoch": 3.6527718091963903,
277
+ "grad_norm": 2.717005491256714,
278
+ "learning_rate": 3.173614095401805e-05,
279
+ "loss": 0.9832,
280
+ "step": 17000
281
+ },
282
+ {
283
+ "epoch": 3.760206274172755,
284
+ "grad_norm": 2.696438789367676,
285
+ "learning_rate": 3.1198968629136225e-05,
286
+ "loss": 0.9954,
287
+ "step": 17500
288
+ },
289
+ {
290
+ "epoch": 3.867640739149119,
291
+ "grad_norm": 2.5392231941223145,
292
+ "learning_rate": 3.066179630425441e-05,
293
+ "loss": 0.9978,
294
+ "step": 18000
295
+ },
296
+ {
297
+ "epoch": 3.9750752041254835,
298
+ "grad_norm": 3.3882222175598145,
299
+ "learning_rate": 3.0124623979372585e-05,
300
+ "loss": 0.9957,
301
+ "step": 18500
302
+ },
303
+ {
304
+ "epoch": 4.0,
305
+ "eval_loss": 1.3951435089111328,
306
+ "eval_rouge2_fmeasure": 0.4323,
307
+ "eval_rouge2_precision": 0.4323,
308
+ "eval_rouge2_recall": 0.4323,
309
+ "eval_runtime": 72.9673,
310
+ "eval_samples_per_second": 65.605,
311
+ "eval_steps_per_second": 1.028,
312
+ "step": 18616
313
+ },
314
+ {
315
+ "epoch": 4.082509669101848,
316
+ "grad_norm": 2.655376672744751,
317
+ "learning_rate": 2.958745165449076e-05,
318
+ "loss": 0.8449,
319
+ "step": 19000
320
+ },
321
+ {
322
+ "epoch": 4.189944134078212,
323
+ "grad_norm": 2.5333900451660156,
324
+ "learning_rate": 2.9050279329608944e-05,
325
+ "loss": 0.8125,
326
+ "step": 19500
327
+ },
328
+ {
329
+ "epoch": 4.297378599054577,
330
+ "grad_norm": 2.4350857734680176,
331
+ "learning_rate": 2.8513107004727117e-05,
332
+ "loss": 0.8184,
333
+ "step": 20000
334
+ },
335
+ {
336
+ "epoch": 4.404813064030941,
337
+ "grad_norm": 2.595292329788208,
338
+ "learning_rate": 2.7975934679845293e-05,
339
+ "loss": 0.8317,
340
+ "step": 20500
341
+ },
342
+ {
343
+ "epoch": 4.512247529007306,
344
+ "grad_norm": 2.4971327781677246,
345
+ "learning_rate": 2.743876235496347e-05,
346
+ "loss": 0.833,
347
+ "step": 21000
348
+ },
349
+ {
350
+ "epoch": 4.61968199398367,
351
+ "grad_norm": 2.7136857509613037,
352
+ "learning_rate": 2.6901590030081653e-05,
353
+ "loss": 0.8361,
354
+ "step": 21500
355
+ },
356
+ {
357
+ "epoch": 4.727116458960034,
358
+ "grad_norm": 2.8456897735595703,
359
+ "learning_rate": 2.636441770519983e-05,
360
+ "loss": 0.8458,
361
+ "step": 22000
362
+ },
363
+ {
364
+ "epoch": 4.834550923936399,
365
+ "grad_norm": 2.601877212524414,
366
+ "learning_rate": 2.5827245380318005e-05,
367
+ "loss": 0.846,
368
+ "step": 22500
369
+ },
370
+ {
371
+ "epoch": 4.941985388912763,
372
+ "grad_norm": 2.6655149459838867,
373
+ "learning_rate": 2.5290073055436185e-05,
374
+ "loss": 0.848,
375
+ "step": 23000
376
+ },
377
+ {
378
+ "epoch": 5.0,
379
+ "eval_loss": 1.42265784740448,
380
+ "eval_rouge2_fmeasure": 0.4307,
381
+ "eval_rouge2_precision": 0.4307,
382
+ "eval_rouge2_recall": 0.4307,
383
+ "eval_runtime": 73.0163,
384
+ "eval_samples_per_second": 65.561,
385
+ "eval_steps_per_second": 1.027,
386
+ "step": 23270
387
+ },
388
+ {
389
+ "epoch": 5.049419853889128,
390
+ "grad_norm": 2.525017023086548,
391
+ "learning_rate": 2.475290073055436e-05,
392
+ "loss": 0.7735,
393
+ "step": 23500
394
+ },
395
+ {
396
+ "epoch": 5.156854318865492,
397
+ "grad_norm": 2.2720000743865967,
398
+ "learning_rate": 2.421572840567254e-05,
399
+ "loss": 0.6797,
400
+ "step": 24000
401
+ },
402
+ {
403
+ "epoch": 5.264288783841857,
404
+ "grad_norm": 2.633282423019409,
405
+ "learning_rate": 2.3678556080790718e-05,
406
+ "loss": 0.6955,
407
+ "step": 24500
408
+ },
409
+ {
410
+ "epoch": 5.3717232488182205,
411
+ "grad_norm": 2.6699864864349365,
412
+ "learning_rate": 2.3141383755908897e-05,
413
+ "loss": 0.7026,
414
+ "step": 25000
415
+ },
416
+ {
417
+ "epoch": 5.479157713794585,
418
+ "grad_norm": 2.765111207962036,
419
+ "learning_rate": 2.2604211431027074e-05,
420
+ "loss": 0.7045,
421
+ "step": 25500
422
+ },
423
+ {
424
+ "epoch": 5.58659217877095,
425
+ "grad_norm": 2.8248813152313232,
426
+ "learning_rate": 2.206703910614525e-05,
427
+ "loss": 0.7148,
428
+ "step": 26000
429
+ },
430
+ {
431
+ "epoch": 5.694026643747314,
432
+ "grad_norm": 2.7719056606292725,
433
+ "learning_rate": 2.152986678126343e-05,
434
+ "loss": 0.7218,
435
+ "step": 26500
436
+ },
437
+ {
438
+ "epoch": 5.801461108723679,
439
+ "grad_norm": 2.8628671169281006,
440
+ "learning_rate": 2.099269445638161e-05,
441
+ "loss": 0.717,
442
+ "step": 27000
443
+ },
444
+ {
445
+ "epoch": 5.908895573700043,
446
+ "grad_norm": 2.479224443435669,
447
+ "learning_rate": 2.0455522131499786e-05,
448
+ "loss": 0.7247,
449
+ "step": 27500
450
+ },
451
+ {
452
+ "epoch": 6.0,
453
+ "eval_loss": 1.479453682899475,
454
+ "eval_rouge2_fmeasure": 0.43,
455
+ "eval_rouge2_precision": 0.43,
456
+ "eval_rouge2_recall": 0.43,
457
+ "eval_runtime": 73.0025,
458
+ "eval_samples_per_second": 65.573,
459
+ "eval_steps_per_second": 1.027,
460
+ "step": 27924
461
+ }
462
+ ],
463
+ "logging_steps": 500,
464
+ "max_steps": 46540,
465
+ "num_input_tokens_seen": 0,
466
+ "num_train_epochs": 10,
467
+ "save_steps": 500,
468
+ "stateful_callbacks": {
469
+ "EarlyStoppingCallback": {
470
+ "args": {
471
+ "early_stopping_patience": 3,
472
+ "early_stopping_threshold": 0.0
473
+ },
474
+ "attributes": {
475
+ "early_stopping_patience_counter": 2
476
+ }
477
+ },
478
+ "TrainerControl": {
479
+ "args": {
480
+ "should_epoch_stop": false,
481
+ "should_evaluate": false,
482
+ "should_log": false,
483
+ "should_save": true,
484
+ "should_training_stop": false
485
+ },
486
+ "attributes": {}
487
+ }
488
+ },
489
+ "total_flos": 1.9760389250501837e+17,
490
+ "train_batch_size": 64,
491
+ "trial_name": null,
492
+ "trial_params": null
493
+ }
EncoderDecoder_5/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a91b80dad1b4298cd90ae4828e8412b3a192cc72a1512b2233246a3e4eba8376
3
+ size 5432