huhu233 commited on
Commit
8cde61d
1 Parent(s): 1496fe4

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "trans_model",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 65000
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classif_dropout": 0.0,
18
+ "classifier_dropout": 0.0,
19
+ "d_model": 512,
20
+ "decoder_attention_heads": 8,
21
+ "decoder_ffn_dim": 2048,
22
+ "decoder_layerdrop": 0.0,
23
+ "decoder_layers": 6,
24
+ "decoder_start_token_id": 65000,
25
+ "decoder_vocab_size": 65001,
26
+ "do_blenderbot_90_layernorm": false,
27
+ "dropout": 0.1,
28
+ "encoder_attention_heads": 8,
29
+ "encoder_ffn_dim": 2048,
30
+ "encoder_layerdrop": 0.0,
31
+ "encoder_layers": 6,
32
+ "eos_token_id": 0,
33
+ "extra_pos_embeddings": 0,
34
+ "force_bos_token_to_be_generated": false,
35
+ "forced_eos_token_id": 0,
36
+ "gradient_checkpointing": false,
37
+ "id2label": {
38
+ "0": "LABEL_0",
39
+ "1": "LABEL_1",
40
+ "2": "LABEL_2"
41
+ },
42
+ "init_std": 0.02,
43
+ "is_encoder_decoder": true,
44
+ "label2id": {
45
+ "LABEL_0": 0,
46
+ "LABEL_1": 1,
47
+ "LABEL_2": 2
48
+ },
49
+ "max_length": 512,
50
+ "max_position_embeddings": 512,
51
+ "model_type": "marian",
52
+ "normalize_before": false,
53
+ "normalize_embedding": false,
54
+ "num_beams": 4,
55
+ "num_hidden_layers": 6,
56
+ "pad_token_id": 65000,
57
+ "scale_embedding": true,
58
+ "share_encoder_decoder_embeddings": true,
59
+ "static_position_embeddings": true,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.30.2",
62
+ "use_cache": true,
63
+ "vocab_size": 65001
64
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 65000
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 65000,
10
+ "eos_token_id": 0,
11
+ "forced_eos_token_id": 0,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 65000,
15
+ "transformers_version": "4.30.2"
16
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b975db1caf46413da27809a7786e61eb2f3c4e9c2647f16de1c82a4281fea9
3
+ size 619501061
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:072ccbd6ec6adf71e38319f0c08fa691c51e79def9360c13ee9ad6d4e195a30d
3
+ size 310022533
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba189c7848e4096e993e8ab804f5bb42f8755bcda7a33135ed67461afe47ac38
3
+ size 14511
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:519e73e22f5a8054abe5b4756ca77bf6ddeb528bd80ca85e2b1e31b233f1adac
3
+ size 627
source.spm ADDED
Binary file (806 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
Binary file (805 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "eos_token": "</s>",
4
+ "model_max_length": 512,
5
+ "pad_token": "<pad>",
6
+ "separate_vocabs": false,
7
+ "source_lang": "eng",
8
+ "sp_model_kwargs": {},
9
+ "target_lang": "zho",
10
+ "tokenizer_class": "MarianTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,1978 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.968985378821444,
5
+ "global_step": 157500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "learning_rate": 1.9936704854737643e-05,
13
+ "loss": 2.1885,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.06,
18
+ "learning_rate": 1.9873409709475284e-05,
19
+ "loss": 2.1503,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.09,
24
+ "learning_rate": 1.9810114564212926e-05,
25
+ "loss": 2.1207,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.13,
30
+ "learning_rate": 1.974681941895057e-05,
31
+ "loss": 2.093,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.16,
36
+ "learning_rate": 1.968352427368821e-05,
37
+ "loss": 2.0938,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.19,
42
+ "learning_rate": 1.9620229128425853e-05,
43
+ "loss": 2.0842,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.22,
48
+ "learning_rate": 1.9556933983163494e-05,
49
+ "loss": 2.0991,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 0.25,
54
+ "learning_rate": 1.9493638837901136e-05,
55
+ "loss": 2.0715,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.28,
60
+ "learning_rate": 1.9430343692638777e-05,
61
+ "loss": 2.073,
62
+ "step": 4500
63
+ },
64
+ {
65
+ "epoch": 0.32,
66
+ "learning_rate": 1.936704854737642e-05,
67
+ "loss": 2.08,
68
+ "step": 5000
69
+ },
70
+ {
71
+ "epoch": 0.35,
72
+ "learning_rate": 1.930375340211406e-05,
73
+ "loss": 2.0494,
74
+ "step": 5500
75
+ },
76
+ {
77
+ "epoch": 0.38,
78
+ "learning_rate": 1.92404582568517e-05,
79
+ "loss": 2.0616,
80
+ "step": 6000
81
+ },
82
+ {
83
+ "epoch": 0.41,
84
+ "learning_rate": 1.9177163111589342e-05,
85
+ "loss": 2.0571,
86
+ "step": 6500
87
+ },
88
+ {
89
+ "epoch": 0.44,
90
+ "learning_rate": 1.9113867966326984e-05,
91
+ "loss": 2.0561,
92
+ "step": 7000
93
+ },
94
+ {
95
+ "epoch": 0.47,
96
+ "learning_rate": 1.9050572821064625e-05,
97
+ "loss": 2.0658,
98
+ "step": 7500
99
+ },
100
+ {
101
+ "epoch": 0.51,
102
+ "learning_rate": 1.8987277675802266e-05,
103
+ "loss": 2.0275,
104
+ "step": 8000
105
+ },
106
+ {
107
+ "epoch": 0.54,
108
+ "learning_rate": 1.8923982530539908e-05,
109
+ "loss": 2.0335,
110
+ "step": 8500
111
+ },
112
+ {
113
+ "epoch": 0.57,
114
+ "learning_rate": 1.8860687385277552e-05,
115
+ "loss": 2.0312,
116
+ "step": 9000
117
+ },
118
+ {
119
+ "epoch": 0.6,
120
+ "learning_rate": 1.8797392240015194e-05,
121
+ "loss": 2.0138,
122
+ "step": 9500
123
+ },
124
+ {
125
+ "epoch": 0.63,
126
+ "learning_rate": 1.8734097094752835e-05,
127
+ "loss": 2.0005,
128
+ "step": 10000
129
+ },
130
+ {
131
+ "epoch": 0.66,
132
+ "learning_rate": 1.8670801949490476e-05,
133
+ "loss": 2.0248,
134
+ "step": 10500
135
+ },
136
+ {
137
+ "epoch": 0.7,
138
+ "learning_rate": 1.8607506804228118e-05,
139
+ "loss": 2.0152,
140
+ "step": 11000
141
+ },
142
+ {
143
+ "epoch": 0.73,
144
+ "learning_rate": 1.854421165896576e-05,
145
+ "loss": 2.0168,
146
+ "step": 11500
147
+ },
148
+ {
149
+ "epoch": 0.76,
150
+ "learning_rate": 1.84809165137034e-05,
151
+ "loss": 2.0196,
152
+ "step": 12000
153
+ },
154
+ {
155
+ "epoch": 0.79,
156
+ "learning_rate": 1.841762136844104e-05,
157
+ "loss": 2.0003,
158
+ "step": 12500
159
+ },
160
+ {
161
+ "epoch": 0.82,
162
+ "learning_rate": 1.8354326223178683e-05,
163
+ "loss": 2.0267,
164
+ "step": 13000
165
+ },
166
+ {
167
+ "epoch": 0.85,
168
+ "learning_rate": 1.8291031077916324e-05,
169
+ "loss": 2.0022,
170
+ "step": 13500
171
+ },
172
+ {
173
+ "epoch": 0.89,
174
+ "learning_rate": 1.8227735932653966e-05,
175
+ "loss": 2.0102,
176
+ "step": 14000
177
+ },
178
+ {
179
+ "epoch": 0.92,
180
+ "learning_rate": 1.8164440787391607e-05,
181
+ "loss": 2.0,
182
+ "step": 14500
183
+ },
184
+ {
185
+ "epoch": 0.95,
186
+ "learning_rate": 1.810114564212925e-05,
187
+ "loss": 1.9751,
188
+ "step": 15000
189
+ },
190
+ {
191
+ "epoch": 0.98,
192
+ "learning_rate": 1.803785049686689e-05,
193
+ "loss": 1.9912,
194
+ "step": 15500
195
+ },
196
+ {
197
+ "epoch": 1.0,
198
+ "eval_loss": 1.708198070526123,
199
+ "eval_runtime": 629.4341,
200
+ "eval_samples_per_second": 401.594,
201
+ "eval_steps_per_second": 25.1,
202
+ "step": 15799
203
+ },
204
+ {
205
+ "epoch": 1.01,
206
+ "learning_rate": 1.797455535160453e-05,
207
+ "loss": 1.9373,
208
+ "step": 16000
209
+ },
210
+ {
211
+ "epoch": 1.04,
212
+ "learning_rate": 1.7911260206342176e-05,
213
+ "loss": 1.8308,
214
+ "step": 16500
215
+ },
216
+ {
217
+ "epoch": 1.08,
218
+ "learning_rate": 1.7847965061079817e-05,
219
+ "loss": 1.8442,
220
+ "step": 17000
221
+ },
222
+ {
223
+ "epoch": 1.11,
224
+ "learning_rate": 1.778466991581746e-05,
225
+ "loss": 1.8558,
226
+ "step": 17500
227
+ },
228
+ {
229
+ "epoch": 1.14,
230
+ "learning_rate": 1.77213747705551e-05,
231
+ "loss": 1.8564,
232
+ "step": 18000
233
+ },
234
+ {
235
+ "epoch": 1.17,
236
+ "learning_rate": 1.765807962529274e-05,
237
+ "loss": 1.8578,
238
+ "step": 18500
239
+ },
240
+ {
241
+ "epoch": 1.2,
242
+ "learning_rate": 1.7594784480030382e-05,
243
+ "loss": 1.8312,
244
+ "step": 19000
245
+ },
246
+ {
247
+ "epoch": 1.23,
248
+ "learning_rate": 1.7531489334768024e-05,
249
+ "loss": 1.8784,
250
+ "step": 19500
251
+ },
252
+ {
253
+ "epoch": 1.27,
254
+ "learning_rate": 1.746819418950567e-05,
255
+ "loss": 1.8497,
256
+ "step": 20000
257
+ },
258
+ {
259
+ "epoch": 1.3,
260
+ "learning_rate": 1.740489904424331e-05,
261
+ "loss": 1.8528,
262
+ "step": 20500
263
+ },
264
+ {
265
+ "epoch": 1.33,
266
+ "learning_rate": 1.734160389898095e-05,
267
+ "loss": 1.8645,
268
+ "step": 21000
269
+ },
270
+ {
271
+ "epoch": 1.36,
272
+ "learning_rate": 1.7278308753718592e-05,
273
+ "loss": 1.8563,
274
+ "step": 21500
275
+ },
276
+ {
277
+ "epoch": 1.39,
278
+ "learning_rate": 1.7215013608456234e-05,
279
+ "loss": 1.8616,
280
+ "step": 22000
281
+ },
282
+ {
283
+ "epoch": 1.42,
284
+ "learning_rate": 1.7151718463193875e-05,
285
+ "loss": 1.8699,
286
+ "step": 22500
287
+ },
288
+ {
289
+ "epoch": 1.46,
290
+ "learning_rate": 1.7088423317931516e-05,
291
+ "loss": 1.8583,
292
+ "step": 23000
293
+ },
294
+ {
295
+ "epoch": 1.49,
296
+ "learning_rate": 1.7025128172669158e-05,
297
+ "loss": 1.868,
298
+ "step": 23500
299
+ },
300
+ {
301
+ "epoch": 1.52,
302
+ "learning_rate": 1.69618330274068e-05,
303
+ "loss": 1.8534,
304
+ "step": 24000
305
+ },
306
+ {
307
+ "epoch": 1.55,
308
+ "learning_rate": 1.689853788214444e-05,
309
+ "loss": 1.8557,
310
+ "step": 24500
311
+ },
312
+ {
313
+ "epoch": 1.58,
314
+ "learning_rate": 1.683524273688208e-05,
315
+ "loss": 1.8709,
316
+ "step": 25000
317
+ },
318
+ {
319
+ "epoch": 1.61,
320
+ "learning_rate": 1.6771947591619723e-05,
321
+ "loss": 1.8544,
322
+ "step": 25500
323
+ },
324
+ {
325
+ "epoch": 1.65,
326
+ "learning_rate": 1.6708652446357364e-05,
327
+ "loss": 1.8803,
328
+ "step": 26000
329
+ },
330
+ {
331
+ "epoch": 1.68,
332
+ "learning_rate": 1.6645357301095006e-05,
333
+ "loss": 1.8573,
334
+ "step": 26500
335
+ },
336
+ {
337
+ "epoch": 1.71,
338
+ "learning_rate": 1.658206215583265e-05,
339
+ "loss": 1.8668,
340
+ "step": 27000
341
+ },
342
+ {
343
+ "epoch": 1.74,
344
+ "learning_rate": 1.6518767010570292e-05,
345
+ "loss": 1.8592,
346
+ "step": 27500
347
+ },
348
+ {
349
+ "epoch": 1.77,
350
+ "learning_rate": 1.6455471865307933e-05,
351
+ "loss": 1.8551,
352
+ "step": 28000
353
+ },
354
+ {
355
+ "epoch": 1.8,
356
+ "learning_rate": 1.6392176720045574e-05,
357
+ "loss": 1.8504,
358
+ "step": 28500
359
+ },
360
+ {
361
+ "epoch": 1.84,
362
+ "learning_rate": 1.6328881574783216e-05,
363
+ "loss": 1.8578,
364
+ "step": 29000
365
+ },
366
+ {
367
+ "epoch": 1.87,
368
+ "learning_rate": 1.6265586429520857e-05,
369
+ "loss": 1.8614,
370
+ "step": 29500
371
+ },
372
+ {
373
+ "epoch": 1.9,
374
+ "learning_rate": 1.62022912842585e-05,
375
+ "loss": 1.8592,
376
+ "step": 30000
377
+ },
378
+ {
379
+ "epoch": 1.93,
380
+ "learning_rate": 1.613899613899614e-05,
381
+ "loss": 1.854,
382
+ "step": 30500
383
+ },
384
+ {
385
+ "epoch": 1.96,
386
+ "learning_rate": 1.607570099373378e-05,
387
+ "loss": 1.8536,
388
+ "step": 31000
389
+ },
390
+ {
391
+ "epoch": 1.99,
392
+ "learning_rate": 1.6012405848471422e-05,
393
+ "loss": 1.8687,
394
+ "step": 31500
395
+ },
396
+ {
397
+ "epoch": 2.0,
398
+ "eval_loss": 1.5787432193756104,
399
+ "eval_runtime": 629.6856,
400
+ "eval_samples_per_second": 401.434,
401
+ "eval_steps_per_second": 25.09,
402
+ "step": 31598
403
+ },
404
+ {
405
+ "epoch": 2.03,
406
+ "learning_rate": 1.5949110703209064e-05,
407
+ "loss": 1.7515,
408
+ "step": 32000
409
+ },
410
+ {
411
+ "epoch": 2.06,
412
+ "learning_rate": 1.5885815557946705e-05,
413
+ "loss": 1.7233,
414
+ "step": 32500
415
+ },
416
+ {
417
+ "epoch": 2.09,
418
+ "learning_rate": 1.5822520412684346e-05,
419
+ "loss": 1.754,
420
+ "step": 33000
421
+ },
422
+ {
423
+ "epoch": 2.12,
424
+ "learning_rate": 1.5759225267421988e-05,
425
+ "loss": 1.7302,
426
+ "step": 33500
427
+ },
428
+ {
429
+ "epoch": 2.15,
430
+ "learning_rate": 1.5695930122159632e-05,
431
+ "loss": 1.7369,
432
+ "step": 34000
433
+ },
434
+ {
435
+ "epoch": 2.18,
436
+ "learning_rate": 1.5632634976897274e-05,
437
+ "loss": 1.7294,
438
+ "step": 34500
439
+ },
440
+ {
441
+ "epoch": 2.22,
442
+ "learning_rate": 1.5569339831634915e-05,
443
+ "loss": 1.7317,
444
+ "step": 35000
445
+ },
446
+ {
447
+ "epoch": 2.25,
448
+ "learning_rate": 1.5506044686372556e-05,
449
+ "loss": 1.7457,
450
+ "step": 35500
451
+ },
452
+ {
453
+ "epoch": 2.28,
454
+ "learning_rate": 1.5442749541110198e-05,
455
+ "loss": 1.758,
456
+ "step": 36000
457
+ },
458
+ {
459
+ "epoch": 2.31,
460
+ "learning_rate": 1.537945439584784e-05,
461
+ "loss": 1.7442,
462
+ "step": 36500
463
+ },
464
+ {
465
+ "epoch": 2.34,
466
+ "learning_rate": 1.531615925058548e-05,
467
+ "loss": 1.7449,
468
+ "step": 37000
469
+ },
470
+ {
471
+ "epoch": 2.37,
472
+ "learning_rate": 1.5252864105323122e-05,
473
+ "loss": 1.7502,
474
+ "step": 37500
475
+ },
476
+ {
477
+ "epoch": 2.41,
478
+ "learning_rate": 1.5189568960060765e-05,
479
+ "loss": 1.7529,
480
+ "step": 38000
481
+ },
482
+ {
483
+ "epoch": 2.44,
484
+ "learning_rate": 1.5126273814798406e-05,
485
+ "loss": 1.7675,
486
+ "step": 38500
487
+ },
488
+ {
489
+ "epoch": 2.47,
490
+ "learning_rate": 1.5062978669536047e-05,
491
+ "loss": 1.7537,
492
+ "step": 39000
493
+ },
494
+ {
495
+ "epoch": 2.5,
496
+ "learning_rate": 1.4999683524273689e-05,
497
+ "loss": 1.7546,
498
+ "step": 39500
499
+ },
500
+ {
501
+ "epoch": 2.53,
502
+ "learning_rate": 1.493638837901133e-05,
503
+ "loss": 1.7409,
504
+ "step": 40000
505
+ },
506
+ {
507
+ "epoch": 2.56,
508
+ "learning_rate": 1.4873093233748971e-05,
509
+ "loss": 1.7599,
510
+ "step": 40500
511
+ },
512
+ {
513
+ "epoch": 2.6,
514
+ "learning_rate": 1.4809798088486613e-05,
515
+ "loss": 1.7467,
516
+ "step": 41000
517
+ },
518
+ {
519
+ "epoch": 2.63,
520
+ "learning_rate": 1.4746502943224257e-05,
521
+ "loss": 1.7426,
522
+ "step": 41500
523
+ },
524
+ {
525
+ "epoch": 2.66,
526
+ "learning_rate": 1.4683207797961899e-05,
527
+ "loss": 1.7421,
528
+ "step": 42000
529
+ },
530
+ {
531
+ "epoch": 2.69,
532
+ "learning_rate": 1.461991265269954e-05,
533
+ "loss": 1.7572,
534
+ "step": 42500
535
+ },
536
+ {
537
+ "epoch": 2.72,
538
+ "learning_rate": 1.4556617507437181e-05,
539
+ "loss": 1.7489,
540
+ "step": 43000
541
+ },
542
+ {
543
+ "epoch": 2.75,
544
+ "learning_rate": 1.4493322362174823e-05,
545
+ "loss": 1.7482,
546
+ "step": 43500
547
+ },
548
+ {
549
+ "epoch": 2.78,
550
+ "learning_rate": 1.4430027216912464e-05,
551
+ "loss": 1.7578,
552
+ "step": 44000
553
+ },
554
+ {
555
+ "epoch": 2.82,
556
+ "learning_rate": 1.4366732071650105e-05,
557
+ "loss": 1.7608,
558
+ "step": 44500
559
+ },
560
+ {
561
+ "epoch": 2.85,
562
+ "learning_rate": 1.4303436926387748e-05,
563
+ "loss": 1.7623,
564
+ "step": 45000
565
+ },
566
+ {
567
+ "epoch": 2.88,
568
+ "learning_rate": 1.424014178112539e-05,
569
+ "loss": 1.7534,
570
+ "step": 45500
571
+ },
572
+ {
573
+ "epoch": 2.91,
574
+ "learning_rate": 1.4176846635863031e-05,
575
+ "loss": 1.7513,
576
+ "step": 46000
577
+ },
578
+ {
579
+ "epoch": 2.94,
580
+ "learning_rate": 1.4113551490600672e-05,
581
+ "loss": 1.7539,
582
+ "step": 46500
583
+ },
584
+ {
585
+ "epoch": 2.97,
586
+ "learning_rate": 1.4050256345338314e-05,
587
+ "loss": 1.7529,
588
+ "step": 47000
589
+ },
590
+ {
591
+ "epoch": 3.0,
592
+ "eval_loss": 1.4882566928863525,
593
+ "eval_runtime": 671.3515,
594
+ "eval_samples_per_second": 376.52,
595
+ "eval_steps_per_second": 23.533,
596
+ "step": 47397
597
+ },
598
+ {
599
+ "epoch": 3.01,
600
+ "learning_rate": 1.3986961200075955e-05,
601
+ "loss": 1.7233,
602
+ "step": 47500
603
+ },
604
+ {
605
+ "epoch": 3.04,
606
+ "learning_rate": 1.3923666054813596e-05,
607
+ "loss": 1.6255,
608
+ "step": 48000
609
+ },
610
+ {
611
+ "epoch": 3.07,
612
+ "learning_rate": 1.386037090955124e-05,
613
+ "loss": 1.6566,
614
+ "step": 48500
615
+ },
616
+ {
617
+ "epoch": 3.1,
618
+ "learning_rate": 1.379707576428888e-05,
619
+ "loss": 1.6442,
620
+ "step": 49000
621
+ },
622
+ {
623
+ "epoch": 3.13,
624
+ "learning_rate": 1.3733780619026522e-05,
625
+ "loss": 1.6439,
626
+ "step": 49500
627
+ },
628
+ {
629
+ "epoch": 3.16,
630
+ "learning_rate": 1.3670485473764163e-05,
631
+ "loss": 1.6438,
632
+ "step": 50000
633
+ },
634
+ {
635
+ "epoch": 3.2,
636
+ "learning_rate": 1.3607190328501805e-05,
637
+ "loss": 1.6527,
638
+ "step": 50500
639
+ },
640
+ {
641
+ "epoch": 3.23,
642
+ "learning_rate": 1.3543895183239446e-05,
643
+ "loss": 1.6426,
644
+ "step": 51000
645
+ },
646
+ {
647
+ "epoch": 3.26,
648
+ "learning_rate": 1.3480600037977087e-05,
649
+ "loss": 1.6802,
650
+ "step": 51500
651
+ },
652
+ {
653
+ "epoch": 3.29,
654
+ "learning_rate": 1.341730489271473e-05,
655
+ "loss": 1.6568,
656
+ "step": 52000
657
+ },
658
+ {
659
+ "epoch": 3.32,
660
+ "learning_rate": 1.3354009747452372e-05,
661
+ "loss": 1.6657,
662
+ "step": 52500
663
+ },
664
+ {
665
+ "epoch": 3.35,
666
+ "learning_rate": 1.3290714602190013e-05,
667
+ "loss": 1.6734,
668
+ "step": 53000
669
+ },
670
+ {
671
+ "epoch": 3.39,
672
+ "learning_rate": 1.3227419456927654e-05,
673
+ "loss": 1.655,
674
+ "step": 53500
675
+ },
676
+ {
677
+ "epoch": 3.42,
678
+ "learning_rate": 1.3164124311665296e-05,
679
+ "loss": 1.6831,
680
+ "step": 54000
681
+ },
682
+ {
683
+ "epoch": 3.45,
684
+ "learning_rate": 1.3100829166402937e-05,
685
+ "loss": 1.6532,
686
+ "step": 54500
687
+ },
688
+ {
689
+ "epoch": 3.48,
690
+ "learning_rate": 1.3037534021140578e-05,
691
+ "loss": 1.6649,
692
+ "step": 55000
693
+ },
694
+ {
695
+ "epoch": 3.51,
696
+ "learning_rate": 1.2974238875878221e-05,
697
+ "loss": 1.6643,
698
+ "step": 55500
699
+ },
700
+ {
701
+ "epoch": 3.54,
702
+ "learning_rate": 1.2910943730615863e-05,
703
+ "loss": 1.6749,
704
+ "step": 56000
705
+ },
706
+ {
707
+ "epoch": 3.58,
708
+ "learning_rate": 1.2847648585353504e-05,
709
+ "loss": 1.6802,
710
+ "step": 56500
711
+ },
712
+ {
713
+ "epoch": 3.61,
714
+ "learning_rate": 1.2784353440091145e-05,
715
+ "loss": 1.6753,
716
+ "step": 57000
717
+ },
718
+ {
719
+ "epoch": 3.64,
720
+ "learning_rate": 1.2721058294828787e-05,
721
+ "loss": 1.6759,
722
+ "step": 57500
723
+ },
724
+ {
725
+ "epoch": 3.67,
726
+ "learning_rate": 1.2657763149566428e-05,
727
+ "loss": 1.6756,
728
+ "step": 58000
729
+ },
730
+ {
731
+ "epoch": 3.7,
732
+ "learning_rate": 1.259446800430407e-05,
733
+ "loss": 1.6733,
734
+ "step": 58500
735
+ },
736
+ {
737
+ "epoch": 3.73,
738
+ "learning_rate": 1.253117285904171e-05,
739
+ "loss": 1.671,
740
+ "step": 59000
741
+ },
742
+ {
743
+ "epoch": 3.77,
744
+ "learning_rate": 1.2467877713779355e-05,
745
+ "loss": 1.6697,
746
+ "step": 59500
747
+ },
748
+ {
749
+ "epoch": 3.8,
750
+ "learning_rate": 1.2404582568516997e-05,
751
+ "loss": 1.668,
752
+ "step": 60000
753
+ },
754
+ {
755
+ "epoch": 3.83,
756
+ "learning_rate": 1.2341287423254638e-05,
757
+ "loss": 1.6689,
758
+ "step": 60500
759
+ },
760
+ {
761
+ "epoch": 3.86,
762
+ "learning_rate": 1.227799227799228e-05,
763
+ "loss": 1.6874,
764
+ "step": 61000
765
+ },
766
+ {
767
+ "epoch": 3.89,
768
+ "learning_rate": 1.221469713272992e-05,
769
+ "loss": 1.6926,
770
+ "step": 61500
771
+ },
772
+ {
773
+ "epoch": 3.92,
774
+ "learning_rate": 1.2151401987467562e-05,
775
+ "loss": 1.6819,
776
+ "step": 62000
777
+ },
778
+ {
779
+ "epoch": 3.96,
780
+ "learning_rate": 1.2088106842205203e-05,
781
+ "loss": 1.6599,
782
+ "step": 62500
783
+ },
784
+ {
785
+ "epoch": 3.99,
786
+ "learning_rate": 1.2024811696942846e-05,
787
+ "loss": 1.6886,
788
+ "step": 63000
789
+ },
790
+ {
791
+ "epoch": 4.0,
792
+ "eval_loss": 1.417983055114746,
793
+ "eval_runtime": 634.8433,
794
+ "eval_samples_per_second": 398.172,
795
+ "eval_steps_per_second": 24.886,
796
+ "step": 63196
797
+ },
798
+ {
799
+ "epoch": 4.02,
800
+ "learning_rate": 1.1961516551680488e-05,
801
+ "loss": 1.6122,
802
+ "step": 63500
803
+ },
804
+ {
805
+ "epoch": 4.05,
806
+ "learning_rate": 1.1898221406418129e-05,
807
+ "loss": 1.578,
808
+ "step": 64000
809
+ },
810
+ {
811
+ "epoch": 4.08,
812
+ "learning_rate": 1.183492626115577e-05,
813
+ "loss": 1.5662,
814
+ "step": 64500
815
+ },
816
+ {
817
+ "epoch": 4.11,
818
+ "learning_rate": 1.1771631115893412e-05,
819
+ "loss": 1.5732,
820
+ "step": 65000
821
+ },
822
+ {
823
+ "epoch": 4.15,
824
+ "learning_rate": 1.1708335970631053e-05,
825
+ "loss": 1.5726,
826
+ "step": 65500
827
+ },
828
+ {
829
+ "epoch": 4.18,
830
+ "learning_rate": 1.1645040825368694e-05,
831
+ "loss": 1.5868,
832
+ "step": 66000
833
+ },
834
+ {
835
+ "epoch": 4.21,
836
+ "learning_rate": 1.1581745680106337e-05,
837
+ "loss": 1.5781,
838
+ "step": 66500
839
+ },
840
+ {
841
+ "epoch": 4.24,
842
+ "learning_rate": 1.1518450534843979e-05,
843
+ "loss": 1.5965,
844
+ "step": 67000
845
+ },
846
+ {
847
+ "epoch": 4.27,
848
+ "learning_rate": 1.145515538958162e-05,
849
+ "loss": 1.5934,
850
+ "step": 67500
851
+ },
852
+ {
853
+ "epoch": 4.3,
854
+ "learning_rate": 1.1391860244319261e-05,
855
+ "loss": 1.5791,
856
+ "step": 68000
857
+ },
858
+ {
859
+ "epoch": 4.34,
860
+ "learning_rate": 1.1328565099056903e-05,
861
+ "loss": 1.6037,
862
+ "step": 68500
863
+ },
864
+ {
865
+ "epoch": 4.37,
866
+ "learning_rate": 1.1265269953794544e-05,
867
+ "loss": 1.6046,
868
+ "step": 69000
869
+ },
870
+ {
871
+ "epoch": 4.4,
872
+ "learning_rate": 1.1201974808532185e-05,
873
+ "loss": 1.5903,
874
+ "step": 69500
875
+ },
876
+ {
877
+ "epoch": 4.43,
878
+ "learning_rate": 1.1138679663269828e-05,
879
+ "loss": 1.5837,
880
+ "step": 70000
881
+ },
882
+ {
883
+ "epoch": 4.46,
884
+ "learning_rate": 1.107538451800747e-05,
885
+ "loss": 1.6162,
886
+ "step": 70500
887
+ },
888
+ {
889
+ "epoch": 4.49,
890
+ "learning_rate": 1.1012089372745111e-05,
891
+ "loss": 1.5988,
892
+ "step": 71000
893
+ },
894
+ {
895
+ "epoch": 4.53,
896
+ "learning_rate": 1.0948794227482752e-05,
897
+ "loss": 1.6082,
898
+ "step": 71500
899
+ },
900
+ {
901
+ "epoch": 4.56,
902
+ "learning_rate": 1.0885499082220394e-05,
903
+ "loss": 1.5832,
904
+ "step": 72000
905
+ },
906
+ {
907
+ "epoch": 4.59,
908
+ "learning_rate": 1.0822203936958035e-05,
909
+ "loss": 1.6153,
910
+ "step": 72500
911
+ },
912
+ {
913
+ "epoch": 4.62,
914
+ "learning_rate": 1.0758908791695676e-05,
915
+ "loss": 1.6178,
916
+ "step": 73000
917
+ },
918
+ {
919
+ "epoch": 4.65,
920
+ "learning_rate": 1.0695613646433321e-05,
921
+ "loss": 1.5981,
922
+ "step": 73500
923
+ },
924
+ {
925
+ "epoch": 4.68,
926
+ "learning_rate": 1.0632318501170963e-05,
927
+ "loss": 1.6135,
928
+ "step": 74000
929
+ },
930
+ {
931
+ "epoch": 4.72,
932
+ "learning_rate": 1.0569023355908604e-05,
933
+ "loss": 1.6122,
934
+ "step": 74500
935
+ },
936
+ {
937
+ "epoch": 4.75,
938
+ "learning_rate": 1.0505728210646245e-05,
939
+ "loss": 1.5929,
940
+ "step": 75000
941
+ },
942
+ {
943
+ "epoch": 4.78,
944
+ "learning_rate": 1.0442433065383886e-05,
945
+ "loss": 1.6069,
946
+ "step": 75500
947
+ },
948
+ {
949
+ "epoch": 4.81,
950
+ "learning_rate": 1.0379137920121528e-05,
951
+ "loss": 1.6025,
952
+ "step": 76000
953
+ },
954
+ {
955
+ "epoch": 4.84,
956
+ "learning_rate": 1.0315842774859167e-05,
957
+ "loss": 1.6284,
958
+ "step": 76500
959
+ },
960
+ {
961
+ "epoch": 4.87,
962
+ "learning_rate": 1.0252547629596812e-05,
963
+ "loss": 1.6134,
964
+ "step": 77000
965
+ },
966
+ {
967
+ "epoch": 4.91,
968
+ "learning_rate": 1.0189252484334454e-05,
969
+ "loss": 1.6092,
970
+ "step": 77500
971
+ },
972
+ {
973
+ "epoch": 4.94,
974
+ "learning_rate": 1.0125957339072095e-05,
975
+ "loss": 1.6194,
976
+ "step": 78000
977
+ },
978
+ {
979
+ "epoch": 4.97,
980
+ "learning_rate": 1.0062662193809736e-05,
981
+ "loss": 1.6227,
982
+ "step": 78500
983
+ },
984
+ {
985
+ "epoch": 5.0,
986
+ "eval_loss": 1.3593807220458984,
987
+ "eval_runtime": 634.5713,
988
+ "eval_samples_per_second": 398.343,
989
+ "eval_steps_per_second": 24.897,
990
+ "step": 78995
991
+ },
992
+ {
993
+ "epoch": 5.0,
994
+ "learning_rate": 9.999367048547378e-06,
995
+ "loss": 1.6451,
996
+ "step": 79000
997
+ },
998
+ {
999
+ "epoch": 5.03,
1000
+ "learning_rate": 9.936071903285019e-06,
1001
+ "loss": 1.5186,
1002
+ "step": 79500
1003
+ },
1004
+ {
1005
+ "epoch": 5.06,
1006
+ "learning_rate": 9.87277675802266e-06,
1007
+ "loss": 1.5124,
1008
+ "step": 80000
1009
+ },
1010
+ {
1011
+ "epoch": 5.1,
1012
+ "learning_rate": 9.809481612760301e-06,
1013
+ "loss": 1.5223,
1014
+ "step": 80500
1015
+ },
1016
+ {
1017
+ "epoch": 5.13,
1018
+ "learning_rate": 9.746186467497943e-06,
1019
+ "loss": 1.5234,
1020
+ "step": 81000
1021
+ },
1022
+ {
1023
+ "epoch": 5.16,
1024
+ "learning_rate": 9.682891322235586e-06,
1025
+ "loss": 1.5298,
1026
+ "step": 81500
1027
+ },
1028
+ {
1029
+ "epoch": 5.19,
1030
+ "learning_rate": 9.619596176973227e-06,
1031
+ "loss": 1.5259,
1032
+ "step": 82000
1033
+ },
1034
+ {
1035
+ "epoch": 5.22,
1036
+ "learning_rate": 9.556301031710869e-06,
1037
+ "loss": 1.5463,
1038
+ "step": 82500
1039
+ },
1040
+ {
1041
+ "epoch": 5.25,
1042
+ "learning_rate": 9.49300588644851e-06,
1043
+ "loss": 1.5367,
1044
+ "step": 83000
1045
+ },
1046
+ {
1047
+ "epoch": 5.29,
1048
+ "learning_rate": 9.429710741186153e-06,
1049
+ "loss": 1.543,
1050
+ "step": 83500
1051
+ },
1052
+ {
1053
+ "epoch": 5.32,
1054
+ "learning_rate": 9.366415595923794e-06,
1055
+ "loss": 1.5379,
1056
+ "step": 84000
1057
+ },
1058
+ {
1059
+ "epoch": 5.35,
1060
+ "learning_rate": 9.303120450661436e-06,
1061
+ "loss": 1.5215,
1062
+ "step": 84500
1063
+ },
1064
+ {
1065
+ "epoch": 5.38,
1066
+ "learning_rate": 9.239825305399077e-06,
1067
+ "loss": 1.5339,
1068
+ "step": 85000
1069
+ },
1070
+ {
1071
+ "epoch": 5.41,
1072
+ "learning_rate": 9.176530160136718e-06,
1073
+ "loss": 1.5588,
1074
+ "step": 85500
1075
+ },
1076
+ {
1077
+ "epoch": 5.44,
1078
+ "learning_rate": 9.11323501487436e-06,
1079
+ "loss": 1.5522,
1080
+ "step": 86000
1081
+ },
1082
+ {
1083
+ "epoch": 5.48,
1084
+ "learning_rate": 9.049939869612e-06,
1085
+ "loss": 1.5516,
1086
+ "step": 86500
1087
+ },
1088
+ {
1089
+ "epoch": 5.51,
1090
+ "learning_rate": 8.986644724349644e-06,
1091
+ "loss": 1.5503,
1092
+ "step": 87000
1093
+ },
1094
+ {
1095
+ "epoch": 5.54,
1096
+ "learning_rate": 8.923349579087285e-06,
1097
+ "loss": 1.5459,
1098
+ "step": 87500
1099
+ },
1100
+ {
1101
+ "epoch": 5.57,
1102
+ "learning_rate": 8.860054433824927e-06,
1103
+ "loss": 1.5437,
1104
+ "step": 88000
1105
+ },
1106
+ {
1107
+ "epoch": 5.6,
1108
+ "learning_rate": 8.796759288562568e-06,
1109
+ "loss": 1.5452,
1110
+ "step": 88500
1111
+ },
1112
+ {
1113
+ "epoch": 5.63,
1114
+ "learning_rate": 8.73346414330021e-06,
1115
+ "loss": 1.5434,
1116
+ "step": 89000
1117
+ },
1118
+ {
1119
+ "epoch": 5.66,
1120
+ "learning_rate": 8.67016899803785e-06,
1121
+ "loss": 1.5633,
1122
+ "step": 89500
1123
+ },
1124
+ {
1125
+ "epoch": 5.7,
1126
+ "learning_rate": 8.606873852775492e-06,
1127
+ "loss": 1.5535,
1128
+ "step": 90000
1129
+ },
1130
+ {
1131
+ "epoch": 5.73,
1132
+ "learning_rate": 8.543578707513135e-06,
1133
+ "loss": 1.5692,
1134
+ "step": 90500
1135
+ },
1136
+ {
1137
+ "epoch": 5.76,
1138
+ "learning_rate": 8.480283562250776e-06,
1139
+ "loss": 1.5609,
1140
+ "step": 91000
1141
+ },
1142
+ {
1143
+ "epoch": 5.79,
1144
+ "learning_rate": 8.416988416988418e-06,
1145
+ "loss": 1.5529,
1146
+ "step": 91500
1147
+ },
1148
+ {
1149
+ "epoch": 5.82,
1150
+ "learning_rate": 8.353693271726059e-06,
1151
+ "loss": 1.5602,
1152
+ "step": 92000
1153
+ },
1154
+ {
1155
+ "epoch": 5.85,
1156
+ "learning_rate": 8.290398126463702e-06,
1157
+ "loss": 1.5547,
1158
+ "step": 92500
1159
+ },
1160
+ {
1161
+ "epoch": 5.89,
1162
+ "learning_rate": 8.227102981201343e-06,
1163
+ "loss": 1.5557,
1164
+ "step": 93000
1165
+ },
1166
+ {
1167
+ "epoch": 5.92,
1168
+ "learning_rate": 8.163807835938985e-06,
1169
+ "loss": 1.5488,
1170
+ "step": 93500
1171
+ },
1172
+ {
1173
+ "epoch": 5.95,
1174
+ "learning_rate": 8.100512690676626e-06,
1175
+ "loss": 1.5736,
1176
+ "step": 94000
1177
+ },
1178
+ {
1179
+ "epoch": 5.98,
1180
+ "learning_rate": 8.037217545414267e-06,
1181
+ "loss": 1.559,
1182
+ "step": 94500
1183
+ },
1184
+ {
1185
+ "epoch": 6.0,
1186
+ "eval_loss": 1.3149573802947998,
1187
+ "eval_runtime": 678.6783,
1188
+ "eval_samples_per_second": 372.455,
1189
+ "eval_steps_per_second": 23.279,
1190
+ "step": 94794
1191
+ },
1192
+ {
1193
+ "epoch": 6.01,
1194
+ "learning_rate": 7.973922400151909e-06,
1195
+ "loss": 1.5248,
1196
+ "step": 95000
1197
+ },
1198
+ {
1199
+ "epoch": 6.04,
1200
+ "learning_rate": 7.91062725488955e-06,
1201
+ "loss": 1.4873,
1202
+ "step": 95500
1203
+ },
1204
+ {
1205
+ "epoch": 6.08,
1206
+ "learning_rate": 7.847332109627193e-06,
1207
+ "loss": 1.4885,
1208
+ "step": 96000
1209
+ },
1210
+ {
1211
+ "epoch": 6.11,
1212
+ "learning_rate": 7.784036964364834e-06,
1213
+ "loss": 1.4882,
1214
+ "step": 96500
1215
+ },
1216
+ {
1217
+ "epoch": 6.14,
1218
+ "learning_rate": 7.720741819102476e-06,
1219
+ "loss": 1.499,
1220
+ "step": 97000
1221
+ },
1222
+ {
1223
+ "epoch": 6.17,
1224
+ "learning_rate": 7.657446673840117e-06,
1225
+ "loss": 1.493,
1226
+ "step": 97500
1227
+ },
1228
+ {
1229
+ "epoch": 6.2,
1230
+ "learning_rate": 7.594151528577759e-06,
1231
+ "loss": 1.4864,
1232
+ "step": 98000
1233
+ },
1234
+ {
1235
+ "epoch": 6.23,
1236
+ "learning_rate": 7.5308563833154e-06,
1237
+ "loss": 1.4889,
1238
+ "step": 98500
1239
+ },
1240
+ {
1241
+ "epoch": 6.27,
1242
+ "learning_rate": 7.467561238053042e-06,
1243
+ "loss": 1.5047,
1244
+ "step": 99000
1245
+ },
1246
+ {
1247
+ "epoch": 6.3,
1248
+ "learning_rate": 7.404266092790684e-06,
1249
+ "loss": 1.4828,
1250
+ "step": 99500
1251
+ },
1252
+ {
1253
+ "epoch": 6.33,
1254
+ "learning_rate": 7.340970947528325e-06,
1255
+ "loss": 1.4884,
1256
+ "step": 100000
1257
+ },
1258
+ {
1259
+ "epoch": 6.36,
1260
+ "learning_rate": 7.2776758022659665e-06,
1261
+ "loss": 1.4981,
1262
+ "step": 100500
1263
+ },
1264
+ {
1265
+ "epoch": 6.39,
1266
+ "learning_rate": 7.214380657003608e-06,
1267
+ "loss": 1.494,
1268
+ "step": 101000
1269
+ },
1270
+ {
1271
+ "epoch": 6.42,
1272
+ "learning_rate": 7.15108551174125e-06,
1273
+ "loss": 1.4798,
1274
+ "step": 101500
1275
+ },
1276
+ {
1277
+ "epoch": 6.46,
1278
+ "learning_rate": 7.087790366478891e-06,
1279
+ "loss": 1.498,
1280
+ "step": 102000
1281
+ },
1282
+ {
1283
+ "epoch": 6.49,
1284
+ "learning_rate": 7.024495221216533e-06,
1285
+ "loss": 1.496,
1286
+ "step": 102500
1287
+ },
1288
+ {
1289
+ "epoch": 6.52,
1290
+ "learning_rate": 6.961200075954176e-06,
1291
+ "loss": 1.5097,
1292
+ "step": 103000
1293
+ },
1294
+ {
1295
+ "epoch": 6.55,
1296
+ "learning_rate": 6.897904930691817e-06,
1297
+ "loss": 1.5032,
1298
+ "step": 103500
1299
+ },
1300
+ {
1301
+ "epoch": 6.58,
1302
+ "learning_rate": 6.8346097854294576e-06,
1303
+ "loss": 1.5001,
1304
+ "step": 104000
1305
+ },
1306
+ {
1307
+ "epoch": 6.61,
1308
+ "learning_rate": 6.771314640167099e-06,
1309
+ "loss": 1.5097,
1310
+ "step": 104500
1311
+ },
1312
+ {
1313
+ "epoch": 6.65,
1314
+ "learning_rate": 6.708019494904742e-06,
1315
+ "loss": 1.5065,
1316
+ "step": 105000
1317
+ },
1318
+ {
1319
+ "epoch": 6.68,
1320
+ "learning_rate": 6.644724349642383e-06,
1321
+ "loss": 1.4961,
1322
+ "step": 105500
1323
+ },
1324
+ {
1325
+ "epoch": 6.71,
1326
+ "learning_rate": 6.5814292043800246e-06,
1327
+ "loss": 1.5092,
1328
+ "step": 106000
1329
+ },
1330
+ {
1331
+ "epoch": 6.74,
1332
+ "learning_rate": 6.518134059117667e-06,
1333
+ "loss": 1.5079,
1334
+ "step": 106500
1335
+ },
1336
+ {
1337
+ "epoch": 6.77,
1338
+ "learning_rate": 6.454838913855308e-06,
1339
+ "loss": 1.513,
1340
+ "step": 107000
1341
+ },
1342
+ {
1343
+ "epoch": 6.8,
1344
+ "learning_rate": 6.391543768592949e-06,
1345
+ "loss": 1.5076,
1346
+ "step": 107500
1347
+ },
1348
+ {
1349
+ "epoch": 6.84,
1350
+ "learning_rate": 6.328248623330591e-06,
1351
+ "loss": 1.5123,
1352
+ "step": 108000
1353
+ },
1354
+ {
1355
+ "epoch": 6.87,
1356
+ "learning_rate": 6.264953478068233e-06,
1357
+ "loss": 1.5117,
1358
+ "step": 108500
1359
+ },
1360
+ {
1361
+ "epoch": 6.9,
1362
+ "learning_rate": 6.201658332805874e-06,
1363
+ "loss": 1.5056,
1364
+ "step": 109000
1365
+ },
1366
+ {
1367
+ "epoch": 6.93,
1368
+ "learning_rate": 6.1383631875435156e-06,
1369
+ "loss": 1.517,
1370
+ "step": 109500
1371
+ },
1372
+ {
1373
+ "epoch": 6.96,
1374
+ "learning_rate": 6.075068042281157e-06,
1375
+ "loss": 1.515,
1376
+ "step": 110000
1377
+ },
1378
+ {
1379
+ "epoch": 6.99,
1380
+ "learning_rate": 6.011772897018799e-06,
1381
+ "loss": 1.5193,
1382
+ "step": 110500
1383
+ },
1384
+ {
1385
+ "epoch": 7.0,
1386
+ "eval_loss": 1.2794440984725952,
1387
+ "eval_runtime": 637.2277,
1388
+ "eval_samples_per_second": 396.682,
1389
+ "eval_steps_per_second": 24.793,
1390
+ "step": 110593
1391
+ },
1392
+ {
1393
+ "epoch": 7.03,
1394
+ "learning_rate": 5.94847775175644e-06,
1395
+ "loss": 1.4557,
1396
+ "step": 111000
1397
+ },
1398
+ {
1399
+ "epoch": 7.06,
1400
+ "learning_rate": 5.885182606494082e-06,
1401
+ "loss": 1.4395,
1402
+ "step": 111500
1403
+ },
1404
+ {
1405
+ "epoch": 7.09,
1406
+ "learning_rate": 5.821887461231725e-06,
1407
+ "loss": 1.4518,
1408
+ "step": 112000
1409
+ },
1410
+ {
1411
+ "epoch": 7.12,
1412
+ "learning_rate": 5.758592315969366e-06,
1413
+ "loss": 1.4513,
1414
+ "step": 112500
1415
+ },
1416
+ {
1417
+ "epoch": 7.15,
1418
+ "learning_rate": 5.695297170707007e-06,
1419
+ "loss": 1.454,
1420
+ "step": 113000
1421
+ },
1422
+ {
1423
+ "epoch": 7.18,
1424
+ "learning_rate": 5.632002025444649e-06,
1425
+ "loss": 1.4597,
1426
+ "step": 113500
1427
+ },
1428
+ {
1429
+ "epoch": 7.22,
1430
+ "learning_rate": 5.568706880182291e-06,
1431
+ "loss": 1.4383,
1432
+ "step": 114000
1433
+ },
1434
+ {
1435
+ "epoch": 7.25,
1436
+ "learning_rate": 5.505411734919932e-06,
1437
+ "loss": 1.4529,
1438
+ "step": 114500
1439
+ },
1440
+ {
1441
+ "epoch": 7.28,
1442
+ "learning_rate": 5.442116589657574e-06,
1443
+ "loss": 1.4706,
1444
+ "step": 115000
1445
+ },
1446
+ {
1447
+ "epoch": 7.31,
1448
+ "learning_rate": 5.378821444395216e-06,
1449
+ "loss": 1.4576,
1450
+ "step": 115500
1451
+ },
1452
+ {
1453
+ "epoch": 7.34,
1454
+ "learning_rate": 5.315526299132857e-06,
1455
+ "loss": 1.4681,
1456
+ "step": 116000
1457
+ },
1458
+ {
1459
+ "epoch": 7.37,
1460
+ "learning_rate": 5.252231153870498e-06,
1461
+ "loss": 1.4537,
1462
+ "step": 116500
1463
+ },
1464
+ {
1465
+ "epoch": 7.41,
1466
+ "learning_rate": 5.18893600860814e-06,
1467
+ "loss": 1.4583,
1468
+ "step": 117000
1469
+ },
1470
+ {
1471
+ "epoch": 7.44,
1472
+ "learning_rate": 5.125640863345782e-06,
1473
+ "loss": 1.4645,
1474
+ "step": 117500
1475
+ },
1476
+ {
1477
+ "epoch": 7.47,
1478
+ "learning_rate": 5.062345718083423e-06,
1479
+ "loss": 1.455,
1480
+ "step": 118000
1481
+ },
1482
+ {
1483
+ "epoch": 7.5,
1484
+ "learning_rate": 4.999050572821065e-06,
1485
+ "loss": 1.4821,
1486
+ "step": 118500
1487
+ },
1488
+ {
1489
+ "epoch": 7.53,
1490
+ "learning_rate": 4.935755427558707e-06,
1491
+ "loss": 1.4605,
1492
+ "step": 119000
1493
+ },
1494
+ {
1495
+ "epoch": 7.56,
1496
+ "learning_rate": 4.872460282296348e-06,
1497
+ "loss": 1.4621,
1498
+ "step": 119500
1499
+ },
1500
+ {
1501
+ "epoch": 7.6,
1502
+ "learning_rate": 4.8091651370339894e-06,
1503
+ "loss": 1.4601,
1504
+ "step": 120000
1505
+ },
1506
+ {
1507
+ "epoch": 7.63,
1508
+ "learning_rate": 4.745869991771632e-06,
1509
+ "loss": 1.4648,
1510
+ "step": 120500
1511
+ },
1512
+ {
1513
+ "epoch": 7.66,
1514
+ "learning_rate": 4.682574846509273e-06,
1515
+ "loss": 1.4723,
1516
+ "step": 121000
1517
+ },
1518
+ {
1519
+ "epoch": 7.69,
1520
+ "learning_rate": 4.619279701246915e-06,
1521
+ "loss": 1.4733,
1522
+ "step": 121500
1523
+ },
1524
+ {
1525
+ "epoch": 7.72,
1526
+ "learning_rate": 4.5559845559845564e-06,
1527
+ "loss": 1.4723,
1528
+ "step": 122000
1529
+ },
1530
+ {
1531
+ "epoch": 7.75,
1532
+ "learning_rate": 4.492689410722198e-06,
1533
+ "loss": 1.4788,
1534
+ "step": 122500
1535
+ },
1536
+ {
1537
+ "epoch": 7.79,
1538
+ "learning_rate": 4.42939426545984e-06,
1539
+ "loss": 1.4665,
1540
+ "step": 123000
1541
+ },
1542
+ {
1543
+ "epoch": 7.82,
1544
+ "learning_rate": 4.366099120197481e-06,
1545
+ "loss": 1.4699,
1546
+ "step": 123500
1547
+ },
1548
+ {
1549
+ "epoch": 7.85,
1550
+ "learning_rate": 4.3028039749351235e-06,
1551
+ "loss": 1.4908,
1552
+ "step": 124000
1553
+ },
1554
+ {
1555
+ "epoch": 7.88,
1556
+ "learning_rate": 4.239508829672764e-06,
1557
+ "loss": 1.4712,
1558
+ "step": 124500
1559
+ },
1560
+ {
1561
+ "epoch": 7.91,
1562
+ "learning_rate": 4.176213684410406e-06,
1563
+ "loss": 1.4722,
1564
+ "step": 125000
1565
+ },
1566
+ {
1567
+ "epoch": 7.94,
1568
+ "learning_rate": 4.1129185391480474e-06,
1569
+ "loss": 1.4856,
1570
+ "step": 125500
1571
+ },
1572
+ {
1573
+ "epoch": 7.98,
1574
+ "learning_rate": 4.04962339388569e-06,
1575
+ "loss": 1.4793,
1576
+ "step": 126000
1577
+ },
1578
+ {
1579
+ "epoch": 8.0,
1580
+ "eval_loss": 1.2516653537750244,
1581
+ "eval_runtime": 654.6089,
1582
+ "eval_samples_per_second": 386.15,
1583
+ "eval_steps_per_second": 24.135,
1584
+ "step": 126392
1585
+ },
1586
+ {
1587
+ "epoch": 8.01,
1588
+ "learning_rate": 3.986328248623331e-06,
1589
+ "loss": 1.4563,
1590
+ "step": 126500
1591
+ },
1592
+ {
1593
+ "epoch": 8.04,
1594
+ "learning_rate": 3.923033103360972e-06,
1595
+ "loss": 1.4263,
1596
+ "step": 127000
1597
+ },
1598
+ {
1599
+ "epoch": 8.07,
1600
+ "learning_rate": 3.8597379580986145e-06,
1601
+ "loss": 1.4301,
1602
+ "step": 127500
1603
+ },
1604
+ {
1605
+ "epoch": 8.1,
1606
+ "learning_rate": 3.7964428128362558e-06,
1607
+ "loss": 1.43,
1608
+ "step": 128000
1609
+ },
1610
+ {
1611
+ "epoch": 8.13,
1612
+ "learning_rate": 3.7331476675738975e-06,
1613
+ "loss": 1.4355,
1614
+ "step": 128500
1615
+ },
1616
+ {
1617
+ "epoch": 8.17,
1618
+ "learning_rate": 3.669852522311539e-06,
1619
+ "loss": 1.4384,
1620
+ "step": 129000
1621
+ },
1622
+ {
1623
+ "epoch": 8.2,
1624
+ "learning_rate": 3.6065573770491806e-06,
1625
+ "loss": 1.4398,
1626
+ "step": 129500
1627
+ },
1628
+ {
1629
+ "epoch": 8.23,
1630
+ "learning_rate": 3.543262231786822e-06,
1631
+ "loss": 1.425,
1632
+ "step": 130000
1633
+ },
1634
+ {
1635
+ "epoch": 8.26,
1636
+ "learning_rate": 3.4799670865244637e-06,
1637
+ "loss": 1.423,
1638
+ "step": 130500
1639
+ },
1640
+ {
1641
+ "epoch": 8.29,
1642
+ "learning_rate": 3.416671941262105e-06,
1643
+ "loss": 1.4278,
1644
+ "step": 131000
1645
+ },
1646
+ {
1647
+ "epoch": 8.32,
1648
+ "learning_rate": 3.3533767959997472e-06,
1649
+ "loss": 1.4368,
1650
+ "step": 131500
1651
+ },
1652
+ {
1653
+ "epoch": 8.35,
1654
+ "learning_rate": 3.290081650737389e-06,
1655
+ "loss": 1.4351,
1656
+ "step": 132000
1657
+ },
1658
+ {
1659
+ "epoch": 8.39,
1660
+ "learning_rate": 3.2267865054750303e-06,
1661
+ "loss": 1.4351,
1662
+ "step": 132500
1663
+ },
1664
+ {
1665
+ "epoch": 8.42,
1666
+ "learning_rate": 3.163491360212672e-06,
1667
+ "loss": 1.4299,
1668
+ "step": 133000
1669
+ },
1670
+ {
1671
+ "epoch": 8.45,
1672
+ "learning_rate": 3.1001962149503134e-06,
1673
+ "loss": 1.4265,
1674
+ "step": 133500
1675
+ },
1676
+ {
1677
+ "epoch": 8.48,
1678
+ "learning_rate": 3.036901069687955e-06,
1679
+ "loss": 1.4468,
1680
+ "step": 134000
1681
+ },
1682
+ {
1683
+ "epoch": 8.51,
1684
+ "learning_rate": 2.9736059244255965e-06,
1685
+ "loss": 1.4389,
1686
+ "step": 134500
1687
+ },
1688
+ {
1689
+ "epoch": 8.54,
1690
+ "learning_rate": 2.9103107791632386e-06,
1691
+ "loss": 1.4199,
1692
+ "step": 135000
1693
+ },
1694
+ {
1695
+ "epoch": 8.58,
1696
+ "learning_rate": 2.84701563390088e-06,
1697
+ "loss": 1.4361,
1698
+ "step": 135500
1699
+ },
1700
+ {
1701
+ "epoch": 8.61,
1702
+ "learning_rate": 2.7837204886385217e-06,
1703
+ "loss": 1.4401,
1704
+ "step": 136000
1705
+ },
1706
+ {
1707
+ "epoch": 8.64,
1708
+ "learning_rate": 2.7204253433761635e-06,
1709
+ "loss": 1.4423,
1710
+ "step": 136500
1711
+ },
1712
+ {
1713
+ "epoch": 8.67,
1714
+ "learning_rate": 2.657130198113805e-06,
1715
+ "loss": 1.4266,
1716
+ "step": 137000
1717
+ },
1718
+ {
1719
+ "epoch": 8.7,
1720
+ "learning_rate": 2.5938350528514466e-06,
1721
+ "loss": 1.4406,
1722
+ "step": 137500
1723
+ },
1724
+ {
1725
+ "epoch": 8.73,
1726
+ "learning_rate": 2.530539907589088e-06,
1727
+ "loss": 1.441,
1728
+ "step": 138000
1729
+ },
1730
+ {
1731
+ "epoch": 8.77,
1732
+ "learning_rate": 2.4672447623267296e-06,
1733
+ "loss": 1.4551,
1734
+ "step": 138500
1735
+ },
1736
+ {
1737
+ "epoch": 8.8,
1738
+ "learning_rate": 2.4039496170643714e-06,
1739
+ "loss": 1.4452,
1740
+ "step": 139000
1741
+ },
1742
+ {
1743
+ "epoch": 8.83,
1744
+ "learning_rate": 2.340654471802013e-06,
1745
+ "loss": 1.4392,
1746
+ "step": 139500
1747
+ },
1748
+ {
1749
+ "epoch": 8.86,
1750
+ "learning_rate": 2.2773593265396545e-06,
1751
+ "loss": 1.4361,
1752
+ "step": 140000
1753
+ },
1754
+ {
1755
+ "epoch": 8.89,
1756
+ "learning_rate": 2.2140641812772962e-06,
1757
+ "loss": 1.4313,
1758
+ "step": 140500
1759
+ },
1760
+ {
1761
+ "epoch": 8.92,
1762
+ "learning_rate": 2.1507690360149376e-06,
1763
+ "loss": 1.4323,
1764
+ "step": 141000
1765
+ },
1766
+ {
1767
+ "epoch": 8.96,
1768
+ "learning_rate": 2.0874738907525793e-06,
1769
+ "loss": 1.4266,
1770
+ "step": 141500
1771
+ },
1772
+ {
1773
+ "epoch": 8.99,
1774
+ "learning_rate": 2.024178745490221e-06,
1775
+ "loss": 1.4354,
1776
+ "step": 142000
1777
+ },
1778
+ {
1779
+ "epoch": 9.0,
1780
+ "eval_loss": 1.2341375350952148,
1781
+ "eval_runtime": 642.6304,
1782
+ "eval_samples_per_second": 393.347,
1783
+ "eval_steps_per_second": 24.585,
1784
+ "step": 142191
1785
+ },
1786
+ {
1787
+ "epoch": 9.02,
1788
+ "learning_rate": 1.960883600227863e-06,
1789
+ "loss": 1.4034,
1790
+ "step": 142500
1791
+ },
1792
+ {
1793
+ "epoch": 9.05,
1794
+ "learning_rate": 1.8975884549655044e-06,
1795
+ "loss": 1.3966,
1796
+ "step": 143000
1797
+ },
1798
+ {
1799
+ "epoch": 9.08,
1800
+ "learning_rate": 1.834293309703146e-06,
1801
+ "loss": 1.3921,
1802
+ "step": 143500
1803
+ },
1804
+ {
1805
+ "epoch": 9.11,
1806
+ "learning_rate": 1.7709981644407874e-06,
1807
+ "loss": 1.396,
1808
+ "step": 144000
1809
+ },
1810
+ {
1811
+ "epoch": 9.15,
1812
+ "learning_rate": 1.7077030191784292e-06,
1813
+ "loss": 1.411,
1814
+ "step": 144500
1815
+ },
1816
+ {
1817
+ "epoch": 9.18,
1818
+ "learning_rate": 1.6444078739160707e-06,
1819
+ "loss": 1.406,
1820
+ "step": 145000
1821
+ },
1822
+ {
1823
+ "epoch": 9.21,
1824
+ "learning_rate": 1.5811127286537123e-06,
1825
+ "loss": 1.407,
1826
+ "step": 145500
1827
+ },
1828
+ {
1829
+ "epoch": 9.24,
1830
+ "learning_rate": 1.5178175833913538e-06,
1831
+ "loss": 1.4182,
1832
+ "step": 146000
1833
+ },
1834
+ {
1835
+ "epoch": 9.27,
1836
+ "learning_rate": 1.4545224381289958e-06,
1837
+ "loss": 1.4116,
1838
+ "step": 146500
1839
+ },
1840
+ {
1841
+ "epoch": 9.3,
1842
+ "learning_rate": 1.3912272928666373e-06,
1843
+ "loss": 1.4166,
1844
+ "step": 147000
1845
+ },
1846
+ {
1847
+ "epoch": 9.34,
1848
+ "learning_rate": 1.3279321476042789e-06,
1849
+ "loss": 1.4063,
1850
+ "step": 147500
1851
+ },
1852
+ {
1853
+ "epoch": 9.37,
1854
+ "learning_rate": 1.2646370023419204e-06,
1855
+ "loss": 1.4025,
1856
+ "step": 148000
1857
+ },
1858
+ {
1859
+ "epoch": 9.4,
1860
+ "learning_rate": 1.2013418570795622e-06,
1861
+ "loss": 1.4061,
1862
+ "step": 148500
1863
+ },
1864
+ {
1865
+ "epoch": 9.43,
1866
+ "learning_rate": 1.1380467118172037e-06,
1867
+ "loss": 1.4066,
1868
+ "step": 149000
1869
+ },
1870
+ {
1871
+ "epoch": 9.46,
1872
+ "learning_rate": 1.0747515665548455e-06,
1873
+ "loss": 1.4152,
1874
+ "step": 149500
1875
+ },
1876
+ {
1877
+ "epoch": 9.49,
1878
+ "learning_rate": 1.011456421292487e-06,
1879
+ "loss": 1.417,
1880
+ "step": 150000
1881
+ },
1882
+ {
1883
+ "epoch": 9.53,
1884
+ "learning_rate": 9.481612760301285e-07,
1885
+ "loss": 1.411,
1886
+ "step": 150500
1887
+ },
1888
+ {
1889
+ "epoch": 9.56,
1890
+ "learning_rate": 8.848661307677701e-07,
1891
+ "loss": 1.4162,
1892
+ "step": 151000
1893
+ },
1894
+ {
1895
+ "epoch": 9.59,
1896
+ "learning_rate": 8.215709855054118e-07,
1897
+ "loss": 1.4195,
1898
+ "step": 151500
1899
+ },
1900
+ {
1901
+ "epoch": 9.62,
1902
+ "learning_rate": 7.582758402430535e-07,
1903
+ "loss": 1.4226,
1904
+ "step": 152000
1905
+ },
1906
+ {
1907
+ "epoch": 9.65,
1908
+ "learning_rate": 6.94980694980695e-07,
1909
+ "loss": 1.4239,
1910
+ "step": 152500
1911
+ },
1912
+ {
1913
+ "epoch": 9.68,
1914
+ "learning_rate": 6.316855497183366e-07,
1915
+ "loss": 1.4078,
1916
+ "step": 153000
1917
+ },
1918
+ {
1919
+ "epoch": 9.72,
1920
+ "learning_rate": 5.683904044559782e-07,
1921
+ "loss": 1.4101,
1922
+ "step": 153500
1923
+ },
1924
+ {
1925
+ "epoch": 9.75,
1926
+ "learning_rate": 5.050952591936199e-07,
1927
+ "loss": 1.416,
1928
+ "step": 154000
1929
+ },
1930
+ {
1931
+ "epoch": 9.78,
1932
+ "learning_rate": 4.418001139312615e-07,
1933
+ "loss": 1.4182,
1934
+ "step": 154500
1935
+ },
1936
+ {
1937
+ "epoch": 9.81,
1938
+ "learning_rate": 3.785049686689031e-07,
1939
+ "loss": 1.4196,
1940
+ "step": 155000
1941
+ },
1942
+ {
1943
+ "epoch": 9.84,
1944
+ "learning_rate": 3.1520982340654476e-07,
1945
+ "loss": 1.4132,
1946
+ "step": 155500
1947
+ },
1948
+ {
1949
+ "epoch": 9.87,
1950
+ "learning_rate": 2.5191467814418635e-07,
1951
+ "loss": 1.4138,
1952
+ "step": 156000
1953
+ },
1954
+ {
1955
+ "epoch": 9.91,
1956
+ "learning_rate": 1.88619532881828e-07,
1957
+ "loss": 1.4333,
1958
+ "step": 156500
1959
+ },
1960
+ {
1961
+ "epoch": 9.94,
1962
+ "learning_rate": 1.253243876194696e-07,
1963
+ "loss": 1.413,
1964
+ "step": 157000
1965
+ },
1966
+ {
1967
+ "epoch": 9.97,
1968
+ "learning_rate": 6.202924235711122e-08,
1969
+ "loss": 1.4116,
1970
+ "step": 157500
1971
+ }
1972
+ ],
1973
+ "max_steps": 157990,
1974
+ "num_train_epochs": 10,
1975
+ "total_flos": 3.820454731815322e+16,
1976
+ "trial_name": null,
1977
+ "trial_params": null
1978
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7d624b7f61620cf3bd86485c3abbf851a0ba124edd2a7b3aec3c4a0d5076e32
3
+ size 4091
vocab.json ADDED
The diff for this file is too large to render. See raw diff