RaphaelMourad commited on
Commit
81fe4a6
·
verified ·
1 Parent(s): 17fb763

Upload 9 files

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../MistralModels/models/Mixtral-8x7B-v0.1-small-4096",
3
+ "architectures": [
4
+ "MixtralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 768,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "mixtral",
15
+ "num_attention_heads": 8,
16
+ "num_experts_per_tok": 1,
17
+ "num_hidden_layers": 8,
18
+ "num_key_value_heads": 8,
19
+ "num_local_experts": 8,
20
+ "output_router_logits": false,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_theta": 1000000.0,
23
+ "router_aux_loss_coef": 0.02,
24
+ "router_jitter_noise": 0.0,
25
+ "sliding_window": null,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.43.3",
29
+ "use_cache": true,
30
+ "vocab_size": 4096
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.43.3"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6355a1c4e91fcba689a8bdc46b285b55614c70e7284c086c7074fd6b73a2f3b5
3
+ size 276979168
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f86f749dc73bc16a2502fef2f98f5c00b4400cb2c67fbe62653b7ed104d13779
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e020fcef19fdb2ed8b130f1961d42d4f3d32869aedf58e153ed40f1d0ef0385f
3
+ size 1064
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PreTrainedTokenizerFast", "unk_token": "[UNK]", "cls_token": "[CLS]", "sep_token": "[SEP]", "pad_token": "[PAD]", "mask_token": "[MASK]"}
trainer_state.json ADDED
@@ -0,0 +1,2090 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.4190404415130615,
3
+ "best_model_checkpoint": "./results/models/checkpoint-136458",
4
+ "epoch": 18.0,
5
+ "eval_steps": 500,
6
+ "global_step": 136458,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06595435958316845,
13
+ "grad_norm": 0.63671875,
14
+ "learning_rate": 0.0009986809128083366,
15
+ "loss": 5.3104,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.1319087191663369,
20
+ "grad_norm": 0.640625,
21
+ "learning_rate": 0.0009973618256166733,
22
+ "loss": 4.3028,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.19786307874950534,
27
+ "grad_norm": 0.61328125,
28
+ "learning_rate": 0.0009960427384250099,
29
+ "loss": 3.9369,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.2638174383326738,
34
+ "grad_norm": 0.5390625,
35
+ "learning_rate": 0.0009947236512333464,
36
+ "loss": 3.7115,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.3297717979158422,
41
+ "grad_norm": 0.59375,
42
+ "learning_rate": 0.0009934045640416832,
43
+ "loss": 3.5589,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.3957261574990107,
48
+ "grad_norm": 0.62890625,
49
+ "learning_rate": 0.0009920854768500197,
50
+ "loss": 3.4636,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.4616805170821791,
55
+ "grad_norm": 0.53515625,
56
+ "learning_rate": 0.0009907663896583565,
57
+ "loss": 3.3838,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.5276348766653476,
62
+ "grad_norm": 0.5859375,
63
+ "learning_rate": 0.000989447302466693,
64
+ "loss": 3.3023,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.593589236248516,
69
+ "grad_norm": 0.6171875,
70
+ "learning_rate": 0.0009881282152750298,
71
+ "loss": 3.2445,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.6595435958316844,
76
+ "grad_norm": 0.76171875,
77
+ "learning_rate": 0.0009868091280833663,
78
+ "loss": 3.205,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.725497955414853,
83
+ "grad_norm": 0.5390625,
84
+ "learning_rate": 0.0009854900408917029,
85
+ "loss": 3.157,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.7914523149980214,
90
+ "grad_norm": 0.51953125,
91
+ "learning_rate": 0.0009841709537000397,
92
+ "loss": 3.126,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.8574066745811898,
97
+ "grad_norm": 0.59765625,
98
+ "learning_rate": 0.0009828518665083762,
99
+ "loss": 3.0838,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.9233610341643582,
104
+ "grad_norm": 0.5546875,
105
+ "learning_rate": 0.0009815327793167127,
106
+ "loss": 3.0584,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.9893153937475268,
111
+ "grad_norm": 0.5234375,
112
+ "learning_rate": 0.0009802136921250495,
113
+ "loss": 3.0325,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 1.0,
118
+ "eval_loss": 3.0351455211639404,
119
+ "eval_runtime": 4.9415,
120
+ "eval_samples_per_second": 404.738,
121
+ "eval_steps_per_second": 1.619,
122
+ "step": 7581
123
+ },
124
+ {
125
+ "epoch": 1.0552697533306952,
126
+ "grad_norm": 0.55859375,
127
+ "learning_rate": 0.000978894604933386,
128
+ "loss": 2.9581,
129
+ "step": 8000
130
+ },
131
+ {
132
+ "epoch": 1.1212241129138636,
133
+ "grad_norm": 0.625,
134
+ "learning_rate": 0.0009775755177417228,
135
+ "loss": 2.9426,
136
+ "step": 8500
137
+ },
138
+ {
139
+ "epoch": 1.187178472497032,
140
+ "grad_norm": 0.53125,
141
+ "learning_rate": 0.0009762564305500594,
142
+ "loss": 2.9122,
143
+ "step": 9000
144
+ },
145
+ {
146
+ "epoch": 1.2531328320802004,
147
+ "grad_norm": 0.58203125,
148
+ "learning_rate": 0.000974937343358396,
149
+ "loss": 2.8982,
150
+ "step": 9500
151
+ },
152
+ {
153
+ "epoch": 1.3190871916633689,
154
+ "grad_norm": 0.55859375,
155
+ "learning_rate": 0.0009736182561667326,
156
+ "loss": 2.891,
157
+ "step": 10000
158
+ },
159
+ {
160
+ "epoch": 1.3850415512465375,
161
+ "grad_norm": 0.53515625,
162
+ "learning_rate": 0.0009722991689750693,
163
+ "loss": 2.8751,
164
+ "step": 10500
165
+ },
166
+ {
167
+ "epoch": 1.450995910829706,
168
+ "grad_norm": 0.609375,
169
+ "learning_rate": 0.0009709800817834059,
170
+ "loss": 2.853,
171
+ "step": 11000
172
+ },
173
+ {
174
+ "epoch": 1.5169502704128743,
175
+ "grad_norm": 0.56640625,
176
+ "learning_rate": 0.0009696609945917425,
177
+ "loss": 2.842,
178
+ "step": 11500
179
+ },
180
+ {
181
+ "epoch": 1.5829046299960428,
182
+ "grad_norm": 0.5390625,
183
+ "learning_rate": 0.0009683419074000792,
184
+ "loss": 2.831,
185
+ "step": 12000
186
+ },
187
+ {
188
+ "epoch": 1.6488589895792112,
189
+ "grad_norm": 0.546875,
190
+ "learning_rate": 0.0009670228202084158,
191
+ "loss": 2.8257,
192
+ "step": 12500
193
+ },
194
+ {
195
+ "epoch": 1.7148133491623796,
196
+ "grad_norm": 0.58984375,
197
+ "learning_rate": 0.0009657037330167524,
198
+ "loss": 2.8069,
199
+ "step": 13000
200
+ },
201
+ {
202
+ "epoch": 1.780767708745548,
203
+ "grad_norm": 0.51171875,
204
+ "learning_rate": 0.0009643846458250891,
205
+ "loss": 2.7999,
206
+ "step": 13500
207
+ },
208
+ {
209
+ "epoch": 1.8467220683287167,
210
+ "grad_norm": 0.53125,
211
+ "learning_rate": 0.0009630655586334257,
212
+ "loss": 2.7877,
213
+ "step": 14000
214
+ },
215
+ {
216
+ "epoch": 1.9126764279118849,
217
+ "grad_norm": 0.625,
218
+ "learning_rate": 0.0009617464714417623,
219
+ "loss": 2.7914,
220
+ "step": 14500
221
+ },
222
+ {
223
+ "epoch": 1.9786307874950535,
224
+ "grad_norm": 0.546875,
225
+ "learning_rate": 0.0009604273842500989,
226
+ "loss": 2.7667,
227
+ "step": 15000
228
+ },
229
+ {
230
+ "epoch": 2.0,
231
+ "eval_loss": 2.802887201309204,
232
+ "eval_runtime": 5.0226,
233
+ "eval_samples_per_second": 398.199,
234
+ "eval_steps_per_second": 1.593,
235
+ "step": 15162
236
+ },
237
+ {
238
+ "epoch": 2.0445851470782217,
239
+ "grad_norm": 0.57421875,
240
+ "learning_rate": 0.0009591082970584356,
241
+ "loss": 2.6997,
242
+ "step": 15500
243
+ },
244
+ {
245
+ "epoch": 2.1105395066613903,
246
+ "grad_norm": 0.53125,
247
+ "learning_rate": 0.0009577892098667722,
248
+ "loss": 2.688,
249
+ "step": 16000
250
+ },
251
+ {
252
+ "epoch": 2.1764938662445585,
253
+ "grad_norm": 0.54296875,
254
+ "learning_rate": 0.0009564701226751088,
255
+ "loss": 2.6977,
256
+ "step": 16500
257
+ },
258
+ {
259
+ "epoch": 2.242448225827727,
260
+ "grad_norm": 0.578125,
261
+ "learning_rate": 0.0009551510354834455,
262
+ "loss": 2.6954,
263
+ "step": 17000
264
+ },
265
+ {
266
+ "epoch": 2.308402585410896,
267
+ "grad_norm": 0.53125,
268
+ "learning_rate": 0.000953831948291782,
269
+ "loss": 2.6841,
270
+ "step": 17500
271
+ },
272
+ {
273
+ "epoch": 2.374356944994064,
274
+ "grad_norm": 0.56640625,
275
+ "learning_rate": 0.0009525128611001187,
276
+ "loss": 2.6851,
277
+ "step": 18000
278
+ },
279
+ {
280
+ "epoch": 2.4403113045772327,
281
+ "grad_norm": 0.546875,
282
+ "learning_rate": 0.0009511937739084554,
283
+ "loss": 2.6696,
284
+ "step": 18500
285
+ },
286
+ {
287
+ "epoch": 2.506265664160401,
288
+ "grad_norm": 0.54296875,
289
+ "learning_rate": 0.000949874686716792,
290
+ "loss": 2.6717,
291
+ "step": 19000
292
+ },
293
+ {
294
+ "epoch": 2.5722200237435695,
295
+ "grad_norm": 0.53125,
296
+ "learning_rate": 0.0009485555995251286,
297
+ "loss": 2.6728,
298
+ "step": 19500
299
+ },
300
+ {
301
+ "epoch": 2.6381743833267377,
302
+ "grad_norm": 0.578125,
303
+ "learning_rate": 0.0009472365123334653,
304
+ "loss": 2.6487,
305
+ "step": 20000
306
+ },
307
+ {
308
+ "epoch": 2.7041287429099063,
309
+ "grad_norm": 0.51171875,
310
+ "learning_rate": 0.0009459174251418019,
311
+ "loss": 2.6549,
312
+ "step": 20500
313
+ },
314
+ {
315
+ "epoch": 2.770083102493075,
316
+ "grad_norm": 0.62109375,
317
+ "learning_rate": 0.0009445983379501385,
318
+ "loss": 2.6441,
319
+ "step": 21000
320
+ },
321
+ {
322
+ "epoch": 2.836037462076243,
323
+ "grad_norm": 0.57421875,
324
+ "learning_rate": 0.0009432792507584752,
325
+ "loss": 2.6374,
326
+ "step": 21500
327
+ },
328
+ {
329
+ "epoch": 2.901991821659412,
330
+ "grad_norm": 0.546875,
331
+ "learning_rate": 0.0009419601635668118,
332
+ "loss": 2.6291,
333
+ "step": 22000
334
+ },
335
+ {
336
+ "epoch": 2.96794618124258,
337
+ "grad_norm": 0.51953125,
338
+ "learning_rate": 0.0009406410763751484,
339
+ "loss": 2.6326,
340
+ "step": 22500
341
+ },
342
+ {
343
+ "epoch": 3.0,
344
+ "eval_loss": 2.696772575378418,
345
+ "eval_runtime": 5.0551,
346
+ "eval_samples_per_second": 395.639,
347
+ "eval_steps_per_second": 1.583,
348
+ "step": 22743
349
+ },
350
+ {
351
+ "epoch": 3.0339005408257487,
352
+ "grad_norm": 0.68359375,
353
+ "learning_rate": 0.000939321989183485,
354
+ "loss": 2.5858,
355
+ "step": 23000
356
+ },
357
+ {
358
+ "epoch": 3.099854900408917,
359
+ "grad_norm": 0.58984375,
360
+ "learning_rate": 0.0009380029019918217,
361
+ "loss": 2.5525,
362
+ "step": 23500
363
+ },
364
+ {
365
+ "epoch": 3.1658092599920855,
366
+ "grad_norm": 0.515625,
367
+ "learning_rate": 0.0009366838148001583,
368
+ "loss": 2.559,
369
+ "step": 24000
370
+ },
371
+ {
372
+ "epoch": 3.231763619575254,
373
+ "grad_norm": 0.53125,
374
+ "learning_rate": 0.0009353647276084949,
375
+ "loss": 2.5623,
376
+ "step": 24500
377
+ },
378
+ {
379
+ "epoch": 3.2977179791584224,
380
+ "grad_norm": 0.546875,
381
+ "learning_rate": 0.0009340456404168316,
382
+ "loss": 2.5558,
383
+ "step": 25000
384
+ },
385
+ {
386
+ "epoch": 3.363672338741591,
387
+ "grad_norm": 0.72265625,
388
+ "learning_rate": 0.0009327265532251682,
389
+ "loss": 2.5571,
390
+ "step": 25500
391
+ },
392
+ {
393
+ "epoch": 3.429626698324759,
394
+ "grad_norm": 0.55859375,
395
+ "learning_rate": 0.0009314074660335047,
396
+ "loss": 2.561,
397
+ "step": 26000
398
+ },
399
+ {
400
+ "epoch": 3.495581057907928,
401
+ "grad_norm": 0.55859375,
402
+ "learning_rate": 0.0009300883788418415,
403
+ "loss": 2.5557,
404
+ "step": 26500
405
+ },
406
+ {
407
+ "epoch": 3.561535417491096,
408
+ "grad_norm": 0.58203125,
409
+ "learning_rate": 0.000928769291650178,
410
+ "loss": 2.5473,
411
+ "step": 27000
412
+ },
413
+ {
414
+ "epoch": 3.6274897770742647,
415
+ "grad_norm": 0.578125,
416
+ "learning_rate": 0.0009274502044585147,
417
+ "loss": 2.5441,
418
+ "step": 27500
419
+ },
420
+ {
421
+ "epoch": 3.6934441366574333,
422
+ "grad_norm": 0.73046875,
423
+ "learning_rate": 0.0009261311172668514,
424
+ "loss": 2.5392,
425
+ "step": 28000
426
+ },
427
+ {
428
+ "epoch": 3.7593984962406015,
429
+ "grad_norm": 0.53515625,
430
+ "learning_rate": 0.000924812030075188,
431
+ "loss": 2.5421,
432
+ "step": 28500
433
+ },
434
+ {
435
+ "epoch": 3.8253528558237697,
436
+ "grad_norm": 0.59375,
437
+ "learning_rate": 0.0009234929428835246,
438
+ "loss": 2.5356,
439
+ "step": 29000
440
+ },
441
+ {
442
+ "epoch": 3.8913072154069384,
443
+ "grad_norm": 0.66015625,
444
+ "learning_rate": 0.0009221738556918613,
445
+ "loss": 2.5418,
446
+ "step": 29500
447
+ },
448
+ {
449
+ "epoch": 3.957261574990107,
450
+ "grad_norm": 0.55078125,
451
+ "learning_rate": 0.0009208547685001979,
452
+ "loss": 2.5229,
453
+ "step": 30000
454
+ },
455
+ {
456
+ "epoch": 4.0,
457
+ "eval_loss": 2.629683017730713,
458
+ "eval_runtime": 4.5326,
459
+ "eval_samples_per_second": 441.249,
460
+ "eval_steps_per_second": 1.765,
461
+ "step": 30324
462
+ },
463
+ {
464
+ "epoch": 4.023215934573275,
465
+ "grad_norm": 0.6953125,
466
+ "learning_rate": 0.0009195356813085345,
467
+ "loss": 2.4939,
468
+ "step": 30500
469
+ },
470
+ {
471
+ "epoch": 4.089170294156443,
472
+ "grad_norm": 0.546875,
473
+ "learning_rate": 0.0009182165941168712,
474
+ "loss": 2.4514,
475
+ "step": 31000
476
+ },
477
+ {
478
+ "epoch": 4.1551246537396125,
479
+ "grad_norm": 0.578125,
480
+ "learning_rate": 0.0009168975069252078,
481
+ "loss": 2.4666,
482
+ "step": 31500
483
+ },
484
+ {
485
+ "epoch": 4.221079013322781,
486
+ "grad_norm": 0.53125,
487
+ "learning_rate": 0.0009155784197335444,
488
+ "loss": 2.4649,
489
+ "step": 32000
490
+ },
491
+ {
492
+ "epoch": 4.287033372905949,
493
+ "grad_norm": 0.57421875,
494
+ "learning_rate": 0.000914259332541881,
495
+ "loss": 2.4642,
496
+ "step": 32500
497
+ },
498
+ {
499
+ "epoch": 4.352987732489117,
500
+ "grad_norm": 0.60546875,
501
+ "learning_rate": 0.0009129402453502177,
502
+ "loss": 2.4669,
503
+ "step": 33000
504
+ },
505
+ {
506
+ "epoch": 4.418942092072286,
507
+ "grad_norm": 0.5546875,
508
+ "learning_rate": 0.0009116211581585543,
509
+ "loss": 2.4686,
510
+ "step": 33500
511
+ },
512
+ {
513
+ "epoch": 4.484896451655454,
514
+ "grad_norm": 0.53125,
515
+ "learning_rate": 0.0009103020709668909,
516
+ "loss": 2.4809,
517
+ "step": 34000
518
+ },
519
+ {
520
+ "epoch": 4.550850811238623,
521
+ "grad_norm": 0.5546875,
522
+ "learning_rate": 0.0009089829837752276,
523
+ "loss": 2.4708,
524
+ "step": 34500
525
+ },
526
+ {
527
+ "epoch": 4.616805170821792,
528
+ "grad_norm": 0.5625,
529
+ "learning_rate": 0.0009076638965835642,
530
+ "loss": 2.4701,
531
+ "step": 35000
532
+ },
533
+ {
534
+ "epoch": 4.68275953040496,
535
+ "grad_norm": 0.56640625,
536
+ "learning_rate": 0.0009063448093919007,
537
+ "loss": 2.4714,
538
+ "step": 35500
539
+ },
540
+ {
541
+ "epoch": 4.748713889988128,
542
+ "grad_norm": 0.58203125,
543
+ "learning_rate": 0.0009050257222002375,
544
+ "loss": 2.4626,
545
+ "step": 36000
546
+ },
547
+ {
548
+ "epoch": 4.814668249571296,
549
+ "grad_norm": 0.57421875,
550
+ "learning_rate": 0.000903706635008574,
551
+ "loss": 2.4696,
552
+ "step": 36500
553
+ },
554
+ {
555
+ "epoch": 4.880622609154465,
556
+ "grad_norm": 0.55078125,
557
+ "learning_rate": 0.0009023875478169107,
558
+ "loss": 2.4614,
559
+ "step": 37000
560
+ },
561
+ {
562
+ "epoch": 4.9465769687376335,
563
+ "grad_norm": 0.5625,
564
+ "learning_rate": 0.0009010684606252473,
565
+ "loss": 2.4544,
566
+ "step": 37500
567
+ },
568
+ {
569
+ "epoch": 5.0,
570
+ "eval_loss": 2.593965768814087,
571
+ "eval_runtime": 4.0976,
572
+ "eval_samples_per_second": 488.092,
573
+ "eval_steps_per_second": 1.952,
574
+ "step": 37905
575
+ },
576
+ {
577
+ "epoch": 5.012531328320802,
578
+ "grad_norm": 0.55078125,
579
+ "learning_rate": 0.000899749373433584,
580
+ "loss": 2.4455,
581
+ "step": 38000
582
+ },
583
+ {
584
+ "epoch": 5.078485687903971,
585
+ "grad_norm": 0.60546875,
586
+ "learning_rate": 0.0008984302862419205,
587
+ "loss": 2.3874,
588
+ "step": 38500
589
+ },
590
+ {
591
+ "epoch": 5.144440047487139,
592
+ "grad_norm": 0.609375,
593
+ "learning_rate": 0.0008971111990502573,
594
+ "loss": 2.3907,
595
+ "step": 39000
596
+ },
597
+ {
598
+ "epoch": 5.210394407070307,
599
+ "grad_norm": 0.5703125,
600
+ "learning_rate": 0.0008957921118585939,
601
+ "loss": 2.3965,
602
+ "step": 39500
603
+ },
604
+ {
605
+ "epoch": 5.276348766653475,
606
+ "grad_norm": 0.5234375,
607
+ "learning_rate": 0.0008944730246669305,
608
+ "loss": 2.4055,
609
+ "step": 40000
610
+ },
611
+ {
612
+ "epoch": 5.3423031262366445,
613
+ "grad_norm": 0.6171875,
614
+ "learning_rate": 0.0008931539374752672,
615
+ "loss": 2.4024,
616
+ "step": 40500
617
+ },
618
+ {
619
+ "epoch": 5.408257485819813,
620
+ "grad_norm": 0.58203125,
621
+ "learning_rate": 0.0008918348502836038,
622
+ "loss": 2.402,
623
+ "step": 41000
624
+ },
625
+ {
626
+ "epoch": 5.474211845402981,
627
+ "grad_norm": 0.5390625,
628
+ "learning_rate": 0.0008905157630919404,
629
+ "loss": 2.4073,
630
+ "step": 41500
631
+ },
632
+ {
633
+ "epoch": 5.54016620498615,
634
+ "grad_norm": 0.58203125,
635
+ "learning_rate": 0.000889196675900277,
636
+ "loss": 2.4046,
637
+ "step": 42000
638
+ },
639
+ {
640
+ "epoch": 5.606120564569318,
641
+ "grad_norm": 0.57421875,
642
+ "learning_rate": 0.0008878775887086137,
643
+ "loss": 2.4127,
644
+ "step": 42500
645
+ },
646
+ {
647
+ "epoch": 5.672074924152486,
648
+ "grad_norm": 0.57421875,
649
+ "learning_rate": 0.0008865585015169503,
650
+ "loss": 2.4102,
651
+ "step": 43000
652
+ },
653
+ {
654
+ "epoch": 5.738029283735655,
655
+ "grad_norm": 0.546875,
656
+ "learning_rate": 0.0008852394143252869,
657
+ "loss": 2.4021,
658
+ "step": 43500
659
+ },
660
+ {
661
+ "epoch": 5.803983643318824,
662
+ "grad_norm": 0.7109375,
663
+ "learning_rate": 0.0008839203271336235,
664
+ "loss": 2.4041,
665
+ "step": 44000
666
+ },
667
+ {
668
+ "epoch": 5.869938002901992,
669
+ "grad_norm": 0.59375,
670
+ "learning_rate": 0.0008826012399419602,
671
+ "loss": 2.4034,
672
+ "step": 44500
673
+ },
674
+ {
675
+ "epoch": 5.93589236248516,
676
+ "grad_norm": 0.63671875,
677
+ "learning_rate": 0.0008812821527502967,
678
+ "loss": 2.4059,
679
+ "step": 45000
680
+ },
681
+ {
682
+ "epoch": 6.0,
683
+ "eval_loss": 2.5455453395843506,
684
+ "eval_runtime": 4.3171,
685
+ "eval_samples_per_second": 463.276,
686
+ "eval_steps_per_second": 1.853,
687
+ "step": 45486
688
+ },
689
+ {
690
+ "epoch": 6.001846722068329,
691
+ "grad_norm": 0.5234375,
692
+ "learning_rate": 0.0008799630655586335,
693
+ "loss": 2.3998,
694
+ "step": 45500
695
+ },
696
+ {
697
+ "epoch": 6.067801081651497,
698
+ "grad_norm": 0.546875,
699
+ "learning_rate": 0.00087864397836697,
700
+ "loss": 2.3343,
701
+ "step": 46000
702
+ },
703
+ {
704
+ "epoch": 6.1337554412346655,
705
+ "grad_norm": 0.55859375,
706
+ "learning_rate": 0.0008773248911753067,
707
+ "loss": 2.3341,
708
+ "step": 46500
709
+ },
710
+ {
711
+ "epoch": 6.199709800817834,
712
+ "grad_norm": 0.5546875,
713
+ "learning_rate": 0.0008760058039836433,
714
+ "loss": 2.3527,
715
+ "step": 47000
716
+ },
717
+ {
718
+ "epoch": 6.265664160401003,
719
+ "grad_norm": 0.58203125,
720
+ "learning_rate": 0.00087468671679198,
721
+ "loss": 2.3633,
722
+ "step": 47500
723
+ },
724
+ {
725
+ "epoch": 6.331618519984171,
726
+ "grad_norm": 0.55859375,
727
+ "learning_rate": 0.0008733676296003165,
728
+ "loss": 2.3615,
729
+ "step": 48000
730
+ },
731
+ {
732
+ "epoch": 6.397572879567339,
733
+ "grad_norm": 0.546875,
734
+ "learning_rate": 0.0008720485424086533,
735
+ "loss": 2.3654,
736
+ "step": 48500
737
+ },
738
+ {
739
+ "epoch": 6.463527239150508,
740
+ "grad_norm": 0.6015625,
741
+ "learning_rate": 0.0008707294552169899,
742
+ "loss": 2.3695,
743
+ "step": 49000
744
+ },
745
+ {
746
+ "epoch": 6.5294815987336765,
747
+ "grad_norm": 0.5546875,
748
+ "learning_rate": 0.0008694103680253265,
749
+ "loss": 2.3698,
750
+ "step": 49500
751
+ },
752
+ {
753
+ "epoch": 6.595435958316845,
754
+ "grad_norm": 0.65234375,
755
+ "learning_rate": 0.0008680912808336632,
756
+ "loss": 2.3671,
757
+ "step": 50000
758
+ },
759
+ {
760
+ "epoch": 6.661390317900013,
761
+ "grad_norm": 0.62109375,
762
+ "learning_rate": 0.0008667721936419998,
763
+ "loss": 2.3611,
764
+ "step": 50500
765
+ },
766
+ {
767
+ "epoch": 6.727344677483182,
768
+ "grad_norm": 0.55078125,
769
+ "learning_rate": 0.0008654531064503364,
770
+ "loss": 2.371,
771
+ "step": 51000
772
+ },
773
+ {
774
+ "epoch": 6.79329903706635,
775
+ "grad_norm": 0.6484375,
776
+ "learning_rate": 0.000864134019258673,
777
+ "loss": 2.3693,
778
+ "step": 51500
779
+ },
780
+ {
781
+ "epoch": 6.859253396649518,
782
+ "grad_norm": 0.59375,
783
+ "learning_rate": 0.0008628149320670097,
784
+ "loss": 2.3575,
785
+ "step": 52000
786
+ },
787
+ {
788
+ "epoch": 6.9252077562326875,
789
+ "grad_norm": 0.5546875,
790
+ "learning_rate": 0.0008614958448753462,
791
+ "loss": 2.3638,
792
+ "step": 52500
793
+ },
794
+ {
795
+ "epoch": 6.991162115815856,
796
+ "grad_norm": 0.55859375,
797
+ "learning_rate": 0.0008601767576836829,
798
+ "loss": 2.3601,
799
+ "step": 53000
800
+ },
801
+ {
802
+ "epoch": 7.0,
803
+ "eval_loss": 2.5313472747802734,
804
+ "eval_runtime": 4.3987,
805
+ "eval_samples_per_second": 454.683,
806
+ "eval_steps_per_second": 1.819,
807
+ "step": 53067
808
+ },
809
+ {
810
+ "epoch": 7.057116475399024,
811
+ "grad_norm": 0.62890625,
812
+ "learning_rate": 0.0008588576704920195,
813
+ "loss": 2.3004,
814
+ "step": 53500
815
+ },
816
+ {
817
+ "epoch": 7.123070834982192,
818
+ "grad_norm": 0.5859375,
819
+ "learning_rate": 0.0008575385833003562,
820
+ "loss": 2.2926,
821
+ "step": 54000
822
+ },
823
+ {
824
+ "epoch": 7.189025194565361,
825
+ "grad_norm": 0.734375,
826
+ "learning_rate": 0.0008562194961086927,
827
+ "loss": 2.2966,
828
+ "step": 54500
829
+ },
830
+ {
831
+ "epoch": 7.254979554148529,
832
+ "grad_norm": 0.6171875,
833
+ "learning_rate": 0.0008549004089170295,
834
+ "loss": 2.3004,
835
+ "step": 55000
836
+ },
837
+ {
838
+ "epoch": 7.3209339137316976,
839
+ "grad_norm": 0.5859375,
840
+ "learning_rate": 0.000853581321725366,
841
+ "loss": 2.3168,
842
+ "step": 55500
843
+ },
844
+ {
845
+ "epoch": 7.386888273314866,
846
+ "grad_norm": 0.58203125,
847
+ "learning_rate": 0.0008522622345337027,
848
+ "loss": 2.3264,
849
+ "step": 56000
850
+ },
851
+ {
852
+ "epoch": 7.452842632898035,
853
+ "grad_norm": 0.60546875,
854
+ "learning_rate": 0.0008509431473420393,
855
+ "loss": 2.3132,
856
+ "step": 56500
857
+ },
858
+ {
859
+ "epoch": 7.518796992481203,
860
+ "grad_norm": 0.625,
861
+ "learning_rate": 0.000849624060150376,
862
+ "loss": 2.3263,
863
+ "step": 57000
864
+ },
865
+ {
866
+ "epoch": 7.584751352064371,
867
+ "grad_norm": 0.60546875,
868
+ "learning_rate": 0.0008483049729587125,
869
+ "loss": 2.3318,
870
+ "step": 57500
871
+ },
872
+ {
873
+ "epoch": 7.650705711647539,
874
+ "grad_norm": 0.5703125,
875
+ "learning_rate": 0.0008469858857670493,
876
+ "loss": 2.3313,
877
+ "step": 58000
878
+ },
879
+ {
880
+ "epoch": 7.7166600712307085,
881
+ "grad_norm": 0.5859375,
882
+ "learning_rate": 0.0008456667985753858,
883
+ "loss": 2.3265,
884
+ "step": 58500
885
+ },
886
+ {
887
+ "epoch": 7.782614430813877,
888
+ "grad_norm": 0.5859375,
889
+ "learning_rate": 0.0008443477113837225,
890
+ "loss": 2.3401,
891
+ "step": 59000
892
+ },
893
+ {
894
+ "epoch": 7.848568790397045,
895
+ "grad_norm": 0.63671875,
896
+ "learning_rate": 0.0008430286241920592,
897
+ "loss": 2.3385,
898
+ "step": 59500
899
+ },
900
+ {
901
+ "epoch": 7.914523149980214,
902
+ "grad_norm": 0.578125,
903
+ "learning_rate": 0.0008417095370003958,
904
+ "loss": 2.3224,
905
+ "step": 60000
906
+ },
907
+ {
908
+ "epoch": 7.980477509563382,
909
+ "grad_norm": 0.6328125,
910
+ "learning_rate": 0.0008403904498087324,
911
+ "loss": 2.3394,
912
+ "step": 60500
913
+ },
914
+ {
915
+ "epoch": 8.0,
916
+ "eval_loss": 2.508885383605957,
917
+ "eval_runtime": 4.5718,
918
+ "eval_samples_per_second": 437.467,
919
+ "eval_steps_per_second": 1.75,
920
+ "step": 60648
921
+ },
922
+ {
923
+ "epoch": 8.04643186914655,
924
+ "grad_norm": 0.6171875,
925
+ "learning_rate": 0.0008390713626170689,
926
+ "loss": 2.2768,
927
+ "step": 61000
928
+ },
929
+ {
930
+ "epoch": 8.112386228729719,
931
+ "grad_norm": 0.6015625,
932
+ "learning_rate": 0.0008377522754254057,
933
+ "loss": 2.2642,
934
+ "step": 61500
935
+ },
936
+ {
937
+ "epoch": 8.178340588312887,
938
+ "grad_norm": 0.5859375,
939
+ "learning_rate": 0.0008364331882337422,
940
+ "loss": 2.2747,
941
+ "step": 62000
942
+ },
943
+ {
944
+ "epoch": 8.244294947896057,
945
+ "grad_norm": 0.64453125,
946
+ "learning_rate": 0.0008351141010420789,
947
+ "loss": 2.2778,
948
+ "step": 62500
949
+ },
950
+ {
951
+ "epoch": 8.310249307479225,
952
+ "grad_norm": 0.65234375,
953
+ "learning_rate": 0.0008337950138504155,
954
+ "loss": 2.2876,
955
+ "step": 63000
956
+ },
957
+ {
958
+ "epoch": 8.376203667062393,
959
+ "grad_norm": 0.6796875,
960
+ "learning_rate": 0.0008324759266587522,
961
+ "loss": 2.2983,
962
+ "step": 63500
963
+ },
964
+ {
965
+ "epoch": 8.442158026645561,
966
+ "grad_norm": 0.6171875,
967
+ "learning_rate": 0.0008311568394670887,
968
+ "loss": 2.3014,
969
+ "step": 64000
970
+ },
971
+ {
972
+ "epoch": 8.50811238622873,
973
+ "grad_norm": 0.67578125,
974
+ "learning_rate": 0.0008298377522754255,
975
+ "loss": 2.2997,
976
+ "step": 64500
977
+ },
978
+ {
979
+ "epoch": 8.574066745811898,
980
+ "grad_norm": 0.81640625,
981
+ "learning_rate": 0.000828518665083762,
982
+ "loss": 2.2962,
983
+ "step": 65000
984
+ },
985
+ {
986
+ "epoch": 8.640021105395066,
987
+ "grad_norm": 0.6171875,
988
+ "learning_rate": 0.0008271995778920987,
989
+ "loss": 2.2889,
990
+ "step": 65500
991
+ },
992
+ {
993
+ "epoch": 8.705975464978234,
994
+ "grad_norm": 0.59765625,
995
+ "learning_rate": 0.0008258804907004353,
996
+ "loss": 2.2984,
997
+ "step": 66000
998
+ },
999
+ {
1000
+ "epoch": 8.771929824561404,
1001
+ "grad_norm": 0.59375,
1002
+ "learning_rate": 0.000824561403508772,
1003
+ "loss": 2.3166,
1004
+ "step": 66500
1005
+ },
1006
+ {
1007
+ "epoch": 8.837884184144572,
1008
+ "grad_norm": 0.73828125,
1009
+ "learning_rate": 0.0008232423163171085,
1010
+ "loss": 2.3091,
1011
+ "step": 67000
1012
+ },
1013
+ {
1014
+ "epoch": 8.90383854372774,
1015
+ "grad_norm": 0.63671875,
1016
+ "learning_rate": 0.0008219232291254453,
1017
+ "loss": 2.2962,
1018
+ "step": 67500
1019
+ },
1020
+ {
1021
+ "epoch": 8.969792903310909,
1022
+ "grad_norm": 0.64453125,
1023
+ "learning_rate": 0.0008206041419337818,
1024
+ "loss": 2.306,
1025
+ "step": 68000
1026
+ },
1027
+ {
1028
+ "epoch": 9.0,
1029
+ "eval_loss": 2.5000669956207275,
1030
+ "eval_runtime": 4.9297,
1031
+ "eval_samples_per_second": 405.707,
1032
+ "eval_steps_per_second": 1.623,
1033
+ "step": 68229
1034
+ },
1035
+ {
1036
+ "epoch": 9.035747262894077,
1037
+ "grad_norm": 0.62890625,
1038
+ "learning_rate": 0.0008192850547421185,
1039
+ "loss": 2.2644,
1040
+ "step": 68500
1041
+ },
1042
+ {
1043
+ "epoch": 9.101701622477245,
1044
+ "grad_norm": 0.6015625,
1045
+ "learning_rate": 0.000817965967550455,
1046
+ "loss": 2.2261,
1047
+ "step": 69000
1048
+ },
1049
+ {
1050
+ "epoch": 9.167655982060413,
1051
+ "grad_norm": 0.6328125,
1052
+ "learning_rate": 0.0008166468803587918,
1053
+ "loss": 2.2395,
1054
+ "step": 69500
1055
+ },
1056
+ {
1057
+ "epoch": 9.233610341643583,
1058
+ "grad_norm": 0.625,
1059
+ "learning_rate": 0.0008153277931671284,
1060
+ "loss": 2.2461,
1061
+ "step": 70000
1062
+ },
1063
+ {
1064
+ "epoch": 9.299564701226751,
1065
+ "grad_norm": 0.66796875,
1066
+ "learning_rate": 0.0008140087059754649,
1067
+ "loss": 2.256,
1068
+ "step": 70500
1069
+ },
1070
+ {
1071
+ "epoch": 9.36551906080992,
1072
+ "grad_norm": 0.6015625,
1073
+ "learning_rate": 0.0008126896187838017,
1074
+ "loss": 2.2607,
1075
+ "step": 71000
1076
+ },
1077
+ {
1078
+ "epoch": 9.431473420393088,
1079
+ "grad_norm": 0.62109375,
1080
+ "learning_rate": 0.0008113705315921382,
1081
+ "loss": 2.2576,
1082
+ "step": 71500
1083
+ },
1084
+ {
1085
+ "epoch": 9.497427779976256,
1086
+ "grad_norm": 0.57421875,
1087
+ "learning_rate": 0.0008100514444004749,
1088
+ "loss": 2.2616,
1089
+ "step": 72000
1090
+ },
1091
+ {
1092
+ "epoch": 9.563382139559424,
1093
+ "grad_norm": 0.7578125,
1094
+ "learning_rate": 0.0008087323572088115,
1095
+ "loss": 2.2711,
1096
+ "step": 72500
1097
+ },
1098
+ {
1099
+ "epoch": 9.629336499142592,
1100
+ "grad_norm": 0.6328125,
1101
+ "learning_rate": 0.0008074132700171482,
1102
+ "loss": 2.2697,
1103
+ "step": 73000
1104
+ },
1105
+ {
1106
+ "epoch": 9.695290858725762,
1107
+ "grad_norm": 0.609375,
1108
+ "learning_rate": 0.0008060941828254847,
1109
+ "loss": 2.2548,
1110
+ "step": 73500
1111
+ },
1112
+ {
1113
+ "epoch": 9.76124521830893,
1114
+ "grad_norm": 0.75390625,
1115
+ "learning_rate": 0.0008047750956338215,
1116
+ "loss": 2.2583,
1117
+ "step": 74000
1118
+ },
1119
+ {
1120
+ "epoch": 9.827199577892099,
1121
+ "grad_norm": 0.62109375,
1122
+ "learning_rate": 0.000803456008442158,
1123
+ "loss": 2.2598,
1124
+ "step": 74500
1125
+ },
1126
+ {
1127
+ "epoch": 9.893153937475267,
1128
+ "grad_norm": 0.60546875,
1129
+ "learning_rate": 0.0008021369212504947,
1130
+ "loss": 2.2696,
1131
+ "step": 75000
1132
+ },
1133
+ {
1134
+ "epoch": 9.959108297058435,
1135
+ "grad_norm": 0.59765625,
1136
+ "learning_rate": 0.0008008178340588313,
1137
+ "loss": 2.2692,
1138
+ "step": 75500
1139
+ },
1140
+ {
1141
+ "epoch": 10.0,
1142
+ "eval_loss": 2.4777557849884033,
1143
+ "eval_runtime": 3.8042,
1144
+ "eval_samples_per_second": 525.734,
1145
+ "eval_steps_per_second": 2.103,
1146
+ "step": 75810
1147
+ },
1148
+ {
1149
+ "epoch": 10.025062656641603,
1150
+ "grad_norm": 0.625,
1151
+ "learning_rate": 0.000799498746867168,
1152
+ "loss": 2.2365,
1153
+ "step": 76000
1154
+ },
1155
+ {
1156
+ "epoch": 10.091017016224772,
1157
+ "grad_norm": 0.5859375,
1158
+ "learning_rate": 0.0007981796596755045,
1159
+ "loss": 2.1978,
1160
+ "step": 76500
1161
+ },
1162
+ {
1163
+ "epoch": 10.156971375807942,
1164
+ "grad_norm": 0.63671875,
1165
+ "learning_rate": 0.0007968605724838413,
1166
+ "loss": 2.2063,
1167
+ "step": 77000
1168
+ },
1169
+ {
1170
+ "epoch": 10.22292573539111,
1171
+ "grad_norm": 0.61328125,
1172
+ "learning_rate": 0.0007955414852921778,
1173
+ "loss": 2.215,
1174
+ "step": 77500
1175
+ },
1176
+ {
1177
+ "epoch": 10.288880094974278,
1178
+ "grad_norm": 0.70703125,
1179
+ "learning_rate": 0.0007942223981005145,
1180
+ "loss": 2.222,
1181
+ "step": 78000
1182
+ },
1183
+ {
1184
+ "epoch": 10.354834454557446,
1185
+ "grad_norm": 0.75,
1186
+ "learning_rate": 0.000792903310908851,
1187
+ "loss": 2.2226,
1188
+ "step": 78500
1189
+ },
1190
+ {
1191
+ "epoch": 10.420788814140614,
1192
+ "grad_norm": 0.6015625,
1193
+ "learning_rate": 0.0007915842237171877,
1194
+ "loss": 2.2292,
1195
+ "step": 79000
1196
+ },
1197
+ {
1198
+ "epoch": 10.486743173723783,
1199
+ "grad_norm": 0.609375,
1200
+ "learning_rate": 0.0007902651365255243,
1201
+ "loss": 2.2305,
1202
+ "step": 79500
1203
+ },
1204
+ {
1205
+ "epoch": 10.55269753330695,
1206
+ "grad_norm": 0.625,
1207
+ "learning_rate": 0.0007889460493338609,
1208
+ "loss": 2.2233,
1209
+ "step": 80000
1210
+ },
1211
+ {
1212
+ "epoch": 10.61865189289012,
1213
+ "grad_norm": 0.65625,
1214
+ "learning_rate": 0.0007876269621421977,
1215
+ "loss": 2.2378,
1216
+ "step": 80500
1217
+ },
1218
+ {
1219
+ "epoch": 10.684606252473289,
1220
+ "grad_norm": 0.66015625,
1221
+ "learning_rate": 0.0007863078749505342,
1222
+ "loss": 2.2273,
1223
+ "step": 81000
1224
+ },
1225
+ {
1226
+ "epoch": 10.750560612056457,
1227
+ "grad_norm": 0.69140625,
1228
+ "learning_rate": 0.0007849887877588709,
1229
+ "loss": 2.2363,
1230
+ "step": 81500
1231
+ },
1232
+ {
1233
+ "epoch": 10.816514971639625,
1234
+ "grad_norm": 0.63671875,
1235
+ "learning_rate": 0.0007836697005672075,
1236
+ "loss": 2.2386,
1237
+ "step": 82000
1238
+ },
1239
+ {
1240
+ "epoch": 10.882469331222794,
1241
+ "grad_norm": 0.9453125,
1242
+ "learning_rate": 0.0007823506133755442,
1243
+ "loss": 2.2348,
1244
+ "step": 82500
1245
+ },
1246
+ {
1247
+ "epoch": 10.948423690805962,
1248
+ "grad_norm": 0.64453125,
1249
+ "learning_rate": 0.0007810315261838807,
1250
+ "loss": 2.2359,
1251
+ "step": 83000
1252
+ },
1253
+ {
1254
+ "epoch": 11.0,
1255
+ "eval_loss": 2.4718410968780518,
1256
+ "eval_runtime": 4.0713,
1257
+ "eval_samples_per_second": 491.242,
1258
+ "eval_steps_per_second": 1.965,
1259
+ "step": 83391
1260
+ },
1261
+ {
1262
+ "epoch": 11.01437805038913,
1263
+ "grad_norm": 0.6796875,
1264
+ "learning_rate": 0.0007797124389922175,
1265
+ "loss": 2.2184,
1266
+ "step": 83500
1267
+ },
1268
+ {
1269
+ "epoch": 11.0803324099723,
1270
+ "grad_norm": 0.6015625,
1271
+ "learning_rate": 0.000778393351800554,
1272
+ "loss": 2.1719,
1273
+ "step": 84000
1274
+ },
1275
+ {
1276
+ "epoch": 11.146286769555468,
1277
+ "grad_norm": 0.67578125,
1278
+ "learning_rate": 0.0007770742646088907,
1279
+ "loss": 2.1817,
1280
+ "step": 84500
1281
+ },
1282
+ {
1283
+ "epoch": 11.212241129138636,
1284
+ "grad_norm": 0.8125,
1285
+ "learning_rate": 0.0007757551774172273,
1286
+ "loss": 2.185,
1287
+ "step": 85000
1288
+ },
1289
+ {
1290
+ "epoch": 11.278195488721805,
1291
+ "grad_norm": 0.62890625,
1292
+ "learning_rate": 0.000774436090225564,
1293
+ "loss": 2.1905,
1294
+ "step": 85500
1295
+ },
1296
+ {
1297
+ "epoch": 11.344149848304973,
1298
+ "grad_norm": 0.86328125,
1299
+ "learning_rate": 0.0007731170030339005,
1300
+ "loss": 2.1953,
1301
+ "step": 86000
1302
+ },
1303
+ {
1304
+ "epoch": 11.410104207888141,
1305
+ "grad_norm": 0.6015625,
1306
+ "learning_rate": 0.0007717979158422373,
1307
+ "loss": 2.1873,
1308
+ "step": 86500
1309
+ },
1310
+ {
1311
+ "epoch": 11.47605856747131,
1312
+ "grad_norm": 0.63671875,
1313
+ "learning_rate": 0.0007704788286505738,
1314
+ "loss": 2.2014,
1315
+ "step": 87000
1316
+ },
1317
+ {
1318
+ "epoch": 11.54201292705448,
1319
+ "grad_norm": 0.703125,
1320
+ "learning_rate": 0.0007691597414589104,
1321
+ "loss": 2.2029,
1322
+ "step": 87500
1323
+ },
1324
+ {
1325
+ "epoch": 11.607967286637647,
1326
+ "grad_norm": 0.69140625,
1327
+ "learning_rate": 0.000767840654267247,
1328
+ "loss": 2.2041,
1329
+ "step": 88000
1330
+ },
1331
+ {
1332
+ "epoch": 11.673921646220816,
1333
+ "grad_norm": 0.59375,
1334
+ "learning_rate": 0.0007665215670755837,
1335
+ "loss": 2.2031,
1336
+ "step": 88500
1337
+ },
1338
+ {
1339
+ "epoch": 11.739876005803984,
1340
+ "grad_norm": 0.71875,
1341
+ "learning_rate": 0.0007652024798839203,
1342
+ "loss": 2.2059,
1343
+ "step": 89000
1344
+ },
1345
+ {
1346
+ "epoch": 11.805830365387152,
1347
+ "grad_norm": 0.65625,
1348
+ "learning_rate": 0.0007638833926922569,
1349
+ "loss": 2.21,
1350
+ "step": 89500
1351
+ },
1352
+ {
1353
+ "epoch": 11.87178472497032,
1354
+ "grad_norm": 0.6875,
1355
+ "learning_rate": 0.0007625643055005937,
1356
+ "loss": 2.2105,
1357
+ "step": 90000
1358
+ },
1359
+ {
1360
+ "epoch": 11.937739084553488,
1361
+ "grad_norm": 0.6015625,
1362
+ "learning_rate": 0.0007612452183089302,
1363
+ "loss": 2.2089,
1364
+ "step": 90500
1365
+ },
1366
+ {
1367
+ "epoch": 12.0,
1368
+ "eval_loss": 2.453855514526367,
1369
+ "eval_runtime": 3.8561,
1370
+ "eval_samples_per_second": 518.662,
1371
+ "eval_steps_per_second": 2.075,
1372
+ "step": 90972
1373
+ },
1374
+ {
1375
+ "epoch": 12.003693444136658,
1376
+ "grad_norm": 0.66796875,
1377
+ "learning_rate": 0.0007599261311172669,
1378
+ "loss": 2.2131,
1379
+ "step": 91000
1380
+ },
1381
+ {
1382
+ "epoch": 12.069647803719826,
1383
+ "grad_norm": 0.61328125,
1384
+ "learning_rate": 0.0007586070439256035,
1385
+ "loss": 2.1413,
1386
+ "step": 91500
1387
+ },
1388
+ {
1389
+ "epoch": 12.135602163302995,
1390
+ "grad_norm": 0.640625,
1391
+ "learning_rate": 0.0007572879567339402,
1392
+ "loss": 2.1529,
1393
+ "step": 92000
1394
+ },
1395
+ {
1396
+ "epoch": 12.201556522886163,
1397
+ "grad_norm": 0.609375,
1398
+ "learning_rate": 0.0007559688695422767,
1399
+ "loss": 2.1531,
1400
+ "step": 92500
1401
+ },
1402
+ {
1403
+ "epoch": 12.267510882469331,
1404
+ "grad_norm": 0.6640625,
1405
+ "learning_rate": 0.0007546497823506135,
1406
+ "loss": 2.1672,
1407
+ "step": 93000
1408
+ },
1409
+ {
1410
+ "epoch": 12.3334652420525,
1411
+ "grad_norm": 0.6484375,
1412
+ "learning_rate": 0.00075333069515895,
1413
+ "loss": 2.1653,
1414
+ "step": 93500
1415
+ },
1416
+ {
1417
+ "epoch": 12.399419601635667,
1418
+ "grad_norm": 0.62109375,
1419
+ "learning_rate": 0.0007520116079672867,
1420
+ "loss": 2.1711,
1421
+ "step": 94000
1422
+ },
1423
+ {
1424
+ "epoch": 12.465373961218837,
1425
+ "grad_norm": 0.59375,
1426
+ "learning_rate": 0.0007506925207756233,
1427
+ "loss": 2.1787,
1428
+ "step": 94500
1429
+ },
1430
+ {
1431
+ "epoch": 12.531328320802006,
1432
+ "grad_norm": 0.60546875,
1433
+ "learning_rate": 0.00074937343358396,
1434
+ "loss": 2.1737,
1435
+ "step": 95000
1436
+ },
1437
+ {
1438
+ "epoch": 12.597282680385174,
1439
+ "grad_norm": 0.65625,
1440
+ "learning_rate": 0.0007480543463922965,
1441
+ "loss": 2.1787,
1442
+ "step": 95500
1443
+ },
1444
+ {
1445
+ "epoch": 12.663237039968342,
1446
+ "grad_norm": 0.66015625,
1447
+ "learning_rate": 0.0007467352592006333,
1448
+ "loss": 2.1764,
1449
+ "step": 96000
1450
+ },
1451
+ {
1452
+ "epoch": 12.72919139955151,
1453
+ "grad_norm": 0.6484375,
1454
+ "learning_rate": 0.0007454161720089698,
1455
+ "loss": 2.1902,
1456
+ "step": 96500
1457
+ },
1458
+ {
1459
+ "epoch": 12.795145759134678,
1460
+ "grad_norm": 0.62109375,
1461
+ "learning_rate": 0.0007440970848173064,
1462
+ "loss": 2.1778,
1463
+ "step": 97000
1464
+ },
1465
+ {
1466
+ "epoch": 12.861100118717847,
1467
+ "grad_norm": 0.61328125,
1468
+ "learning_rate": 0.000742777997625643,
1469
+ "loss": 2.1791,
1470
+ "step": 97500
1471
+ },
1472
+ {
1473
+ "epoch": 12.927054478301017,
1474
+ "grad_norm": 0.875,
1475
+ "learning_rate": 0.0007414589104339797,
1476
+ "loss": 2.194,
1477
+ "step": 98000
1478
+ },
1479
+ {
1480
+ "epoch": 12.993008837884185,
1481
+ "grad_norm": 0.62890625,
1482
+ "learning_rate": 0.0007401398232423163,
1483
+ "loss": 2.1869,
1484
+ "step": 98500
1485
+ },
1486
+ {
1487
+ "epoch": 13.0,
1488
+ "eval_loss": 2.4518699645996094,
1489
+ "eval_runtime": 4.8206,
1490
+ "eval_samples_per_second": 414.883,
1491
+ "eval_steps_per_second": 1.66,
1492
+ "step": 98553
1493
+ },
1494
+ {
1495
+ "epoch": 13.058963197467353,
1496
+ "grad_norm": 0.65234375,
1497
+ "learning_rate": 0.0007388207360506529,
1498
+ "loss": 2.1188,
1499
+ "step": 99000
1500
+ },
1501
+ {
1502
+ "epoch": 13.124917557050521,
1503
+ "grad_norm": 0.640625,
1504
+ "learning_rate": 0.0007375016488589896,
1505
+ "loss": 2.1277,
1506
+ "step": 99500
1507
+ },
1508
+ {
1509
+ "epoch": 13.19087191663369,
1510
+ "grad_norm": 0.734375,
1511
+ "learning_rate": 0.0007361825616673262,
1512
+ "loss": 2.139,
1513
+ "step": 100000
1514
+ },
1515
+ {
1516
+ "epoch": 13.256826276216858,
1517
+ "grad_norm": 0.6796875,
1518
+ "learning_rate": 0.0007348634744756628,
1519
+ "loss": 2.1469,
1520
+ "step": 100500
1521
+ },
1522
+ {
1523
+ "epoch": 13.322780635800026,
1524
+ "grad_norm": 0.67578125,
1525
+ "learning_rate": 0.0007335443872839995,
1526
+ "loss": 2.1406,
1527
+ "step": 101000
1528
+ },
1529
+ {
1530
+ "epoch": 13.388734995383194,
1531
+ "grad_norm": 0.6640625,
1532
+ "learning_rate": 0.0007322253000923362,
1533
+ "loss": 2.1448,
1534
+ "step": 101500
1535
+ },
1536
+ {
1537
+ "epoch": 13.454689354966364,
1538
+ "grad_norm": 0.67578125,
1539
+ "learning_rate": 0.0007309062129006727,
1540
+ "loss": 2.1424,
1541
+ "step": 102000
1542
+ },
1543
+ {
1544
+ "epoch": 13.520643714549532,
1545
+ "grad_norm": 0.66796875,
1546
+ "learning_rate": 0.0007295871257090095,
1547
+ "loss": 2.1505,
1548
+ "step": 102500
1549
+ },
1550
+ {
1551
+ "epoch": 13.5865980741327,
1552
+ "grad_norm": 0.7578125,
1553
+ "learning_rate": 0.000728268038517346,
1554
+ "loss": 2.1533,
1555
+ "step": 103000
1556
+ },
1557
+ {
1558
+ "epoch": 13.652552433715869,
1559
+ "grad_norm": 0.671875,
1560
+ "learning_rate": 0.0007269489513256827,
1561
+ "loss": 2.1655,
1562
+ "step": 103500
1563
+ },
1564
+ {
1565
+ "epoch": 13.718506793299037,
1566
+ "grad_norm": 0.609375,
1567
+ "learning_rate": 0.0007256298641340193,
1568
+ "loss": 2.1599,
1569
+ "step": 104000
1570
+ },
1571
+ {
1572
+ "epoch": 13.784461152882205,
1573
+ "grad_norm": 0.59765625,
1574
+ "learning_rate": 0.000724310776942356,
1575
+ "loss": 2.1582,
1576
+ "step": 104500
1577
+ },
1578
+ {
1579
+ "epoch": 13.850415512465373,
1580
+ "grad_norm": 0.62890625,
1581
+ "learning_rate": 0.0007229916897506925,
1582
+ "loss": 2.1664,
1583
+ "step": 105000
1584
+ },
1585
+ {
1586
+ "epoch": 13.916369872048543,
1587
+ "grad_norm": 0.65625,
1588
+ "learning_rate": 0.0007216726025590291,
1589
+ "loss": 2.1638,
1590
+ "step": 105500
1591
+ },
1592
+ {
1593
+ "epoch": 13.982324231631711,
1594
+ "grad_norm": 0.7109375,
1595
+ "learning_rate": 0.0007203535153673658,
1596
+ "loss": 2.1643,
1597
+ "step": 106000
1598
+ },
1599
+ {
1600
+ "epoch": 14.0,
1601
+ "eval_loss": 2.442619562149048,
1602
+ "eval_runtime": 5.7628,
1603
+ "eval_samples_per_second": 347.051,
1604
+ "eval_steps_per_second": 1.388,
1605
+ "step": 106134
1606
+ },
1607
+ {
1608
+ "epoch": 14.04827859121488,
1609
+ "grad_norm": 0.68359375,
1610
+ "learning_rate": 0.0007190344281757024,
1611
+ "loss": 2.1108,
1612
+ "step": 106500
1613
+ },
1614
+ {
1615
+ "epoch": 14.114232950798048,
1616
+ "grad_norm": 0.67578125,
1617
+ "learning_rate": 0.000717715340984039,
1618
+ "loss": 2.1076,
1619
+ "step": 107000
1620
+ },
1621
+ {
1622
+ "epoch": 14.180187310381216,
1623
+ "grad_norm": 0.6484375,
1624
+ "learning_rate": 0.0007163962537923757,
1625
+ "loss": 2.1202,
1626
+ "step": 107500
1627
+ },
1628
+ {
1629
+ "epoch": 14.246141669964384,
1630
+ "grad_norm": 0.625,
1631
+ "learning_rate": 0.0007150771666007123,
1632
+ "loss": 2.1221,
1633
+ "step": 108000
1634
+ },
1635
+ {
1636
+ "epoch": 14.312096029547552,
1637
+ "grad_norm": 0.625,
1638
+ "learning_rate": 0.0007137580794090489,
1639
+ "loss": 2.1262,
1640
+ "step": 108500
1641
+ },
1642
+ {
1643
+ "epoch": 14.378050389130722,
1644
+ "grad_norm": 0.62109375,
1645
+ "learning_rate": 0.0007124389922173856,
1646
+ "loss": 2.1376,
1647
+ "step": 109000
1648
+ },
1649
+ {
1650
+ "epoch": 14.44400474871389,
1651
+ "grad_norm": 0.65234375,
1652
+ "learning_rate": 0.0007111199050257222,
1653
+ "loss": 2.1344,
1654
+ "step": 109500
1655
+ },
1656
+ {
1657
+ "epoch": 14.509959108297059,
1658
+ "grad_norm": 0.65625,
1659
+ "learning_rate": 0.0007098008178340588,
1660
+ "loss": 2.1379,
1661
+ "step": 110000
1662
+ },
1663
+ {
1664
+ "epoch": 14.575913467880227,
1665
+ "grad_norm": 0.671875,
1666
+ "learning_rate": 0.0007084817306423955,
1667
+ "loss": 2.1354,
1668
+ "step": 110500
1669
+ },
1670
+ {
1671
+ "epoch": 14.641867827463395,
1672
+ "grad_norm": 0.66796875,
1673
+ "learning_rate": 0.0007071626434507322,
1674
+ "loss": 2.1339,
1675
+ "step": 111000
1676
+ },
1677
+ {
1678
+ "epoch": 14.707822187046563,
1679
+ "grad_norm": 0.65625,
1680
+ "learning_rate": 0.0007058435562590687,
1681
+ "loss": 2.1297,
1682
+ "step": 111500
1683
+ },
1684
+ {
1685
+ "epoch": 14.773776546629731,
1686
+ "grad_norm": 0.69921875,
1687
+ "learning_rate": 0.0007045244690674055,
1688
+ "loss": 2.1355,
1689
+ "step": 112000
1690
+ },
1691
+ {
1692
+ "epoch": 14.839730906212901,
1693
+ "grad_norm": 0.6640625,
1694
+ "learning_rate": 0.000703205381875742,
1695
+ "loss": 2.1385,
1696
+ "step": 112500
1697
+ },
1698
+ {
1699
+ "epoch": 14.90568526579607,
1700
+ "grad_norm": 0.83203125,
1701
+ "learning_rate": 0.0007018862946840787,
1702
+ "loss": 2.1399,
1703
+ "step": 113000
1704
+ },
1705
+ {
1706
+ "epoch": 14.971639625379238,
1707
+ "grad_norm": 0.62890625,
1708
+ "learning_rate": 0.0007005672074924153,
1709
+ "loss": 2.149,
1710
+ "step": 113500
1711
+ },
1712
+ {
1713
+ "epoch": 15.0,
1714
+ "eval_loss": 2.43693208694458,
1715
+ "eval_runtime": 4.8208,
1716
+ "eval_samples_per_second": 414.869,
1717
+ "eval_steps_per_second": 1.659,
1718
+ "step": 113715
1719
+ },
1720
+ {
1721
+ "epoch": 15.037593984962406,
1722
+ "grad_norm": 0.6171875,
1723
+ "learning_rate": 0.0006992481203007519,
1724
+ "loss": 2.1078,
1725
+ "step": 114000
1726
+ },
1727
+ {
1728
+ "epoch": 15.103548344545574,
1729
+ "grad_norm": 0.6875,
1730
+ "learning_rate": 0.0006979290331090885,
1731
+ "loss": 2.0965,
1732
+ "step": 114500
1733
+ },
1734
+ {
1735
+ "epoch": 15.169502704128742,
1736
+ "grad_norm": 0.66796875,
1737
+ "learning_rate": 0.0006966099459174251,
1738
+ "loss": 2.1037,
1739
+ "step": 115000
1740
+ },
1741
+ {
1742
+ "epoch": 15.23545706371191,
1743
+ "grad_norm": 0.6640625,
1744
+ "learning_rate": 0.0006952908587257618,
1745
+ "loss": 2.1074,
1746
+ "step": 115500
1747
+ },
1748
+ {
1749
+ "epoch": 15.30141142329508,
1750
+ "grad_norm": 0.734375,
1751
+ "learning_rate": 0.0006939717715340984,
1752
+ "loss": 2.1103,
1753
+ "step": 116000
1754
+ },
1755
+ {
1756
+ "epoch": 15.367365782878249,
1757
+ "grad_norm": 0.69140625,
1758
+ "learning_rate": 0.000692652684342435,
1759
+ "loss": 2.1089,
1760
+ "step": 116500
1761
+ },
1762
+ {
1763
+ "epoch": 15.433320142461417,
1764
+ "grad_norm": 0.6328125,
1765
+ "learning_rate": 0.0006913335971507717,
1766
+ "loss": 2.1088,
1767
+ "step": 117000
1768
+ },
1769
+ {
1770
+ "epoch": 15.499274502044585,
1771
+ "grad_norm": 0.6640625,
1772
+ "learning_rate": 0.0006900145099591083,
1773
+ "loss": 2.1112,
1774
+ "step": 117500
1775
+ },
1776
+ {
1777
+ "epoch": 15.565228861627753,
1778
+ "grad_norm": 0.609375,
1779
+ "learning_rate": 0.0006886954227674449,
1780
+ "loss": 2.1153,
1781
+ "step": 118000
1782
+ },
1783
+ {
1784
+ "epoch": 15.631183221210922,
1785
+ "grad_norm": 0.6328125,
1786
+ "learning_rate": 0.0006873763355757816,
1787
+ "loss": 2.1321,
1788
+ "step": 118500
1789
+ },
1790
+ {
1791
+ "epoch": 15.69713758079409,
1792
+ "grad_norm": 0.93359375,
1793
+ "learning_rate": 0.0006860572483841182,
1794
+ "loss": 2.1233,
1795
+ "step": 119000
1796
+ },
1797
+ {
1798
+ "epoch": 15.763091940377258,
1799
+ "grad_norm": 0.68359375,
1800
+ "learning_rate": 0.0006847381611924548,
1801
+ "loss": 2.1361,
1802
+ "step": 119500
1803
+ },
1804
+ {
1805
+ "epoch": 15.829046299960428,
1806
+ "grad_norm": 0.76171875,
1807
+ "learning_rate": 0.0006834190740007915,
1808
+ "loss": 2.1296,
1809
+ "step": 120000
1810
+ },
1811
+ {
1812
+ "epoch": 15.895000659543596,
1813
+ "grad_norm": 0.62890625,
1814
+ "learning_rate": 0.0006820999868091281,
1815
+ "loss": 2.1236,
1816
+ "step": 120500
1817
+ },
1818
+ {
1819
+ "epoch": 15.960955019126764,
1820
+ "grad_norm": 0.6484375,
1821
+ "learning_rate": 0.0006807808996174647,
1822
+ "loss": 2.1354,
1823
+ "step": 121000
1824
+ },
1825
+ {
1826
+ "epoch": 16.0,
1827
+ "eval_loss": 2.431525945663452,
1828
+ "eval_runtime": 5.3462,
1829
+ "eval_samples_per_second": 374.095,
1830
+ "eval_steps_per_second": 1.496,
1831
+ "step": 121296
1832
+ },
1833
+ {
1834
+ "epoch": 16.026909378709934,
1835
+ "grad_norm": 0.69921875,
1836
+ "learning_rate": 0.0006794618124258015,
1837
+ "loss": 2.1031,
1838
+ "step": 121500
1839
+ },
1840
+ {
1841
+ "epoch": 16.0928637382931,
1842
+ "grad_norm": 0.66015625,
1843
+ "learning_rate": 0.000678142725234138,
1844
+ "loss": 2.0653,
1845
+ "step": 122000
1846
+ },
1847
+ {
1848
+ "epoch": 16.15881809787627,
1849
+ "grad_norm": 0.6484375,
1850
+ "learning_rate": 0.0006768236380424745,
1851
+ "loss": 2.0796,
1852
+ "step": 122500
1853
+ },
1854
+ {
1855
+ "epoch": 16.224772457459437,
1856
+ "grad_norm": 0.796875,
1857
+ "learning_rate": 0.0006755045508508113,
1858
+ "loss": 2.0916,
1859
+ "step": 123000
1860
+ },
1861
+ {
1862
+ "epoch": 16.290726817042607,
1863
+ "grad_norm": 0.6796875,
1864
+ "learning_rate": 0.0006741854636591479,
1865
+ "loss": 2.0888,
1866
+ "step": 123500
1867
+ },
1868
+ {
1869
+ "epoch": 16.356681176625774,
1870
+ "grad_norm": 0.62890625,
1871
+ "learning_rate": 0.0006728663764674845,
1872
+ "loss": 2.0981,
1873
+ "step": 124000
1874
+ },
1875
+ {
1876
+ "epoch": 16.422635536208944,
1877
+ "grad_norm": 0.65625,
1878
+ "learning_rate": 0.0006715472892758211,
1879
+ "loss": 2.1025,
1880
+ "step": 124500
1881
+ },
1882
+ {
1883
+ "epoch": 16.488589895792114,
1884
+ "grad_norm": 0.640625,
1885
+ "learning_rate": 0.0006702282020841578,
1886
+ "loss": 2.0996,
1887
+ "step": 125000
1888
+ },
1889
+ {
1890
+ "epoch": 16.55454425537528,
1891
+ "grad_norm": 0.6640625,
1892
+ "learning_rate": 0.0006689091148924944,
1893
+ "loss": 2.103,
1894
+ "step": 125500
1895
+ },
1896
+ {
1897
+ "epoch": 16.62049861495845,
1898
+ "grad_norm": 0.73828125,
1899
+ "learning_rate": 0.000667590027700831,
1900
+ "loss": 2.1105,
1901
+ "step": 126000
1902
+ },
1903
+ {
1904
+ "epoch": 16.686452974541616,
1905
+ "grad_norm": 0.6796875,
1906
+ "learning_rate": 0.0006662709405091677,
1907
+ "loss": 2.1138,
1908
+ "step": 126500
1909
+ },
1910
+ {
1911
+ "epoch": 16.752407334124786,
1912
+ "grad_norm": 0.65625,
1913
+ "learning_rate": 0.0006649518533175043,
1914
+ "loss": 2.1095,
1915
+ "step": 127000
1916
+ },
1917
+ {
1918
+ "epoch": 16.818361693707953,
1919
+ "grad_norm": 0.640625,
1920
+ "learning_rate": 0.0006636327661258409,
1921
+ "loss": 2.1114,
1922
+ "step": 127500
1923
+ },
1924
+ {
1925
+ "epoch": 16.884316053291123,
1926
+ "grad_norm": 0.7421875,
1927
+ "learning_rate": 0.0006623136789341776,
1928
+ "loss": 2.1012,
1929
+ "step": 128000
1930
+ },
1931
+ {
1932
+ "epoch": 16.950270412874293,
1933
+ "grad_norm": 0.69140625,
1934
+ "learning_rate": 0.0006609945917425142,
1935
+ "loss": 2.109,
1936
+ "step": 128500
1937
+ },
1938
+ {
1939
+ "epoch": 17.0,
1940
+ "eval_loss": 2.4235010147094727,
1941
+ "eval_runtime": 5.2236,
1942
+ "eval_samples_per_second": 382.874,
1943
+ "eval_steps_per_second": 1.531,
1944
+ "step": 128877
1945
+ },
1946
+ {
1947
+ "epoch": 17.01622477245746,
1948
+ "grad_norm": 0.6640625,
1949
+ "learning_rate": 0.0006596755045508508,
1950
+ "loss": 2.0946,
1951
+ "step": 129000
1952
+ },
1953
+ {
1954
+ "epoch": 17.08217913204063,
1955
+ "grad_norm": 0.67578125,
1956
+ "learning_rate": 0.0006583564173591875,
1957
+ "loss": 2.0571,
1958
+ "step": 129500
1959
+ },
1960
+ {
1961
+ "epoch": 17.148133491623796,
1962
+ "grad_norm": 0.7578125,
1963
+ "learning_rate": 0.0006570373301675241,
1964
+ "loss": 2.0727,
1965
+ "step": 130000
1966
+ },
1967
+ {
1968
+ "epoch": 17.214087851206965,
1969
+ "grad_norm": 0.6484375,
1970
+ "learning_rate": 0.0006557182429758607,
1971
+ "loss": 2.0699,
1972
+ "step": 130500
1973
+ },
1974
+ {
1975
+ "epoch": 17.280042210790132,
1976
+ "grad_norm": 0.6953125,
1977
+ "learning_rate": 0.0006543991557841973,
1978
+ "loss": 2.0763,
1979
+ "step": 131000
1980
+ },
1981
+ {
1982
+ "epoch": 17.345996570373302,
1983
+ "grad_norm": 0.7265625,
1984
+ "learning_rate": 0.000653080068592534,
1985
+ "loss": 2.0804,
1986
+ "step": 131500
1987
+ },
1988
+ {
1989
+ "epoch": 17.411950929956472,
1990
+ "grad_norm": 0.69140625,
1991
+ "learning_rate": 0.0006517609814008705,
1992
+ "loss": 2.0832,
1993
+ "step": 132000
1994
+ },
1995
+ {
1996
+ "epoch": 17.47790528953964,
1997
+ "grad_norm": 0.67578125,
1998
+ "learning_rate": 0.0006504418942092073,
1999
+ "loss": 2.0821,
2000
+ "step": 132500
2001
+ },
2002
+ {
2003
+ "epoch": 17.54385964912281,
2004
+ "grad_norm": 0.6875,
2005
+ "learning_rate": 0.0006491228070175439,
2006
+ "loss": 2.0821,
2007
+ "step": 133000
2008
+ },
2009
+ {
2010
+ "epoch": 17.609814008705975,
2011
+ "grad_norm": 0.7109375,
2012
+ "learning_rate": 0.0006478037198258805,
2013
+ "loss": 2.0886,
2014
+ "step": 133500
2015
+ },
2016
+ {
2017
+ "epoch": 17.675768368289145,
2018
+ "grad_norm": 0.66015625,
2019
+ "learning_rate": 0.000646484632634217,
2020
+ "loss": 2.0945,
2021
+ "step": 134000
2022
+ },
2023
+ {
2024
+ "epoch": 17.74172272787231,
2025
+ "grad_norm": 0.64453125,
2026
+ "learning_rate": 0.0006451655454425538,
2027
+ "loss": 2.0918,
2028
+ "step": 134500
2029
+ },
2030
+ {
2031
+ "epoch": 17.80767708745548,
2032
+ "grad_norm": 0.72265625,
2033
+ "learning_rate": 0.0006438464582508904,
2034
+ "loss": 2.099,
2035
+ "step": 135000
2036
+ },
2037
+ {
2038
+ "epoch": 17.873631447038647,
2039
+ "grad_norm": 0.68359375,
2040
+ "learning_rate": 0.000642527371059227,
2041
+ "loss": 2.097,
2042
+ "step": 135500
2043
+ },
2044
+ {
2045
+ "epoch": 17.939585806621817,
2046
+ "grad_norm": 0.68359375,
2047
+ "learning_rate": 0.0006412082838675637,
2048
+ "loss": 2.0928,
2049
+ "step": 136000
2050
+ },
2051
+ {
2052
+ "epoch": 18.0,
2053
+ "eval_loss": 2.4190404415130615,
2054
+ "eval_runtime": 4.6336,
2055
+ "eval_samples_per_second": 431.626,
2056
+ "eval_steps_per_second": 1.727,
2057
+ "step": 136458
2058
+ }
2059
+ ],
2060
+ "logging_steps": 500,
2061
+ "max_steps": 379050,
2062
+ "num_input_tokens_seen": 0,
2063
+ "num_train_epochs": 50,
2064
+ "save_steps": 500,
2065
+ "stateful_callbacks": {
2066
+ "EarlyStoppingCallback": {
2067
+ "args": {
2068
+ "early_stopping_patience": 3,
2069
+ "early_stopping_threshold": 0.0
2070
+ },
2071
+ "attributes": {
2072
+ "early_stopping_patience_counter": 0
2073
+ }
2074
+ },
2075
+ "TrainerControl": {
2076
+ "args": {
2077
+ "should_epoch_stop": false,
2078
+ "should_evaluate": false,
2079
+ "should_log": false,
2080
+ "should_save": true,
2081
+ "should_training_stop": false
2082
+ },
2083
+ "attributes": {}
2084
+ }
2085
+ },
2086
+ "total_flos": 2.8932082563541893e+18,
2087
+ "train_batch_size": 256,
2088
+ "trial_name": null,
2089
+ "trial_params": null
2090
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef93778f4df28bcb4b6b3504e3fda994c342be326fb8a4837a1a8cbf8b52b8a
3
+ size 5112