semran1 commited on
Commit
4ca5b93
·
verified ·
1 Parent(s): ded9bf5

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pretrain_anneal/checkpoint-16912",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 96,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1536,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 16,
20
+ "num_key_value_heads": 8,
21
+ "pad_token_id": 0,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": null,
25
+ "rope_theta": 100000,
26
+ "sliding_window": null,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.44.2",
30
+ "use_cache": true,
31
+ "vocab_size": 32000
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.44.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f543445fee4a911cf2f9559e152c15257f621dc69316a6fded9b4422f7270d
3
+ size 3519556840
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0c9995ff07baed3c0275b29bb5a07431bbc176b099582b7b5056065b277f538
3
+ size 3324071674
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:956c0353592a48f9d490d3289718276a14d3df1265878ef47b3b09958cb9577f
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f0d9bac4f62dd1b5b2d75cd9a9eec01eca37b01cf938c058f2db3b929bbe13
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
trainer_state.json ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 2000,
6
+ "global_step": 80,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "crossentropy": 2.487246513366699,
13
+ "epoch": 0.125,
14
+ "grad_norm": 0.03208793327212334,
15
+ "learning_rate": 0.001,
16
+ "loss": 2.4872,
17
+ "step": 1
18
+ },
19
+ {
20
+ "crossentropy": 2.379896402359009,
21
+ "epoch": 0.25,
22
+ "grad_norm": 0.03239322826266289,
23
+ "learning_rate": 0.002,
24
+ "loss": 2.3799,
25
+ "step": 2
26
+ },
27
+ {
28
+ "crossentropy": 2.5010085105895996,
29
+ "epoch": 0.375,
30
+ "grad_norm": 0.03320762887597084,
31
+ "learning_rate": 0.003,
32
+ "loss": 2.501,
33
+ "step": 3
34
+ },
35
+ {
36
+ "crossentropy": 2.688127040863037,
37
+ "epoch": 0.5,
38
+ "grad_norm": 0.03224330395460129,
39
+ "learning_rate": 0.004,
40
+ "loss": 2.6881,
41
+ "step": 4
42
+ },
43
+ {
44
+ "crossentropy": 2.4009199142456055,
45
+ "epoch": 0.625,
46
+ "grad_norm": 0.029966533184051514,
47
+ "learning_rate": 0.005,
48
+ "loss": 2.4009,
49
+ "step": 5
50
+ },
51
+ {
52
+ "crossentropy": 2.474385976791382,
53
+ "epoch": 0.75,
54
+ "grad_norm": 0.030283687636256218,
55
+ "learning_rate": 0.006,
56
+ "loss": 2.4744,
57
+ "step": 6
58
+ },
59
+ {
60
+ "crossentropy": 2.5291025638580322,
61
+ "epoch": 0.875,
62
+ "grad_norm": 0.03127186745405197,
63
+ "learning_rate": 0.006999999999999999,
64
+ "loss": 2.5291,
65
+ "step": 7
66
+ },
67
+ {
68
+ "crossentropy": 2.3309171199798584,
69
+ "epoch": 1.0,
70
+ "grad_norm": 0.030661335214972496,
71
+ "learning_rate": 0.008,
72
+ "loss": 2.3309,
73
+ "step": 8
74
+ },
75
+ {
76
+ "crossentropy": 2.4654123783111572,
77
+ "epoch": 1.125,
78
+ "grad_norm": 0.029608242213726044,
79
+ "learning_rate": 0.009000000000000001,
80
+ "loss": 2.4654,
81
+ "step": 9
82
+ },
83
+ {
84
+ "crossentropy": 2.468688488006592,
85
+ "epoch": 1.25,
86
+ "grad_norm": 0.03335599973797798,
87
+ "learning_rate": 0.01,
88
+ "loss": 2.4687,
89
+ "step": 10
90
+ },
91
+ {
92
+ "crossentropy": 2.3773512840270996,
93
+ "epoch": 1.375,
94
+ "grad_norm": 0.031801655888557434,
95
+ "learning_rate": 0.009994965332706574,
96
+ "loss": 2.3774,
97
+ "step": 11
98
+ },
99
+ {
100
+ "crossentropy": 2.5056920051574707,
101
+ "epoch": 1.5,
102
+ "grad_norm": 0.03860826417803764,
103
+ "learning_rate": 0.009979871469976196,
104
+ "loss": 2.5057,
105
+ "step": 12
106
+ },
107
+ {
108
+ "crossentropy": 2.4569733142852783,
109
+ "epoch": 1.625,
110
+ "grad_norm": 0.03568004071712494,
111
+ "learning_rate": 0.009954748808839673,
112
+ "loss": 2.457,
113
+ "step": 13
114
+ },
115
+ {
116
+ "crossentropy": 2.4127414226531982,
117
+ "epoch": 1.75,
118
+ "grad_norm": 0.03557576611638069,
119
+ "learning_rate": 0.009919647942993149,
120
+ "loss": 2.4127,
121
+ "step": 14
122
+ },
123
+ {
124
+ "crossentropy": 2.4089269638061523,
125
+ "epoch": 1.875,
126
+ "grad_norm": 0.0321757011115551,
127
+ "grad_norm_var": 5.793923908080815e-06,
128
+ "learning_rate": 0.009874639560909117,
129
+ "loss": 2.4089,
130
+ "step": 15
131
+ },
132
+ {
133
+ "crossentropy": 2.385178804397583,
134
+ "epoch": 2.0,
135
+ "grad_norm": 0.033555347472429276,
136
+ "grad_norm_var": 5.852987523136916e-06,
137
+ "learning_rate": 0.009819814303479266,
138
+ "loss": 2.3852,
139
+ "step": 16
140
+ },
141
+ {
142
+ "crossentropy": 2.4342517852783203,
143
+ "epoch": 2.125,
144
+ "grad_norm": 0.034557584673166275,
145
+ "grad_norm_var": 6.127065953599033e-06,
146
+ "learning_rate": 0.009755282581475769,
147
+ "loss": 2.4343,
148
+ "step": 17
149
+ },
150
+ {
151
+ "crossentropy": 2.4237754344940186,
152
+ "epoch": 2.25,
153
+ "grad_norm": 0.03371824324131012,
154
+ "grad_norm_var": 6.168768579761655e-06,
155
+ "learning_rate": 0.009681174353198686,
156
+ "loss": 2.4238,
157
+ "step": 18
158
+ },
159
+ {
160
+ "crossentropy": 2.4231510162353516,
161
+ "epoch": 2.375,
162
+ "grad_norm": 0.02907504141330719,
163
+ "grad_norm_var": 6.557002911025779e-06,
164
+ "learning_rate": 0.009597638862757255,
165
+ "loss": 2.4232,
166
+ "step": 19
167
+ },
168
+ {
169
+ "crossentropy": 2.3850157260894775,
170
+ "epoch": 2.5,
171
+ "grad_norm": 0.03278205543756485,
172
+ "grad_norm_var": 6.122522196004565e-06,
173
+ "learning_rate": 0.009504844339512096,
174
+ "loss": 2.385,
175
+ "step": 20
176
+ },
177
+ {
178
+ "crossentropy": 2.3287932872772217,
179
+ "epoch": 2.625,
180
+ "grad_norm": 0.03301858901977539,
181
+ "grad_norm_var": 5.930476905840337e-06,
182
+ "learning_rate": 0.00940297765928369,
183
+ "loss": 2.3288,
184
+ "step": 21
185
+ },
186
+ {
187
+ "crossentropy": 2.3058865070343018,
188
+ "epoch": 2.75,
189
+ "grad_norm": 0.03293571248650551,
190
+ "grad_norm_var": 5.5371734660507875e-06,
191
+ "learning_rate": 0.009292243968009331,
192
+ "loss": 2.3059,
193
+ "step": 22
194
+ },
195
+ {
196
+ "crossentropy": 2.2683637142181396,
197
+ "epoch": 2.875,
198
+ "grad_norm": 0.03541896864771843,
199
+ "grad_norm_var": 5.362674881745298e-06,
200
+ "learning_rate": 0.009172866268606514,
201
+ "loss": 2.2684,
202
+ "step": 23
203
+ },
204
+ {
205
+ "crossentropy": 2.411990165710449,
206
+ "epoch": 3.0,
207
+ "grad_norm": 0.03502194955945015,
208
+ "grad_norm_var": 4.411311488799798e-06,
209
+ "learning_rate": 0.009045084971874737,
210
+ "loss": 2.412,
211
+ "step": 24
212
+ },
213
+ {
214
+ "crossentropy": 2.2515363693237305,
215
+ "epoch": 3.125,
216
+ "grad_norm": 0.03379856050014496,
217
+ "grad_norm_var": 4.1744788750110175e-06,
218
+ "learning_rate": 0.008909157412340149,
219
+ "loss": 2.2515,
220
+ "step": 25
221
+ },
222
+ {
223
+ "crossentropy": 2.3486626148223877,
224
+ "epoch": 3.25,
225
+ "grad_norm": 0.034475792199373245,
226
+ "grad_norm_var": 2.7198637048790144e-06,
227
+ "learning_rate": 0.008765357330018056,
228
+ "loss": 2.3487,
229
+ "step": 26
230
+ },
231
+ {
232
+ "crossentropy": 2.379669666290283,
233
+ "epoch": 3.375,
234
+ "grad_norm": 0.03319563344120979,
235
+ "grad_norm_var": 2.4738877727155267e-06,
236
+ "learning_rate": 0.008613974319136958,
237
+ "loss": 2.3797,
238
+ "step": 27
239
+ },
240
+ {
241
+ "crossentropy": 2.3640384674072266,
242
+ "epoch": 3.5,
243
+ "grad_norm": 0.03426358476281166,
244
+ "grad_norm_var": 2.2389126654258237e-06,
245
+ "learning_rate": 0.008455313244934324,
246
+ "loss": 2.364,
247
+ "step": 28
248
+ },
249
+ {
250
+ "crossentropy": 2.324143409729004,
251
+ "epoch": 3.625,
252
+ "grad_norm": 0.0358288437128067,
253
+ "grad_norm_var": 2.410602035355508e-06,
254
+ "learning_rate": 0.008289693629698563,
255
+ "loss": 2.3241,
256
+ "step": 29
257
+ },
258
+ {
259
+ "crossentropy": 2.159423828125,
260
+ "epoch": 3.75,
261
+ "grad_norm": 0.031966786831617355,
262
+ "grad_norm_var": 2.612506091834166e-06,
263
+ "learning_rate": 0.008117449009293669,
264
+ "loss": 2.1594,
265
+ "step": 30
266
+ },
267
+ {
268
+ "crossentropy": 2.2669661045074463,
269
+ "epoch": 3.875,
270
+ "grad_norm": 0.0339609794318676,
271
+ "grad_norm_var": 2.61687730451927e-06,
272
+ "learning_rate": 0.007938926261462366,
273
+ "loss": 2.267,
274
+ "step": 31
275
+ },
276
+ {
277
+ "crossentropy": 2.482945680618286,
278
+ "epoch": 4.0,
279
+ "grad_norm": 0.03357086703181267,
280
+ "grad_norm_var": 2.5635888162410063e-06,
281
+ "learning_rate": 0.007754484907260513,
282
+ "loss": 2.4829,
283
+ "step": 32
284
+ },
285
+ {
286
+ "crossentropy": 2.1757168769836426,
287
+ "epoch": 4.125,
288
+ "grad_norm": 0.032582979649305344,
289
+ "grad_norm_var": 1.2068945133852464e-06,
290
+ "learning_rate": 0.007564496387029531,
291
+ "loss": 2.1757,
292
+ "step": 33
293
+ },
294
+ {
295
+ "crossentropy": 2.4300379753112793,
296
+ "epoch": 4.25,
297
+ "grad_norm": 0.03353552147746086,
298
+ "grad_norm_var": 1.1362555840309259e-06,
299
+ "learning_rate": 0.007369343312364994,
300
+ "loss": 2.43,
301
+ "step": 34
302
+ },
303
+ {
304
+ "crossentropy": 2.1233766078948975,
305
+ "epoch": 4.375,
306
+ "grad_norm": 0.034830041229724884,
307
+ "grad_norm_var": 1.1319644129707703e-06,
308
+ "learning_rate": 0.007169418695587791,
309
+ "loss": 2.1234,
310
+ "step": 35
311
+ },
312
+ {
313
+ "crossentropy": 2.2619526386260986,
314
+ "epoch": 4.5,
315
+ "grad_norm": 0.033126723021268845,
316
+ "grad_norm_var": 1.1071727437842251e-06,
317
+ "learning_rate": 0.006965125158269619,
318
+ "loss": 2.262,
319
+ "step": 36
320
+ },
321
+ {
322
+ "crossentropy": 2.3065433502197266,
323
+ "epoch": 4.625,
324
+ "grad_norm": 0.0348593033850193,
325
+ "grad_norm_var": 1.0216560744425243e-06,
326
+ "learning_rate": 0.0067568741204067145,
327
+ "loss": 2.3065,
328
+ "step": 37
329
+ },
330
+ {
331
+ "crossentropy": 2.2902674674987793,
332
+ "epoch": 4.75,
333
+ "grad_norm": 0.0360955074429512,
334
+ "grad_norm_var": 1.2434575549111667e-06,
335
+ "learning_rate": 0.006545084971874737,
336
+ "loss": 2.2903,
337
+ "step": 38
338
+ },
339
+ {
340
+ "crossentropy": 2.2587778568267822,
341
+ "epoch": 4.875,
342
+ "grad_norm": 0.03467360511422157,
343
+ "grad_norm_var": 1.2055615432713293e-06,
344
+ "learning_rate": 0.006330184227833375,
345
+ "loss": 2.2588,
346
+ "step": 39
347
+ },
348
+ {
349
+ "crossentropy": 2.3978285789489746,
350
+ "epoch": 5.0,
351
+ "grad_norm": 0.035668086260557175,
352
+ "grad_norm_var": 1.3685657271088221e-06,
353
+ "learning_rate": 0.006112604669781572,
354
+ "loss": 2.3978,
355
+ "step": 40
356
+ },
357
+ {
358
+ "crossentropy": 2.3564870357513428,
359
+ "epoch": 5.125,
360
+ "grad_norm": 0.0373300276696682,
361
+ "grad_norm_var": 2.0190065310690096e-06,
362
+ "learning_rate": 0.005892784473993183,
363
+ "loss": 2.3565,
364
+ "step": 41
365
+ },
366
+ {
367
+ "crossentropy": 2.202275514602661,
368
+ "epoch": 5.25,
369
+ "grad_norm": 0.03768543526530266,
370
+ "grad_norm_var": 2.6563097811015932e-06,
371
+ "learning_rate": 0.0056711663290882775,
372
+ "loss": 2.2023,
373
+ "step": 42
374
+ },
375
+ {
376
+ "crossentropy": 2.192146062850952,
377
+ "epoch": 5.375,
378
+ "grad_norm": 0.03134535253047943,
379
+ "grad_norm_var": 3.228640330968311e-06,
380
+ "learning_rate": 0.005448196544517168,
381
+ "loss": 2.1921,
382
+ "step": 43
383
+ },
384
+ {
385
+ "crossentropy": 2.2519092559814453,
386
+ "epoch": 5.5,
387
+ "grad_norm": 0.032612044364213943,
388
+ "grad_norm_var": 3.044945465432008e-06,
389
+ "learning_rate": 0.005224324151752576,
390
+ "loss": 2.2519,
391
+ "step": 44
392
+ },
393
+ {
394
+ "crossentropy": 2.3610141277313232,
395
+ "epoch": 5.625,
396
+ "grad_norm": 0.03532750904560089,
397
+ "grad_norm_var": 3.073519780112897e-06,
398
+ "learning_rate": 0.005,
399
+ "loss": 2.361,
400
+ "step": 45
401
+ },
402
+ {
403
+ "crossentropy": 2.2574493885040283,
404
+ "epoch": 5.75,
405
+ "grad_norm": 0.03538508713245392,
406
+ "grad_norm_var": 3.047191916038562e-06,
407
+ "learning_rate": 0.004775675848247427,
408
+ "loss": 2.2574,
409
+ "step": 46
410
+ },
411
+ {
412
+ "crossentropy": 2.170254707336426,
413
+ "epoch": 5.875,
414
+ "grad_norm": 0.03475534915924072,
415
+ "grad_norm_var": 2.9654755954788835e-06,
416
+ "learning_rate": 0.004551803455482833,
417
+ "loss": 2.1703,
418
+ "step": 47
419
+ },
420
+ {
421
+ "crossentropy": 2.2004218101501465,
422
+ "epoch": 6.0,
423
+ "grad_norm": 0.0375908762216568,
424
+ "grad_norm_var": 3.107626395358089e-06,
425
+ "learning_rate": 0.004328833670911724,
426
+ "loss": 2.2004,
427
+ "step": 48
428
+ },
429
+ {
430
+ "crossentropy": 2.162071943283081,
431
+ "epoch": 6.125,
432
+ "grad_norm": 0.031826432794332504,
433
+ "grad_norm_var": 4.072774386597716e-06,
434
+ "learning_rate": 0.004107215526006817,
435
+ "loss": 2.1621,
436
+ "step": 49
437
+ },
438
+ {
439
+ "crossentropy": 2.144589424133301,
440
+ "epoch": 6.25,
441
+ "grad_norm": 0.03268062323331833,
442
+ "grad_norm_var": 4.202360731680779e-06,
443
+ "learning_rate": 0.003887395330218428,
444
+ "loss": 2.1446,
445
+ "step": 50
446
+ },
447
+ {
448
+ "crossentropy": 2.202075481414795,
449
+ "epoch": 6.375,
450
+ "grad_norm": 0.03384561836719513,
451
+ "grad_norm_var": 4.294842472950198e-06,
452
+ "learning_rate": 0.003669815772166625,
453
+ "loss": 2.2021,
454
+ "step": 51
455
+ },
456
+ {
457
+ "crossentropy": 2.3169946670532227,
458
+ "epoch": 6.5,
459
+ "grad_norm": 0.03661832585930824,
460
+ "grad_norm_var": 4.387942230220839e-06,
461
+ "learning_rate": 0.003454915028125263,
462
+ "loss": 2.317,
463
+ "step": 52
464
+ },
465
+ {
466
+ "crossentropy": 2.191814422607422,
467
+ "epoch": 6.625,
468
+ "grad_norm": 0.03522748872637749,
469
+ "grad_norm_var": 4.380226970383122e-06,
470
+ "learning_rate": 0.003243125879593286,
471
+ "loss": 2.1918,
472
+ "step": 53
473
+ },
474
+ {
475
+ "crossentropy": 2.196298360824585,
476
+ "epoch": 6.75,
477
+ "grad_norm": 0.03524046018719673,
478
+ "grad_norm_var": 4.3576876356847075e-06,
479
+ "learning_rate": 0.0030348748417303823,
480
+ "loss": 2.1963,
481
+ "step": 54
482
+ },
483
+ {
484
+ "crossentropy": 2.2684872150421143,
485
+ "epoch": 6.875,
486
+ "grad_norm": 0.03640512377023697,
487
+ "grad_norm_var": 4.452811959714381e-06,
488
+ "learning_rate": 0.00283058130441221,
489
+ "loss": 2.2685,
490
+ "step": 55
491
+ },
492
+ {
493
+ "crossentropy": 2.3180689811706543,
494
+ "epoch": 7.0,
495
+ "grad_norm": 0.03479791432619095,
496
+ "grad_norm_var": 4.097831569832335e-06,
497
+ "learning_rate": 0.002630656687635007,
498
+ "loss": 2.3181,
499
+ "step": 56
500
+ },
501
+ {
502
+ "crossentropy": 2.201280117034912,
503
+ "epoch": 7.125,
504
+ "grad_norm": 0.03417252376675606,
505
+ "grad_norm_var": 2.7744502044233416e-06,
506
+ "learning_rate": 0.00243550361297047,
507
+ "loss": 2.2013,
508
+ "step": 57
509
+ },
510
+ {
511
+ "crossentropy": 2.3339715003967285,
512
+ "epoch": 7.25,
513
+ "grad_norm": 0.033313509076833725,
514
+ "grad_norm_var": 2.5884423837709467e-06,
515
+ "learning_rate": 0.002245515092739488,
516
+ "loss": 2.334,
517
+ "step": 58
518
+ },
519
+ {
520
+ "crossentropy": 2.1603586673736572,
521
+ "epoch": 7.375,
522
+ "grad_norm": 0.03365428000688553,
523
+ "grad_norm_var": 2.684439790351147e-06,
524
+ "learning_rate": 0.0020610737385376348,
525
+ "loss": 2.1604,
526
+ "step": 59
527
+ },
528
+ {
529
+ "crossentropy": 2.2999627590179443,
530
+ "epoch": 7.5,
531
+ "grad_norm": 0.034641556441783905,
532
+ "grad_norm_var": 2.6678187146791006e-06,
533
+ "learning_rate": 0.0018825509907063327,
534
+ "loss": 2.3,
535
+ "step": 60
536
+ },
537
+ {
538
+ "crossentropy": 2.1747264862060547,
539
+ "epoch": 7.625,
540
+ "grad_norm": 0.03396380692720413,
541
+ "grad_norm_var": 2.7140570016346197e-06,
542
+ "learning_rate": 0.001710306370301437,
543
+ "loss": 2.1747,
544
+ "step": 61
545
+ },
546
+ {
547
+ "crossentropy": 2.120774269104004,
548
+ "epoch": 7.75,
549
+ "grad_norm": 0.03624221310019493,
550
+ "grad_norm_var": 2.3210148057607623e-06,
551
+ "learning_rate": 0.0015446867550656768,
552
+ "loss": 2.1208,
553
+ "step": 62
554
+ },
555
+ {
556
+ "crossentropy": 2.2385036945343018,
557
+ "epoch": 7.875,
558
+ "grad_norm": 0.034913428127765656,
559
+ "grad_norm_var": 1.732991609125894e-06,
560
+ "learning_rate": 0.0013860256808630427,
561
+ "loss": 2.2385,
562
+ "step": 63
563
+ },
564
+ {
565
+ "crossentropy": 2.1571521759033203,
566
+ "epoch": 8.0,
567
+ "grad_norm": 0.03277244418859482,
568
+ "grad_norm_var": 1.4490052253732697e-06,
569
+ "learning_rate": 0.0012346426699819458,
570
+ "loss": 2.1572,
571
+ "step": 64
572
+ },
573
+ {
574
+ "crossentropy": 2.2847859859466553,
575
+ "epoch": 8.125,
576
+ "grad_norm": 0.033221788704395294,
577
+ "grad_norm_var": 1.512194472239048e-06,
578
+ "learning_rate": 0.001090842587659851,
579
+ "loss": 2.2848,
580
+ "step": 65
581
+ },
582
+ {
583
+ "crossentropy": 2.253898859024048,
584
+ "epoch": 8.25,
585
+ "grad_norm": 0.03283696994185448,
586
+ "grad_norm_var": 1.361639072858939e-06,
587
+ "learning_rate": 0.0009549150281252633,
588
+ "loss": 2.2539,
589
+ "step": 66
590
+ },
591
+ {
592
+ "crossentropy": 2.1402039527893066,
593
+ "epoch": 8.375,
594
+ "grad_norm": 0.03309963271021843,
595
+ "grad_norm_var": 1.3845661239702636e-06,
596
+ "learning_rate": 0.0008271337313934868,
597
+ "loss": 2.1402,
598
+ "step": 67
599
+ },
600
+ {
601
+ "crossentropy": 2.1381993293762207,
602
+ "epoch": 8.5,
603
+ "grad_norm": 0.03431132435798645,
604
+ "grad_norm_var": 1.3068839073976725e-06,
605
+ "learning_rate": 0.0007077560319906695,
606
+ "loss": 2.1382,
607
+ "step": 68
608
+ },
609
+ {
610
+ "crossentropy": 2.1610183715820312,
611
+ "epoch": 8.625,
612
+ "grad_norm": 0.03382508084177971,
613
+ "grad_norm_var": 9.367598844583584e-07,
614
+ "learning_rate": 0.00059702234071631,
615
+ "loss": 2.161,
616
+ "step": 69
617
+ },
618
+ {
619
+ "crossentropy": 2.2194838523864746,
620
+ "epoch": 8.75,
621
+ "grad_norm": 0.03652471676468849,
622
+ "grad_norm_var": 1.3163803696068672e-06,
623
+ "learning_rate": 0.0004951556604879049,
624
+ "loss": 2.2195,
625
+ "step": 70
626
+ },
627
+ {
628
+ "crossentropy": 2.179126262664795,
629
+ "epoch": 8.875,
630
+ "grad_norm": 0.03386425971984863,
631
+ "grad_norm_var": 1.2798076699575718e-06,
632
+ "learning_rate": 0.0004023611372427471,
633
+ "loss": 2.1791,
634
+ "step": 71
635
+ },
636
+ {
637
+ "crossentropy": 2.2535359859466553,
638
+ "epoch": 9.0,
639
+ "grad_norm": 0.032655857503414154,
640
+ "grad_norm_var": 1.3903296123838708e-06,
641
+ "learning_rate": 0.00031882564680131396,
642
+ "loss": 2.2535,
643
+ "step": 72
644
+ },
645
+ {
646
+ "crossentropy": 2.083571434020996,
647
+ "epoch": 9.125,
648
+ "grad_norm": 0.03222977742552757,
649
+ "grad_norm_var": 1.6381791075122573e-06,
650
+ "learning_rate": 0.00024471741852423234,
651
+ "loss": 2.0836,
652
+ "step": 73
653
+ },
654
+ {
655
+ "crossentropy": 2.1447954177856445,
656
+ "epoch": 9.25,
657
+ "grad_norm": 0.03370295464992523,
658
+ "grad_norm_var": 1.585818962655111e-06,
659
+ "learning_rate": 0.0001801856965207338,
660
+ "loss": 2.1448,
661
+ "step": 74
662
+ },
663
+ {
664
+ "crossentropy": 2.3297011852264404,
665
+ "epoch": 9.375,
666
+ "grad_norm": 0.03342713788151741,
667
+ "grad_norm_var": 1.5866984901570397e-06,
668
+ "learning_rate": 0.0001253604390908819,
669
+ "loss": 2.3297,
670
+ "step": 75
671
+ },
672
+ {
673
+ "crossentropy": 2.184936761856079,
674
+ "epoch": 9.5,
675
+ "grad_norm": 0.03379293903708458,
676
+ "grad_norm_var": 1.128480817529492e-06,
677
+ "learning_rate": 8.035205700685166e-05,
678
+ "loss": 2.1849,
679
+ "step": 76
680
+ },
681
+ {
682
+ "crossentropy": 2.244959592819214,
683
+ "epoch": 9.625,
684
+ "grad_norm": 0.03535815700888634,
685
+ "grad_norm_var": 1.2224064569112125e-06,
686
+ "learning_rate": 4.52511911603265e-05,
687
+ "loss": 2.245,
688
+ "step": 77
689
+ },
690
+ {
691
+ "crossentropy": 2.145529270172119,
692
+ "epoch": 9.75,
693
+ "grad_norm": 0.03338780626654625,
694
+ "grad_norm_var": 1.180987359381859e-06,
695
+ "learning_rate": 2.012853002380466e-05,
696
+ "loss": 2.1455,
697
+ "step": 78
698
+ },
699
+ {
700
+ "crossentropy": 2.282687187194824,
701
+ "epoch": 9.875,
702
+ "grad_norm": 0.03313542157411575,
703
+ "grad_norm_var": 1.1489689212500093e-06,
704
+ "learning_rate": 5.034667293427053e-06,
705
+ "loss": 2.2827,
706
+ "step": 79
707
+ },
708
+ {
709
+ "crossentropy": 2.1934750080108643,
710
+ "epoch": 10.0,
711
+ "grad_norm": 0.03277941048145294,
712
+ "grad_norm_var": 1.18509241838338e-06,
713
+ "learning_rate": 0.0,
714
+ "loss": 2.1935,
715
+ "step": 80
716
+ }
717
+ ],
718
+ "logging_steps": 1,
719
+ "max_steps": 80,
720
+ "num_input_tokens_seen": 0,
721
+ "num_train_epochs": 10,
722
+ "save_steps": 2000,
723
+ "stateful_callbacks": {
724
+ "TrainerControl": {
725
+ "args": {
726
+ "should_epoch_stop": false,
727
+ "should_evaluate": false,
728
+ "should_log": false,
729
+ "should_save": true,
730
+ "should_training_stop": true
731
+ },
732
+ "attributes": {}
733
+ }
734
+ },
735
+ "total_flos": 1.384645494177792e+16,
736
+ "train_batch_size": 16,
737
+ "trial_name": null,
738
+ "trial_params": null
739
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36525e45296e89abb811d54f8d48c25f84509486e0812ee034d7029aea3952d0
3
+ size 5304