SystemAdmin123 commited on
Commit
bbe6688
·
verified ·
1 Parent(s): 4460e89

Training in progress, step 40, checkpoint

Browse files
last-checkpoint/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
last-checkpoint/config.json CHANGED
@@ -1,31 +1,28 @@
1
  {
2
- "_name_or_path": "fxmarty/small-llama-testing",
3
  "architectures": [
4
- "LlamaForCausalLM"
5
  ],
6
- "attention_bias": false,
7
  "attention_dropout": 0.0,
8
- "bos_token_id": 0,
9
- "eos_token_id": 2,
10
- "head_dim": 64,
11
  "hidden_act": "silu",
12
- "hidden_size": 256,
13
  "initializer_range": 0.02,
14
- "intermediate_size": 128,
15
- "max_position_embeddings": 2048,
16
- "mlp_bias": false,
17
- "model_type": "llama",
18
  "num_attention_heads": 4,
19
  "num_hidden_layers": 2,
20
- "num_key_value_heads": 4,
21
- "pad_token_id": -1,
22
- "pretraining_tp": 1,
23
  "rms_norm_eps": 1e-06,
24
  "rope_scaling": null,
25
- "rope_theta": 10000.0,
26
- "tie_word_embeddings": false,
 
27
  "torch_dtype": "bfloat16",
28
- "transformers_version": "4.46.0",
29
  "use_cache": false,
30
- "vocab_size": 32000
 
31
  }
 
1
  {
2
+ "_name_or_path": "peft-internal-testing/tiny-dummy-qwen2",
3
  "architectures": [
4
+ "Qwen2ForCausalLM"
5
  ],
 
6
  "attention_dropout": 0.0,
7
+ "eos_token_id": 151643,
 
 
8
  "hidden_act": "silu",
9
+ "hidden_size": 8,
10
  "initializer_range": 0.02,
11
+ "intermediate_size": 32,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
  "num_attention_heads": 4,
16
  "num_hidden_layers": 2,
17
+ "num_key_value_heads": 2,
 
 
18
  "rms_norm_eps": 1e-06,
19
  "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": true,
23
  "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.48.1",
25
  "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151646
28
  }
last-checkpoint/generation_config.json CHANGED
@@ -1,8 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 0,
4
  "do_sample": true,
5
- "eos_token_id": 1,
6
- "pad_token_id": 0,
7
- "transformers_version": "4.46.0"
8
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
  "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "transformers_version": "4.48.1"
 
7
  }
last-checkpoint/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3981e030b93d87183bf27dee61297bc699920b1d1063e2a14a6d844ffe232f8b
3
- size 34214640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe161f5ecbc782395222bf2f386584a1d7eea980d6f5af667b405b40dc54ba49
3
+ size 2433024
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9bbf306e4b6a97f05c3a91de8c636a98b058cb119c3bdee85b0cb98f796a16d
3
- size 34779282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eaaef8fa6904a05d7b00660e5c89634cc0fc7514af15647f3b2197f47ba53c2
3
+ size 2498406
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:491f63818c64ff6648de457327747921b2babcb32039fca8bf9c3307b28a7558
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0df10be429c2412198f2c4c684866a8c921cbd0d9ee4c865077476da07bda410
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffddba77a8223104775eb3a1c056a7d75edc6bf860f477efa9250d5724119c2a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b14d8363b71c1e824421bf14a513f0f951f1c6c9b9494dcedc75b3fa1fecea91
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd946489a63144cac84a6b6a74fc737d9b6756458719a76e2f70310c4ca7f8f9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0d27a8e9aed69fdd3b729f8cbf6300af5e0e26e9226d0e2307b0cf80aff9030
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b9d2ba99876c3fefd1687e4c0947b9787ec544cab12a64bdb6814ccefbfbfd3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23de9e35ad029c7a59ff383920fe892474d8f295c2e2f82ec3e6f109f3f96960
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc2121f36b82fd0af2305e5cdccd3b5d0ed7e9605006dca6684c3e6d513aeddb
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50724f43561b5c511c4e37bb1b8ec7e620ff22332c4f39576452e56a9e02d18d
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95bd7e70c78cf607370cf86fe76ce57495142b48bf65473af517bedfd74dc36d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f8d108db37b78249e3b0beb22f6120e06fd369dd9fbaa19526994e040db5f9d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78dee00f79cb684b52a789dfffc13b50359ff9b6a646306b391a800ef6536778
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e62597393912992a3de2b36aa1663274b75b96f6525f5504e64377c5049c51e
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b8b731c550646d4b266437a563c6210a81eba271f5c46c3dc0af7053d7c4bd3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b1f0a5c61bcbceb46a2a335e00a59c951081a94126a8ae4cc9a9102e012eb65
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20e46998e3e088e3aec913e9faf45e0989d546f2ea882278b706c60253407ebc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea54818dbc15bd81d4941b71d5e2806c555d1a733f768c00c5f1a670e0a08fb1
3
  size 1064
last-checkpoint/special_tokens_map.json CHANGED
@@ -1,30 +1,20 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
  "eos_token": {
10
- "content": "</s>",
11
  "lstrip": false,
12
- "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
  }
30
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
 
 
 
6
  "eos_token": {
7
+ "content": "<|endoftext|>",
8
  "lstrip": false,
9
+ "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
  "pad_token": {
14
+ "content": "<|endoftext|>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false
 
 
 
 
 
 
 
19
  }
20
  }
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,26 +1,24 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
  "lstrip": false,
9
- "normalized": true,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
13
  },
14
- "1": {
15
- "content": "<s>",
16
  "lstrip": false,
17
- "normalized": true,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
- "2": {
23
- "content": "</s>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
@@ -28,16 +26,20 @@
28
  "special": true
29
  }
30
  },
31
- "bos_token": "<s>",
32
- "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
 
 
 
 
33
  "clean_up_tokenization_spaces": false,
34
- "eos_token": "</s>",
35
- "legacy": true,
36
- "model_max_length": 2048,
37
- "pad_token": "</s>",
38
- "sp_model_kwargs": {},
39
- "tokenizer_class": "LlamaTokenizer",
40
- "unk_token": "<unk>",
41
- "use_default_system_prompt": false,
42
  "use_fast": true
43
  }
 
1
  {
2
+ "add_prefix_space": false,
 
 
3
  "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
  "lstrip": false,
7
+ "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
  "lstrip": false,
15
+ "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
 
26
  "special": true
27
  }
28
  },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
  "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|endoftext|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 32768,
40
+ "pad_token": "<|endoftext|>",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null,
44
  "use_fast": true
45
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,1001 +1,70 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 20,
6
- "global_step": 890,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.011235955056179775,
13
- "eval_loss": 10.422812461853027,
14
- "eval_runtime": 1.3009,
15
- "eval_samples_per_second": 1154.612,
16
- "eval_steps_per_second": 7.687,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.11235955056179775,
21
- "grad_norm": 1.5390625,
22
- "learning_rate": 7.692307692307693e-05,
23
- "loss": 10.4033,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.2247191011235955,
28
- "grad_norm": 1.46875,
29
- "learning_rate": 0.00015384615384615385,
30
- "loss": 10.127,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 0.2247191011235955,
35
- "eval_loss": 9.863175392150879,
36
- "eval_runtime": 1.4324,
37
- "eval_samples_per_second": 1048.618,
38
- "eval_steps_per_second": 6.981,
39
  "step": 20
40
  },
41
  {
42
- "epoch": 0.33707865168539325,
43
- "grad_norm": 1.5234375,
44
- "learning_rate": 0.00019998942319271077,
45
- "loss": 9.616,
46
  "step": 30
47
  },
48
  {
49
- "epoch": 0.449438202247191,
50
- "grad_norm": 1.3046875,
51
- "learning_rate": 0.00019987045980408906,
52
- "loss": 9.0393,
53
  "step": 40
54
  },
55
  {
56
- "epoch": 0.449438202247191,
57
- "eval_loss": 8.740312576293945,
58
- "eval_runtime": 1.4689,
59
- "eval_samples_per_second": 1022.538,
60
- "eval_steps_per_second": 6.808,
61
  "step": 40
62
- },
63
- {
64
- "epoch": 0.5617977528089888,
65
- "grad_norm": 1.9296875,
66
- "learning_rate": 0.00019961946980917456,
67
- "loss": 8.5271,
68
- "step": 50
69
- },
70
- {
71
- "epoch": 0.6741573033707865,
72
- "grad_norm": 1.15625,
73
- "learning_rate": 0.00019923678501135848,
74
- "loss": 8.1127,
75
- "step": 60
76
- },
77
- {
78
- "epoch": 0.6741573033707865,
79
- "eval_loss": 7.918918132781982,
80
- "eval_runtime": 1.4635,
81
- "eval_samples_per_second": 1026.272,
82
- "eval_steps_per_second": 6.833,
83
- "step": 60
84
- },
85
- {
86
- "epoch": 0.7865168539325843,
87
- "grad_norm": 1.015625,
88
- "learning_rate": 0.00019872291131173742,
89
- "loss": 7.7733,
90
- "step": 70
91
- },
92
- {
93
- "epoch": 0.898876404494382,
94
- "grad_norm": 0.9765625,
95
- "learning_rate": 0.00019807852804032305,
96
- "loss": 7.5513,
97
- "step": 80
98
- },
99
- {
100
- "epoch": 0.898876404494382,
101
- "eval_loss": 7.457934856414795,
102
- "eval_runtime": 1.5378,
103
- "eval_samples_per_second": 976.69,
104
- "eval_steps_per_second": 6.503,
105
- "step": 80
106
- },
107
- {
108
- "epoch": 1.0112359550561798,
109
- "grad_norm": 0.625,
110
- "learning_rate": 0.00019730448705798239,
111
- "loss": 7.4017,
112
- "step": 90
113
- },
114
- {
115
- "epoch": 1.1235955056179776,
116
- "grad_norm": 0.515625,
117
- "learning_rate": 0.00019640181163029595,
118
- "loss": 7.2769,
119
- "step": 100
120
- },
121
- {
122
- "epoch": 1.1235955056179776,
123
- "eval_loss": 7.276979446411133,
124
- "eval_runtime": 1.6975,
125
- "eval_samples_per_second": 884.842,
126
- "eval_steps_per_second": 5.891,
127
- "step": 100
128
- },
129
- {
130
- "epoch": 1.2359550561797752,
131
- "grad_norm": 0.470703125,
132
- "learning_rate": 0.0001953716950748227,
133
- "loss": 7.2175,
134
- "step": 110
135
- },
136
- {
137
- "epoch": 1.348314606741573,
138
- "grad_norm": 0.953125,
139
- "learning_rate": 0.00019421549918355966,
140
- "loss": 7.1384,
141
- "step": 120
142
- },
143
- {
144
- "epoch": 1.348314606741573,
145
- "eval_loss": 7.176700115203857,
146
- "eval_runtime": 1.4059,
147
- "eval_samples_per_second": 1068.345,
148
- "eval_steps_per_second": 7.113,
149
- "step": 120
150
- },
151
- {
152
- "epoch": 1.4606741573033708,
153
- "grad_norm": 0.52734375,
154
- "learning_rate": 0.00019293475242268223,
155
- "loss": 7.1637,
156
- "step": 130
157
- },
158
- {
159
- "epoch": 1.5730337078651684,
160
- "grad_norm": 0.49609375,
161
- "learning_rate": 0.00019153114791194473,
162
- "loss": 7.0576,
163
- "step": 140
164
- },
165
- {
166
- "epoch": 1.5730337078651684,
167
- "eval_loss": 7.057478904724121,
168
- "eval_runtime": 1.5081,
169
- "eval_samples_per_second": 995.943,
170
- "eval_steps_per_second": 6.631,
171
- "step": 140
172
- },
173
- {
174
- "epoch": 1.6853932584269664,
175
- "grad_norm": 0.7265625,
176
- "learning_rate": 0.00019000654118641211,
177
- "loss": 7.0339,
178
- "step": 150
179
- },
180
- {
181
- "epoch": 1.797752808988764,
182
- "grad_norm": 0.494140625,
183
- "learning_rate": 0.00018836294774348278,
184
- "loss": 6.9564,
185
- "step": 160
186
- },
187
- {
188
- "epoch": 1.797752808988764,
189
- "eval_loss": 6.937857627868652,
190
- "eval_runtime": 1.4808,
191
- "eval_samples_per_second": 1014.296,
192
- "eval_steps_per_second": 6.753,
193
- "step": 160
194
- },
195
- {
196
- "epoch": 1.9101123595505618,
197
- "grad_norm": 1.1171875,
198
- "learning_rate": 0.00018660254037844388,
199
- "loss": 6.8863,
200
- "step": 170
201
- },
202
- {
203
- "epoch": 2.0224719101123596,
204
- "grad_norm": 0.78125,
205
- "learning_rate": 0.0001847276463120828,
206
- "loss": 6.8785,
207
- "step": 180
208
- },
209
- {
210
- "epoch": 2.0224719101123596,
211
- "eval_loss": 6.820836067199707,
212
- "eval_runtime": 1.5264,
213
- "eval_samples_per_second": 984.014,
214
- "eval_steps_per_second": 6.551,
215
- "step": 180
216
- },
217
- {
218
- "epoch": 2.134831460674157,
219
- "grad_norm": 1.2265625,
220
- "learning_rate": 0.00018274074411415105,
221
- "loss": 6.7019,
222
- "step": 190
223
- },
224
- {
225
- "epoch": 2.247191011235955,
226
- "grad_norm": 0.78515625,
227
- "learning_rate": 0.00018064446042674828,
228
- "loss": 6.7027,
229
- "step": 200
230
- },
231
- {
232
- "epoch": 2.247191011235955,
233
- "eval_loss": 6.721198558807373,
234
- "eval_runtime": 1.4329,
235
- "eval_samples_per_second": 1048.203,
236
- "eval_steps_per_second": 6.979,
237
- "step": 200
238
- },
239
- {
240
- "epoch": 2.359550561797753,
241
- "grad_norm": 1.0078125,
242
- "learning_rate": 0.00017844156649195759,
243
- "loss": 6.6148,
244
- "step": 210
245
- },
246
- {
247
- "epoch": 2.4719101123595504,
248
- "grad_norm": 0.7109375,
249
- "learning_rate": 0.00017613497448832312,
250
- "loss": 6.5913,
251
- "step": 220
252
- },
253
- {
254
- "epoch": 2.4719101123595504,
255
- "eval_loss": 6.636184215545654,
256
- "eval_runtime": 1.5174,
257
- "eval_samples_per_second": 989.862,
258
- "eval_steps_per_second": 6.59,
259
- "step": 220
260
- },
261
- {
262
- "epoch": 2.5842696629213484,
263
- "grad_norm": 0.734375,
264
- "learning_rate": 0.0001737277336810124,
265
- "loss": 6.5713,
266
- "step": 230
267
- },
268
- {
269
- "epoch": 2.696629213483146,
270
- "grad_norm": 1.1875,
271
- "learning_rate": 0.0001712230263907531,
272
- "loss": 6.498,
273
- "step": 240
274
- },
275
- {
276
- "epoch": 2.696629213483146,
277
- "eval_loss": 6.557174205780029,
278
- "eval_runtime": 1.526,
279
- "eval_samples_per_second": 984.272,
280
- "eval_steps_per_second": 6.553,
281
- "step": 240
282
- },
283
- {
284
- "epoch": 2.808988764044944,
285
- "grad_norm": 0.76953125,
286
- "learning_rate": 0.0001686241637868734,
287
- "loss": 6.4981,
288
- "step": 250
289
- },
290
- {
291
- "epoch": 2.9213483146067416,
292
- "grad_norm": 1.34375,
293
- "learning_rate": 0.00016593458151000688,
294
- "loss": 6.4453,
295
- "step": 260
296
- },
297
- {
298
- "epoch": 2.9213483146067416,
299
- "eval_loss": 6.472111225128174,
300
- "eval_runtime": 1.4329,
301
- "eval_samples_per_second": 1048.191,
302
- "eval_steps_per_second": 6.979,
303
- "step": 260
304
- },
305
- {
306
- "epoch": 3.033707865168539,
307
- "grad_norm": 0.96484375,
308
- "learning_rate": 0.00016315783513024977,
309
- "loss": 6.4341,
310
- "step": 270
311
- },
312
- {
313
- "epoch": 3.146067415730337,
314
- "grad_norm": 1.34375,
315
- "learning_rate": 0.00016029759544677297,
316
- "loss": 6.2635,
317
- "step": 280
318
- },
319
- {
320
- "epoch": 3.146067415730337,
321
- "eval_loss": 6.41255521774292,
322
- "eval_runtime": 1.7049,
323
- "eval_samples_per_second": 880.983,
324
- "eval_steps_per_second": 5.865,
325
- "step": 280
326
- },
327
- {
328
- "epoch": 3.258426966292135,
329
- "grad_norm": 0.96484375,
330
- "learning_rate": 0.0001573576436351046,
331
- "loss": 6.3071,
332
- "step": 290
333
- },
334
- {
335
- "epoch": 3.370786516853933,
336
- "grad_norm": 0.83984375,
337
- "learning_rate": 0.000154341866248497,
338
- "loss": 6.236,
339
- "step": 300
340
- },
341
- {
342
- "epoch": 3.370786516853933,
343
- "eval_loss": 6.365837574005127,
344
- "eval_runtime": 1.6113,
345
- "eval_samples_per_second": 932.168,
346
- "eval_steps_per_second": 6.206,
347
- "step": 300
348
- },
349
- {
350
- "epoch": 3.4831460674157304,
351
- "grad_norm": 1.2265625,
352
- "learning_rate": 0.00015125425007998653,
353
- "loss": 6.2083,
354
- "step": 310
355
- },
356
- {
357
- "epoch": 3.595505617977528,
358
- "grad_norm": 0.7890625,
359
- "learning_rate": 0.00014809887689193877,
360
- "loss": 6.2733,
361
- "step": 320
362
- },
363
- {
364
- "epoch": 3.595505617977528,
365
- "eval_loss": 6.316156387329102,
366
- "eval_runtime": 1.4293,
367
- "eval_samples_per_second": 1050.857,
368
- "eval_steps_per_second": 6.996,
369
- "step": 320
370
- },
371
- {
372
- "epoch": 3.7078651685393256,
373
- "grad_norm": 1.46875,
374
- "learning_rate": 0.00014487991802004623,
375
- "loss": 6.1737,
376
- "step": 330
377
- },
378
- {
379
- "epoch": 3.8202247191011236,
380
- "grad_norm": 1.15625,
381
- "learning_rate": 0.00014160162885891193,
382
- "loss": 6.2472,
383
- "step": 340
384
- },
385
- {
386
- "epoch": 3.8202247191011236,
387
- "eval_loss": 6.286980628967285,
388
- "eval_runtime": 1.566,
389
- "eval_samples_per_second": 959.153,
390
- "eval_steps_per_second": 6.386,
391
- "step": 340
392
- },
393
- {
394
- "epoch": 3.932584269662921,
395
- "grad_norm": 1.03125,
396
- "learning_rate": 0.000138268343236509,
397
- "loss": 6.1259,
398
- "step": 350
399
- },
400
- {
401
- "epoch": 4.044943820224719,
402
- "grad_norm": 0.92578125,
403
- "learning_rate": 0.0001348844676849531,
404
- "loss": 6.1738,
405
- "step": 360
406
- },
407
- {
408
- "epoch": 4.044943820224719,
409
- "eval_loss": 6.240147113800049,
410
- "eval_runtime": 1.7,
411
- "eval_samples_per_second": 883.529,
412
- "eval_steps_per_second": 5.882,
413
- "step": 360
414
- },
415
- {
416
- "epoch": 4.157303370786517,
417
- "grad_norm": 0.96484375,
418
- "learning_rate": 0.00013145447561516138,
419
- "loss": 6.058,
420
- "step": 370
421
- },
422
- {
423
- "epoch": 4.269662921348314,
424
- "grad_norm": 1.2421875,
425
- "learning_rate": 0.00012798290140309923,
426
- "loss": 6.0509,
427
- "step": 380
428
- },
429
- {
430
- "epoch": 4.269662921348314,
431
- "eval_loss": 6.218361854553223,
432
- "eval_runtime": 1.4692,
433
- "eval_samples_per_second": 1022.354,
434
- "eval_steps_per_second": 6.807,
435
- "step": 380
436
- },
437
- {
438
- "epoch": 4.382022471910112,
439
- "grad_norm": 0.69921875,
440
- "learning_rate": 0.0001244743343954324,
441
- "loss": 6.0345,
442
- "step": 390
443
- },
444
- {
445
- "epoch": 4.49438202247191,
446
- "grad_norm": 1.5078125,
447
- "learning_rate": 0.0001209334128425092,
448
- "loss": 6.0158,
449
- "step": 400
450
- },
451
- {
452
- "epoch": 4.49438202247191,
453
- "eval_loss": 6.195931434631348,
454
- "eval_runtime": 1.5795,
455
- "eval_samples_per_second": 950.928,
456
- "eval_steps_per_second": 6.331,
457
- "step": 400
458
- },
459
- {
460
- "epoch": 4.606741573033708,
461
- "grad_norm": 0.86328125,
462
- "learning_rate": 0.00011736481776669306,
463
- "loss": 6.0611,
464
- "step": 410
465
- },
466
- {
467
- "epoch": 4.719101123595506,
468
- "grad_norm": 0.9609375,
469
- "learning_rate": 0.00011377326677415108,
470
- "loss": 6.0043,
471
- "step": 420
472
- },
473
- {
474
- "epoch": 4.719101123595506,
475
- "eval_loss": 6.1770148277282715,
476
- "eval_runtime": 1.4629,
477
- "eval_samples_per_second": 1026.746,
478
- "eval_steps_per_second": 6.836,
479
- "step": 420
480
- },
481
- {
482
- "epoch": 4.831460674157303,
483
- "grad_norm": 1.171875,
484
- "learning_rate": 0.00011016350781828019,
485
- "loss": 6.0156,
486
- "step": 430
487
- },
488
- {
489
- "epoch": 4.943820224719101,
490
- "grad_norm": 0.95703125,
491
- "learning_rate": 0.00010654031292301432,
492
- "loss": 6.0249,
493
- "step": 440
494
- },
495
- {
496
- "epoch": 4.943820224719101,
497
- "eval_loss": 6.157017230987549,
498
- "eval_runtime": 1.5184,
499
- "eval_samples_per_second": 989.202,
500
- "eval_steps_per_second": 6.586,
501
- "step": 440
502
- },
503
- {
504
- "epoch": 5.056179775280899,
505
- "grad_norm": 1.296875,
506
- "learning_rate": 0.00010290847187431113,
507
- "loss": 6.0049,
508
- "step": 450
509
- },
510
- {
511
- "epoch": 5.168539325842697,
512
- "grad_norm": 1.0703125,
513
- "learning_rate": 9.927278588815786e-05,
514
- "loss": 5.9625,
515
- "step": 460
516
- },
517
- {
518
- "epoch": 5.168539325842697,
519
- "eval_loss": 6.147063732147217,
520
- "eval_runtime": 1.5949,
521
- "eval_samples_per_second": 941.776,
522
- "eval_steps_per_second": 6.27,
523
- "step": 460
524
- },
525
- {
526
- "epoch": 5.280898876404494,
527
- "grad_norm": 1.5859375,
528
- "learning_rate": 9.563806126346642e-05,
529
- "loss": 5.8988,
530
- "step": 470
531
- },
532
- {
533
- "epoch": 5.393258426966292,
534
- "grad_norm": 0.74609375,
535
- "learning_rate": 9.200910302824963e-05,
536
- "loss": 6.0231,
537
- "step": 480
538
- },
539
- {
540
- "epoch": 5.393258426966292,
541
- "eval_loss": 6.130283355712891,
542
- "eval_runtime": 1.482,
543
- "eval_samples_per_second": 1013.508,
544
- "eval_steps_per_second": 6.748,
545
- "step": 480
546
- },
547
- {
548
- "epoch": 5.50561797752809,
549
- "grad_norm": 0.78125,
550
- "learning_rate": 8.839070858747697e-05,
551
- "loss": 5.9,
552
- "step": 490
553
- },
554
- {
555
- "epoch": 5.617977528089888,
556
- "grad_norm": 0.87890625,
557
- "learning_rate": 8.478766138100834e-05,
558
- "loss": 5.9395,
559
- "step": 500
560
- },
561
- {
562
- "epoch": 5.617977528089888,
563
- "eval_loss": 6.124143123626709,
564
- "eval_runtime": 1.4263,
565
- "eval_samples_per_second": 1053.058,
566
- "eval_steps_per_second": 7.011,
567
- "step": 500
568
- },
569
- {
570
- "epoch": 5.730337078651686,
571
- "grad_norm": 0.75,
572
- "learning_rate": 8.120472455998882e-05,
573
- "loss": 5.9263,
574
- "step": 510
575
- },
576
- {
577
- "epoch": 5.842696629213483,
578
- "grad_norm": 1.8046875,
579
- "learning_rate": 7.764663469006526e-05,
580
- "loss": 5.8278,
581
- "step": 520
582
- },
583
- {
584
- "epoch": 5.842696629213483,
585
- "eval_loss": 6.109380722045898,
586
- "eval_runtime": 1.57,
587
- "eval_samples_per_second": 956.718,
588
- "eval_steps_per_second": 6.37,
589
- "step": 520
590
- },
591
- {
592
- "epoch": 5.955056179775281,
593
- "grad_norm": 0.8671875,
594
- "learning_rate": 7.411809548974792e-05,
595
- "loss": 5.9788,
596
- "step": 530
597
- },
598
- {
599
- "epoch": 6.067415730337078,
600
- "grad_norm": 1.2109375,
601
- "learning_rate": 7.062377161219556e-05,
602
- "loss": 5.8774,
603
- "step": 540
604
- },
605
- {
606
- "epoch": 6.067415730337078,
607
- "eval_loss": 6.107788562774658,
608
- "eval_runtime": 1.6988,
609
- "eval_samples_per_second": 884.172,
610
- "eval_steps_per_second": 5.887,
611
- "step": 540
612
- },
613
- {
614
- "epoch": 6.179775280898877,
615
- "grad_norm": 0.6640625,
616
- "learning_rate": 6.71682824786439e-05,
617
- "loss": 5.9356,
618
- "step": 550
619
- },
620
- {
621
- "epoch": 6.292134831460674,
622
- "grad_norm": 0.57421875,
623
- "learning_rate": 6.375619617162985e-05,
624
- "loss": 5.8393,
625
- "step": 560
626
- },
627
- {
628
- "epoch": 6.292134831460674,
629
- "eval_loss": 6.102546691894531,
630
- "eval_runtime": 1.5621,
631
- "eval_samples_per_second": 961.551,
632
- "eval_steps_per_second": 6.402,
633
- "step": 560
634
- },
635
- {
636
- "epoch": 6.404494382022472,
637
- "grad_norm": 0.84375,
638
- "learning_rate": 6.039202339608432e-05,
639
- "loss": 5.9184,
640
- "step": 570
641
- },
642
- {
643
- "epoch": 6.51685393258427,
644
- "grad_norm": 0.66796875,
645
- "learning_rate": 5.708021151627712e-05,
646
- "loss": 5.8534,
647
- "step": 580
648
- },
649
- {
650
- "epoch": 6.51685393258427,
651
- "eval_loss": 6.098301410675049,
652
- "eval_runtime": 1.5809,
653
- "eval_samples_per_second": 950.072,
654
- "eval_steps_per_second": 6.325,
655
- "step": 580
656
- },
657
- {
658
- "epoch": 6.629213483146067,
659
- "grad_norm": 1.4609375,
660
- "learning_rate": 5.382513867649663e-05,
661
- "loss": 5.8461,
662
- "step": 590
663
- },
664
- {
665
- "epoch": 6.741573033707866,
666
- "grad_norm": 0.65234375,
667
- "learning_rate": 5.063110801323697e-05,
668
- "loss": 5.9313,
669
- "step": 600
670
- },
671
- {
672
- "epoch": 6.741573033707866,
673
- "eval_loss": 6.101324558258057,
674
- "eval_runtime": 1.5383,
675
- "eval_samples_per_second": 976.413,
676
- "eval_steps_per_second": 6.501,
677
- "step": 600
678
- },
679
- {
680
- "epoch": 6.853932584269663,
681
- "grad_norm": 0.58203125,
682
- "learning_rate": 4.7502341966544e-05,
683
- "loss": 5.8228,
684
- "step": 610
685
- },
686
- {
687
- "epoch": 6.966292134831461,
688
- "grad_norm": 0.78125,
689
- "learning_rate": 4.444297669803981e-05,
690
- "loss": 5.8947,
691
- "step": 620
692
- },
693
- {
694
- "epoch": 6.966292134831461,
695
- "eval_loss": 6.098939895629883,
696
- "eval_runtime": 1.5482,
697
- "eval_samples_per_second": 970.169,
698
- "eval_steps_per_second": 6.459,
699
- "step": 620
700
- },
701
- {
702
- "epoch": 7.078651685393258,
703
- "grad_norm": 0.83203125,
704
- "learning_rate": 4.145705662300595e-05,
705
- "loss": 5.8575,
706
- "step": 630
707
- },
708
- {
709
- "epoch": 7.191011235955056,
710
- "grad_norm": 0.88671875,
711
- "learning_rate": 3.854852906375326e-05,
712
- "loss": 5.8936,
713
- "step": 640
714
- },
715
- {
716
- "epoch": 7.191011235955056,
717
- "eval_loss": 6.097050189971924,
718
- "eval_runtime": 1.5395,
719
- "eval_samples_per_second": 975.65,
720
- "eval_steps_per_second": 6.496,
721
- "step": 640
722
- },
723
- {
724
- "epoch": 7.303370786516854,
725
- "grad_norm": 0.58203125,
726
- "learning_rate": 3.5721239031346066e-05,
727
- "loss": 5.8294,
728
- "step": 650
729
- },
730
- {
731
- "epoch": 7.415730337078652,
732
- "grad_norm": 1.1796875,
733
- "learning_rate": 3.297892414258043e-05,
734
- "loss": 5.8275,
735
- "step": 660
736
- },
737
- {
738
- "epoch": 7.415730337078652,
739
- "eval_loss": 6.095025539398193,
740
- "eval_runtime": 1.562,
741
- "eval_samples_per_second": 961.579,
742
- "eval_steps_per_second": 6.402,
743
- "step": 660
744
- },
745
- {
746
- "epoch": 7.52808988764045,
747
- "grad_norm": 0.71484375,
748
- "learning_rate": 3.032520967893453e-05,
749
- "loss": 5.88,
750
- "step": 670
751
- },
752
- {
753
- "epoch": 7.640449438202247,
754
- "grad_norm": 0.8125,
755
- "learning_rate": 2.776360379402445e-05,
756
- "loss": 5.822,
757
- "step": 680
758
- },
759
- {
760
- "epoch": 7.640449438202247,
761
- "eval_loss": 6.089855670928955,
762
- "eval_runtime": 1.5643,
763
- "eval_samples_per_second": 960.159,
764
- "eval_steps_per_second": 6.393,
765
- "step": 680
766
- },
767
- {
768
- "epoch": 7.752808988764045,
769
- "grad_norm": 0.7734375,
770
- "learning_rate": 2.529749287590042e-05,
771
- "loss": 5.8741,
772
- "step": 690
773
- },
774
- {
775
- "epoch": 7.865168539325842,
776
- "grad_norm": 0.62890625,
777
- "learning_rate": 2.2930137070314194e-05,
778
- "loss": 5.8637,
779
- "step": 700
780
- },
781
- {
782
- "epoch": 7.865168539325842,
783
- "eval_loss": 6.0882697105407715,
784
- "eval_runtime": 1.4971,
785
- "eval_samples_per_second": 1003.241,
786
- "eval_steps_per_second": 6.679,
787
- "step": 700
788
- },
789
- {
790
- "epoch": 7.97752808988764,
791
- "grad_norm": 1.25,
792
- "learning_rate": 2.0664665970876496e-05,
793
- "loss": 5.8311,
794
- "step": 710
795
- },
796
- {
797
- "epoch": 8.089887640449438,
798
- "grad_norm": 0.6484375,
799
- "learning_rate": 1.8504074481801238e-05,
800
- "loss": 5.8951,
801
- "step": 720
802
- },
803
- {
804
- "epoch": 8.089887640449438,
805
- "eval_loss": 6.095778465270996,
806
- "eval_runtime": 1.6558,
807
- "eval_samples_per_second": 907.088,
808
- "eval_steps_per_second": 6.039,
809
- "step": 720
810
- },
811
- {
812
- "epoch": 8.202247191011235,
813
- "grad_norm": 1.015625,
814
- "learning_rate": 1.6451218858706374e-05,
815
- "loss": 5.8359,
816
- "step": 730
817
- },
818
- {
819
- "epoch": 8.314606741573034,
820
- "grad_norm": 0.640625,
821
- "learning_rate": 1.4508812932705363e-05,
822
- "loss": 5.8697,
823
- "step": 740
824
- },
825
- {
826
- "epoch": 8.314606741573034,
827
- "eval_loss": 6.090635776519775,
828
- "eval_runtime": 1.4815,
829
- "eval_samples_per_second": 1013.856,
830
- "eval_steps_per_second": 6.75,
831
- "step": 740
832
- },
833
- {
834
- "epoch": 8.426966292134832,
835
- "grad_norm": 1.1328125,
836
- "learning_rate": 1.2679424522780426e-05,
837
- "loss": 5.8225,
838
- "step": 750
839
- },
840
- {
841
- "epoch": 8.539325842696629,
842
- "grad_norm": 0.71875,
843
- "learning_rate": 1.0965472041181102e-05,
844
- "loss": 5.9076,
845
- "step": 760
846
- },
847
- {
848
- "epoch": 8.539325842696629,
849
- "eval_loss": 6.088898181915283,
850
- "eval_runtime": 1.5533,
851
- "eval_samples_per_second": 966.97,
852
- "eval_steps_per_second": 6.438,
853
- "step": 760
854
- },
855
- {
856
- "epoch": 8.651685393258427,
857
- "grad_norm": 0.58984375,
858
- "learning_rate": 9.369221296335006e-06,
859
- "loss": 5.8285,
860
- "step": 770
861
- },
862
- {
863
- "epoch": 8.764044943820224,
864
- "grad_norm": 1.0703125,
865
- "learning_rate": 7.892782497497642e-06,
866
- "loss": 5.8149,
867
- "step": 780
868
- },
869
- {
870
- "epoch": 8.764044943820224,
871
- "eval_loss": 6.089446544647217,
872
- "eval_runtime": 1.5112,
873
- "eval_samples_per_second": 993.94,
874
- "eval_steps_per_second": 6.617,
875
- "step": 780
876
- },
877
- {
878
- "epoch": 8.876404494382022,
879
- "grad_norm": 0.625,
880
- "learning_rate": 6.538107465101162e-06,
881
- "loss": 5.8661,
882
- "step": 790
883
- },
884
- {
885
- "epoch": 8.98876404494382,
886
- "grad_norm": 1.125,
887
- "learning_rate": 5.306987050489442e-06,
888
- "loss": 5.7888,
889
- "step": 800
890
- },
891
- {
892
- "epoch": 8.98876404494382,
893
- "eval_loss": 6.0915751457214355,
894
- "eval_runtime": 1.5317,
895
- "eval_samples_per_second": 980.616,
896
- "eval_steps_per_second": 6.529,
897
- "step": 800
898
- },
899
- {
900
- "epoch": 9.101123595505618,
901
- "grad_norm": 0.63671875,
902
- "learning_rate": 4.20104876845111e-06,
903
- "loss": 5.9183,
904
- "step": 810
905
- },
906
- {
907
- "epoch": 9.213483146067416,
908
- "grad_norm": 1.421875,
909
- "learning_rate": 3.2217546456799086e-06,
910
- "loss": 5.8096,
911
- "step": 820
912
- },
913
- {
914
- "epoch": 9.213483146067416,
915
- "eval_loss": 6.09375,
916
- "eval_runtime": 1.5233,
917
- "eval_samples_per_second": 986.02,
918
- "eval_steps_per_second": 6.565,
919
- "step": 820
920
- },
921
- {
922
- "epoch": 9.325842696629213,
923
- "grad_norm": 0.7109375,
924
- "learning_rate": 2.3703992880066638e-06,
925
- "loss": 5.9093,
926
- "step": 830
927
- },
928
- {
929
- "epoch": 9.438202247191011,
930
- "grad_norm": 0.578125,
931
- "learning_rate": 1.648108168958229e-06,
932
- "loss": 5.8319,
933
- "step": 840
934
- },
935
- {
936
- "epoch": 9.438202247191011,
937
- "eval_loss": 6.085657119750977,
938
- "eval_runtime": 1.441,
939
- "eval_samples_per_second": 1042.336,
940
- "eval_steps_per_second": 6.94,
941
- "step": 840
942
- },
943
- {
944
- "epoch": 9.55056179775281,
945
- "grad_norm": 1.0234375,
946
- "learning_rate": 1.055836141905553e-06,
947
- "loss": 5.8358,
948
- "step": 850
949
- },
950
- {
951
- "epoch": 9.662921348314606,
952
- "grad_norm": 0.58203125,
953
- "learning_rate": 5.943661777680354e-07,
954
- "loss": 5.8508,
955
- "step": 860
956
- },
957
- {
958
- "epoch": 9.662921348314606,
959
- "eval_loss": 6.0900702476501465,
960
- "eval_runtime": 1.4106,
961
- "eval_samples_per_second": 1064.779,
962
- "eval_steps_per_second": 7.089,
963
- "step": 860
964
- },
965
- {
966
- "epoch": 9.775280898876405,
967
- "grad_norm": 1.328125,
968
- "learning_rate": 2.643083299427751e-07,
969
- "loss": 5.8201,
970
- "step": 870
971
- },
972
- {
973
- "epoch": 9.887640449438202,
974
- "grad_norm": 0.6875,
975
- "learning_rate": 6.609892782699633e-08,
976
- "loss": 5.8517,
977
- "step": 880
978
- },
979
- {
980
- "epoch": 9.887640449438202,
981
- "eval_loss": 6.0848469734191895,
982
- "eval_runtime": 1.5852,
983
- "eval_samples_per_second": 947.512,
984
- "eval_steps_per_second": 6.308,
985
- "step": 880
986
- },
987
- {
988
- "epoch": 10.0,
989
- "grad_norm": 0.8203125,
990
- "learning_rate": 0.0,
991
- "loss": 5.7949,
992
- "step": 890
993
  }
994
  ],
995
  "logging_steps": 10,
996
- "max_steps": 890,
997
  "num_input_tokens_seen": 0,
998
- "num_train_epochs": 10,
999
  "save_steps": 40,
1000
  "stateful_callbacks": {
1001
  "TrainerControl": {
@@ -1004,13 +73,13 @@
1004
  "should_evaluate": false,
1005
  "should_log": false,
1006
  "should_save": true,
1007
- "should_training_stop": true
1008
  },
1009
  "attributes": {}
1010
  }
1011
  },
1012
- "total_flos": 1.5024257713045504e+16,
1013
- "train_batch_size": 19,
1014
  "trial_name": null,
1015
  "trial_params": null
1016
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.047337278106508875,
5
  "eval_steps": 20,
6
+ "global_step": 40,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.001183431952662722,
13
+ "eval_loss": 11.929322242736816,
14
+ "eval_runtime": 3.7308,
15
+ "eval_samples_per_second": 402.598,
16
+ "eval_steps_per_second": 25.196,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.011834319526627219,
21
+ "grad_norm": 0.19140625,
22
+ "learning_rate": 1.6000000000000003e-05,
23
+ "loss": 11.9299,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.023668639053254437,
28
+ "grad_norm": 0.12158203125,
29
+ "learning_rate": 3.2000000000000005e-05,
30
+ "loss": 11.9289,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 0.023668639053254437,
35
+ "eval_loss": 11.92916488647461,
36
+ "eval_runtime": 3.6696,
37
+ "eval_samples_per_second": 409.305,
38
+ "eval_steps_per_second": 25.616,
39
  "step": 20
40
  },
41
  {
42
+ "epoch": 0.03550295857988166,
43
+ "grad_norm": 0.251953125,
44
+ "learning_rate": 4.8e-05,
45
+ "loss": 11.9288,
46
  "step": 30
47
  },
48
  {
49
+ "epoch": 0.047337278106508875,
50
+ "grad_norm": 0.1435546875,
51
+ "learning_rate": 6.400000000000001e-05,
52
+ "loss": 11.9293,
53
  "step": 40
54
  },
55
  {
56
+ "epoch": 0.047337278106508875,
57
+ "eval_loss": 11.928914070129395,
58
+ "eval_runtime": 3.5621,
59
+ "eval_samples_per_second": 421.656,
60
+ "eval_steps_per_second": 26.389,
61
  "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
  ],
64
  "logging_steps": 10,
65
+ "max_steps": 2500,
66
  "num_input_tokens_seen": 0,
67
+ "num_train_epochs": 3,
68
  "save_steps": 40,
69
  "stateful_callbacks": {
70
  "TrainerControl": {
 
73
  "should_evaluate": false,
74
  "should_log": false,
75
  "should_save": true,
76
+ "should_training_stop": false
77
  },
78
  "attributes": {}
79
  }
80
  },
81
+ "total_flos": 15714680832.0,
82
+ "train_batch_size": 2,
83
  "trial_name": null,
84
  "trial_params": null
85
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88983683f78772e6fd62c9a97937ef6f408b0ee554cb1a96936c237405b3f78a
3
- size 6520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ab1aa49f52e7f57216c42b69ed41e061894107885c3e00824c758931ff09d8
3
+ size 6648
last-checkpoint/vocab.json ADDED
The diff for this file is too large to render. See raw diff