simonycl commited on
Commit
200bca6
·
verified ·
1 Parent(s): 8deb362

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPTNeoForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "attention_layers": [
8
+ "global",
9
+ "global",
10
+ "global",
11
+ "global",
12
+ "global",
13
+ "global",
14
+ "global",
15
+ "global",
16
+ "global",
17
+ "global",
18
+ "global",
19
+ "global",
20
+ "global",
21
+ "global",
22
+ "global",
23
+ "global"
24
+ ],
25
+ "attention_types": [
26
+ [
27
+ [
28
+ "global"
29
+ ],
30
+ 16
31
+ ]
32
+ ],
33
+ "bos_token_id": 50256,
34
+ "classifier_dropout": 0.1,
35
+ "embed_dropout": 0.1,
36
+ "eos_token_id": 50256,
37
+ "hidden_size": 1024,
38
+ "initializer_range": 0.02,
39
+ "intermediate_size": null,
40
+ "layer_norm_epsilon": 1e-05,
41
+ "max_position_embeddings": 4096,
42
+ "model_type": "gpt_neo",
43
+ "num_heads": 16,
44
+ "num_layers": 16,
45
+ "resid_dropout": 0.1,
46
+ "torch_dtype": "bfloat16",
47
+ "transformers_version": "4.45.1",
48
+ "use_cache": true,
49
+ "vocab_size": 50257,
50
+ "window_size": 4096
51
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.45.1"
6
+ }
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step26040
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbfe78ac81c86bb4077133cd4d368066e6c84dbe21af4f30c9b7b3fa69ada41e
3
+ size 617249488
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2338dc132fa9996cc0309f715d9140f9a5c2d292a430782a88e5aad36e1f9f
3
+ size 15024
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:401d5daae2d1f61c0e1a341c34cd3944037998dd967fe2db5ecd8e521dd41353
3
+ size 15024
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0fd694cc901cbbd9ef57e85409afd692fb87bb1ca2a5ae34cfeacaa7be8286
3
+ size 15024
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f10a26b8fe2c553a7a20d6d27ec3fbd1d984b5c833e0a62d90a8e8658d10c1
3
+ size 15024
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272cfba81198575e76e6b9965d7f6f8c7480ede87e02d759cade961d7ff227a2
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "model_max_length": 4096,
19
+ "pad_token": "<|endoftext|>",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
trainer_state.json ADDED
@@ -0,0 +1,2685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.17754687368869781,
3
+ "best_model_checkpoint": "results/checkpoint-25000",
4
+ "epoch": 9.998720081914758,
5
+ "eval_steps": 500,
6
+ "global_step": 26040,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.038397542557276336,
13
+ "grad_norm": 1.0079567432403564,
14
+ "learning_rate": 9.999643338380885e-06,
15
+ "loss": 5.5723,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.07679508511455267,
20
+ "grad_norm": 0.6461474299430847,
21
+ "learning_rate": 9.998558958654982e-06,
22
+ "loss": 2.2782,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.115192627671829,
27
+ "grad_norm": 0.4909125566482544,
28
+ "learning_rate": 9.996746982275233e-06,
29
+ "loss": 1.8047,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.15359017022910534,
34
+ "grad_norm": 0.47547289729118347,
35
+ "learning_rate": 9.994207672995245e-06,
36
+ "loss": 1.5821,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.19198771278638166,
41
+ "grad_norm": 0.41358837485313416,
42
+ "learning_rate": 9.99094140044013e-06,
43
+ "loss": 1.4754,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.19198771278638166,
48
+ "eval_valid_loss": 1.4288749694824219,
49
+ "eval_valid_runtime": 4.7117,
50
+ "eval_valid_samples_per_second": 212.238,
51
+ "eval_valid_steps_per_second": 6.792,
52
+ "step": 500
53
+ },
54
+ {
55
+ "epoch": 0.19198771278638166,
56
+ "eval_valid_target_loss": 1.4590624570846558,
57
+ "eval_valid_target_runtime": 4.684,
58
+ "eval_valid_target_samples_per_second": 213.493,
59
+ "eval_valid_target_steps_per_second": 6.832,
60
+ "step": 500
61
+ },
62
+ {
63
+ "epoch": 0.230385255343658,
64
+ "grad_norm": 0.43229448795318604,
65
+ "learning_rate": 9.986948640052719e-06,
66
+ "loss": 1.4087,
67
+ "step": 600
68
+ },
69
+ {
70
+ "epoch": 0.26878279790093434,
71
+ "grad_norm": 0.528977632522583,
72
+ "learning_rate": 9.982229973024328e-06,
73
+ "loss": 1.3245,
74
+ "step": 700
75
+ },
76
+ {
77
+ "epoch": 0.3071803404582107,
78
+ "grad_norm": 0.5489594340324402,
79
+ "learning_rate": 9.976786086210186e-06,
80
+ "loss": 1.0455,
81
+ "step": 800
82
+ },
83
+ {
84
+ "epoch": 0.34557788301548703,
85
+ "grad_norm": 0.5119125843048096,
86
+ "learning_rate": 9.970617772029439e-06,
87
+ "loss": 0.7605,
88
+ "step": 900
89
+ },
90
+ {
91
+ "epoch": 0.3839754255727633,
92
+ "grad_norm": 0.5092576146125793,
93
+ "learning_rate": 9.963725928349814e-06,
94
+ "loss": 0.6005,
95
+ "step": 1000
96
+ },
97
+ {
98
+ "epoch": 0.3839754255727633,
99
+ "eval_valid_loss": 0.49165624380111694,
100
+ "eval_valid_runtime": 4.674,
101
+ "eval_valid_samples_per_second": 213.951,
102
+ "eval_valid_steps_per_second": 6.846,
103
+ "step": 1000
104
+ },
105
+ {
106
+ "epoch": 0.3839754255727633,
107
+ "eval_valid_target_loss": 0.5142187476158142,
108
+ "eval_valid_target_runtime": 4.6758,
109
+ "eval_valid_target_samples_per_second": 213.869,
110
+ "eval_valid_target_steps_per_second": 6.844,
111
+ "step": 1000
112
+ },
113
+ {
114
+ "epoch": 0.42237296813003966,
115
+ "grad_norm": 0.43798330426216125,
116
+ "learning_rate": 9.956111558356915e-06,
117
+ "loss": 0.4887,
118
+ "step": 1100
119
+ },
120
+ {
121
+ "epoch": 0.460770510687316,
122
+ "grad_norm": 0.3737218379974365,
123
+ "learning_rate": 9.947775770408207e-06,
124
+ "loss": 0.4307,
125
+ "step": 1200
126
+ },
127
+ {
128
+ "epoch": 0.49916805324459235,
129
+ "grad_norm": 0.4303857386112213,
130
+ "learning_rate": 9.938719777871674e-06,
131
+ "loss": 0.4027,
132
+ "step": 1300
133
+ },
134
+ {
135
+ "epoch": 0.5375655958018687,
136
+ "grad_norm": 0.3709202706813812,
137
+ "learning_rate": 9.92894489894921e-06,
138
+ "loss": 0.3799,
139
+ "step": 1400
140
+ },
141
+ {
142
+ "epoch": 0.575963138359145,
143
+ "grad_norm": 0.3918135464191437,
144
+ "learning_rate": 9.918452556484728e-06,
145
+ "loss": 0.3633,
146
+ "step": 1500
147
+ },
148
+ {
149
+ "epoch": 0.575963138359145,
150
+ "eval_valid_loss": 0.33228906989097595,
151
+ "eval_valid_runtime": 4.7244,
152
+ "eval_valid_samples_per_second": 211.669,
153
+ "eval_valid_steps_per_second": 6.773,
154
+ "step": 1500
155
+ },
156
+ {
157
+ "epoch": 0.575963138359145,
158
+ "eval_valid_target_loss": 0.3428671956062317,
159
+ "eval_valid_target_runtime": 4.6595,
160
+ "eval_valid_target_samples_per_second": 214.617,
161
+ "eval_valid_target_steps_per_second": 6.868,
162
+ "step": 1500
163
+ },
164
+ {
165
+ "epoch": 0.6143606809164214,
166
+ "grad_norm": 0.3580816686153412,
167
+ "learning_rate": 9.907244277757053e-06,
168
+ "loss": 0.3565,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "epoch": 0.6527582234736977,
173
+ "grad_norm": 0.34893009066581726,
174
+ "learning_rate": 9.895321694257617e-06,
175
+ "loss": 0.3443,
176
+ "step": 1700
177
+ },
178
+ {
179
+ "epoch": 0.6911557660309741,
180
+ "grad_norm": 0.3050221800804138,
181
+ "learning_rate": 9.882686541452967e-06,
182
+ "loss": 0.3339,
183
+ "step": 1800
184
+ },
185
+ {
186
+ "epoch": 0.7295533085882504,
187
+ "grad_norm": 0.3123306632041931,
188
+ "learning_rate": 9.869340658532151e-06,
189
+ "loss": 0.3278,
190
+ "step": 1900
191
+ },
192
+ {
193
+ "epoch": 0.7679508511455266,
194
+ "grad_norm": 0.31590646505355835,
195
+ "learning_rate": 9.85528598813901e-06,
196
+ "loss": 0.32,
197
+ "step": 2000
198
+ },
199
+ {
200
+ "epoch": 0.7679508511455266,
201
+ "eval_valid_loss": 0.29783594608306885,
202
+ "eval_valid_runtime": 4.6776,
203
+ "eval_valid_samples_per_second": 213.785,
204
+ "eval_valid_steps_per_second": 6.841,
205
+ "step": 2000
206
+ },
207
+ {
208
+ "epoch": 0.7679508511455266,
209
+ "eval_valid_target_loss": 0.3129218816757202,
210
+ "eval_valid_target_runtime": 4.6598,
211
+ "eval_valid_target_samples_per_second": 214.601,
212
+ "eval_valid_target_steps_per_second": 6.867,
213
+ "step": 2000
214
+ },
215
+ {
216
+ "epoch": 0.806348393702803,
217
+ "grad_norm": 0.2684693932533264,
218
+ "learning_rate": 9.840524576089392e-06,
219
+ "loss": 0.3194,
220
+ "step": 2100
221
+ },
222
+ {
223
+ "epoch": 0.8447459362600793,
224
+ "grad_norm": 0.30888476967811584,
225
+ "learning_rate": 9.82505857107337e-06,
226
+ "loss": 0.3108,
227
+ "step": 2200
228
+ },
229
+ {
230
+ "epoch": 0.8831434788173557,
231
+ "grad_norm": 0.2777215242385864,
232
+ "learning_rate": 9.808890224342476e-06,
233
+ "loss": 0.3105,
234
+ "step": 2300
235
+ },
236
+ {
237
+ "epoch": 0.921541021374632,
238
+ "grad_norm": 0.30601397156715393,
239
+ "learning_rate": 9.792021889381995e-06,
240
+ "loss": 0.3055,
241
+ "step": 2400
242
+ },
243
+ {
244
+ "epoch": 0.9599385639319084,
245
+ "grad_norm": 0.2972748875617981,
246
+ "learning_rate": 9.774456021568404e-06,
247
+ "loss": 0.3008,
248
+ "step": 2500
249
+ },
250
+ {
251
+ "epoch": 0.9599385639319084,
252
+ "eval_valid_loss": 0.27842968702316284,
253
+ "eval_valid_runtime": 4.68,
254
+ "eval_valid_samples_per_second": 213.675,
255
+ "eval_valid_steps_per_second": 6.838,
256
+ "step": 2500
257
+ },
258
+ {
259
+ "epoch": 0.9599385639319084,
260
+ "eval_valid_target_loss": 0.2943359315395355,
261
+ "eval_valid_target_runtime": 4.6764,
262
+ "eval_valid_target_samples_per_second": 213.838,
263
+ "eval_valid_target_steps_per_second": 6.843,
264
+ "step": 2500
265
+ },
266
+ {
267
+ "epoch": 0.9983361064891847,
268
+ "grad_norm": 0.2891447842121124,
269
+ "learning_rate": 9.756195177811953e-06,
270
+ "loss": 0.2969,
271
+ "step": 2600
272
+ },
273
+ {
274
+ "epoch": 1.036733649046461,
275
+ "grad_norm": 0.30049994587898254,
276
+ "learning_rate": 9.737242016184486e-06,
277
+ "loss": 0.2913,
278
+ "step": 2700
279
+ },
280
+ {
281
+ "epoch": 1.0751311916037374,
282
+ "grad_norm": 0.25728341937065125,
283
+ "learning_rate": 9.717599295532518e-06,
284
+ "loss": 0.2911,
285
+ "step": 2800
286
+ },
287
+ {
288
+ "epoch": 1.1135287341610136,
289
+ "grad_norm": 0.31619054079055786,
290
+ "learning_rate": 9.697269875075667e-06,
291
+ "loss": 0.2879,
292
+ "step": 2900
293
+ },
294
+ {
295
+ "epoch": 1.15192627671829,
296
+ "grad_norm": 0.3005208671092987,
297
+ "learning_rate": 9.676256713990448e-06,
298
+ "loss": 0.2839,
299
+ "step": 3000
300
+ },
301
+ {
302
+ "epoch": 1.15192627671829,
303
+ "eval_valid_loss": 0.2648593783378601,
304
+ "eval_valid_runtime": 4.6888,
305
+ "eval_valid_samples_per_second": 213.274,
306
+ "eval_valid_steps_per_second": 6.825,
307
+ "step": 3000
308
+ },
309
+ {
310
+ "epoch": 1.15192627671829,
311
+ "eval_valid_target_loss": 0.2817968726158142,
312
+ "eval_valid_target_runtime": 4.6695,
313
+ "eval_valid_target_samples_per_second": 214.155,
314
+ "eval_valid_target_steps_per_second": 6.853,
315
+ "step": 3000
316
+ },
317
+ {
318
+ "epoch": 1.1903238192755663,
319
+ "grad_norm": 0.24846772849559784,
320
+ "learning_rate": 9.654562870979545e-06,
321
+ "loss": 0.2803,
322
+ "step": 3100
323
+ },
324
+ {
325
+ "epoch": 1.2287213618328428,
326
+ "grad_norm": 0.23501113057136536,
327
+ "learning_rate": 9.632191503826574e-06,
328
+ "loss": 0.278,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "epoch": 1.267118904390119,
333
+ "grad_norm": 0.27793240547180176,
334
+ "learning_rate": 9.609145868936434e-06,
335
+ "loss": 0.2776,
336
+ "step": 3300
337
+ },
338
+ {
339
+ "epoch": 1.3055164469473954,
340
+ "grad_norm": 0.2599338889122009,
341
+ "learning_rate": 9.5854293208613e-06,
342
+ "loss": 0.275,
343
+ "step": 3400
344
+ },
345
+ {
346
+ "epoch": 1.3439139895046717,
347
+ "grad_norm": 0.2431340515613556,
348
+ "learning_rate": 9.561045311812335e-06,
349
+ "loss": 0.2722,
350
+ "step": 3500
351
+ },
352
+ {
353
+ "epoch": 1.3439139895046717,
354
+ "eval_valid_loss": 0.2545468807220459,
355
+ "eval_valid_runtime": 4.7131,
356
+ "eval_valid_samples_per_second": 212.177,
357
+ "eval_valid_steps_per_second": 6.79,
358
+ "step": 3500
359
+ },
360
+ {
361
+ "epoch": 1.3439139895046717,
362
+ "eval_valid_target_loss": 0.27326563000679016,
363
+ "eval_valid_target_runtime": 4.6635,
364
+ "eval_valid_target_samples_per_second": 214.433,
365
+ "eval_valid_target_steps_per_second": 6.862,
366
+ "step": 3500
367
+ },
368
+ {
369
+ "epoch": 1.382311532061948,
370
+ "grad_norm": 0.28658226132392883,
371
+ "learning_rate": 9.535997391157174e-06,
372
+ "loss": 0.2693,
373
+ "step": 3600
374
+ },
375
+ {
376
+ "epoch": 1.4207090746192244,
377
+ "grad_norm": 0.26118528842926025,
378
+ "learning_rate": 9.510289204903273e-06,
379
+ "loss": 0.2667,
380
+ "step": 3700
381
+ },
382
+ {
383
+ "epoch": 1.4591066171765008,
384
+ "grad_norm": 0.2761940062046051,
385
+ "learning_rate": 9.483924495167204e-06,
386
+ "loss": 0.2654,
387
+ "step": 3800
388
+ },
389
+ {
390
+ "epoch": 1.497504159733777,
391
+ "grad_norm": 0.2712952792644501,
392
+ "learning_rate": 9.456907099629933e-06,
393
+ "loss": 0.2642,
394
+ "step": 3900
395
+ },
396
+ {
397
+ "epoch": 1.5359017022910533,
398
+ "grad_norm": 0.23100949823856354,
399
+ "learning_rate": 9.429240950978212e-06,
400
+ "loss": 0.2622,
401
+ "step": 4000
402
+ },
403
+ {
404
+ "epoch": 1.5359017022910533,
405
+ "eval_valid_loss": 0.24485936760902405,
406
+ "eval_valid_runtime": 4.6751,
407
+ "eval_valid_samples_per_second": 213.9,
408
+ "eval_valid_steps_per_second": 6.845,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "epoch": 1.5359017022910533,
413
+ "eval_valid_target_loss": 0.2641640603542328,
414
+ "eval_valid_target_runtime": 4.6656,
415
+ "eval_valid_target_samples_per_second": 214.335,
416
+ "eval_valid_target_steps_per_second": 6.859,
417
+ "step": 4000
418
+ },
419
+ {
420
+ "epoch": 1.5742992448483297,
421
+ "grad_norm": 0.2676081359386444,
422
+ "learning_rate": 9.400930076332126e-06,
423
+ "loss": 0.2602,
424
+ "step": 4100
425
+ },
426
+ {
427
+ "epoch": 1.6126967874056062,
428
+ "grad_norm": 0.24242335557937622,
429
+ "learning_rate": 9.371978596658904e-06,
430
+ "loss": 0.2573,
431
+ "step": 4200
432
+ },
433
+ {
434
+ "epoch": 1.6510943299628824,
435
+ "grad_norm": 0.27868130803108215,
436
+ "learning_rate": 9.342390726173065e-06,
437
+ "loss": 0.2574,
438
+ "step": 4300
439
+ },
440
+ {
441
+ "epoch": 1.6894918725201586,
442
+ "grad_norm": 0.2644180655479431,
443
+ "learning_rate": 9.31217077172299e-06,
444
+ "loss": 0.255,
445
+ "step": 4400
446
+ },
447
+ {
448
+ "epoch": 1.727889415077435,
449
+ "grad_norm": 0.2352069914340973,
450
+ "learning_rate": 9.281323132164013e-06,
451
+ "loss": 0.2538,
452
+ "step": 4500
453
+ },
454
+ {
455
+ "epoch": 1.727889415077435,
456
+ "eval_valid_loss": 0.2354765683412552,
457
+ "eval_valid_runtime": 4.7068,
458
+ "eval_valid_samples_per_second": 212.461,
459
+ "eval_valid_steps_per_second": 6.799,
460
+ "step": 4500
461
+ },
462
+ {
463
+ "epoch": 1.727889415077435,
464
+ "eval_valid_target_loss": 0.25502344965934753,
465
+ "eval_valid_target_runtime": 4.6612,
466
+ "eval_valid_target_samples_per_second": 214.536,
467
+ "eval_valid_target_steps_per_second": 6.865,
468
+ "step": 4500
469
+ },
470
+ {
471
+ "epoch": 1.7662869576347113,
472
+ "grad_norm": 0.28041261434555054,
473
+ "learning_rate": 9.249852297718116e-06,
474
+ "loss": 0.2507,
475
+ "step": 4600
476
+ },
477
+ {
478
+ "epoch": 1.8046845001919878,
479
+ "grad_norm": 0.2735440135002136,
480
+ "learning_rate": 9.217762849320334e-06,
481
+ "loss": 0.2496,
482
+ "step": 4700
483
+ },
484
+ {
485
+ "epoch": 1.843082042749264,
486
+ "grad_norm": 0.26316097378730774,
487
+ "learning_rate": 9.185059457951933e-06,
488
+ "loss": 0.2479,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "epoch": 1.8814795853065402,
493
+ "grad_norm": 0.23891638219356537,
494
+ "learning_rate": 9.151746883960512e-06,
495
+ "loss": 0.2457,
496
+ "step": 4900
497
+ },
498
+ {
499
+ "epoch": 1.9198771278638167,
500
+ "grad_norm": 0.22432874143123627,
501
+ "learning_rate": 9.117829976367072e-06,
502
+ "loss": 0.2446,
503
+ "step": 5000
504
+ },
505
+ {
506
+ "epoch": 1.9198771278638167,
507
+ "eval_valid_loss": 0.2283046841621399,
508
+ "eval_valid_runtime": 4.6829,
509
+ "eval_valid_samples_per_second": 213.544,
510
+ "eval_valid_steps_per_second": 6.833,
511
+ "step": 5000
512
+ },
513
+ {
514
+ "epoch": 1.9198771278638167,
515
+ "eval_valid_target_loss": 0.24809375405311584,
516
+ "eval_valid_target_runtime": 4.6694,
517
+ "eval_valid_target_samples_per_second": 214.162,
518
+ "eval_valid_target_steps_per_second": 6.853,
519
+ "step": 5000
520
+ },
521
+ {
522
+ "epoch": 1.9582746704210932,
523
+ "grad_norm": 0.27488961815834045,
524
+ "learning_rate": 9.08331367216019e-06,
525
+ "loss": 0.2434,
526
+ "step": 5100
527
+ },
528
+ {
529
+ "epoch": 1.9966722129783694,
530
+ "grad_norm": 0.2284267097711563,
531
+ "learning_rate": 9.048202995577383e-06,
532
+ "loss": 0.24,
533
+ "step": 5200
534
+ },
535
+ {
536
+ "epoch": 2.0350697555356456,
537
+ "grad_norm": 0.2710357904434204,
538
+ "learning_rate": 9.012503057373769e-06,
539
+ "loss": 0.2399,
540
+ "step": 5300
541
+ },
542
+ {
543
+ "epoch": 2.073467298092922,
544
+ "grad_norm": 0.24398750066757202,
545
+ "learning_rate": 8.976219054078147e-06,
546
+ "loss": 0.2391,
547
+ "step": 5400
548
+ },
549
+ {
550
+ "epoch": 2.1118648406501985,
551
+ "grad_norm": 0.24732039868831635,
552
+ "learning_rate": 8.939356267236582e-06,
553
+ "loss": 0.2374,
554
+ "step": 5500
555
+ },
556
+ {
557
+ "epoch": 2.1118648406501985,
558
+ "eval_valid_loss": 0.22253906726837158,
559
+ "eval_valid_runtime": 4.6969,
560
+ "eval_valid_samples_per_second": 212.904,
561
+ "eval_valid_steps_per_second": 6.813,
562
+ "step": 5500
563
+ },
564
+ {
565
+ "epoch": 2.1118648406501985,
566
+ "eval_valid_target_loss": 0.24240624904632568,
567
+ "eval_valid_target_runtime": 4.6761,
568
+ "eval_valid_target_samples_per_second": 213.853,
569
+ "eval_valid_target_steps_per_second": 6.843,
570
+ "step": 5500
571
+ },
572
+ {
573
+ "epoch": 2.1502623832074748,
574
+ "grad_norm": 0.23949123919010162,
575
+ "learning_rate": 8.901920062643607e-06,
576
+ "loss": 0.2368,
577
+ "step": 5600
578
+ },
579
+ {
580
+ "epoch": 2.188659925764751,
581
+ "grad_norm": 0.26010605692863464,
582
+ "learning_rate": 8.863915889561188e-06,
583
+ "loss": 0.2351,
584
+ "step": 5700
585
+ },
586
+ {
587
+ "epoch": 2.2270574683220272,
588
+ "grad_norm": 0.2524034380912781,
589
+ "learning_rate": 8.825349279925506e-06,
590
+ "loss": 0.2333,
591
+ "step": 5800
592
+ },
593
+ {
594
+ "epoch": 2.265455010879304,
595
+ "grad_norm": 0.24745632708072662,
596
+ "learning_rate": 8.78622584754173e-06,
597
+ "loss": 0.2323,
598
+ "step": 5900
599
+ },
600
+ {
601
+ "epoch": 2.30385255343658,
602
+ "grad_norm": 0.2586907148361206,
603
+ "learning_rate": 8.746551287266863e-06,
604
+ "loss": 0.2312,
605
+ "step": 6000
606
+ },
607
+ {
608
+ "epoch": 2.30385255343658,
609
+ "eval_valid_loss": 0.216859370470047,
610
+ "eval_valid_runtime": 4.6709,
611
+ "eval_valid_samples_per_second": 214.092,
612
+ "eval_valid_steps_per_second": 6.851,
613
+ "step": 6000
614
+ },
615
+ {
616
+ "epoch": 2.30385255343658,
617
+ "eval_valid_target_loss": 0.23771093785762787,
618
+ "eval_valid_target_runtime": 4.6848,
619
+ "eval_valid_target_samples_per_second": 213.455,
620
+ "eval_valid_target_steps_per_second": 6.831,
621
+ "step": 6000
622
+ },
623
+ {
624
+ "epoch": 2.3422500959938564,
625
+ "grad_norm": 0.24499697983264923,
626
+ "learning_rate": 8.706331374180792e-06,
627
+ "loss": 0.2301,
628
+ "step": 6100
629
+ },
630
+ {
631
+ "epoch": 2.3806476385511326,
632
+ "grad_norm": 0.24237163364887238,
633
+ "learning_rate": 8.665571962745655e-06,
634
+ "loss": 0.2304,
635
+ "step": 6200
636
+ },
637
+ {
638
+ "epoch": 2.419045181108409,
639
+ "grad_norm": 0.27395910024642944,
640
+ "learning_rate": 8.624278985953665e-06,
641
+ "loss": 0.2287,
642
+ "step": 6300
643
+ },
644
+ {
645
+ "epoch": 2.4574427236656855,
646
+ "grad_norm": 0.2500033378601074,
647
+ "learning_rate": 8.582458454463493e-06,
648
+ "loss": 0.2279,
649
+ "step": 6400
650
+ },
651
+ {
652
+ "epoch": 2.4958402662229617,
653
+ "grad_norm": 0.2605977952480316,
654
+ "learning_rate": 8.540116455725346e-06,
655
+ "loss": 0.2277,
656
+ "step": 6500
657
+ },
658
+ {
659
+ "epoch": 2.4958402662229617,
660
+ "eval_valid_loss": 0.21196874976158142,
661
+ "eval_valid_runtime": 4.6941,
662
+ "eval_valid_samples_per_second": 213.035,
663
+ "eval_valid_steps_per_second": 6.817,
664
+ "step": 6500
665
+ },
666
+ {
667
+ "epoch": 2.4958402662229617,
668
+ "eval_valid_target_loss": 0.23328906297683716,
669
+ "eval_valid_target_runtime": 4.6792,
670
+ "eval_valid_target_samples_per_second": 213.712,
671
+ "eval_valid_target_steps_per_second": 6.839,
672
+ "step": 6500
673
+ },
674
+ {
675
+ "epoch": 2.534237808780238,
676
+ "grad_norm": 0.2220095992088318,
677
+ "learning_rate": 8.497259153094875e-06,
678
+ "loss": 0.2254,
679
+ "step": 6600
680
+ },
681
+ {
682
+ "epoch": 2.5726353513375146,
683
+ "grad_norm": 0.24707047641277313,
684
+ "learning_rate": 8.453892784936022e-06,
685
+ "loss": 0.2239,
686
+ "step": 6700
687
+ },
688
+ {
689
+ "epoch": 2.611032893894791,
690
+ "grad_norm": 0.23103290796279907,
691
+ "learning_rate": 8.41002366371297e-06,
692
+ "loss": 0.224,
693
+ "step": 6800
694
+ },
695
+ {
696
+ "epoch": 2.649430436452067,
697
+ "grad_norm": 0.2249547839164734,
698
+ "learning_rate": 8.36565817507127e-06,
699
+ "loss": 0.2227,
700
+ "step": 6900
701
+ },
702
+ {
703
+ "epoch": 2.6878279790093433,
704
+ "grad_norm": 0.24457262456417084,
705
+ "learning_rate": 8.32080277690836e-06,
706
+ "loss": 0.2209,
707
+ "step": 7000
708
+ },
709
+ {
710
+ "epoch": 2.6878279790093433,
711
+ "eval_valid_loss": 0.20793749392032623,
712
+ "eval_valid_runtime": 4.6727,
713
+ "eval_valid_samples_per_second": 214.01,
714
+ "eval_valid_steps_per_second": 6.848,
715
+ "step": 7000
716
+ },
717
+ {
718
+ "epoch": 2.6878279790093433,
719
+ "eval_valid_target_loss": 0.22950781881809235,
720
+ "eval_valid_target_runtime": 4.6848,
721
+ "eval_valid_target_samples_per_second": 213.456,
722
+ "eval_valid_target_steps_per_second": 6.831,
723
+ "step": 7000
724
+ },
725
+ {
726
+ "epoch": 2.7262255215666196,
727
+ "grad_norm": 0.23176012933254242,
728
+ "learning_rate": 8.275463998433537e-06,
729
+ "loss": 0.2206,
730
+ "step": 7100
731
+ },
732
+ {
733
+ "epoch": 2.764623064123896,
734
+ "grad_norm": 0.21723733842372894,
735
+ "learning_rate": 8.229648439217552e-06,
736
+ "loss": 0.2203,
737
+ "step": 7200
738
+ },
739
+ {
740
+ "epoch": 2.8030206066811725,
741
+ "grad_norm": 0.2428179383277893,
742
+ "learning_rate": 8.183362768231971e-06,
743
+ "loss": 0.2192,
744
+ "step": 7300
745
+ },
746
+ {
747
+ "epoch": 2.8414181492384487,
748
+ "grad_norm": 0.2162482738494873,
749
+ "learning_rate": 8.136613722878437e-06,
750
+ "loss": 0.2183,
751
+ "step": 7400
752
+ },
753
+ {
754
+ "epoch": 2.879815691795725,
755
+ "grad_norm": 0.22231200337409973,
756
+ "learning_rate": 8.08940810800796e-06,
757
+ "loss": 0.2177,
758
+ "step": 7500
759
+ },
760
+ {
761
+ "epoch": 2.879815691795725,
762
+ "eval_valid_loss": 0.20469531416893005,
763
+ "eval_valid_runtime": 4.6819,
764
+ "eval_valid_samples_per_second": 213.587,
765
+ "eval_valid_steps_per_second": 6.835,
766
+ "step": 7500
767
+ },
768
+ {
769
+ "epoch": 2.879815691795725,
770
+ "eval_valid_target_loss": 0.2264062464237213,
771
+ "eval_valid_target_runtime": 4.6647,
772
+ "eval_valid_target_samples_per_second": 214.376,
773
+ "eval_valid_target_steps_per_second": 6.86,
774
+ "step": 7500
775
+ },
776
+ {
777
+ "epoch": 2.9182132343530016,
778
+ "grad_norm": 0.2663327157497406,
779
+ "learning_rate": 8.041752794930389e-06,
780
+ "loss": 0.2172,
781
+ "step": 7600
782
+ },
783
+ {
784
+ "epoch": 2.956610776910278,
785
+ "grad_norm": 0.2545444369316101,
786
+ "learning_rate": 7.993654720414227e-06,
787
+ "loss": 0.216,
788
+ "step": 7700
789
+ },
790
+ {
791
+ "epoch": 2.995008319467554,
792
+ "grad_norm": 0.2252371460199356,
793
+ "learning_rate": 7.9451208856769e-06,
794
+ "loss": 0.2154,
795
+ "step": 7800
796
+ },
797
+ {
798
+ "epoch": 3.0334058620248303,
799
+ "grad_norm": 0.2507840394973755,
800
+ "learning_rate": 7.896158355365643e-06,
801
+ "loss": 0.2151,
802
+ "step": 7900
803
+ },
804
+ {
805
+ "epoch": 3.0718034045821065,
806
+ "grad_norm": 0.22570189833641052,
807
+ "learning_rate": 7.846774256529178e-06,
808
+ "loss": 0.2131,
809
+ "step": 8000
810
+ },
811
+ {
812
+ "epoch": 3.0718034045821065,
813
+ "eval_valid_loss": 0.2014453113079071,
814
+ "eval_valid_runtime": 4.6924,
815
+ "eval_valid_samples_per_second": 213.111,
816
+ "eval_valid_steps_per_second": 6.82,
817
+ "step": 8000
818
+ },
819
+ {
820
+ "epoch": 3.0718034045821065,
821
+ "eval_valid_target_loss": 0.22346094250679016,
822
+ "eval_valid_target_runtime": 4.6655,
823
+ "eval_valid_target_samples_per_second": 214.339,
824
+ "eval_valid_target_steps_per_second": 6.859,
825
+ "step": 8000
826
+ },
827
+ {
828
+ "epoch": 3.110200947139383,
829
+ "grad_norm": 0.24750301241874695,
830
+ "learning_rate": 7.796975777580276e-06,
831
+ "loss": 0.2133,
832
+ "step": 8100
833
+ },
834
+ {
835
+ "epoch": 3.1485984896966595,
836
+ "grad_norm": 0.2118765264749527,
837
+ "learning_rate": 7.746770167249413e-06,
838
+ "loss": 0.2124,
839
+ "step": 8200
840
+ },
841
+ {
842
+ "epoch": 3.1869960322539357,
843
+ "grad_norm": 0.22295965254306793,
844
+ "learning_rate": 7.696164733529628e-06,
845
+ "loss": 0.2123,
846
+ "step": 8300
847
+ },
848
+ {
849
+ "epoch": 3.225393574811212,
850
+ "grad_norm": 0.2226712554693222,
851
+ "learning_rate": 7.645166842612766e-06,
852
+ "loss": 0.2115,
853
+ "step": 8400
854
+ },
855
+ {
856
+ "epoch": 3.2637911173684886,
857
+ "grad_norm": 0.22712872922420502,
858
+ "learning_rate": 7.593783917817248e-06,
859
+ "loss": 0.211,
860
+ "step": 8500
861
+ },
862
+ {
863
+ "epoch": 3.2637911173684886,
864
+ "eval_valid_loss": 0.19893750548362732,
865
+ "eval_valid_runtime": 4.6876,
866
+ "eval_valid_samples_per_second": 213.327,
867
+ "eval_valid_steps_per_second": 6.826,
868
+ "step": 8500
869
+ },
870
+ {
871
+ "epoch": 3.2637911173684886,
872
+ "eval_valid_target_loss": 0.22138281166553497,
873
+ "eval_valid_target_runtime": 4.6684,
874
+ "eval_valid_target_samples_per_second": 214.206,
875
+ "eval_valid_target_steps_per_second": 6.855,
876
+ "step": 8500
877
+ },
878
+ {
879
+ "epoch": 3.302188659925765,
880
+ "grad_norm": 0.20663662254810333,
881
+ "learning_rate": 7.5420234385075155e-06,
882
+ "loss": 0.211,
883
+ "step": 8600
884
+ },
885
+ {
886
+ "epoch": 3.340586202483041,
887
+ "grad_norm": 0.24639233946800232,
888
+ "learning_rate": 7.489892939005333e-06,
889
+ "loss": 0.2099,
890
+ "step": 8700
891
+ },
892
+ {
893
+ "epoch": 3.3789837450403173,
894
+ "grad_norm": 0.21435491740703583,
895
+ "learning_rate": 7.437400007493079e-06,
896
+ "loss": 0.209,
897
+ "step": 8800
898
+ },
899
+ {
900
+ "epoch": 3.4173812875975935,
901
+ "grad_norm": 0.21131959557533264,
902
+ "learning_rate": 7.384552284909195e-06,
903
+ "loss": 0.2081,
904
+ "step": 8900
905
+ },
906
+ {
907
+ "epoch": 3.45577883015487,
908
+ "grad_norm": 0.2295517921447754,
909
+ "learning_rate": 7.3313574638359734e-06,
910
+ "loss": 0.2084,
911
+ "step": 9000
912
+ },
913
+ {
914
+ "epoch": 3.45577883015487,
915
+ "eval_valid_loss": 0.19658593833446503,
916
+ "eval_valid_runtime": 4.6935,
917
+ "eval_valid_samples_per_second": 213.059,
918
+ "eval_valid_steps_per_second": 6.818,
919
+ "step": 9000
920
+ },
921
+ {
922
+ "epoch": 3.45577883015487,
923
+ "eval_valid_target_loss": 0.2188750058412552,
924
+ "eval_valid_target_runtime": 4.6686,
925
+ "eval_valid_target_samples_per_second": 214.199,
926
+ "eval_valid_target_steps_per_second": 6.854,
927
+ "step": 9000
928
+ },
929
+ {
930
+ "epoch": 3.4941763727121464,
931
+ "grad_norm": 0.2244088351726532,
932
+ "learning_rate": 7.277823287379801e-06,
933
+ "loss": 0.2084,
934
+ "step": 9100
935
+ },
936
+ {
937
+ "epoch": 3.5325739152694227,
938
+ "grad_norm": 0.2267696112394333,
939
+ "learning_rate": 7.2239575480440774e-06,
940
+ "loss": 0.2085,
941
+ "step": 9200
942
+ },
943
+ {
944
+ "epoch": 3.5709714578266993,
945
+ "grad_norm": 0.20846766233444214,
946
+ "learning_rate": 7.169768086594913e-06,
947
+ "loss": 0.2063,
948
+ "step": 9300
949
+ },
950
+ {
951
+ "epoch": 3.6093690003839756,
952
+ "grad_norm": 0.23632733523845673,
953
+ "learning_rate": 7.115262790919827e-06,
954
+ "loss": 0.2068,
955
+ "step": 9400
956
+ },
957
+ {
958
+ "epoch": 3.647766542941252,
959
+ "grad_norm": 0.20877471566200256,
960
+ "learning_rate": 7.060449594879573e-06,
961
+ "loss": 0.2059,
962
+ "step": 9500
963
+ },
964
+ {
965
+ "epoch": 3.647766542941252,
966
+ "eval_valid_loss": 0.19441406428813934,
967
+ "eval_valid_runtime": 4.6671,
968
+ "eval_valid_samples_per_second": 214.264,
969
+ "eval_valid_steps_per_second": 6.856,
970
+ "step": 9500
971
+ },
972
+ {
973
+ "epoch": 3.647766542941252,
974
+ "eval_valid_target_loss": 0.21704687178134918,
975
+ "eval_valid_target_runtime": 4.6648,
976
+ "eval_valid_target_samples_per_second": 214.371,
977
+ "eval_valid_target_steps_per_second": 6.86,
978
+ "step": 9500
979
+ },
980
+ {
981
+ "epoch": 3.686164085498528,
982
+ "grad_norm": 0.20587915182113647,
983
+ "learning_rate": 7.0053364771532805e-06,
984
+ "loss": 0.2058,
985
+ "step": 9600
986
+ },
987
+ {
988
+ "epoch": 3.7245616280558043,
989
+ "grad_norm": 0.208708256483078,
990
+ "learning_rate": 6.949931460077058e-06,
991
+ "loss": 0.2052,
992
+ "step": 9700
993
+ },
994
+ {
995
+ "epoch": 3.7629591706130805,
996
+ "grad_norm": 0.21517980098724365,
997
+ "learning_rate": 6.894242608476263e-06,
998
+ "loss": 0.2049,
999
+ "step": 9800
1000
+ },
1001
+ {
1002
+ "epoch": 3.801356713170357,
1003
+ "grad_norm": 0.22570070624351501,
1004
+ "learning_rate": 6.8382780284915685e-06,
1005
+ "loss": 0.2047,
1006
+ "step": 9900
1007
+ },
1008
+ {
1009
+ "epoch": 3.8397542557276334,
1010
+ "grad_norm": 0.22346258163452148,
1011
+ "learning_rate": 6.782045866399023e-06,
1012
+ "loss": 0.2037,
1013
+ "step": 10000
1014
+ },
1015
+ {
1016
+ "epoch": 3.8397542557276334,
1017
+ "eval_valid_loss": 0.1928359419107437,
1018
+ "eval_valid_runtime": 4.6748,
1019
+ "eval_valid_samples_per_second": 213.912,
1020
+ "eval_valid_steps_per_second": 6.845,
1021
+ "step": 10000
1022
+ },
1023
+ {
1024
+ "epoch": 3.8397542557276334,
1025
+ "eval_valid_target_loss": 0.21531249582767487,
1026
+ "eval_valid_target_runtime": 4.6773,
1027
+ "eval_valid_target_samples_per_second": 213.8,
1028
+ "eval_valid_target_steps_per_second": 6.842,
1029
+ "step": 10000
1030
+ },
1031
+ {
1032
+ "epoch": 3.8781517982849096,
1033
+ "grad_norm": 0.2544507086277008,
1034
+ "learning_rate": 6.725554307424274e-06,
1035
+ "loss": 0.2036,
1036
+ "step": 10100
1037
+ },
1038
+ {
1039
+ "epoch": 3.9165493408421863,
1040
+ "grad_norm": 0.27723318338394165,
1041
+ "learning_rate": 6.668811574551106e-06,
1042
+ "loss": 0.2039,
1043
+ "step": 10200
1044
+ },
1045
+ {
1046
+ "epoch": 3.9549468833994625,
1047
+ "grad_norm": 0.22496485710144043,
1048
+ "learning_rate": 6.6118259273245065e-06,
1049
+ "loss": 0.2032,
1050
+ "step": 10300
1051
+ },
1052
+ {
1053
+ "epoch": 3.9933444259567388,
1054
+ "grad_norm": 0.22093619406223297,
1055
+ "learning_rate": 6.55460566064838e-06,
1056
+ "loss": 0.2027,
1057
+ "step": 10400
1058
+ },
1059
+ {
1060
+ "epoch": 4.031741968514015,
1061
+ "grad_norm": 0.2137976437807083,
1062
+ "learning_rate": 6.497159103578143e-06,
1063
+ "loss": 0.2016,
1064
+ "step": 10500
1065
+ },
1066
+ {
1067
+ "epoch": 4.031741968514015,
1068
+ "eval_valid_loss": 0.19111718237400055,
1069
+ "eval_valid_runtime": 4.6833,
1070
+ "eval_valid_samples_per_second": 213.523,
1071
+ "eval_valid_steps_per_second": 6.833,
1072
+ "step": 10500
1073
+ },
1074
+ {
1075
+ "epoch": 4.031741968514015,
1076
+ "eval_valid_target_loss": 0.2142656296491623,
1077
+ "eval_valid_target_runtime": 4.6587,
1078
+ "eval_valid_target_samples_per_second": 214.65,
1079
+ "eval_valid_target_steps_per_second": 6.869,
1080
+ "step": 10500
1081
+ },
1082
+ {
1083
+ "epoch": 4.070139511071291,
1084
+ "grad_norm": 0.20360158383846283,
1085
+ "learning_rate": 6.439494618108332e-06,
1086
+ "loss": 0.2013,
1087
+ "step": 10600
1088
+ },
1089
+ {
1090
+ "epoch": 4.1085370536285675,
1091
+ "grad_norm": 0.21878282725811005,
1092
+ "learning_rate": 6.38162059795542e-06,
1093
+ "loss": 0.2006,
1094
+ "step": 10700
1095
+ },
1096
+ {
1097
+ "epoch": 4.146934596185844,
1098
+ "grad_norm": 0.2319776862859726,
1099
+ "learning_rate": 6.323545467336017e-06,
1100
+ "loss": 0.2012,
1101
+ "step": 10800
1102
+ },
1103
+ {
1104
+ "epoch": 4.185332138743121,
1105
+ "grad_norm": 0.20898312330245972,
1106
+ "learning_rate": 6.26527767974063e-06,
1107
+ "loss": 0.2005,
1108
+ "step": 10900
1109
+ },
1110
+ {
1111
+ "epoch": 4.223729681300397,
1112
+ "grad_norm": 0.21366915106773376,
1113
+ "learning_rate": 6.206825716703166e-06,
1114
+ "loss": 0.2,
1115
+ "step": 11000
1116
+ },
1117
+ {
1118
+ "epoch": 4.223729681300397,
1119
+ "eval_valid_loss": 0.18977344036102295,
1120
+ "eval_valid_runtime": 4.7328,
1121
+ "eval_valid_samples_per_second": 211.293,
1122
+ "eval_valid_steps_per_second": 6.761,
1123
+ "step": 11000
1124
+ },
1125
+ {
1126
+ "epoch": 4.223729681300397,
1127
+ "eval_valid_target_loss": 0.21274219453334808,
1128
+ "eval_valid_target_runtime": 4.6506,
1129
+ "eval_valid_target_samples_per_second": 215.026,
1130
+ "eval_valid_target_steps_per_second": 6.881,
1131
+ "step": 11000
1132
+ },
1133
+ {
1134
+ "epoch": 4.262127223857673,
1135
+ "grad_norm": 0.20968745648860931,
1136
+ "learning_rate": 6.1481980865663405e-06,
1137
+ "loss": 0.1993,
1138
+ "step": 11100
1139
+ },
1140
+ {
1141
+ "epoch": 4.3005247664149495,
1142
+ "grad_norm": 0.20683012902736664,
1143
+ "learning_rate": 6.089403323243203e-06,
1144
+ "loss": 0.1992,
1145
+ "step": 11200
1146
+ },
1147
+ {
1148
+ "epoch": 4.338922308972226,
1149
+ "grad_norm": 0.20785097777843475,
1150
+ "learning_rate": 6.030449984974916e-06,
1151
+ "loss": 0.199,
1152
+ "step": 11300
1153
+ },
1154
+ {
1155
+ "epoch": 4.377319851529502,
1156
+ "grad_norm": 0.20532238483428955,
1157
+ "learning_rate": 5.971346653085025e-06,
1158
+ "loss": 0.199,
1159
+ "step": 11400
1160
+ },
1161
+ {
1162
+ "epoch": 4.415717394086778,
1163
+ "grad_norm": 0.21589842438697815,
1164
+ "learning_rate": 5.912101930730329e-06,
1165
+ "loss": 0.1992,
1166
+ "step": 11500
1167
+ },
1168
+ {
1169
+ "epoch": 4.415717394086778,
1170
+ "eval_valid_loss": 0.18833594024181366,
1171
+ "eval_valid_runtime": 4.6904,
1172
+ "eval_valid_samples_per_second": 213.203,
1173
+ "eval_valid_steps_per_second": 6.823,
1174
+ "step": 11500
1175
+ },
1176
+ {
1177
+ "epoch": 4.415717394086778,
1178
+ "eval_valid_target_loss": 0.211976557970047,
1179
+ "eval_valid_target_runtime": 4.658,
1180
+ "eval_valid_target_samples_per_second": 214.686,
1181
+ "eval_valid_target_steps_per_second": 6.87,
1182
+ "step": 11500
1183
+ },
1184
+ {
1185
+ "epoch": 4.4541149366440544,
1186
+ "grad_norm": 0.2021540254354477,
1187
+ "learning_rate": 5.852724441648614e-06,
1188
+ "loss": 0.1987,
1189
+ "step": 11600
1190
+ },
1191
+ {
1192
+ "epoch": 4.492512479201331,
1193
+ "grad_norm": 0.24406403303146362,
1194
+ "learning_rate": 5.7932228289033506e-06,
1195
+ "loss": 0.1984,
1196
+ "step": 11700
1197
+ },
1198
+ {
1199
+ "epoch": 4.530910021758608,
1200
+ "grad_norm": 0.20519228279590607,
1201
+ "learning_rate": 5.7336057536256216e-06,
1202
+ "loss": 0.1984,
1203
+ "step": 11800
1204
+ },
1205
+ {
1206
+ "epoch": 4.569307564315884,
1207
+ "grad_norm": 0.21227143704891205,
1208
+ "learning_rate": 5.67388189375337e-06,
1209
+ "loss": 0.1976,
1210
+ "step": 11900
1211
+ },
1212
+ {
1213
+ "epoch": 4.60770510687316,
1214
+ "grad_norm": 0.2325662076473236,
1215
+ "learning_rate": 5.614059942768254e-06,
1216
+ "loss": 0.1977,
1217
+ "step": 12000
1218
+ },
1219
+ {
1220
+ "epoch": 4.60770510687316,
1221
+ "eval_valid_loss": 0.18742187321186066,
1222
+ "eval_valid_runtime": 4.6831,
1223
+ "eval_valid_samples_per_second": 213.535,
1224
+ "eval_valid_steps_per_second": 6.833,
1225
+ "step": 12000
1226
+ },
1227
+ {
1228
+ "epoch": 4.60770510687316,
1229
+ "eval_valid_target_loss": 0.21108593046665192,
1230
+ "eval_valid_target_runtime": 4.6502,
1231
+ "eval_valid_target_samples_per_second": 215.046,
1232
+ "eval_valid_target_steps_per_second": 6.881,
1233
+ "step": 12000
1234
+ },
1235
+ {
1236
+ "epoch": 4.6461026494304365,
1237
+ "grad_norm": 0.2245544046163559,
1238
+ "learning_rate": 5.554148608430192e-06,
1239
+ "loss": 0.1965,
1240
+ "step": 12100
1241
+ },
1242
+ {
1243
+ "epoch": 4.684500191987713,
1244
+ "grad_norm": 0.22662824392318726,
1245
+ "learning_rate": 5.4941566115098614e-06,
1246
+ "loss": 0.1971,
1247
+ "step": 12200
1248
+ },
1249
+ {
1250
+ "epoch": 4.722897734544989,
1251
+ "grad_norm": 0.19245535135269165,
1252
+ "learning_rate": 5.4340926845192874e-06,
1253
+ "loss": 0.1974,
1254
+ "step": 12300
1255
+ },
1256
+ {
1257
+ "epoch": 4.761295277102265,
1258
+ "grad_norm": 0.18942756950855255,
1259
+ "learning_rate": 5.373965570440729e-06,
1260
+ "loss": 0.1966,
1261
+ "step": 12400
1262
+ },
1263
+ {
1264
+ "epoch": 4.799692819659541,
1265
+ "grad_norm": 0.1962059736251831,
1266
+ "learning_rate": 5.3137840214540395e-06,
1267
+ "loss": 0.1958,
1268
+ "step": 12500
1269
+ },
1270
+ {
1271
+ "epoch": 4.799692819659541,
1272
+ "eval_valid_loss": 0.18663281202316284,
1273
+ "eval_valid_runtime": 4.6972,
1274
+ "eval_valid_samples_per_second": 212.895,
1275
+ "eval_valid_steps_per_second": 6.813,
1276
+ "step": 12500
1277
+ },
1278
+ {
1279
+ "epoch": 4.799692819659541,
1280
+ "eval_valid_target_loss": 0.21009375154972076,
1281
+ "eval_valid_target_runtime": 4.6708,
1282
+ "eval_valid_target_samples_per_second": 214.096,
1283
+ "eval_valid_target_steps_per_second": 6.851,
1284
+ "step": 12500
1285
+ },
1286
+ {
1287
+ "epoch": 4.838090362216818,
1288
+ "grad_norm": 0.2151457667350769,
1289
+ "learning_rate": 5.2535567976626846e-06,
1290
+ "loss": 0.1963,
1291
+ "step": 12600
1292
+ },
1293
+ {
1294
+ "epoch": 4.876487904774095,
1295
+ "grad_norm": 0.18380814790725708,
1296
+ "learning_rate": 5.1932926658186166e-06,
1297
+ "loss": 0.1959,
1298
+ "step": 12700
1299
+ },
1300
+ {
1301
+ "epoch": 4.914885447331371,
1302
+ "grad_norm": 0.19516663253307343,
1303
+ "learning_rate": 5.133000398046168e-06,
1304
+ "loss": 0.1953,
1305
+ "step": 12800
1306
+ },
1307
+ {
1308
+ "epoch": 4.953282989888647,
1309
+ "grad_norm": 0.24182352423667908,
1310
+ "learning_rate": 5.072688770565177e-06,
1311
+ "loss": 0.1953,
1312
+ "step": 12900
1313
+ },
1314
+ {
1315
+ "epoch": 4.9916805324459235,
1316
+ "grad_norm": 0.23720215260982513,
1317
+ "learning_rate": 5.012366562413501e-06,
1318
+ "loss": 0.1955,
1319
+ "step": 13000
1320
+ },
1321
+ {
1322
+ "epoch": 4.9916805324459235,
1323
+ "eval_valid_loss": 0.18524999916553497,
1324
+ "eval_valid_runtime": 4.6908,
1325
+ "eval_valid_samples_per_second": 213.184,
1326
+ "eval_valid_steps_per_second": 6.822,
1327
+ "step": 13000
1328
+ },
1329
+ {
1330
+ "epoch": 4.9916805324459235,
1331
+ "eval_valid_target_loss": 0.20893749594688416,
1332
+ "eval_valid_target_runtime": 4.6667,
1333
+ "eval_valid_target_samples_per_second": 214.285,
1334
+ "eval_valid_target_steps_per_second": 6.857,
1335
+ "step": 13000
1336
+ },
1337
+ {
1338
+ "epoch": 5.0300780750032,
1339
+ "grad_norm": 0.20271484553813934,
1340
+ "learning_rate": 4.952042554169138e-06,
1341
+ "loss": 0.1948,
1342
+ "step": 13100
1343
+ },
1344
+ {
1345
+ "epoch": 5.068475617560476,
1346
+ "grad_norm": 0.2053770273923874,
1347
+ "learning_rate": 4.891725526672107e-06,
1348
+ "loss": 0.1947,
1349
+ "step": 13200
1350
+ },
1351
+ {
1352
+ "epoch": 5.106873160117752,
1353
+ "grad_norm": 0.20811979472637177,
1354
+ "learning_rate": 4.8314242597463e-06,
1355
+ "loss": 0.1939,
1356
+ "step": 13300
1357
+ },
1358
+ {
1359
+ "epoch": 5.145270702675028,
1360
+ "grad_norm": 0.19889037311077118,
1361
+ "learning_rate": 4.771147530921483e-06,
1362
+ "loss": 0.1943,
1363
+ "step": 13400
1364
+ },
1365
+ {
1366
+ "epoch": 5.1836682452323055,
1367
+ "grad_norm": 0.2038932591676712,
1368
+ "learning_rate": 4.710904114155621e-06,
1369
+ "loss": 0.1938,
1370
+ "step": 13500
1371
+ },
1372
+ {
1373
+ "epoch": 5.1836682452323055,
1374
+ "eval_valid_loss": 0.1847265660762787,
1375
+ "eval_valid_runtime": 4.698,
1376
+ "eval_valid_samples_per_second": 212.854,
1377
+ "eval_valid_steps_per_second": 6.811,
1378
+ "step": 13500
1379
+ },
1380
+ {
1381
+ "epoch": 5.1836682452323055,
1382
+ "eval_valid_target_loss": 0.20839843153953552,
1383
+ "eval_valid_target_runtime": 4.6593,
1384
+ "eval_valid_target_samples_per_second": 214.626,
1385
+ "eval_valid_target_steps_per_second": 6.868,
1386
+ "step": 13500
1387
+ },
1388
+ {
1389
+ "epoch": 5.222065787789582,
1390
+ "grad_norm": 0.19585560262203217,
1391
+ "learning_rate": 4.650702778557736e-06,
1392
+ "loss": 0.1932,
1393
+ "step": 13600
1394
+ },
1395
+ {
1396
+ "epoch": 5.260463330346858,
1397
+ "grad_norm": 0.23953603208065033,
1398
+ "learning_rate": 4.59055228711146e-06,
1399
+ "loss": 0.1933,
1400
+ "step": 13700
1401
+ },
1402
+ {
1403
+ "epoch": 5.298860872904134,
1404
+ "grad_norm": 0.21477288007736206,
1405
+ "learning_rate": 4.530461395399485e-06,
1406
+ "loss": 0.1929,
1407
+ "step": 13800
1408
+ },
1409
+ {
1410
+ "epoch": 5.33725841546141,
1411
+ "grad_norm": 0.22662727534770966,
1412
+ "learning_rate": 4.470438850329089e-06,
1413
+ "loss": 0.1935,
1414
+ "step": 13900
1415
+ },
1416
+ {
1417
+ "epoch": 5.375655958018687,
1418
+ "grad_norm": 0.18912354111671448,
1419
+ "learning_rate": 4.410493388858925e-06,
1420
+ "loss": 0.1931,
1421
+ "step": 14000
1422
+ },
1423
+ {
1424
+ "epoch": 5.375655958018687,
1425
+ "eval_valid_loss": 0.18379686772823334,
1426
+ "eval_valid_runtime": 4.6729,
1427
+ "eval_valid_samples_per_second": 214.001,
1428
+ "eval_valid_steps_per_second": 6.848,
1429
+ "step": 14000
1430
+ },
1431
+ {
1432
+ "epoch": 5.375655958018687,
1433
+ "eval_valid_target_loss": 0.20746874809265137,
1434
+ "eval_valid_target_runtime": 4.6581,
1435
+ "eval_valid_target_samples_per_second": 214.682,
1436
+ "eval_valid_target_steps_per_second": 6.87,
1437
+ "step": 14000
1438
+ },
1439
+ {
1440
+ "epoch": 5.414053500575963,
1441
+ "grad_norm": 0.21155835688114166,
1442
+ "learning_rate": 4.350633736727259e-06,
1443
+ "loss": 0.193,
1444
+ "step": 14100
1445
+ },
1446
+ {
1447
+ "epoch": 5.452451043133239,
1448
+ "grad_norm": 0.2160138338804245,
1449
+ "learning_rate": 4.29086860718184e-06,
1450
+ "loss": 0.1931,
1451
+ "step": 14200
1452
+ },
1453
+ {
1454
+ "epoch": 5.490848585690516,
1455
+ "grad_norm": 0.19270409643650055,
1456
+ "learning_rate": 4.231206699711587e-06,
1457
+ "loss": 0.1925,
1458
+ "step": 14300
1459
+ },
1460
+ {
1461
+ "epoch": 5.5292461282477925,
1462
+ "grad_norm": 0.18501386046409607,
1463
+ "learning_rate": 4.171656698780281e-06,
1464
+ "loss": 0.1925,
1465
+ "step": 14400
1466
+ },
1467
+ {
1468
+ "epoch": 5.567643670805069,
1469
+ "grad_norm": 0.20564299821853638,
1470
+ "learning_rate": 4.112227272562447e-06,
1471
+ "loss": 0.1918,
1472
+ "step": 14500
1473
+ },
1474
+ {
1475
+ "epoch": 5.567643670805069,
1476
+ "eval_valid_loss": 0.18317969143390656,
1477
+ "eval_valid_runtime": 4.679,
1478
+ "eval_valid_samples_per_second": 213.72,
1479
+ "eval_valid_steps_per_second": 6.839,
1480
+ "step": 14500
1481
+ },
1482
+ {
1483
+ "epoch": 5.567643670805069,
1484
+ "eval_valid_target_loss": 0.20700781047344208,
1485
+ "eval_valid_target_runtime": 4.674,
1486
+ "eval_valid_target_samples_per_second": 213.95,
1487
+ "eval_valid_target_steps_per_second": 6.846,
1488
+ "step": 14500
1489
+ },
1490
+ {
1491
+ "epoch": 5.606041213362345,
1492
+ "grad_norm": 0.21509169042110443,
1493
+ "learning_rate": 4.052927071681593e-06,
1494
+ "loss": 0.1919,
1495
+ "step": 14600
1496
+ },
1497
+ {
1498
+ "epoch": 5.644438755919621,
1499
+ "grad_norm": 0.18730491399765015,
1500
+ "learning_rate": 3.99376472795103e-06,
1501
+ "loss": 0.1921,
1502
+ "step": 14700
1503
+ },
1504
+ {
1505
+ "epoch": 5.682836298476897,
1506
+ "grad_norm": 0.21269969642162323,
1507
+ "learning_rate": 3.934748853117398e-06,
1508
+ "loss": 0.1918,
1509
+ "step": 14800
1510
+ },
1511
+ {
1512
+ "epoch": 5.721233841034174,
1513
+ "grad_norm": 0.18910899758338928,
1514
+ "learning_rate": 3.8758880376071415e-06,
1515
+ "loss": 0.1914,
1516
+ "step": 14900
1517
+ },
1518
+ {
1519
+ "epoch": 5.75963138359145,
1520
+ "grad_norm": 0.22251802682876587,
1521
+ "learning_rate": 3.8171908492760665e-06,
1522
+ "loss": 0.1916,
1523
+ "step": 15000
1524
+ },
1525
+ {
1526
+ "epoch": 5.75963138359145,
1527
+ "eval_valid_loss": 0.18259374797344208,
1528
+ "eval_valid_runtime": 4.67,
1529
+ "eval_valid_samples_per_second": 214.134,
1530
+ "eval_valid_steps_per_second": 6.852,
1531
+ "step": 15000
1532
+ },
1533
+ {
1534
+ "epoch": 5.75963138359145,
1535
+ "eval_valid_target_loss": 0.20646093785762787,
1536
+ "eval_valid_target_runtime": 4.67,
1537
+ "eval_valid_target_samples_per_second": 214.131,
1538
+ "eval_valid_target_steps_per_second": 6.852,
1539
+ "step": 15000
1540
+ },
1541
+ {
1542
+ "epoch": 5.798028926148726,
1543
+ "grad_norm": 0.17328619956970215,
1544
+ "learning_rate": 3.758665832162203e-06,
1545
+ "loss": 0.1911,
1546
+ "step": 15100
1547
+ },
1548
+ {
1549
+ "epoch": 5.836426468706003,
1550
+ "grad_norm": 0.20850612223148346,
1551
+ "learning_rate": 3.7003215052421116e-06,
1552
+ "loss": 0.1915,
1553
+ "step": 15200
1554
+ },
1555
+ {
1556
+ "epoch": 5.8748240112632795,
1557
+ "grad_norm": 0.1912785917520523,
1558
+ "learning_rate": 3.642166361190859e-06,
1559
+ "loss": 0.1908,
1560
+ "step": 15300
1561
+ },
1562
+ {
1563
+ "epoch": 5.913221553820556,
1564
+ "grad_norm": 0.2138790339231491,
1565
+ "learning_rate": 3.584208865145812e-06,
1566
+ "loss": 0.1907,
1567
+ "step": 15400
1568
+ },
1569
+ {
1570
+ "epoch": 5.951619096377832,
1571
+ "grad_norm": 0.19723013043403625,
1572
+ "learning_rate": 3.5264574534744373e-06,
1573
+ "loss": 0.1913,
1574
+ "step": 15500
1575
+ },
1576
+ {
1577
+ "epoch": 5.951619096377832,
1578
+ "eval_valid_loss": 0.1817968785762787,
1579
+ "eval_valid_runtime": 4.6726,
1580
+ "eval_valid_samples_per_second": 214.016,
1581
+ "eval_valid_steps_per_second": 6.849,
1582
+ "step": 15500
1583
+ },
1584
+ {
1585
+ "epoch": 5.951619096377832,
1586
+ "eval_valid_target_loss": 0.20574218034744263,
1587
+ "eval_valid_target_runtime": 4.6817,
1588
+ "eval_valid_target_samples_per_second": 213.599,
1589
+ "eval_valid_target_steps_per_second": 6.835,
1590
+ "step": 15500
1591
+ },
1592
+ {
1593
+ "epoch": 5.990016638935108,
1594
+ "grad_norm": 0.19212548434734344,
1595
+ "learning_rate": 3.4689205325462997e-06,
1596
+ "loss": 0.1907,
1597
+ "step": 15600
1598
+ },
1599
+ {
1600
+ "epoch": 6.028414181492384,
1601
+ "grad_norm": 0.19529464840888977,
1602
+ "learning_rate": 3.4116064775094126e-06,
1603
+ "loss": 0.1901,
1604
+ "step": 15700
1605
+ },
1606
+ {
1607
+ "epoch": 6.066811724049661,
1608
+ "grad_norm": 0.2088070809841156,
1609
+ "learning_rate": 3.354523631071147e-06,
1610
+ "loss": 0.1902,
1611
+ "step": 15800
1612
+ },
1613
+ {
1614
+ "epoch": 6.105209266606937,
1615
+ "grad_norm": 0.19294045865535736,
1616
+ "learning_rate": 3.2976803022838514e-06,
1617
+ "loss": 0.1903,
1618
+ "step": 15900
1619
+ },
1620
+ {
1621
+ "epoch": 6.143606809164213,
1622
+ "grad_norm": 0.20844899117946625,
1623
+ "learning_rate": 3.2410847653353805e-06,
1624
+ "loss": 0.1897,
1625
+ "step": 16000
1626
+ },
1627
+ {
1628
+ "epoch": 6.143606809164213,
1629
+ "eval_valid_loss": 0.1809999942779541,
1630
+ "eval_valid_runtime": 4.6789,
1631
+ "eval_valid_samples_per_second": 213.724,
1632
+ "eval_valid_steps_per_second": 6.839,
1633
+ "step": 16000
1634
+ },
1635
+ {
1636
+ "epoch": 6.143606809164213,
1637
+ "eval_valid_target_loss": 0.20546874403953552,
1638
+ "eval_valid_target_runtime": 4.6614,
1639
+ "eval_valid_target_samples_per_second": 214.53,
1640
+ "eval_valid_target_steps_per_second": 6.865,
1641
+ "step": 16000
1642
+ },
1643
+ {
1644
+ "epoch": 6.18200435172149,
1645
+ "grad_norm": 0.19932307302951813,
1646
+ "learning_rate": 3.184745258344688e-06,
1647
+ "loss": 0.1894,
1648
+ "step": 16100
1649
+ },
1650
+ {
1651
+ "epoch": 6.220401894278766,
1652
+ "grad_norm": 0.19776058197021484,
1653
+ "learning_rate": 3.128669982162681e-06,
1654
+ "loss": 0.1899,
1655
+ "step": 16200
1656
+ },
1657
+ {
1658
+ "epoch": 6.258799436836043,
1659
+ "grad_norm": 0.20467509329319,
1660
+ "learning_rate": 3.07286709917849e-06,
1661
+ "loss": 0.1898,
1662
+ "step": 16300
1663
+ },
1664
+ {
1665
+ "epoch": 6.297196979393319,
1666
+ "grad_norm": 0.19593088328838348,
1667
+ "learning_rate": 3.017344732131342e-06,
1668
+ "loss": 0.1895,
1669
+ "step": 16400
1670
+ },
1671
+ {
1672
+ "epoch": 6.335594521950595,
1673
+ "grad_norm": 0.20078891515731812,
1674
+ "learning_rate": 2.9621109629282064e-06,
1675
+ "loss": 0.1897,
1676
+ "step": 16500
1677
+ },
1678
+ {
1679
+ "epoch": 6.335594521950595,
1680
+ "eval_valid_loss": 0.1807578057050705,
1681
+ "eval_valid_runtime": 4.7017,
1682
+ "eval_valid_samples_per_second": 212.687,
1683
+ "eval_valid_steps_per_second": 6.806,
1684
+ "step": 16500
1685
+ },
1686
+ {
1687
+ "epoch": 6.335594521950595,
1688
+ "eval_valid_target_loss": 0.2052578181028366,
1689
+ "eval_valid_target_runtime": 4.6726,
1690
+ "eval_valid_target_samples_per_second": 214.013,
1691
+ "eval_valid_target_steps_per_second": 6.848,
1692
+ "step": 16500
1693
+ },
1694
+ {
1695
+ "epoch": 6.373992064507871,
1696
+ "grad_norm": 0.17822235822677612,
1697
+ "learning_rate": 2.9071738314673758e-06,
1698
+ "loss": 0.1889,
1699
+ "step": 16600
1700
+ },
1701
+ {
1702
+ "epoch": 6.412389607065148,
1703
+ "grad_norm": 0.21160703897476196,
1704
+ "learning_rate": 2.8525413344681797e-06,
1705
+ "loss": 0.1889,
1706
+ "step": 16700
1707
+ },
1708
+ {
1709
+ "epoch": 6.450787149622424,
1710
+ "grad_norm": 0.19472962617874146,
1711
+ "learning_rate": 2.798221424306953e-06,
1712
+ "loss": 0.1894,
1713
+ "step": 16800
1714
+ },
1715
+ {
1716
+ "epoch": 6.4891846921797,
1717
+ "grad_norm": 0.17923222482204437,
1718
+ "learning_rate": 2.744222007859506e-06,
1719
+ "loss": 0.1891,
1720
+ "step": 16900
1721
+ },
1722
+ {
1723
+ "epoch": 6.527582234736977,
1724
+ "grad_norm": 0.18077126145362854,
1725
+ "learning_rate": 2.690550945350157e-06,
1726
+ "loss": 0.1886,
1727
+ "step": 17000
1728
+ },
1729
+ {
1730
+ "epoch": 6.527582234736977,
1731
+ "eval_valid_loss": 0.18031249940395355,
1732
+ "eval_valid_runtime": 4.6828,
1733
+ "eval_valid_samples_per_second": 213.548,
1734
+ "eval_valid_steps_per_second": 6.834,
1735
+ "step": 17000
1736
+ },
1737
+ {
1738
+ "epoch": 6.527582234736977,
1739
+ "eval_valid_target_loss": 0.20450781285762787,
1740
+ "eval_valid_target_runtime": 4.6685,
1741
+ "eval_valid_target_samples_per_second": 214.203,
1742
+ "eval_valid_target_steps_per_second": 6.854,
1743
+ "step": 17000
1744
+ },
1745
+ {
1746
+ "epoch": 6.565979777294253,
1747
+ "grad_norm": 0.19065329432487488,
1748
+ "learning_rate": 2.637216049207615e-06,
1749
+ "loss": 0.188,
1750
+ "step": 17100
1751
+ },
1752
+ {
1753
+ "epoch": 6.60437731985153,
1754
+ "grad_norm": 0.20368430018424988,
1755
+ "learning_rate": 2.5842250829277724e-06,
1756
+ "loss": 0.189,
1757
+ "step": 17200
1758
+ },
1759
+ {
1760
+ "epoch": 6.642774862408806,
1761
+ "grad_norm": 0.21131780743598938,
1762
+ "learning_rate": 2.5315857599436575e-06,
1763
+ "loss": 0.1887,
1764
+ "step": 17300
1765
+ },
1766
+ {
1767
+ "epoch": 6.681172404966082,
1768
+ "grad_norm": 0.2033446729183197,
1769
+ "learning_rate": 2.4793057425026467e-06,
1770
+ "loss": 0.1887,
1771
+ "step": 17400
1772
+ },
1773
+ {
1774
+ "epoch": 6.719569947523358,
1775
+ "grad_norm": 0.19689294695854187,
1776
+ "learning_rate": 2.427392640551137e-06,
1777
+ "loss": 0.1887,
1778
+ "step": 17500
1779
+ },
1780
+ {
1781
+ "epoch": 6.719569947523358,
1782
+ "eval_valid_loss": 0.17996874451637268,
1783
+ "eval_valid_runtime": 4.7043,
1784
+ "eval_valid_samples_per_second": 212.57,
1785
+ "eval_valid_steps_per_second": 6.802,
1786
+ "step": 17500
1787
+ },
1788
+ {
1789
+ "epoch": 6.719569947523358,
1790
+ "eval_valid_target_loss": 0.20432811975479126,
1791
+ "eval_valid_target_runtime": 4.6638,
1792
+ "eval_valid_target_samples_per_second": 214.416,
1793
+ "eval_valid_target_steps_per_second": 6.861,
1794
+ "step": 17500
1795
+ },
1796
+ {
1797
+ "epoch": 6.757967490080635,
1798
+ "grad_norm": 0.1994999349117279,
1799
+ "learning_rate": 2.3758540106268406e-06,
1800
+ "loss": 0.1881,
1801
+ "step": 17600
1802
+ },
1803
+ {
1804
+ "epoch": 6.796365032637911,
1805
+ "grad_norm": 0.19650602340698242,
1806
+ "learning_rate": 2.32469735475884e-06,
1807
+ "loss": 0.1881,
1808
+ "step": 17700
1809
+ },
1810
+ {
1811
+ "epoch": 6.834762575195187,
1812
+ "grad_norm": 0.21248474717140198,
1813
+ "learning_rate": 2.273930119375586e-06,
1814
+ "loss": 0.1882,
1815
+ "step": 17800
1816
+ },
1817
+ {
1818
+ "epoch": 6.873160117752464,
1819
+ "grad_norm": 0.19042810797691345,
1820
+ "learning_rate": 2.2235596942209776e-06,
1821
+ "loss": 0.188,
1822
+ "step": 17900
1823
+ },
1824
+ {
1825
+ "epoch": 6.91155766030974,
1826
+ "grad_norm": 0.23096908628940582,
1827
+ "learning_rate": 2.173593411278714e-06,
1828
+ "loss": 0.1886,
1829
+ "step": 18000
1830
+ },
1831
+ {
1832
+ "epoch": 6.91155766030974,
1833
+ "eval_valid_loss": 0.17952343821525574,
1834
+ "eval_valid_runtime": 4.6878,
1835
+ "eval_valid_samples_per_second": 213.321,
1836
+ "eval_valid_steps_per_second": 6.826,
1837
+ "step": 18000
1838
+ },
1839
+ {
1840
+ "epoch": 6.91155766030974,
1841
+ "eval_valid_target_loss": 0.20391406118869781,
1842
+ "eval_valid_target_runtime": 4.6595,
1843
+ "eval_valid_target_samples_per_second": 214.617,
1844
+ "eval_valid_target_steps_per_second": 6.868,
1845
+ "step": 18000
1846
+ },
1847
+ {
1848
+ "epoch": 6.949955202867017,
1849
+ "grad_norm": 0.21275204420089722,
1850
+ "learning_rate": 2.124038543705034e-06,
1851
+ "loss": 0.1878,
1852
+ "step": 18100
1853
+ },
1854
+ {
1855
+ "epoch": 6.988352745424293,
1856
+ "grad_norm": 0.20453621447086334,
1857
+ "learning_rate": 2.0749023047700285e-06,
1858
+ "loss": 0.188,
1859
+ "step": 18200
1860
+ },
1861
+ {
1862
+ "epoch": 7.026750287981569,
1863
+ "grad_norm": 0.20724526047706604,
1864
+ "learning_rate": 2.026191846807663e-06,
1865
+ "loss": 0.1883,
1866
+ "step": 18300
1867
+ },
1868
+ {
1869
+ "epoch": 7.065147830538845,
1870
+ "grad_norm": 0.1886543333530426,
1871
+ "learning_rate": 1.9779142601746825e-06,
1872
+ "loss": 0.1874,
1873
+ "step": 18400
1874
+ },
1875
+ {
1876
+ "epoch": 7.1035453730961216,
1877
+ "grad_norm": 0.20411571860313416,
1878
+ "learning_rate": 1.9300765722185265e-06,
1879
+ "loss": 0.187,
1880
+ "step": 18500
1881
+ },
1882
+ {
1883
+ "epoch": 7.1035453730961216,
1884
+ "eval_valid_loss": 0.17924219369888306,
1885
+ "eval_valid_runtime": 4.6825,
1886
+ "eval_valid_samples_per_second": 213.561,
1887
+ "eval_valid_steps_per_second": 6.834,
1888
+ "step": 18500
1889
+ },
1890
+ {
1891
+ "epoch": 7.1035453730961216,
1892
+ "eval_valid_target_loss": 0.20393750071525574,
1893
+ "eval_valid_target_runtime": 4.6736,
1894
+ "eval_valid_target_samples_per_second": 213.97,
1895
+ "eval_valid_target_steps_per_second": 6.847,
1896
+ "step": 18500
1897
+ },
1898
+ {
1899
+ "epoch": 7.141942915653398,
1900
+ "grad_norm": 0.18996645510196686,
1901
+ "learning_rate": 1.8826857462544129e-06,
1902
+ "loss": 0.1871,
1903
+ "step": 18600
1904
+ },
1905
+ {
1906
+ "epoch": 7.180340458210675,
1907
+ "grad_norm": 0.21018381416797638,
1908
+ "learning_rate": 1.8357486805517615e-06,
1909
+ "loss": 0.1874,
1910
+ "step": 18700
1911
+ },
1912
+ {
1913
+ "epoch": 7.218738000767951,
1914
+ "grad_norm": 0.19617675244808197,
1915
+ "learning_rate": 1.7892722073300627e-06,
1916
+ "loss": 0.1869,
1917
+ "step": 18800
1918
+ },
1919
+ {
1920
+ "epoch": 7.257135543325227,
1921
+ "grad_norm": 0.2340448796749115,
1922
+ "learning_rate": 1.743263091764379e-06,
1923
+ "loss": 0.187,
1924
+ "step": 18900
1925
+ },
1926
+ {
1927
+ "epoch": 7.295533085882504,
1928
+ "grad_norm": 0.22970305383205414,
1929
+ "learning_rate": 1.6977280310005845e-06,
1930
+ "loss": 0.1873,
1931
+ "step": 19000
1932
+ },
1933
+ {
1934
+ "epoch": 7.295533085882504,
1935
+ "eval_valid_loss": 0.1788671910762787,
1936
+ "eval_valid_runtime": 4.6706,
1937
+ "eval_valid_samples_per_second": 214.105,
1938
+ "eval_valid_steps_per_second": 6.851,
1939
+ "step": 19000
1940
+ },
1941
+ {
1942
+ "epoch": 7.295533085882504,
1943
+ "eval_valid_target_loss": 0.20334374904632568,
1944
+ "eval_valid_target_runtime": 4.6842,
1945
+ "eval_valid_target_samples_per_second": 213.484,
1946
+ "eval_valid_target_steps_per_second": 6.831,
1947
+ "step": 19000
1948
+ },
1949
+ {
1950
+ "epoch": 7.33393062843978,
1951
+ "grad_norm": 0.20527499914169312,
1952
+ "learning_rate": 1.6526736531805354e-06,
1953
+ "loss": 0.1873,
1954
+ "step": 19100
1955
+ },
1956
+ {
1957
+ "epoch": 7.372328170997056,
1958
+ "grad_norm": 0.1835908442735672,
1959
+ "learning_rate": 1.6081065164772624e-06,
1960
+ "loss": 0.187,
1961
+ "step": 19200
1962
+ },
1963
+ {
1964
+ "epoch": 7.410725713554332,
1965
+ "grad_norm": 0.18936371803283691,
1966
+ "learning_rate": 1.564033108140348e-06,
1967
+ "loss": 0.1865,
1968
+ "step": 19300
1969
+ },
1970
+ {
1971
+ "epoch": 7.4491232561116085,
1972
+ "grad_norm": 0.19136998057365417,
1973
+ "learning_rate": 1.520459843551646e-06,
1974
+ "loss": 0.1872,
1975
+ "step": 19400
1976
+ },
1977
+ {
1978
+ "epoch": 7.487520798668886,
1979
+ "grad_norm": 0.19691316783428192,
1980
+ "learning_rate": 1.4773930652914426e-06,
1981
+ "loss": 0.187,
1982
+ "step": 19500
1983
+ },
1984
+ {
1985
+ "epoch": 7.487520798668886,
1986
+ "eval_valid_loss": 0.17878125607967377,
1987
+ "eval_valid_runtime": 4.6602,
1988
+ "eval_valid_samples_per_second": 214.581,
1989
+ "eval_valid_steps_per_second": 6.867,
1990
+ "step": 19500
1991
+ },
1992
+ {
1993
+ "epoch": 7.487520798668886,
1994
+ "eval_valid_target_loss": 0.20325781404972076,
1995
+ "eval_valid_target_runtime": 4.6796,
1996
+ "eval_valid_target_samples_per_second": 213.695,
1997
+ "eval_valid_target_steps_per_second": 6.838,
1998
+ "step": 19500
1999
+ },
2000
+ {
2001
+ "epoch": 7.525918341226162,
2002
+ "grad_norm": 0.18792080879211426,
2003
+ "learning_rate": 1.434839042215227e-06,
2004
+ "loss": 0.1868,
2005
+ "step": 19600
2006
+ },
2007
+ {
2008
+ "epoch": 7.564315883783438,
2009
+ "grad_norm": 0.1945939064025879,
2010
+ "learning_rate": 1.3928039685411793e-06,
2011
+ "loss": 0.1869,
2012
+ "step": 19700
2013
+ },
2014
+ {
2015
+ "epoch": 7.602713426340714,
2016
+ "grad_norm": 0.17974095046520233,
2017
+ "learning_rate": 1.3512939629485456e-06,
2018
+ "loss": 0.187,
2019
+ "step": 19800
2020
+ },
2021
+ {
2022
+ "epoch": 7.641110968897991,
2023
+ "grad_norm": 0.22416825592517853,
2024
+ "learning_rate": 1.3103150676869864e-06,
2025
+ "loss": 0.1871,
2026
+ "step": 19900
2027
+ },
2028
+ {
2029
+ "epoch": 7.679508511455267,
2030
+ "grad_norm": 0.19613422453403473,
2031
+ "learning_rate": 1.2698732476970627e-06,
2032
+ "loss": 0.1869,
2033
+ "step": 20000
2034
+ },
2035
+ {
2036
+ "epoch": 7.679508511455267,
2037
+ "eval_valid_loss": 0.1783437430858612,
2038
+ "eval_valid_runtime": 4.6716,
2039
+ "eval_valid_samples_per_second": 214.058,
2040
+ "eval_valid_steps_per_second": 6.85,
2041
+ "step": 20000
2042
+ },
2043
+ {
2044
+ "epoch": 7.679508511455267,
2045
+ "eval_valid_target_loss": 0.2031562477350235,
2046
+ "eval_valid_target_runtime": 4.6803,
2047
+ "eval_valid_target_samples_per_second": 213.661,
2048
+ "eval_valid_target_steps_per_second": 6.837,
2049
+ "step": 20000
2050
+ },
2051
+ {
2052
+ "epoch": 7.717906054012543,
2053
+ "grad_norm": 0.20145875215530396,
2054
+ "learning_rate": 1.229974389741964e-06,
2055
+ "loss": 0.187,
2056
+ "step": 20100
2057
+ },
2058
+ {
2059
+ "epoch": 7.756303596569819,
2060
+ "grad_norm": 0.18396620452404022,
2061
+ "learning_rate": 1.1906243015506375e-06,
2062
+ "loss": 0.1867,
2063
+ "step": 20200
2064
+ },
2065
+ {
2066
+ "epoch": 7.7947011391270955,
2067
+ "grad_norm": 0.18105918169021606,
2068
+ "learning_rate": 1.1518287109723958e-06,
2069
+ "loss": 0.1862,
2070
+ "step": 20300
2071
+ },
2072
+ {
2073
+ "epoch": 7.833098681684373,
2074
+ "grad_norm": 0.20986780524253845,
2075
+ "learning_rate": 1.1135932651431651e-06,
2076
+ "loss": 0.1863,
2077
+ "step": 20400
2078
+ },
2079
+ {
2080
+ "epoch": 7.871496224241649,
2081
+ "grad_norm": 0.21804456412792206,
2082
+ "learning_rate": 1.075923529663489e-06,
2083
+ "loss": 0.1869,
2084
+ "step": 20500
2085
+ },
2086
+ {
2087
+ "epoch": 7.871496224241649,
2088
+ "eval_valid_loss": 0.17836718261241913,
2089
+ "eval_valid_runtime": 4.6832,
2090
+ "eval_valid_samples_per_second": 213.531,
2091
+ "eval_valid_steps_per_second": 6.833,
2092
+ "step": 20500
2093
+ },
2094
+ {
2095
+ "epoch": 7.871496224241649,
2096
+ "eval_valid_target_loss": 0.20322656631469727,
2097
+ "eval_valid_target_runtime": 4.6763,
2098
+ "eval_valid_target_samples_per_second": 213.843,
2099
+ "eval_valid_target_steps_per_second": 6.843,
2100
+ "step": 20500
2101
+ },
2102
+ {
2103
+ "epoch": 7.909893766798925,
2104
+ "grad_norm": 0.22019818425178528,
2105
+ "learning_rate": 1.0388249877883827e-06,
2106
+ "loss": 0.1858,
2107
+ "step": 20600
2108
+ },
2109
+ {
2110
+ "epoch": 7.948291309356201,
2111
+ "grad_norm": 0.1965310275554657,
2112
+ "learning_rate": 1.0023030396291916e-06,
2113
+ "loss": 0.1866,
2114
+ "step": 20700
2115
+ },
2116
+ {
2117
+ "epoch": 7.9866888519134775,
2118
+ "grad_norm": 0.18218408524990082,
2119
+ "learning_rate": 9.66363001367534e-07,
2120
+ "loss": 0.1869,
2121
+ "step": 20800
2122
+ },
2123
+ {
2124
+ "epoch": 8.025086394470755,
2125
+ "grad_norm": 0.1850380003452301,
2126
+ "learning_rate": 9.310101044814835e-07,
2127
+ "loss": 0.1861,
2128
+ "step": 20900
2129
+ },
2130
+ {
2131
+ "epoch": 8.06348393702803,
2132
+ "grad_norm": 0.18823818862438202,
2133
+ "learning_rate": 8.962494949840577e-07,
2134
+ "loss": 0.186,
2135
+ "step": 21000
2136
+ },
2137
+ {
2138
+ "epoch": 8.06348393702803,
2139
+ "eval_valid_loss": 0.17808593809604645,
2140
+ "eval_valid_runtime": 4.6916,
2141
+ "eval_valid_samples_per_second": 213.147,
2142
+ "eval_valid_steps_per_second": 6.821,
2143
+ "step": 21000
2144
+ },
2145
+ {
2146
+ "epoch": 8.06348393702803,
2147
+ "eval_valid_target_loss": 0.20311719179153442,
2148
+ "eval_valid_target_runtime": 4.6653,
2149
+ "eval_valid_target_samples_per_second": 214.347,
2150
+ "eval_valid_target_steps_per_second": 6.859,
2151
+ "step": 21000
2152
+ },
2153
+ {
2154
+ "epoch": 8.101881479585307,
2155
+ "grad_norm": 0.20501789450645447,
2156
+ "learning_rate": 8.620862326741658e-07,
2157
+ "loss": 0.1862,
2158
+ "step": 21100
2159
+ },
2160
+ {
2161
+ "epoch": 8.140279022142582,
2162
+ "grad_norm": 0.19500133395195007,
2163
+ "learning_rate": 8.285252904000906e-07,
2164
+ "loss": 0.1862,
2165
+ "step": 21200
2166
+ },
2167
+ {
2168
+ "epoch": 8.17867656469986,
2169
+ "grad_norm": 0.18742544949054718,
2170
+ "learning_rate": 7.955715533356367e-07,
2171
+ "loss": 0.1863,
2172
+ "step": 21300
2173
+ },
2174
+ {
2175
+ "epoch": 8.217074107257135,
2176
+ "grad_norm": 0.20386624336242676,
2177
+ "learning_rate": 7.632298182690473e-07,
2178
+ "loss": 0.186,
2179
+ "step": 21400
2180
+ },
2181
+ {
2182
+ "epoch": 8.255471649814412,
2183
+ "grad_norm": 0.17727358639240265,
2184
+ "learning_rate": 7.315047929047608e-07,
2185
+ "loss": 0.1861,
2186
+ "step": 21500
2187
+ },
2188
+ {
2189
+ "epoch": 8.255471649814412,
2190
+ "eval_valid_loss": 0.17788280546665192,
2191
+ "eval_valid_runtime": 4.679,
2192
+ "eval_valid_samples_per_second": 213.72,
2193
+ "eval_valid_steps_per_second": 6.839,
2194
+ "step": 21500
2195
+ },
2196
+ {
2197
+ "epoch": 8.255471649814412,
2198
+ "eval_valid_target_loss": 0.2026640623807907,
2199
+ "eval_valid_target_runtime": 4.6709,
2200
+ "eval_valid_target_samples_per_second": 214.093,
2201
+ "eval_valid_target_steps_per_second": 6.851,
2202
+ "step": 21500
2203
+ },
2204
+ {
2205
+ "epoch": 8.293869192371687,
2206
+ "grad_norm": 0.19971401989459991,
2207
+ "learning_rate": 7.004010951781648e-07,
2208
+ "loss": 0.1858,
2209
+ "step": 21600
2210
+ },
2211
+ {
2212
+ "epoch": 8.332266734928965,
2213
+ "grad_norm": 0.17827193439006805,
2214
+ "learning_rate": 6.699232525833987e-07,
2215
+ "loss": 0.1868,
2216
+ "step": 21700
2217
+ },
2218
+ {
2219
+ "epoch": 8.370664277486242,
2220
+ "grad_norm": 0.18275295197963715,
2221
+ "learning_rate": 6.400757015143266e-07,
2222
+ "loss": 0.1858,
2223
+ "step": 21800
2224
+ },
2225
+ {
2226
+ "epoch": 8.409061820043517,
2227
+ "grad_norm": 0.19496768712997437,
2228
+ "learning_rate": 6.108627866187661e-07,
2229
+ "loss": 0.1854,
2230
+ "step": 21900
2231
+ },
2232
+ {
2233
+ "epoch": 8.447459362600794,
2234
+ "grad_norm": 0.19046269357204437,
2235
+ "learning_rate": 5.822887601660832e-07,
2236
+ "loss": 0.1862,
2237
+ "step": 22000
2238
+ },
2239
+ {
2240
+ "epoch": 8.447459362600794,
2241
+ "eval_valid_loss": 0.17781250178813934,
2242
+ "eval_valid_runtime": 4.6746,
2243
+ "eval_valid_samples_per_second": 213.921,
2244
+ "eval_valid_steps_per_second": 6.845,
2245
+ "step": 22000
2246
+ },
2247
+ {
2248
+ "epoch": 8.447459362600794,
2249
+ "eval_valid_target_loss": 0.2026640623807907,
2250
+ "eval_valid_target_runtime": 4.6755,
2251
+ "eval_valid_target_samples_per_second": 213.88,
2252
+ "eval_valid_target_steps_per_second": 6.844,
2253
+ "step": 22000
2254
+ },
2255
+ {
2256
+ "epoch": 8.48585690515807,
2257
+ "grad_norm": 0.20896296203136444,
2258
+ "learning_rate": 5.543577814282219e-07,
2259
+ "loss": 0.1856,
2260
+ "step": 22100
2261
+ },
2262
+ {
2263
+ "epoch": 8.524254447715347,
2264
+ "grad_norm": 0.19562530517578125,
2265
+ "learning_rate": 5.270739160742738e-07,
2266
+ "loss": 0.1857,
2267
+ "step": 22200
2268
+ },
2269
+ {
2270
+ "epoch": 8.562651990272622,
2271
+ "grad_norm": 0.1972120851278305,
2272
+ "learning_rate": 5.004411355786792e-07,
2273
+ "loss": 0.1863,
2274
+ "step": 22300
2275
+ },
2276
+ {
2277
+ "epoch": 8.601049532829899,
2278
+ "grad_norm": 0.19712330400943756,
2279
+ "learning_rate": 4.7446331664312786e-07,
2280
+ "loss": 0.1855,
2281
+ "step": 22400
2282
+ },
2283
+ {
2284
+ "epoch": 8.639447075387174,
2285
+ "grad_norm": 0.20409992337226868,
2286
+ "learning_rate": 4.4914424063226937e-07,
2287
+ "loss": 0.1857,
2288
+ "step": 22500
2289
+ },
2290
+ {
2291
+ "epoch": 8.639447075387174,
2292
+ "eval_valid_loss": 0.17765624821186066,
2293
+ "eval_valid_runtime": 4.6769,
2294
+ "eval_valid_samples_per_second": 213.818,
2295
+ "eval_valid_steps_per_second": 6.842,
2296
+ "step": 22500
2297
+ },
2298
+ {
2299
+ "epoch": 8.639447075387174,
2300
+ "eval_valid_target_loss": 0.2025781273841858,
2301
+ "eval_valid_target_runtime": 4.6696,
2302
+ "eval_valid_target_samples_per_second": 214.151,
2303
+ "eval_valid_target_steps_per_second": 6.853,
2304
+ "step": 22500
2305
+ },
2306
+ {
2307
+ "epoch": 8.677844617944451,
2308
+ "grad_norm": 0.21083636581897736,
2309
+ "learning_rate": 4.2448759302328336e-07,
2310
+ "loss": 0.1861,
2311
+ "step": 22600
2312
+ },
2313
+ {
2314
+ "epoch": 8.716242160501729,
2315
+ "grad_norm": 0.18778979778289795,
2316
+ "learning_rate": 4.0049696286942496e-07,
2317
+ "loss": 0.1862,
2318
+ "step": 22700
2319
+ },
2320
+ {
2321
+ "epoch": 8.754639703059004,
2322
+ "grad_norm": 0.18586015701293945,
2323
+ "learning_rate": 3.7717584227759117e-07,
2324
+ "loss": 0.1857,
2325
+ "step": 22800
2326
+ },
2327
+ {
2328
+ "epoch": 8.793037245616281,
2329
+ "grad_norm": 0.1977422684431076,
2330
+ "learning_rate": 3.54527625900013e-07,
2331
+ "loss": 0.1856,
2332
+ "step": 22900
2333
+ },
2334
+ {
2335
+ "epoch": 8.831434788173556,
2336
+ "grad_norm": 0.18881608545780182,
2337
+ "learning_rate": 3.3255561044011564e-07,
2338
+ "loss": 0.1857,
2339
+ "step": 23000
2340
+ },
2341
+ {
2342
+ "epoch": 8.831434788173556,
2343
+ "eval_valid_loss": 0.17771874368190765,
2344
+ "eval_valid_runtime": 4.6727,
2345
+ "eval_valid_samples_per_second": 214.01,
2346
+ "eval_valid_steps_per_second": 6.848,
2347
+ "step": 23000
2348
+ },
2349
+ {
2350
+ "epoch": 8.831434788173556,
2351
+ "eval_valid_target_loss": 0.20250000059604645,
2352
+ "eval_valid_target_runtime": 4.6666,
2353
+ "eval_valid_target_samples_per_second": 214.287,
2354
+ "eval_valid_target_steps_per_second": 6.857,
2355
+ "step": 23000
2356
+ },
2357
+ {
2358
+ "epoch": 8.869832330730834,
2359
+ "grad_norm": 0.2037239372730255,
2360
+ "learning_rate": 3.112629941726547e-07,
2361
+ "loss": 0.1856,
2362
+ "step": 23100
2363
+ },
2364
+ {
2365
+ "epoch": 8.908229873288109,
2366
+ "grad_norm": 0.18967826664447784,
2367
+ "learning_rate": 2.9065287647816744e-07,
2368
+ "loss": 0.1855,
2369
+ "step": 23200
2370
+ },
2371
+ {
2372
+ "epoch": 8.946627415845386,
2373
+ "grad_norm": 0.17752571403980255,
2374
+ "learning_rate": 2.707282573918213e-07,
2375
+ "loss": 0.1858,
2376
+ "step": 23300
2377
+ },
2378
+ {
2379
+ "epoch": 8.985024958402661,
2380
+ "grad_norm": 0.18709731101989746,
2381
+ "learning_rate": 2.514920371667301e-07,
2382
+ "loss": 0.1854,
2383
+ "step": 23400
2384
+ },
2385
+ {
2386
+ "epoch": 9.023422500959938,
2387
+ "grad_norm": 0.21643956005573273,
2388
+ "learning_rate": 2.3294701585178213e-07,
2389
+ "loss": 0.1858,
2390
+ "step": 23500
2391
+ },
2392
+ {
2393
+ "epoch": 9.023422500959938,
2394
+ "eval_valid_loss": 0.17762500047683716,
2395
+ "eval_valid_runtime": 4.6791,
2396
+ "eval_valid_samples_per_second": 213.717,
2397
+ "eval_valid_steps_per_second": 6.839,
2398
+ "step": 23500
2399
+ },
2400
+ {
2401
+ "epoch": 9.023422500959938,
2402
+ "eval_valid_target_loss": 0.20255468785762787,
2403
+ "eval_valid_target_runtime": 4.7103,
2404
+ "eval_valid_target_samples_per_second": 212.301,
2405
+ "eval_valid_target_steps_per_second": 6.794,
2406
+ "step": 23500
2407
+ },
2408
+ {
2409
+ "epoch": 9.061820043517216,
2410
+ "grad_norm": 0.18775244057178497,
2411
+ "learning_rate": 2.1509589288407183e-07,
2412
+ "loss": 0.1855,
2413
+ "step": 23600
2414
+ },
2415
+ {
2416
+ "epoch": 9.100217586074491,
2417
+ "grad_norm": 0.17277489602565765,
2418
+ "learning_rate": 1.9794126669595403e-07,
2419
+ "loss": 0.1859,
2420
+ "step": 23700
2421
+ },
2422
+ {
2423
+ "epoch": 9.138615128631768,
2424
+ "grad_norm": 0.18996348977088928,
2425
+ "learning_rate": 1.8148563433682264e-07,
2426
+ "loss": 0.1852,
2427
+ "step": 23800
2428
+ },
2429
+ {
2430
+ "epoch": 9.177012671189043,
2431
+ "grad_norm": 0.1894453912973404,
2432
+ "learning_rate": 1.6573139110963087e-07,
2433
+ "loss": 0.1854,
2434
+ "step": 23900
2435
+ },
2436
+ {
2437
+ "epoch": 9.21541021374632,
2438
+ "grad_norm": 0.2011975795030594,
2439
+ "learning_rate": 1.5068083022223346e-07,
2440
+ "loss": 0.1855,
2441
+ "step": 24000
2442
+ },
2443
+ {
2444
+ "epoch": 9.21541021374632,
2445
+ "eval_valid_loss": 0.17754687368869781,
2446
+ "eval_valid_runtime": 4.6668,
2447
+ "eval_valid_samples_per_second": 214.279,
2448
+ "eval_valid_steps_per_second": 6.857,
2449
+ "step": 24000
2450
+ },
2451
+ {
2452
+ "epoch": 9.21541021374632,
2453
+ "eval_valid_target_loss": 0.20250000059604645,
2454
+ "eval_valid_target_runtime": 4.6766,
2455
+ "eval_valid_target_samples_per_second": 213.829,
2456
+ "eval_valid_target_steps_per_second": 6.843,
2457
+ "step": 24000
2458
+ },
2459
+ {
2460
+ "epoch": 9.253807756303596,
2461
+ "grad_norm": 0.2087700515985489,
2462
+ "learning_rate": 1.3633614245357807e-07,
2463
+ "loss": 0.1858,
2464
+ "step": 24100
2465
+ },
2466
+ {
2467
+ "epoch": 9.292205298860873,
2468
+ "grad_norm": 0.18402153253555298,
2469
+ "learning_rate": 1.2269941583481548e-07,
2470
+ "loss": 0.1859,
2471
+ "step": 24200
2472
+ },
2473
+ {
2474
+ "epoch": 9.330602841418148,
2475
+ "grad_norm": 0.17724697291851044,
2476
+ "learning_rate": 1.0977263534536597e-07,
2477
+ "loss": 0.1856,
2478
+ "step": 24300
2479
+ },
2480
+ {
2481
+ "epoch": 9.369000383975425,
2482
+ "grad_norm": 0.1847800761461258,
2483
+ "learning_rate": 9.755768262397936e-08,
2484
+ "loss": 0.1858,
2485
+ "step": 24400
2486
+ },
2487
+ {
2488
+ "epoch": 9.407397926532703,
2489
+ "grad_norm": 0.1905263364315033,
2490
+ "learning_rate": 8.605633569484184e-08,
2491
+ "loss": 0.1856,
2492
+ "step": 24500
2493
+ },
2494
+ {
2495
+ "epoch": 9.407397926532703,
2496
+ "eval_valid_loss": 0.1775546818971634,
2497
+ "eval_valid_runtime": 4.6591,
2498
+ "eval_valid_samples_per_second": 214.636,
2499
+ "eval_valid_steps_per_second": 6.868,
2500
+ "step": 24500
2501
+ },
2502
+ {
2503
+ "epoch": 9.407397926532703,
2504
+ "eval_valid_target_loss": 0.2024531215429306,
2505
+ "eval_valid_target_runtime": 4.6763,
2506
+ "eval_valid_target_samples_per_second": 213.844,
2507
+ "eval_valid_target_steps_per_second": 6.843,
2508
+ "step": 24500
2509
+ },
2510
+ {
2511
+ "epoch": 9.445795469089978,
2512
+ "grad_norm": 0.17600856721401215,
2513
+ "learning_rate": 7.52702687087653e-08,
2514
+ "loss": 0.1855,
2515
+ "step": 24600
2516
+ },
2517
+ {
2518
+ "epoch": 9.484193011647255,
2519
+ "grad_norm": 0.19071801006793976,
2520
+ "learning_rate": 6.520105169949609e-08,
2521
+ "loss": 0.1856,
2522
+ "step": 24700
2523
+ },
2524
+ {
2525
+ "epoch": 9.52259055420453,
2526
+ "grad_norm": 0.20268982648849487,
2527
+ "learning_rate": 5.5850150355178936e-08,
2528
+ "loss": 0.1855,
2529
+ "step": 24800
2530
+ },
2531
+ {
2532
+ "epoch": 9.560988096761807,
2533
+ "grad_norm": 0.18069659173488617,
2534
+ "learning_rate": 4.721892580500709e-08,
2535
+ "loss": 0.1852,
2536
+ "step": 24900
2537
+ },
2538
+ {
2539
+ "epoch": 9.599385639319083,
2540
+ "grad_norm": 0.19809788465499878,
2541
+ "learning_rate": 3.9308634421098e-08,
2542
+ "loss": 0.1853,
2543
+ "step": 25000
2544
+ },
2545
+ {
2546
+ "epoch": 9.599385639319083,
2547
+ "eval_valid_loss": 0.17754687368869781,
2548
+ "eval_valid_runtime": 4.6689,
2549
+ "eval_valid_samples_per_second": 214.182,
2550
+ "eval_valid_steps_per_second": 6.854,
2551
+ "step": 25000
2552
+ },
2553
+ {
2554
+ "epoch": 9.599385639319083,
2555
+ "eval_valid_target_loss": 0.20237499475479126,
2556
+ "eval_valid_target_runtime": 4.688,
2557
+ "eval_valid_target_samples_per_second": 213.313,
2558
+ "eval_valid_target_steps_per_second": 6.826,
2559
+ "step": 25000
2560
+ },
2561
+ {
2562
+ "epoch": 9.63778318187636,
2563
+ "grad_norm": 0.1990041732788086,
2564
+ "learning_rate": 3.2120427635613517e-08,
2565
+ "loss": 0.1852,
2566
+ "step": 25100
2567
+ },
2568
+ {
2569
+ "epoch": 9.676180724433635,
2570
+ "grad_norm": 0.20578785240650177,
2571
+ "learning_rate": 2.565535177315226e-08,
2572
+ "loss": 0.185,
2573
+ "step": 25200
2574
+ },
2575
+ {
2576
+ "epoch": 9.714578266990912,
2577
+ "grad_norm": 0.19831426441669464,
2578
+ "learning_rate": 1.991434789845037e-08,
2579
+ "loss": 0.1858,
2580
+ "step": 25300
2581
+ },
2582
+ {
2583
+ "epoch": 9.75297580954819,
2584
+ "grad_norm": 0.18692290782928467,
2585
+ "learning_rate": 1.489825167939607e-08,
2586
+ "loss": 0.1848,
2587
+ "step": 25400
2588
+ },
2589
+ {
2590
+ "epoch": 9.791373352105465,
2591
+ "grad_norm": 0.20175856351852417,
2592
+ "learning_rate": 1.0607793265389742e-08,
2593
+ "loss": 0.1854,
2594
+ "step": 25500
2595
+ },
2596
+ {
2597
+ "epoch": 9.791373352105465,
2598
+ "eval_valid_loss": 0.17751562595367432,
2599
+ "eval_valid_runtime": 4.667,
2600
+ "eval_valid_samples_per_second": 214.272,
2601
+ "eval_valid_steps_per_second": 6.857,
2602
+ "step": 25500
2603
+ },
2604
+ {
2605
+ "epoch": 9.791373352105465,
2606
+ "eval_valid_target_loss": 0.20240625739097595,
2607
+ "eval_valid_target_runtime": 4.6781,
2608
+ "eval_valid_target_samples_per_second": 213.763,
2609
+ "eval_valid_target_steps_per_second": 6.84,
2610
+ "step": 25500
2611
+ },
2612
+ {
2613
+ "epoch": 9.829770894662742,
2614
+ "grad_norm": 0.20650416612625122,
2615
+ "learning_rate": 7.0435971810606244e-09,
2616
+ "loss": 0.1859,
2617
+ "step": 25600
2618
+ },
2619
+ {
2620
+ "epoch": 9.868168437220017,
2621
+ "grad_norm": 0.1880464404821396,
2622
+ "learning_rate": 4.206182235363399e-09,
2623
+ "loss": 0.1857,
2624
+ "step": 25700
2625
+ },
2626
+ {
2627
+ "epoch": 9.906565979777294,
2628
+ "grad_norm": 0.19517436623573303,
2629
+ "learning_rate": 2.095961446056949e-09,
2630
+ "loss": 0.1851,
2631
+ "step": 25800
2632
+ },
2633
+ {
2634
+ "epoch": 9.94496352233457,
2635
+ "grad_norm": 0.21848323941230774,
2636
+ "learning_rate": 7.132419795868872e-10,
2637
+ "loss": 0.1858,
2638
+ "step": 25900
2639
+ },
2640
+ {
2641
+ "epoch": 9.983361064891847,
2642
+ "grad_norm": 0.20499403774738312,
2643
+ "learning_rate": 5.82251063713235e-11,
2644
+ "loss": 0.1851,
2645
+ "step": 26000
2646
+ },
2647
+ {
2648
+ "epoch": 9.983361064891847,
2649
+ "eval_valid_loss": 0.1775234341621399,
2650
+ "eval_valid_runtime": 4.6706,
2651
+ "eval_valid_samples_per_second": 214.106,
2652
+ "eval_valid_steps_per_second": 6.851,
2653
+ "step": 26000
2654
+ },
2655
+ {
2656
+ "epoch": 9.983361064891847,
2657
+ "eval_valid_target_loss": 0.20242968201637268,
2658
+ "eval_valid_target_runtime": 4.679,
2659
+ "eval_valid_target_samples_per_second": 213.721,
2660
+ "eval_valid_target_steps_per_second": 6.839,
2661
+ "step": 26000
2662
+ }
2663
+ ],
2664
+ "logging_steps": 100,
2665
+ "max_steps": 26040,
2666
+ "num_input_tokens_seen": 0,
2667
+ "num_train_epochs": 10,
2668
+ "save_steps": 5000,
2669
+ "stateful_callbacks": {
2670
+ "TrainerControl": {
2671
+ "args": {
2672
+ "should_epoch_stop": false,
2673
+ "should_evaluate": false,
2674
+ "should_log": false,
2675
+ "should_save": true,
2676
+ "should_training_stop": true
2677
+ },
2678
+ "attributes": {}
2679
+ }
2680
+ },
2681
+ "total_flos": 2.475781022436819e+19,
2682
+ "train_batch_size": 16,
2683
+ "trial_name": null,
2684
+ "trial_params": null
2685
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e1dfeeeadad4bc43c97dcedacc619cfac42a0c0627e5ea03b98f5461236498c
3
+ size 6200
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
zero_to_fp32.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
215
+ exclude_frozen_parameters)
216
+ elif zero_stage == 3:
217
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
218
+ exclude_frozen_parameters)
219
+
220
+
221
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
222
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
223
+ return
224
+
225
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
226
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
227
+
228
+ if debug:
229
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
230
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
231
+
232
+ wanted_params = len(frozen_param_shapes)
233
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
234
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
235
+ print(f'Frozen params: Have {avail_numel} numels to process.')
236
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
237
+
238
+ total_params = 0
239
+ total_numel = 0
240
+ for name, shape in frozen_param_shapes.items():
241
+ total_params += 1
242
+ unpartitioned_numel = shape.numel()
243
+ total_numel += unpartitioned_numel
244
+
245
+ state_dict[name] = frozen_param_fragments[name]
246
+
247
+ if debug:
248
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
249
+
250
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
251
+
252
+
253
+ def _has_callable(obj, fn):
254
+ attr = getattr(obj, fn, None)
255
+ return callable(attr)
256
+
257
+
258
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
259
+ param_shapes = zero_model_states[0].param_shapes
260
+
261
+ # Reconstruction protocol:
262
+ #
263
+ # XXX: document this
264
+
265
+ if debug:
266
+ for i in range(world_size):
267
+ for j in range(len(fp32_flat_groups[0])):
268
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
269
+
270
+ # XXX: memory usage doubles here (zero2)
271
+ num_param_groups = len(fp32_flat_groups[0])
272
+ merged_single_partition_of_fp32_groups = []
273
+ for i in range(num_param_groups):
274
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
275
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
276
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
277
+ avail_numel = sum(
278
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
279
+
280
+ if debug:
281
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
282
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
283
+ # not asserting if there is a mismatch due to possible padding
284
+ print(f"Have {avail_numel} numels to process.")
285
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
286
+
287
+ # params
288
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
289
+ # out-of-core computing solution
290
+ total_numel = 0
291
+ total_params = 0
292
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
293
+ offset = 0
294
+ avail_numel = full_single_fp32_vector.numel()
295
+ for name, shape in shapes.items():
296
+
297
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
298
+ total_numel += unpartitioned_numel
299
+ total_params += 1
300
+
301
+ if debug:
302
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
303
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
304
+ offset += unpartitioned_numel
305
+
306
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
307
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
308
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
309
+ # live optimizer object, so we are checking that the numbers are within the right range
310
+ align_to = 2 * world_size
311
+
312
+ def zero2_align(x):
313
+ return align_to * math.ceil(x / align_to)
314
+
315
+ if debug:
316
+ print(f"original offset={offset}, avail_numel={avail_numel}")
317
+
318
+ offset = zero2_align(offset)
319
+ avail_numel = zero2_align(avail_numel)
320
+
321
+ if debug:
322
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
323
+
324
+ # Sanity check
325
+ if offset != avail_numel:
326
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
327
+
328
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
329
+
330
+
331
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
332
+ exclude_frozen_parameters):
333
+ state_dict = OrderedDict()
334
+
335
+ # buffers
336
+ buffers = zero_model_states[0].buffers
337
+ state_dict.update(buffers)
338
+ if debug:
339
+ print(f"added {len(buffers)} buffers")
340
+
341
+ if not exclude_frozen_parameters:
342
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
343
+
344
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
345
+
346
+ # recover shared parameters
347
+ for pair in zero_model_states[0].shared_params:
348
+ if pair[1] in state_dict:
349
+ state_dict[pair[0]] = state_dict[pair[1]]
350
+
351
+ return state_dict
352
+
353
+
354
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
355
+ remainder = unpartitioned_numel % world_size
356
+ padding_numel = (world_size - remainder) if remainder else 0
357
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
358
+ return partitioned_numel, padding_numel
359
+
360
+
361
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
362
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
363
+ return
364
+
365
+ if debug:
366
+ for i in range(world_size):
367
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
368
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
369
+
370
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
371
+ wanted_params = len(frozen_param_shapes)
372
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
373
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
374
+ print(f'Frozen params: Have {avail_numel} numels to process.')
375
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
376
+
377
+ total_params = 0
378
+ total_numel = 0
379
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
380
+ total_params += 1
381
+ unpartitioned_numel = shape.numel()
382
+ total_numel += unpartitioned_numel
383
+
384
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
385
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
386
+
387
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
388
+
389
+ if debug:
390
+ print(
391
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
392
+ )
393
+
394
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
395
+
396
+
397
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
398
+ param_shapes = zero_model_states[0].param_shapes
399
+ avail_numel = fp32_flat_groups[0].numel() * world_size
400
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
401
+ # param, re-consolidating each param, while dealing with padding if any
402
+
403
+ # merge list of dicts, preserving order
404
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
405
+
406
+ if debug:
407
+ for i in range(world_size):
408
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
409
+
410
+ wanted_params = len(param_shapes)
411
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
412
+ # not asserting if there is a mismatch due to possible padding
413
+ avail_numel = fp32_flat_groups[0].numel() * world_size
414
+ print(f"Trainable params: Have {avail_numel} numels to process.")
415
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
416
+
417
+ # params
418
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
419
+ # out-of-core computing solution
420
+ offset = 0
421
+ total_numel = 0
422
+ total_params = 0
423
+ for name, shape in param_shapes.items():
424
+
425
+ unpartitioned_numel = shape.numel()
426
+ total_numel += unpartitioned_numel
427
+ total_params += 1
428
+
429
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
430
+
431
+ if debug:
432
+ print(
433
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
434
+ )
435
+
436
+ # XXX: memory usage doubles here
437
+ state_dict[name] = torch.cat(
438
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
439
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
440
+ offset += partitioned_numel
441
+
442
+ offset *= world_size
443
+
444
+ # Sanity check
445
+ if offset != avail_numel:
446
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
447
+
448
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
449
+
450
+
451
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
452
+ exclude_frozen_parameters):
453
+ state_dict = OrderedDict()
454
+
455
+ # buffers
456
+ buffers = zero_model_states[0].buffers
457
+ state_dict.update(buffers)
458
+ if debug:
459
+ print(f"added {len(buffers)} buffers")
460
+
461
+ if not exclude_frozen_parameters:
462
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
463
+
464
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
465
+
466
+ # recover shared parameters
467
+ for pair in zero_model_states[0].shared_params:
468
+ if pair[1] in state_dict:
469
+ state_dict[pair[0]] = state_dict[pair[1]]
470
+
471
+ return state_dict
472
+
473
+
474
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
475
+ """
476
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
477
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
478
+ via a model hub.
479
+
480
+ Args:
481
+ - ``checkpoint_dir``: path to the desired checkpoint folder
482
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
483
+ - ``exclude_frozen_parameters``: exclude frozen parameters
484
+
485
+ Returns:
486
+ - pytorch ``state_dict``
487
+
488
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
489
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
490
+ the checkpoint.
491
+
492
+ A typical usage might be ::
493
+
494
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
495
+ # do the training and checkpoint saving
496
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
497
+ model = model.cpu() # move to cpu
498
+ model.load_state_dict(state_dict)
499
+ # submit to model hub or save the model to share with others
500
+
501
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
502
+ application. i.e. you will need to re-initialize the deepspeed engine, since
503
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
504
+
505
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
506
+
507
+ """
508
+ if tag is None:
509
+ latest_path = os.path.join(checkpoint_dir, 'latest')
510
+ if os.path.isfile(latest_path):
511
+ with open(latest_path, 'r') as fd:
512
+ tag = fd.read().strip()
513
+ else:
514
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
515
+
516
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
517
+
518
+ if not os.path.isdir(ds_checkpoint_dir):
519
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
520
+
521
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
522
+
523
+
524
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
525
+ """
526
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
527
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
528
+
529
+ Args:
530
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
531
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
532
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
533
+ - ``exclude_frozen_parameters``: exclude frozen parameters
534
+ """
535
+
536
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
537
+ print(f"Saving fp32 state dict to {output_file}")
538
+ torch.save(state_dict, output_file)
539
+
540
+
541
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
542
+ """
543
+ 1. Put the provided model to cpu
544
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
545
+ 3. Load it into the provided model
546
+
547
+ Args:
548
+ - ``model``: the model object to update
549
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
550
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
551
+
552
+ Returns:
553
+ - ``model`: modified model
554
+
555
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
556
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
557
+ conveniently placed for you in the checkpoint folder.
558
+
559
+ A typical usage might be ::
560
+
561
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
562
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
563
+ # submit to model hub or save the model to share with others
564
+
565
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
566
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
567
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
568
+
569
+ """
570
+ logger.info(f"Extracting fp32 weights")
571
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
572
+
573
+ logger.info(f"Overwriting model with fp32 weights")
574
+ model = model.cpu()
575
+ model.load_state_dict(state_dict, strict=False)
576
+
577
+ return model
578
+
579
+
580
+ if __name__ == "__main__":
581
+
582
+ parser = argparse.ArgumentParser()
583
+ parser.add_argument("checkpoint_dir",
584
+ type=str,
585
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
586
+ parser.add_argument(
587
+ "output_file",
588
+ type=str,
589
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
590
+ parser.add_argument("-t",
591
+ "--tag",
592
+ type=str,
593
+ default=None,
594
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
595
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
596
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
597
+ args = parser.parse_args()
598
+
599
+ debug = args.debug
600
+
601
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
602
+ args.output_file,
603
+ tag=args.tag,
604
+ exclude_frozen_parameters=args.exclude_frozen_parameters)