alexue4 commited on
Commit
e06c4d5
1 Parent(s): d29ade6

End of training

Browse files
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: cointegrated/rut5-small
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: text-normalization-ru-new
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # text-normalization-ru-new
15
+
16
+ This model is a fine-tuned version of [cointegrated/rut5-small](https://huggingface.co/cointegrated/rut5-small) on the None dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.0442
19
+ - Mean Distance: 0
20
+ - Max Distance: 25
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 0.001
40
+ - train_batch_size: 15
41
+ - eval_batch_size: 15
42
+ - seed: 42
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: linear
45
+ - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 30
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss | Max Distance | Mean Distance |
51
+ |:-------------:|:-----:|:------:|:---------------:|:------------:|:-------------:|
52
+ | 0.199 | 1.0 | 11526 | 0.8173 | 167 | 17 |
53
+ | 0.1286 | 2.0 | 23052 | 0.5453 | 158 | 14 |
54
+ | 0.0891 | 3.0 | 34578 | 0.3629 | 122 | 10 |
55
+ | 0.0711 | 4.0 | 46104 | 0.4011 | 114 | 12 |
56
+ | 0.0566 | 5.0 | 57630 | 0.2997 | 100 | 7 |
57
+ | 0.0402 | 6.0 | 69156 | 0.1552 | 75 | 4 |
58
+ | 0.0348 | 7.0 | 80682 | 0.1513 | 79 | 3 |
59
+ | 0.0302 | 8.0 | 92208 | 0.1452 | 76 | 3 |
60
+ | 0.0223 | 9.0 | 103734 | 0.0866 | 76 | 1 |
61
+ | 0.0202 | 10.0 | 115260 | 0.1091 | 71 | 2 |
62
+ | 0.0175 | 11.0 | 126786 | 0.0655 | 66 | 1 |
63
+ | 0.014 | 12.0 | 138312 | 0.0474 | 44 | 0 |
64
+ | 0.0122 | 13.0 | 149838 | 0.0515 | 42 | 0 |
65
+ | 0.0117 | 14.0 | 161364 | 0.0479 | 30 | 0 |
66
+ | 0.0093 | 15.0 | 172890 | 0.0565 | 56 | 0 |
67
+ | 0.0085 | 16.0 | 184416 | 0.0472 | 34 | 0 |
68
+ | 0.0075 | 17.0 | 195942 | 0.0420 | 28 | 0 |
69
+ | 0.0059 | 18.0 | 207468 | 0.0415 | 32 | 0 |
70
+ | 0.0054 | 19.0 | 218994 | 0.0406 | 28 | 0 |
71
+ | 0.0046 | 20.0 | 230520 | 0.0393 | 24 | 0 |
72
+ | 0.004 | 21.0 | 242046 | 0.0417 | 24 | 0 |
73
+ | 0.0034 | 22.0 | 253572 | 0.0403 | 18 | 0 |
74
+ | 0.0029 | 23.0 | 265098 | 0.0422 | 21 | 0 |
75
+ | 0.0024 | 24.0 | 276624 | 0.0410 | 21 | 0 |
76
+ | 0.002 | 25.0 | 288150 | 0.0435 | 15 | 0 |
77
+ | 0.0016 | 26.0 | 299676 | 0.0452 | 15 | 0 |
78
+ | 0.0013 | 27.0 | 311202 | 0.0414 | 14 | 0 |
79
+ | 0.0012 | 28.0 | 322728 | 0.0439 | 14 | 0 |
80
+ | 0.001 | 29.0 | 334254 | 0.0444 | 15 | 0 |
81
+ | 0.0026 | 30.0 | 345780 | 0.0427 | 19 | 0 |
82
+ | 0.0077 | 24.0 | 368808 | 0.0495 | 27 | 0 |
83
+ | 0.0083 | 25.0 | 384175 | 0.0446 | 37 | 0 |
84
+ | 0.0078 | 26.0 | 399542 | 0.0481 | 47 | 0 |
85
+ | 0.006 | 27.0 | 414909 | 0.0424 | 37 | 0 |
86
+ | 0.0056 | 28.0 | 430276 | 0.0439 | 22 | 0 |
87
+ | 0.0054 | 29.0 | 445643 | 0.0481 | 23 | 0 |
88
+ | 0.004 | 30.0 | 461010 | 0.0442 | 0 | 25 |
89
+
90
+
91
+ ### Framework versions
92
+
93
+ - Transformers 4.32.1
94
+ - Pytorch 2.0.1+cu117
95
+ - Datasets 2.14.4
96
+ - Tokenizers 0.13.3
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cointegrated/rut5-small",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1024,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "num_decoder_layers": 8,
21
+ "num_heads": 6,
22
+ "num_layers": 8,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.32.1",
30
+ "use_cache": true,
31
+ "vocab_size": 20100
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.32.1"
7
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af9c44d78cc87dce6c9af177a92980e3657aa89417f862a6decf4575d013140
3
+ size 258643461
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6afde64def093a9d493d1f4254768c2e842ed45bcc9c184233f245cb29d2a31
3
+ size 639963
tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "legacy": true,
108
+ "model_max_length": 1000000000000000019884624838656,
109
+ "pad_token": "<pad>",
110
+ "sp_model_kwargs": {},
111
+ "tokenizer_class": "T5Tokenizer",
112
+ "unk_token": "<unk>"
113
+ }
trainer_state.json ADDED
@@ -0,0 +1,2000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 30.0,
5
+ "eval_steps": 500,
6
+ "global_step": 461010,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.8920122621319915e-08,
14
+ "loss": 14.3504,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.15,
19
+ "learning_rate": 5.000289201226213e-05,
20
+ "loss": 2.1001,
21
+ "step": 1729
22
+ },
23
+ {
24
+ "epoch": 0.3,
25
+ "learning_rate": 0.00010000578402452426,
26
+ "loss": 0.3756,
27
+ "step": 3458
28
+ },
29
+ {
30
+ "epoch": 0.45,
31
+ "learning_rate": 0.0001500086760367864,
32
+ "loss": 0.2527,
33
+ "step": 5187
34
+ },
35
+ {
36
+ "epoch": 0.6,
37
+ "learning_rate": 0.00020001156804904852,
38
+ "loss": 0.2076,
39
+ "step": 6916
40
+ },
41
+ {
42
+ "epoch": 0.75,
43
+ "learning_rate": 0.00025001446006131067,
44
+ "loss": 0.1864,
45
+ "step": 8645
46
+ },
47
+ {
48
+ "epoch": 0.9,
49
+ "learning_rate": 0.0003000173520735728,
50
+ "loss": 0.199,
51
+ "step": 10374
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_loss": 0.8173184990882874,
56
+ "eval_max_distance": 167,
57
+ "eval_mean_distance": 17,
58
+ "eval_runtime": 64.0638,
59
+ "eval_samples_per_second": 15.609,
60
+ "eval_steps_per_second": 0.78,
61
+ "step": 11526
62
+ },
63
+ {
64
+ "epoch": 1.05,
65
+ "learning_rate": 0.0003500202440858349,
66
+ "loss": 0.2481,
67
+ "step": 12103
68
+ },
69
+ {
70
+ "epoch": 1.2,
71
+ "learning_rate": 0.00040002313609809704,
72
+ "loss": 0.1244,
73
+ "step": 13832
74
+ },
75
+ {
76
+ "epoch": 1.35,
77
+ "learning_rate": 0.0004500260281103592,
78
+ "loss": 0.1055,
79
+ "step": 15561
80
+ },
81
+ {
82
+ "epoch": 1.5,
83
+ "learning_rate": 0.0005000289201226213,
84
+ "loss": 0.102,
85
+ "step": 17290
86
+ },
87
+ {
88
+ "epoch": 1.65,
89
+ "learning_rate": 0.0005500318121348835,
90
+ "loss": 0.102,
91
+ "step": 19019
92
+ },
93
+ {
94
+ "epoch": 1.8,
95
+ "learning_rate": 0.0006000347041471456,
96
+ "loss": 0.1083,
97
+ "step": 20748
98
+ },
99
+ {
100
+ "epoch": 1.95,
101
+ "learning_rate": 0.0006500375961594078,
102
+ "loss": 0.1286,
103
+ "step": 22477
104
+ },
105
+ {
106
+ "epoch": 2.0,
107
+ "eval_loss": 0.5452634692192078,
108
+ "eval_max_distance": 158,
109
+ "eval_mean_distance": 14,
110
+ "eval_runtime": 30.4268,
111
+ "eval_samples_per_second": 32.866,
112
+ "eval_steps_per_second": 1.643,
113
+ "step": 23052
114
+ },
115
+ {
116
+ "epoch": 2.1,
117
+ "learning_rate": 0.0007000404881716698,
118
+ "loss": 0.1449,
119
+ "step": 24206
120
+ },
121
+ {
122
+ "epoch": 2.25,
123
+ "learning_rate": 0.000750043380183932,
124
+ "loss": 0.0747,
125
+ "step": 25935
126
+ },
127
+ {
128
+ "epoch": 2.4,
129
+ "learning_rate": 0.0008000462721961941,
130
+ "loss": 0.0744,
131
+ "step": 27664
132
+ },
133
+ {
134
+ "epoch": 2.55,
135
+ "learning_rate": 0.0008500491642084563,
136
+ "loss": 0.0742,
137
+ "step": 29393
138
+ },
139
+ {
140
+ "epoch": 2.7,
141
+ "learning_rate": 0.0009000520562207184,
142
+ "loss": 0.0792,
143
+ "step": 31122
144
+ },
145
+ {
146
+ "epoch": 2.85,
147
+ "learning_rate": 0.0009500549482329805,
148
+ "loss": 0.0891,
149
+ "step": 32851
150
+ },
151
+ {
152
+ "epoch": 3.0,
153
+ "eval_loss": 0.3629104495048523,
154
+ "eval_max_distance": 122,
155
+ "eval_mean_distance": 10,
156
+ "eval_runtime": 35.1149,
157
+ "eval_samples_per_second": 28.478,
158
+ "eval_steps_per_second": 1.424,
159
+ "step": 34578
160
+ },
161
+ {
162
+ "epoch": 3.0,
163
+ "learning_rate": 0.0009999935733060843,
164
+ "loss": 0.1079,
165
+ "step": 34580
166
+ },
167
+ {
168
+ "epoch": 3.15,
169
+ "learning_rate": 0.000994437696415833,
170
+ "loss": 0.0962,
171
+ "step": 36309
172
+ },
173
+ {
174
+ "epoch": 3.3,
175
+ "learning_rate": 0.0009888818195255813,
176
+ "loss": 0.059,
177
+ "step": 38038
178
+ },
179
+ {
180
+ "epoch": 3.45,
181
+ "learning_rate": 0.0009833259426353302,
182
+ "loss": 0.0576,
183
+ "step": 39767
184
+ },
185
+ {
186
+ "epoch": 3.6,
187
+ "learning_rate": 0.0009777700657450789,
188
+ "loss": 0.058,
189
+ "step": 41496
190
+ },
191
+ {
192
+ "epoch": 3.75,
193
+ "learning_rate": 0.0009722141888548275,
194
+ "loss": 0.0611,
195
+ "step": 43225
196
+ },
197
+ {
198
+ "epoch": 3.9,
199
+ "learning_rate": 0.0009666583119645761,
200
+ "loss": 0.0711,
201
+ "step": 44954
202
+ },
203
+ {
204
+ "epoch": 4.0,
205
+ "eval_loss": 0.4011004865169525,
206
+ "eval_max_distance": 114,
207
+ "eval_mean_distance": 12,
208
+ "eval_runtime": 24.9859,
209
+ "eval_samples_per_second": 40.022,
210
+ "eval_steps_per_second": 2.001,
211
+ "step": 46104
212
+ },
213
+ {
214
+ "epoch": 4.05,
215
+ "learning_rate": 0.0009611024350743247,
216
+ "loss": 0.0991,
217
+ "step": 46683
218
+ },
219
+ {
220
+ "epoch": 4.2,
221
+ "learning_rate": 0.0009555465581840734,
222
+ "loss": 0.0481,
223
+ "step": 48412
224
+ },
225
+ {
226
+ "epoch": 4.35,
227
+ "learning_rate": 0.0009499906812938221,
228
+ "loss": 0.044,
229
+ "step": 50141
230
+ },
231
+ {
232
+ "epoch": 4.5,
233
+ "learning_rate": 0.0009444348044035706,
234
+ "loss": 0.0435,
235
+ "step": 51870
236
+ },
237
+ {
238
+ "epoch": 4.65,
239
+ "learning_rate": 0.0009388789275133194,
240
+ "loss": 0.0454,
241
+ "step": 53599
242
+ },
243
+ {
244
+ "epoch": 4.8,
245
+ "learning_rate": 0.000933323050623068,
246
+ "loss": 0.0483,
247
+ "step": 55328
248
+ },
249
+ {
250
+ "epoch": 4.95,
251
+ "learning_rate": 0.0009277671737328166,
252
+ "loss": 0.0566,
253
+ "step": 57057
254
+ },
255
+ {
256
+ "epoch": 5.0,
257
+ "eval_loss": 0.29974234104156494,
258
+ "eval_max_distance": 100,
259
+ "eval_mean_distance": 7,
260
+ "eval_runtime": 25.609,
261
+ "eval_samples_per_second": 39.049,
262
+ "eval_steps_per_second": 1.952,
263
+ "step": 57630
264
+ },
265
+ {
266
+ "epoch": 5.1,
267
+ "learning_rate": 0.0009222112968425653,
268
+ "loss": 0.0724,
269
+ "step": 58786
270
+ },
271
+ {
272
+ "epoch": 5.25,
273
+ "learning_rate": 0.000916655419952314,
274
+ "loss": 0.0364,
275
+ "step": 60515
276
+ },
277
+ {
278
+ "epoch": 5.4,
279
+ "learning_rate": 0.0009110995430620625,
280
+ "loss": 0.035,
281
+ "step": 62244
282
+ },
283
+ {
284
+ "epoch": 5.55,
285
+ "learning_rate": 0.0009055436661718113,
286
+ "loss": 0.0353,
287
+ "step": 63973
288
+ },
289
+ {
290
+ "epoch": 5.7,
291
+ "learning_rate": 0.0008999877892815599,
292
+ "loss": 0.0364,
293
+ "step": 65702
294
+ },
295
+ {
296
+ "epoch": 5.85,
297
+ "learning_rate": 0.0008944319123913086,
298
+ "loss": 0.0402,
299
+ "step": 67431
300
+ },
301
+ {
302
+ "epoch": 6.0,
303
+ "eval_loss": 0.15522713959217072,
304
+ "eval_max_distance": 75,
305
+ "eval_mean_distance": 4,
306
+ "eval_runtime": 24.8618,
307
+ "eval_samples_per_second": 40.222,
308
+ "eval_steps_per_second": 2.011,
309
+ "step": 69156
310
+ },
311
+ {
312
+ "epoch": 6.0,
313
+ "learning_rate": 0.0008888760355010572,
314
+ "loss": 0.0506,
315
+ "step": 69160
316
+ },
317
+ {
318
+ "epoch": 6.15,
319
+ "learning_rate": 0.0008833201586108059,
320
+ "loss": 0.0514,
321
+ "step": 70889
322
+ },
323
+ {
324
+ "epoch": 6.3,
325
+ "learning_rate": 0.0008777642817205546,
326
+ "loss": 0.0298,
327
+ "step": 72618
328
+ },
329
+ {
330
+ "epoch": 6.45,
331
+ "learning_rate": 0.0008722084048303031,
332
+ "loss": 0.0287,
333
+ "step": 74347
334
+ },
335
+ {
336
+ "epoch": 6.6,
337
+ "learning_rate": 0.0008666525279400518,
338
+ "loss": 0.0289,
339
+ "step": 76076
340
+ },
341
+ {
342
+ "epoch": 6.75,
343
+ "learning_rate": 0.0008610966510498005,
344
+ "loss": 0.0303,
345
+ "step": 77805
346
+ },
347
+ {
348
+ "epoch": 6.9,
349
+ "learning_rate": 0.0008555407741595491,
350
+ "loss": 0.0348,
351
+ "step": 79534
352
+ },
353
+ {
354
+ "epoch": 7.0,
355
+ "eval_loss": 0.15130603313446045,
356
+ "eval_max_distance": 79,
357
+ "eval_mean_distance": 3,
358
+ "eval_runtime": 25.5455,
359
+ "eval_samples_per_second": 39.146,
360
+ "eval_steps_per_second": 1.957,
361
+ "step": 80682
362
+ },
363
+ {
364
+ "epoch": 7.05,
365
+ "learning_rate": 0.0008499848972692977,
366
+ "loss": 0.0527,
367
+ "step": 81263
368
+ },
369
+ {
370
+ "epoch": 7.2,
371
+ "learning_rate": 0.0008444290203790465,
372
+ "loss": 0.0293,
373
+ "step": 82992
374
+ },
375
+ {
376
+ "epoch": 7.35,
377
+ "learning_rate": 0.000838873143488795,
378
+ "loss": 0.0247,
379
+ "step": 84721
380
+ },
381
+ {
382
+ "epoch": 7.5,
383
+ "learning_rate": 0.0008333172665985437,
384
+ "loss": 0.024,
385
+ "step": 86450
386
+ },
387
+ {
388
+ "epoch": 7.65,
389
+ "learning_rate": 0.0008277613897082924,
390
+ "loss": 0.0243,
391
+ "step": 88179
392
+ },
393
+ {
394
+ "epoch": 7.8,
395
+ "learning_rate": 0.000822205512818041,
396
+ "loss": 0.0263,
397
+ "step": 89908
398
+ },
399
+ {
400
+ "epoch": 7.95,
401
+ "learning_rate": 0.0008166496359277897,
402
+ "loss": 0.0302,
403
+ "step": 91637
404
+ },
405
+ {
406
+ "epoch": 8.0,
407
+ "eval_loss": 0.14522188901901245,
408
+ "eval_max_distance": 76,
409
+ "eval_mean_distance": 3,
410
+ "eval_runtime": 25.6271,
411
+ "eval_samples_per_second": 39.021,
412
+ "eval_steps_per_second": 1.951,
413
+ "step": 92208
414
+ },
415
+ {
416
+ "epoch": 8.1,
417
+ "learning_rate": 0.0008110937590375384,
418
+ "loss": 0.0445,
419
+ "step": 93366
420
+ },
421
+ {
422
+ "epoch": 8.25,
423
+ "learning_rate": 0.0008055378821472869,
424
+ "loss": 0.0229,
425
+ "step": 95095
426
+ },
427
+ {
428
+ "epoch": 8.4,
429
+ "learning_rate": 0.0007999820052570357,
430
+ "loss": 0.0207,
431
+ "step": 96824
432
+ },
433
+ {
434
+ "epoch": 8.55,
435
+ "learning_rate": 0.0007944261283667843,
436
+ "loss": 0.0203,
437
+ "step": 98553
438
+ },
439
+ {
440
+ "epoch": 8.7,
441
+ "learning_rate": 0.000788870251476533,
442
+ "loss": 0.021,
443
+ "step": 100282
444
+ },
445
+ {
446
+ "epoch": 8.85,
447
+ "learning_rate": 0.0007833143745862816,
448
+ "loss": 0.0223,
449
+ "step": 102011
450
+ },
451
+ {
452
+ "epoch": 9.0,
453
+ "eval_loss": 0.08658243715763092,
454
+ "eval_max_distance": 76,
455
+ "eval_mean_distance": 1,
456
+ "eval_runtime": 23.498,
457
+ "eval_samples_per_second": 42.557,
458
+ "eval_steps_per_second": 2.128,
459
+ "step": 103734
460
+ },
461
+ {
462
+ "epoch": 9.0,
463
+ "learning_rate": 0.0007777584976960303,
464
+ "loss": 0.0284,
465
+ "step": 103740
466
+ },
467
+ {
468
+ "epoch": 9.15,
469
+ "learning_rate": 0.0007722026208057789,
470
+ "loss": 0.034,
471
+ "step": 105469
472
+ },
473
+ {
474
+ "epoch": 9.3,
475
+ "learning_rate": 0.0007666467439155275,
476
+ "loss": 0.0193,
477
+ "step": 107198
478
+ },
479
+ {
480
+ "epoch": 9.45,
481
+ "learning_rate": 0.0007610908670252762,
482
+ "loss": 0.0175,
483
+ "step": 108927
484
+ },
485
+ {
486
+ "epoch": 9.6,
487
+ "learning_rate": 0.0007555349901350248,
488
+ "loss": 0.0172,
489
+ "step": 110656
490
+ },
491
+ {
492
+ "epoch": 9.75,
493
+ "learning_rate": 0.0007499791132447735,
494
+ "loss": 0.018,
495
+ "step": 112385
496
+ },
497
+ {
498
+ "epoch": 9.9,
499
+ "learning_rate": 0.0007444232363545221,
500
+ "loss": 0.0202,
501
+ "step": 114114
502
+ },
503
+ {
504
+ "epoch": 10.0,
505
+ "eval_loss": 0.10908353328704834,
506
+ "eval_max_distance": 71,
507
+ "eval_mean_distance": 2,
508
+ "eval_runtime": 25.0909,
509
+ "eval_samples_per_second": 39.855,
510
+ "eval_steps_per_second": 1.993,
511
+ "step": 115260
512
+ },
513
+ {
514
+ "epoch": 10.05,
515
+ "learning_rate": 0.0007388673594642709,
516
+ "loss": 0.0342,
517
+ "step": 115843
518
+ },
519
+ {
520
+ "epoch": 10.2,
521
+ "learning_rate": 0.0007333114825740194,
522
+ "loss": 0.0202,
523
+ "step": 117572
524
+ },
525
+ {
526
+ "epoch": 10.35,
527
+ "learning_rate": 0.0007277556056837681,
528
+ "loss": 0.0159,
529
+ "step": 119301
530
+ },
531
+ {
532
+ "epoch": 10.5,
533
+ "learning_rate": 0.0007221997287935168,
534
+ "loss": 0.0149,
535
+ "step": 121030
536
+ },
537
+ {
538
+ "epoch": 10.65,
539
+ "learning_rate": 0.0007166438519032654,
540
+ "loss": 0.015,
541
+ "step": 122759
542
+ },
543
+ {
544
+ "epoch": 10.8,
545
+ "learning_rate": 0.000711087975013014,
546
+ "loss": 0.0155,
547
+ "step": 124488
548
+ },
549
+ {
550
+ "epoch": 10.95,
551
+ "learning_rate": 0.0007055320981227628,
552
+ "loss": 0.0175,
553
+ "step": 126217
554
+ },
555
+ {
556
+ "epoch": 11.0,
557
+ "eval_loss": 0.06553788483142853,
558
+ "eval_max_distance": 66,
559
+ "eval_mean_distance": 1,
560
+ "eval_runtime": 22.988,
561
+ "eval_samples_per_second": 43.501,
562
+ "eval_steps_per_second": 2.175,
563
+ "step": 126786
564
+ },
565
+ {
566
+ "epoch": 11.1,
567
+ "learning_rate": 0.0006999762212325114,
568
+ "loss": 0.0305,
569
+ "step": 127946
570
+ },
571
+ {
572
+ "epoch": 11.25,
573
+ "learning_rate": 0.00069442034434226,
574
+ "loss": 0.0157,
575
+ "step": 129675
576
+ },
577
+ {
578
+ "epoch": 11.4,
579
+ "learning_rate": 0.0006888644674520087,
580
+ "loss": 0.0134,
581
+ "step": 131404
582
+ },
583
+ {
584
+ "epoch": 11.55,
585
+ "learning_rate": 0.0006833085905617574,
586
+ "loss": 0.0124,
587
+ "step": 133133
588
+ },
589
+ {
590
+ "epoch": 11.7,
591
+ "learning_rate": 0.0006777527136715059,
592
+ "loss": 0.0131,
593
+ "step": 134862
594
+ },
595
+ {
596
+ "epoch": 11.85,
597
+ "learning_rate": 0.0006721968367812547,
598
+ "loss": 0.014,
599
+ "step": 136591
600
+ },
601
+ {
602
+ "epoch": 12.0,
603
+ "eval_loss": 0.04735955968499184,
604
+ "eval_max_distance": 44,
605
+ "eval_mean_distance": 0,
606
+ "eval_runtime": 22.5236,
607
+ "eval_samples_per_second": 44.398,
608
+ "eval_steps_per_second": 2.22,
609
+ "step": 138312
610
+ },
611
+ {
612
+ "epoch": 12.0,
613
+ "learning_rate": 0.0006666409598910033,
614
+ "loss": 0.0182,
615
+ "step": 138320
616
+ },
617
+ {
618
+ "epoch": 12.15,
619
+ "learning_rate": 0.000661085083000752,
620
+ "loss": 0.025,
621
+ "step": 140049
622
+ },
623
+ {
624
+ "epoch": 12.3,
625
+ "learning_rate": 0.0006555292061105006,
626
+ "loss": 0.0128,
627
+ "step": 141778
628
+ },
629
+ {
630
+ "epoch": 12.45,
631
+ "learning_rate": 0.0006499733292202492,
632
+ "loss": 0.0114,
633
+ "step": 143507
634
+ },
635
+ {
636
+ "epoch": 12.6,
637
+ "learning_rate": 0.0006444174523299979,
638
+ "loss": 0.011,
639
+ "step": 145236
640
+ },
641
+ {
642
+ "epoch": 12.75,
643
+ "learning_rate": 0.0006388615754397465,
644
+ "loss": 0.0117,
645
+ "step": 146965
646
+ },
647
+ {
648
+ "epoch": 12.9,
649
+ "learning_rate": 0.0006333056985494952,
650
+ "loss": 0.0122,
651
+ "step": 148694
652
+ },
653
+ {
654
+ "epoch": 13.0,
655
+ "eval_loss": 0.05152251571416855,
656
+ "eval_max_distance": 42,
657
+ "eval_mean_distance": 0,
658
+ "eval_runtime": 23.898,
659
+ "eval_samples_per_second": 41.845,
660
+ "eval_steps_per_second": 2.092,
661
+ "step": 149838
662
+ },
663
+ {
664
+ "epoch": 13.05,
665
+ "learning_rate": 0.0006277498216592438,
666
+ "loss": 0.0241,
667
+ "step": 150423
668
+ },
669
+ {
670
+ "epoch": 13.2,
671
+ "learning_rate": 0.0006221939447689925,
672
+ "loss": 0.0148,
673
+ "step": 152152
674
+ },
675
+ {
676
+ "epoch": 13.35,
677
+ "learning_rate": 0.0006166380678787411,
678
+ "loss": 0.0106,
679
+ "step": 153881
680
+ },
681
+ {
682
+ "epoch": 13.5,
683
+ "learning_rate": 0.0006110821909884899,
684
+ "loss": 0.0096,
685
+ "step": 155610
686
+ },
687
+ {
688
+ "epoch": 13.65,
689
+ "learning_rate": 0.0006055263140982384,
690
+ "loss": 0.0098,
691
+ "step": 157339
692
+ },
693
+ {
694
+ "epoch": 13.8,
695
+ "learning_rate": 0.0005999704372079872,
696
+ "loss": 0.0104,
697
+ "step": 159068
698
+ },
699
+ {
700
+ "epoch": 13.95,
701
+ "learning_rate": 0.0005944145603177358,
702
+ "loss": 0.0117,
703
+ "step": 160797
704
+ },
705
+ {
706
+ "epoch": 14.0,
707
+ "eval_loss": 0.047906968742609024,
708
+ "eval_max_distance": 30,
709
+ "eval_mean_distance": 0,
710
+ "eval_runtime": 23.3531,
711
+ "eval_samples_per_second": 42.821,
712
+ "eval_steps_per_second": 2.141,
713
+ "step": 161364
714
+ },
715
+ {
716
+ "epoch": 14.1,
717
+ "learning_rate": 0.0005888586834274844,
718
+ "loss": 0.0224,
719
+ "step": 162526
720
+ },
721
+ {
722
+ "epoch": 14.25,
723
+ "learning_rate": 0.0005833028065372331,
724
+ "loss": 0.0111,
725
+ "step": 164255
726
+ },
727
+ {
728
+ "epoch": 14.4,
729
+ "learning_rate": 0.0005777469296469818,
730
+ "loss": 0.009,
731
+ "step": 165984
732
+ },
733
+ {
734
+ "epoch": 14.55,
735
+ "learning_rate": 0.0005721910527567303,
736
+ "loss": 0.0086,
737
+ "step": 167713
738
+ },
739
+ {
740
+ "epoch": 14.7,
741
+ "learning_rate": 0.000566635175866479,
742
+ "loss": 0.0088,
743
+ "step": 169442
744
+ },
745
+ {
746
+ "epoch": 14.85,
747
+ "learning_rate": 0.0005610792989762277,
748
+ "loss": 0.0093,
749
+ "step": 171171
750
+ },
751
+ {
752
+ "epoch": 15.0,
753
+ "eval_loss": 0.05651288107037544,
754
+ "eval_max_distance": 56,
755
+ "eval_mean_distance": 0,
756
+ "eval_runtime": 23.6545,
757
+ "eval_samples_per_second": 42.275,
758
+ "eval_steps_per_second": 2.114,
759
+ "step": 172890
760
+ },
761
+ {
762
+ "epoch": 15.0,
763
+ "learning_rate": 0.0005555234220859762,
764
+ "loss": 0.0124,
765
+ "step": 172900
766
+ },
767
+ {
768
+ "epoch": 15.15,
769
+ "learning_rate": 0.000549967545195725,
770
+ "loss": 0.0181,
771
+ "step": 174629
772
+ },
773
+ {
774
+ "epoch": 15.3,
775
+ "learning_rate": 0.0005444116683054736,
776
+ "loss": 0.0091,
777
+ "step": 176358
778
+ },
779
+ {
780
+ "epoch": 15.45,
781
+ "learning_rate": 0.0005388557914152222,
782
+ "loss": 0.0075,
783
+ "step": 178087
784
+ },
785
+ {
786
+ "epoch": 15.6,
787
+ "learning_rate": 0.0005332999145249709,
788
+ "loss": 0.0075,
789
+ "step": 179816
790
+ },
791
+ {
792
+ "epoch": 15.75,
793
+ "learning_rate": 0.0005277440376347196,
794
+ "loss": 0.0075,
795
+ "step": 181545
796
+ },
797
+ {
798
+ "epoch": 15.9,
799
+ "learning_rate": 0.0005221881607444683,
800
+ "loss": 0.0085,
801
+ "step": 183274
802
+ },
803
+ {
804
+ "epoch": 16.0,
805
+ "eval_loss": 0.047154366970062256,
806
+ "eval_max_distance": 34,
807
+ "eval_mean_distance": 0,
808
+ "eval_runtime": 23.554,
809
+ "eval_samples_per_second": 42.456,
810
+ "eval_steps_per_second": 2.123,
811
+ "step": 184416
812
+ },
813
+ {
814
+ "epoch": 16.05,
815
+ "learning_rate": 0.0005166322838542169,
816
+ "loss": 0.0169,
817
+ "step": 185003
818
+ },
819
+ {
820
+ "epoch": 16.2,
821
+ "learning_rate": 0.0005110764069639655,
822
+ "loss": 0.0104,
823
+ "step": 186732
824
+ },
825
+ {
826
+ "epoch": 16.35,
827
+ "learning_rate": 0.0005055205300737143,
828
+ "loss": 0.0072,
829
+ "step": 188461
830
+ },
831
+ {
832
+ "epoch": 16.5,
833
+ "learning_rate": 0.0004999646531834628,
834
+ "loss": 0.0068,
835
+ "step": 190190
836
+ },
837
+ {
838
+ "epoch": 16.65,
839
+ "learning_rate": 0.0004944087762932115,
840
+ "loss": 0.0064,
841
+ "step": 191919
842
+ },
843
+ {
844
+ "epoch": 16.8,
845
+ "learning_rate": 0.0004888528994029601,
846
+ "loss": 0.0068,
847
+ "step": 193648
848
+ },
849
+ {
850
+ "epoch": 16.95,
851
+ "learning_rate": 0.0004832970225127088,
852
+ "loss": 0.0075,
853
+ "step": 195377
854
+ },
855
+ {
856
+ "epoch": 17.0,
857
+ "eval_loss": 0.04200902581214905,
858
+ "eval_max_distance": 28,
859
+ "eval_mean_distance": 0,
860
+ "eval_runtime": 22.1607,
861
+ "eval_samples_per_second": 45.125,
862
+ "eval_steps_per_second": 2.256,
863
+ "step": 195942
864
+ },
865
+ {
866
+ "epoch": 17.1,
867
+ "learning_rate": 0.00047774114562245746,
868
+ "loss": 0.0162,
869
+ "step": 197106
870
+ },
871
+ {
872
+ "epoch": 17.25,
873
+ "learning_rate": 0.00047218526873220605,
874
+ "loss": 0.0077,
875
+ "step": 198835
876
+ },
877
+ {
878
+ "epoch": 17.4,
879
+ "learning_rate": 0.00046662939184195475,
880
+ "loss": 0.0063,
881
+ "step": 200564
882
+ },
883
+ {
884
+ "epoch": 17.55,
885
+ "learning_rate": 0.0004610735149517034,
886
+ "loss": 0.0058,
887
+ "step": 202293
888
+ },
889
+ {
890
+ "epoch": 17.7,
891
+ "learning_rate": 0.0004555176380614521,
892
+ "loss": 0.0057,
893
+ "step": 204022
894
+ },
895
+ {
896
+ "epoch": 17.85,
897
+ "learning_rate": 0.0004499617611712007,
898
+ "loss": 0.0059,
899
+ "step": 205751
900
+ },
901
+ {
902
+ "epoch": 18.0,
903
+ "eval_loss": 0.04149915650486946,
904
+ "eval_max_distance": 32,
905
+ "eval_mean_distance": 0,
906
+ "eval_runtime": 22.9895,
907
+ "eval_samples_per_second": 43.498,
908
+ "eval_steps_per_second": 2.175,
909
+ "step": 207468
910
+ },
911
+ {
912
+ "epoch": 18.0,
913
+ "learning_rate": 0.00044440588428094934,
914
+ "loss": 0.0082,
915
+ "step": 207480
916
+ },
917
+ {
918
+ "epoch": 18.15,
919
+ "learning_rate": 0.00043885000739069804,
920
+ "loss": 0.0133,
921
+ "step": 209209
922
+ },
923
+ {
924
+ "epoch": 18.3,
925
+ "learning_rate": 0.00043329413050044663,
926
+ "loss": 0.0063,
927
+ "step": 210938
928
+ },
929
+ {
930
+ "epoch": 18.45,
931
+ "learning_rate": 0.0004277382536101953,
932
+ "loss": 0.0051,
933
+ "step": 212667
934
+ },
935
+ {
936
+ "epoch": 18.6,
937
+ "learning_rate": 0.000422182376719944,
938
+ "loss": 0.0051,
939
+ "step": 214396
940
+ },
941
+ {
942
+ "epoch": 18.75,
943
+ "learning_rate": 0.00041662649982969263,
944
+ "loss": 0.0051,
945
+ "step": 216125
946
+ },
947
+ {
948
+ "epoch": 18.9,
949
+ "learning_rate": 0.0004110706229394413,
950
+ "loss": 0.0054,
951
+ "step": 217854
952
+ },
953
+ {
954
+ "epoch": 19.0,
955
+ "eval_loss": 0.0405677855014801,
956
+ "eval_max_distance": 28,
957
+ "eval_mean_distance": 0,
958
+ "eval_runtime": 21.777,
959
+ "eval_samples_per_second": 45.92,
960
+ "eval_steps_per_second": 2.296,
961
+ "step": 218994
962
+ },
963
+ {
964
+ "epoch": 19.05,
965
+ "learning_rate": 0.0004055147460491899,
966
+ "loss": 0.0117,
967
+ "step": 219583
968
+ },
969
+ {
970
+ "epoch": 19.2,
971
+ "learning_rate": 0.00039995886915893857,
972
+ "loss": 0.0075,
973
+ "step": 221312
974
+ },
975
+ {
976
+ "epoch": 19.35,
977
+ "learning_rate": 0.00039440299226868727,
978
+ "loss": 0.0051,
979
+ "step": 223041
980
+ },
981
+ {
982
+ "epoch": 19.5,
983
+ "learning_rate": 0.00038884711537843586,
984
+ "loss": 0.0046,
985
+ "step": 224770
986
+ },
987
+ {
988
+ "epoch": 19.65,
989
+ "learning_rate": 0.0003832912384881845,
990
+ "loss": 0.0043,
991
+ "step": 226499
992
+ },
993
+ {
994
+ "epoch": 19.8,
995
+ "learning_rate": 0.0003777353615979332,
996
+ "loss": 0.0044,
997
+ "step": 228228
998
+ },
999
+ {
1000
+ "epoch": 19.95,
1001
+ "learning_rate": 0.0003721794847076818,
1002
+ "loss": 0.0046,
1003
+ "step": 229957
1004
+ },
1005
+ {
1006
+ "epoch": 20.0,
1007
+ "eval_loss": 0.03926468640565872,
1008
+ "eval_max_distance": 24,
1009
+ "eval_mean_distance": 0,
1010
+ "eval_runtime": 23.1294,
1011
+ "eval_samples_per_second": 43.235,
1012
+ "eval_steps_per_second": 2.162,
1013
+ "step": 230520
1014
+ },
1015
+ {
1016
+ "epoch": 20.1,
1017
+ "learning_rate": 0.0003666236078174305,
1018
+ "loss": 0.0118,
1019
+ "step": 231686
1020
+ },
1021
+ {
1022
+ "epoch": 20.25,
1023
+ "learning_rate": 0.00036106773092717915,
1024
+ "loss": 0.0056,
1025
+ "step": 233415
1026
+ },
1027
+ {
1028
+ "epoch": 20.4,
1029
+ "learning_rate": 0.0003555118540369278,
1030
+ "loss": 0.0043,
1031
+ "step": 235144
1032
+ },
1033
+ {
1034
+ "epoch": 20.55,
1035
+ "learning_rate": 0.00034995597714667645,
1036
+ "loss": 0.0039,
1037
+ "step": 236873
1038
+ },
1039
+ {
1040
+ "epoch": 20.7,
1041
+ "learning_rate": 0.0003444001002564251,
1042
+ "loss": 0.0037,
1043
+ "step": 238602
1044
+ },
1045
+ {
1046
+ "epoch": 20.85,
1047
+ "learning_rate": 0.00033884422336617374,
1048
+ "loss": 0.004,
1049
+ "step": 240331
1050
+ },
1051
+ {
1052
+ "epoch": 21.0,
1053
+ "eval_loss": 0.04168349876999855,
1054
+ "eval_max_distance": 24,
1055
+ "eval_mean_distance": 0,
1056
+ "eval_runtime": 22.786,
1057
+ "eval_samples_per_second": 43.887,
1058
+ "eval_steps_per_second": 2.194,
1059
+ "step": 242046
1060
+ },
1061
+ {
1062
+ "epoch": 21.0,
1063
+ "learning_rate": 0.0003332883464759224,
1064
+ "loss": 0.0053,
1065
+ "step": 242060
1066
+ },
1067
+ {
1068
+ "epoch": 21.15,
1069
+ "learning_rate": 0.00032773246958567103,
1070
+ "loss": 0.0097,
1071
+ "step": 243789
1072
+ },
1073
+ {
1074
+ "epoch": 21.3,
1075
+ "learning_rate": 0.00032217659269541973,
1076
+ "loss": 0.0044,
1077
+ "step": 245518
1078
+ },
1079
+ {
1080
+ "epoch": 21.45,
1081
+ "learning_rate": 0.0003166207158051684,
1082
+ "loss": 0.0036,
1083
+ "step": 247247
1084
+ },
1085
+ {
1086
+ "epoch": 21.6,
1087
+ "learning_rate": 0.000311064838914917,
1088
+ "loss": 0.0031,
1089
+ "step": 248976
1090
+ },
1091
+ {
1092
+ "epoch": 21.75,
1093
+ "learning_rate": 0.0003055089620246657,
1094
+ "loss": 0.0031,
1095
+ "step": 250705
1096
+ },
1097
+ {
1098
+ "epoch": 21.9,
1099
+ "learning_rate": 0.0002999530851344143,
1100
+ "loss": 0.0034,
1101
+ "step": 252434
1102
+ },
1103
+ {
1104
+ "epoch": 22.0,
1105
+ "eval_loss": 0.040287140756845474,
1106
+ "eval_max_distance": 18,
1107
+ "eval_mean_distance": 0,
1108
+ "eval_runtime": 21.6664,
1109
+ "eval_samples_per_second": 46.155,
1110
+ "eval_steps_per_second": 2.308,
1111
+ "step": 253572
1112
+ },
1113
+ {
1114
+ "epoch": 22.05,
1115
+ "learning_rate": 0.0002943972082441629,
1116
+ "loss": 0.0082,
1117
+ "step": 254163
1118
+ },
1119
+ {
1120
+ "epoch": 22.2,
1121
+ "learning_rate": 0.0002888413313539116,
1122
+ "loss": 0.0053,
1123
+ "step": 255892
1124
+ },
1125
+ {
1126
+ "epoch": 22.35,
1127
+ "learning_rate": 0.00028328545446366026,
1128
+ "loss": 0.0034,
1129
+ "step": 257621
1130
+ },
1131
+ {
1132
+ "epoch": 22.5,
1133
+ "learning_rate": 0.00027772957757340896,
1134
+ "loss": 0.0031,
1135
+ "step": 259350
1136
+ },
1137
+ {
1138
+ "epoch": 22.65,
1139
+ "learning_rate": 0.00027217370068315756,
1140
+ "loss": 0.0029,
1141
+ "step": 261079
1142
+ },
1143
+ {
1144
+ "epoch": 22.8,
1145
+ "learning_rate": 0.0002666178237929062,
1146
+ "loss": 0.0027,
1147
+ "step": 262808
1148
+ },
1149
+ {
1150
+ "epoch": 22.95,
1151
+ "learning_rate": 0.0002610619469026549,
1152
+ "loss": 0.0029,
1153
+ "step": 264537
1154
+ },
1155
+ {
1156
+ "epoch": 23.0,
1157
+ "eval_loss": 0.04222797229886055,
1158
+ "eval_max_distance": 21,
1159
+ "eval_mean_distance": 0,
1160
+ "eval_runtime": 22.874,
1161
+ "eval_samples_per_second": 43.718,
1162
+ "eval_steps_per_second": 2.186,
1163
+ "step": 265098
1164
+ },
1165
+ {
1166
+ "epoch": 23.1,
1167
+ "learning_rate": 0.0002555060700124035,
1168
+ "loss": 0.0081,
1169
+ "step": 266266
1170
+ },
1171
+ {
1172
+ "epoch": 23.25,
1173
+ "learning_rate": 0.00024995019312215214,
1174
+ "loss": 0.0038,
1175
+ "step": 267995
1176
+ },
1177
+ {
1178
+ "epoch": 23.4,
1179
+ "learning_rate": 0.00024439431623190085,
1180
+ "loss": 0.0028,
1181
+ "step": 269724
1182
+ },
1183
+ {
1184
+ "epoch": 23.55,
1185
+ "learning_rate": 0.00023883843934164947,
1186
+ "loss": 0.0025,
1187
+ "step": 271453
1188
+ },
1189
+ {
1190
+ "epoch": 23.7,
1191
+ "learning_rate": 0.00023328256245139814,
1192
+ "loss": 0.0026,
1193
+ "step": 273182
1194
+ },
1195
+ {
1196
+ "epoch": 23.85,
1197
+ "learning_rate": 0.0002277266855611468,
1198
+ "loss": 0.0024,
1199
+ "step": 274911
1200
+ },
1201
+ {
1202
+ "epoch": 24.0,
1203
+ "eval_loss": 0.04101773351430893,
1204
+ "eval_max_distance": 21,
1205
+ "eval_mean_distance": 0,
1206
+ "eval_runtime": 22.8008,
1207
+ "eval_samples_per_second": 43.858,
1208
+ "eval_steps_per_second": 2.193,
1209
+ "step": 276624
1210
+ },
1211
+ {
1212
+ "epoch": 24.0,
1213
+ "learning_rate": 0.00022217080867089543,
1214
+ "loss": 0.0033,
1215
+ "step": 276640
1216
+ },
1217
+ {
1218
+ "epoch": 24.15,
1219
+ "learning_rate": 0.00021661493178064408,
1220
+ "loss": 0.0069,
1221
+ "step": 278369
1222
+ },
1223
+ {
1224
+ "epoch": 24.3,
1225
+ "learning_rate": 0.00021105905489039273,
1226
+ "loss": 0.0029,
1227
+ "step": 280098
1228
+ },
1229
+ {
1230
+ "epoch": 24.45,
1231
+ "learning_rate": 0.0002055031780001414,
1232
+ "loss": 0.0024,
1233
+ "step": 281827
1234
+ },
1235
+ {
1236
+ "epoch": 24.6,
1237
+ "learning_rate": 0.00019994730110989005,
1238
+ "loss": 0.0021,
1239
+ "step": 283556
1240
+ },
1241
+ {
1242
+ "epoch": 24.75,
1243
+ "learning_rate": 0.0001943914242196387,
1244
+ "loss": 0.002,
1245
+ "step": 285285
1246
+ },
1247
+ {
1248
+ "epoch": 24.9,
1249
+ "learning_rate": 0.00018883554732938734,
1250
+ "loss": 0.002,
1251
+ "step": 287014
1252
+ },
1253
+ {
1254
+ "epoch": 25.0,
1255
+ "eval_loss": 0.043479613959789276,
1256
+ "eval_max_distance": 15,
1257
+ "eval_mean_distance": 0,
1258
+ "eval_runtime": 21.481,
1259
+ "eval_samples_per_second": 46.553,
1260
+ "eval_steps_per_second": 2.328,
1261
+ "step": 288150
1262
+ },
1263
+ {
1264
+ "epoch": 25.05,
1265
+ "learning_rate": 0.000183279670439136,
1266
+ "loss": 0.0055,
1267
+ "step": 288743
1268
+ },
1269
+ {
1270
+ "epoch": 25.2,
1271
+ "learning_rate": 0.00017772379354888466,
1272
+ "loss": 0.0038,
1273
+ "step": 290472
1274
+ },
1275
+ {
1276
+ "epoch": 25.35,
1277
+ "learning_rate": 0.00017216791665863328,
1278
+ "loss": 0.0023,
1279
+ "step": 292201
1280
+ },
1281
+ {
1282
+ "epoch": 25.5,
1283
+ "learning_rate": 0.00016661203976838196,
1284
+ "loss": 0.002,
1285
+ "step": 293930
1286
+ },
1287
+ {
1288
+ "epoch": 25.65,
1289
+ "learning_rate": 0.0001610561628781306,
1290
+ "loss": 0.0017,
1291
+ "step": 295659
1292
+ },
1293
+ {
1294
+ "epoch": 25.8,
1295
+ "learning_rate": 0.00015550028598787928,
1296
+ "loss": 0.0018,
1297
+ "step": 297388
1298
+ },
1299
+ {
1300
+ "epoch": 25.95,
1301
+ "learning_rate": 0.0001499444090976279,
1302
+ "loss": 0.0016,
1303
+ "step": 299117
1304
+ },
1305
+ {
1306
+ "epoch": 26.0,
1307
+ "eval_loss": 0.04517431557178497,
1308
+ "eval_max_distance": 15,
1309
+ "eval_mean_distance": 0,
1310
+ "eval_runtime": 22.7192,
1311
+ "eval_samples_per_second": 44.016,
1312
+ "eval_steps_per_second": 2.201,
1313
+ "step": 299676
1314
+ },
1315
+ {
1316
+ "epoch": 26.1,
1317
+ "learning_rate": 0.00014438853220737654,
1318
+ "loss": 0.0054,
1319
+ "step": 300846
1320
+ },
1321
+ {
1322
+ "epoch": 26.25,
1323
+ "learning_rate": 0.00013883265531712522,
1324
+ "loss": 0.0026,
1325
+ "step": 302575
1326
+ },
1327
+ {
1328
+ "epoch": 26.4,
1329
+ "learning_rate": 0.00013327677842687387,
1330
+ "loss": 0.0018,
1331
+ "step": 304304
1332
+ },
1333
+ {
1334
+ "epoch": 26.55,
1335
+ "learning_rate": 0.0001277209015366225,
1336
+ "loss": 0.0016,
1337
+ "step": 306033
1338
+ },
1339
+ {
1340
+ "epoch": 26.7,
1341
+ "learning_rate": 0.00012216502464637116,
1342
+ "loss": 0.0015,
1343
+ "step": 307762
1344
+ },
1345
+ {
1346
+ "epoch": 26.85,
1347
+ "learning_rate": 0.00011660914775611982,
1348
+ "loss": 0.0013,
1349
+ "step": 309491
1350
+ },
1351
+ {
1352
+ "epoch": 27.0,
1353
+ "eval_loss": 0.04144546017050743,
1354
+ "eval_max_distance": 14,
1355
+ "eval_mean_distance": 0,
1356
+ "eval_runtime": 22.686,
1357
+ "eval_samples_per_second": 44.08,
1358
+ "eval_steps_per_second": 2.204,
1359
+ "step": 311202
1360
+ },
1361
+ {
1362
+ "epoch": 27.0,
1363
+ "learning_rate": 0.00011105327086586847,
1364
+ "loss": 0.0021,
1365
+ "step": 311220
1366
+ },
1367
+ {
1368
+ "epoch": 27.15,
1369
+ "learning_rate": 0.00010549739397561713,
1370
+ "loss": 0.0046,
1371
+ "step": 312949
1372
+ },
1373
+ {
1374
+ "epoch": 27.3,
1375
+ "learning_rate": 9.994151708536577e-05,
1376
+ "loss": 0.002,
1377
+ "step": 314678
1378
+ },
1379
+ {
1380
+ "epoch": 27.45,
1381
+ "learning_rate": 9.438564019511443e-05,
1382
+ "loss": 0.0016,
1383
+ "step": 316407
1384
+ },
1385
+ {
1386
+ "epoch": 27.6,
1387
+ "learning_rate": 8.882976330486308e-05,
1388
+ "loss": 0.0013,
1389
+ "step": 318136
1390
+ },
1391
+ {
1392
+ "epoch": 27.75,
1393
+ "learning_rate": 8.327388641461173e-05,
1394
+ "loss": 0.0013,
1395
+ "step": 319865
1396
+ },
1397
+ {
1398
+ "epoch": 27.9,
1399
+ "learning_rate": 7.771800952436038e-05,
1400
+ "loss": 0.0012,
1401
+ "step": 321594
1402
+ },
1403
+ {
1404
+ "epoch": 28.0,
1405
+ "eval_loss": 0.04387975484132767,
1406
+ "eval_max_distance": 14,
1407
+ "eval_mean_distance": 0,
1408
+ "eval_runtime": 21.586,
1409
+ "eval_samples_per_second": 46.326,
1410
+ "eval_steps_per_second": 2.316,
1411
+ "step": 322728
1412
+ },
1413
+ {
1414
+ "epoch": 28.05,
1415
+ "learning_rate": 7.216213263410904e-05,
1416
+ "loss": 0.0037,
1417
+ "step": 323323
1418
+ },
1419
+ {
1420
+ "epoch": 28.2,
1421
+ "learning_rate": 6.660625574385768e-05,
1422
+ "loss": 0.0027,
1423
+ "step": 325052
1424
+ },
1425
+ {
1426
+ "epoch": 28.35,
1427
+ "learning_rate": 6.105037885360634e-05,
1428
+ "loss": 0.0016,
1429
+ "step": 326781
1430
+ },
1431
+ {
1432
+ "epoch": 28.5,
1433
+ "learning_rate": 5.5494501963355e-05,
1434
+ "loss": 0.0013,
1435
+ "step": 328510
1436
+ },
1437
+ {
1438
+ "epoch": 28.65,
1439
+ "learning_rate": 4.9938625073103644e-05,
1440
+ "loss": 0.0011,
1441
+ "step": 330239
1442
+ },
1443
+ {
1444
+ "epoch": 28.8,
1445
+ "learning_rate": 4.43827481828523e-05,
1446
+ "loss": 0.001,
1447
+ "step": 331968
1448
+ },
1449
+ {
1450
+ "epoch": 28.95,
1451
+ "learning_rate": 3.882687129260095e-05,
1452
+ "loss": 0.001,
1453
+ "step": 333697
1454
+ },
1455
+ {
1456
+ "epoch": 29.0,
1457
+ "eval_loss": 0.044395141303539276,
1458
+ "eval_max_distance": 15,
1459
+ "eval_mean_distance": 0,
1460
+ "eval_runtime": 22.6289,
1461
+ "eval_samples_per_second": 44.191,
1462
+ "eval_steps_per_second": 2.21,
1463
+ "step": 334254
1464
+ },
1465
+ {
1466
+ "epoch": 29.1,
1467
+ "learning_rate": 3.32709944023496e-05,
1468
+ "loss": 0.0039,
1469
+ "step": 335426
1470
+ },
1471
+ {
1472
+ "epoch": 29.25,
1473
+ "learning_rate": 2.771511751209825e-05,
1474
+ "loss": 0.0017,
1475
+ "step": 337155
1476
+ },
1477
+ {
1478
+ "epoch": 29.4,
1479
+ "learning_rate": 2.2159240621846906e-05,
1480
+ "loss": 0.0012,
1481
+ "step": 338884
1482
+ },
1483
+ {
1484
+ "epoch": 29.55,
1485
+ "learning_rate": 1.6603363731595556e-05,
1486
+ "loss": 0.0011,
1487
+ "step": 340613
1488
+ },
1489
+ {
1490
+ "epoch": 29.7,
1491
+ "learning_rate": 1.1047486841344207e-05,
1492
+ "loss": 0.001,
1493
+ "step": 342342
1494
+ },
1495
+ {
1496
+ "epoch": 29.85,
1497
+ "learning_rate": 0.0004477394104151002,
1498
+ "loss": 0.0026,
1499
+ "step": 344071
1500
+ },
1501
+ {
1502
+ "epoch": 30.0,
1503
+ "eval_loss": 0.042660146951675415,
1504
+ "eval_max_distance": 19,
1505
+ "eval_mean_distance": 0,
1506
+ "eval_runtime": 48.9711,
1507
+ "eval_samples_per_second": 20.42,
1508
+ "eval_steps_per_second": 1.021,
1509
+ "step": 345780
1510
+ },
1511
+ {
1512
+ "epoch": 30.0,
1513
+ "learning_rate": 0.00044440588428094934,
1514
+ "loss": 0.0045,
1515
+ "step": 345800
1516
+ },
1517
+ {
1518
+ "epoch": 30.15,
1519
+ "learning_rate": 0.00044107235814679855,
1520
+ "loss": 0.0068,
1521
+ "step": 347529
1522
+ },
1523
+ {
1524
+ "epoch": 30.3,
1525
+ "learning_rate": 0.00043773883201264776,
1526
+ "loss": 0.0036,
1527
+ "step": 349258
1528
+ },
1529
+ {
1530
+ "epoch": 30.45,
1531
+ "learning_rate": 0.0004344053058784969,
1532
+ "loss": 0.0032,
1533
+ "step": 350987
1534
+ },
1535
+ {
1536
+ "epoch": 30.6,
1537
+ "learning_rate": 0.0004310717797443461,
1538
+ "loss": 0.0033,
1539
+ "step": 352716
1540
+ },
1541
+ {
1542
+ "epoch": 30.75,
1543
+ "learning_rate": 0.0004277382536101953,
1544
+ "loss": 0.0034,
1545
+ "step": 354445
1546
+ },
1547
+ {
1548
+ "epoch": 30.9,
1549
+ "learning_rate": 0.00042440472747604454,
1550
+ "loss": 0.004,
1551
+ "step": 356174
1552
+ },
1553
+ {
1554
+ "epoch": 23.29,
1555
+ "learning_rate": 0.000593547500777279,
1556
+ "loss": 0.0057,
1557
+ "step": 357903
1558
+ },
1559
+ {
1560
+ "epoch": 23.4,
1561
+ "learning_rate": 0.0005910471934809801,
1562
+ "loss": 0.0053,
1563
+ "step": 359632
1564
+ },
1565
+ {
1566
+ "epoch": 23.52,
1567
+ "learning_rate": 0.0005885468861846814,
1568
+ "loss": 0.0055,
1569
+ "step": 361361
1570
+ },
1571
+ {
1572
+ "epoch": 23.63,
1573
+ "learning_rate": 0.0005860465788883828,
1574
+ "loss": 0.0056,
1575
+ "step": 363090
1576
+ },
1577
+ {
1578
+ "epoch": 23.74,
1579
+ "learning_rate": 0.0005835462715920841,
1580
+ "loss": 0.006,
1581
+ "step": 364819
1582
+ },
1583
+ {
1584
+ "epoch": 23.85,
1585
+ "learning_rate": 0.0005810459642957853,
1586
+ "loss": 0.0064,
1587
+ "step": 366548
1588
+ },
1589
+ {
1590
+ "epoch": 23.97,
1591
+ "learning_rate": 0.0005785456569994867,
1592
+ "loss": 0.0077,
1593
+ "step": 368277
1594
+ },
1595
+ {
1596
+ "epoch": 24.0,
1597
+ "eval_loss": 0.049542564898729324,
1598
+ "eval_max_distance": 27,
1599
+ "eval_mean_distance": 0,
1600
+ "eval_runtime": 72.4399,
1601
+ "eval_samples_per_second": 13.805,
1602
+ "eval_steps_per_second": 0.925,
1603
+ "step": 368808
1604
+ },
1605
+ {
1606
+ "epoch": 24.08,
1607
+ "learning_rate": 0.0005760453497031879,
1608
+ "loss": 0.0158,
1609
+ "step": 370006
1610
+ },
1611
+ {
1612
+ "epoch": 24.19,
1613
+ "learning_rate": 0.0005735450424068892,
1614
+ "loss": 0.0082,
1615
+ "step": 371735
1616
+ },
1617
+ {
1618
+ "epoch": 24.3,
1619
+ "learning_rate": 0.0005710447351105905,
1620
+ "loss": 0.0057,
1621
+ "step": 373464
1622
+ },
1623
+ {
1624
+ "epoch": 24.42,
1625
+ "learning_rate": 0.0005685444278142918,
1626
+ "loss": 0.0059,
1627
+ "step": 375193
1628
+ },
1629
+ {
1630
+ "epoch": 24.53,
1631
+ "learning_rate": 0.0005660441205179932,
1632
+ "loss": 0.0055,
1633
+ "step": 376922
1634
+ },
1635
+ {
1636
+ "epoch": 24.64,
1637
+ "learning_rate": 0.0005635438132216944,
1638
+ "loss": 0.0062,
1639
+ "step": 378651
1640
+ },
1641
+ {
1642
+ "epoch": 24.75,
1643
+ "learning_rate": 0.0005610435059253957,
1644
+ "loss": 0.0059,
1645
+ "step": 380380
1646
+ },
1647
+ {
1648
+ "epoch": 24.87,
1649
+ "learning_rate": 0.000558543198629097,
1650
+ "loss": 0.0062,
1651
+ "step": 382109
1652
+ },
1653
+ {
1654
+ "epoch": 24.98,
1655
+ "learning_rate": 0.0005560428913327982,
1656
+ "loss": 0.0083,
1657
+ "step": 383838
1658
+ },
1659
+ {
1660
+ "epoch": 25.0,
1661
+ "eval_loss": 0.044624801725149155,
1662
+ "eval_max_distance": 37,
1663
+ "eval_mean_distance": 0,
1664
+ "eval_runtime": 35.859,
1665
+ "eval_samples_per_second": 27.887,
1666
+ "eval_steps_per_second": 1.868,
1667
+ "step": 384175
1668
+ },
1669
+ {
1670
+ "epoch": 25.09,
1671
+ "learning_rate": 0.0005535425840364996,
1672
+ "loss": 0.0154,
1673
+ "step": 385567
1674
+ },
1675
+ {
1676
+ "epoch": 25.2,
1677
+ "learning_rate": 0.0005510422767402009,
1678
+ "loss": 0.0075,
1679
+ "step": 387296
1680
+ },
1681
+ {
1682
+ "epoch": 25.32,
1683
+ "learning_rate": 0.0005485419694439023,
1684
+ "loss": 0.006,
1685
+ "step": 389025
1686
+ },
1687
+ {
1688
+ "epoch": 25.43,
1689
+ "learning_rate": 0.0005460416621476034,
1690
+ "loss": 0.0057,
1691
+ "step": 390754
1692
+ },
1693
+ {
1694
+ "epoch": 25.54,
1695
+ "learning_rate": 0.0005435413548513047,
1696
+ "loss": 0.0057,
1697
+ "step": 392483
1698
+ },
1699
+ {
1700
+ "epoch": 25.65,
1701
+ "learning_rate": 0.0005410410475550061,
1702
+ "loss": 0.0056,
1703
+ "step": 394212
1704
+ },
1705
+ {
1706
+ "epoch": 25.77,
1707
+ "learning_rate": 0.0005385407402587073,
1708
+ "loss": 0.0059,
1709
+ "step": 395941
1710
+ },
1711
+ {
1712
+ "epoch": 25.88,
1713
+ "learning_rate": 0.0005360404329624087,
1714
+ "loss": 0.0067,
1715
+ "step": 397670
1716
+ },
1717
+ {
1718
+ "epoch": 25.99,
1719
+ "learning_rate": 0.0005335401256661099,
1720
+ "loss": 0.0078,
1721
+ "step": 399399
1722
+ },
1723
+ {
1724
+ "epoch": 26.0,
1725
+ "eval_loss": 0.048115409910678864,
1726
+ "eval_max_distance": 47,
1727
+ "eval_mean_distance": 0,
1728
+ "eval_runtime": 35.8872,
1729
+ "eval_samples_per_second": 27.865,
1730
+ "eval_steps_per_second": 1.867,
1731
+ "step": 399542
1732
+ },
1733
+ {
1734
+ "epoch": 26.1,
1735
+ "learning_rate": 0.0005310398183698112,
1736
+ "loss": 0.0151,
1737
+ "step": 401128
1738
+ },
1739
+ {
1740
+ "epoch": 26.22,
1741
+ "learning_rate": 0.0005285395110735125,
1742
+ "loss": 0.0068,
1743
+ "step": 402857
1744
+ },
1745
+ {
1746
+ "epoch": 26.33,
1747
+ "learning_rate": 0.0005260392037772138,
1748
+ "loss": 0.0055,
1749
+ "step": 404586
1750
+ },
1751
+ {
1752
+ "epoch": 26.44,
1753
+ "learning_rate": 0.0005235388964809152,
1754
+ "loss": 0.0052,
1755
+ "step": 406315
1756
+ },
1757
+ {
1758
+ "epoch": 26.55,
1759
+ "learning_rate": 0.0005210385891846165,
1760
+ "loss": 0.0054,
1761
+ "step": 408044
1762
+ },
1763
+ {
1764
+ "epoch": 26.67,
1765
+ "learning_rate": 0.0005185382818883176,
1766
+ "loss": 0.0053,
1767
+ "step": 409773
1768
+ },
1769
+ {
1770
+ "epoch": 26.78,
1771
+ "learning_rate": 0.000516037974592019,
1772
+ "loss": 0.0052,
1773
+ "step": 411502
1774
+ },
1775
+ {
1776
+ "epoch": 26.89,
1777
+ "learning_rate": 0.0005135376672957203,
1778
+ "loss": 0.006,
1779
+ "step": 413231
1780
+ },
1781
+ {
1782
+ "epoch": 27.0,
1783
+ "eval_loss": 0.042408570647239685,
1784
+ "eval_max_distance": 37,
1785
+ "eval_mean_distance": 0,
1786
+ "eval_runtime": 14.3824,
1787
+ "eval_samples_per_second": 69.529,
1788
+ "eval_steps_per_second": 4.658,
1789
+ "step": 414909
1790
+ },
1791
+ {
1792
+ "epoch": 27.0,
1793
+ "learning_rate": 0.0005110373599994215,
1794
+ "loss": 0.0095,
1795
+ "step": 414960
1796
+ },
1797
+ {
1798
+ "epoch": 27.12,
1799
+ "learning_rate": 0.0005085370527031229,
1800
+ "loss": 0.0115,
1801
+ "step": 416689
1802
+ },
1803
+ {
1804
+ "epoch": 27.23,
1805
+ "learning_rate": 0.0005060367454068242,
1806
+ "loss": 0.006,
1807
+ "step": 418418
1808
+ },
1809
+ {
1810
+ "epoch": 27.34,
1811
+ "learning_rate": 0.0005035364381105254,
1812
+ "loss": 0.0051,
1813
+ "step": 420147
1814
+ },
1815
+ {
1816
+ "epoch": 27.45,
1817
+ "learning_rate": 0.0005010361308142267,
1818
+ "loss": 0.0047,
1819
+ "step": 421876
1820
+ },
1821
+ {
1822
+ "epoch": 27.57,
1823
+ "learning_rate": 0.000498535823517928,
1824
+ "loss": 0.0047,
1825
+ "step": 423605
1826
+ },
1827
+ {
1828
+ "epoch": 27.68,
1829
+ "learning_rate": 0.0004960355162216294,
1830
+ "loss": 0.0054,
1831
+ "step": 425334
1832
+ },
1833
+ {
1834
+ "epoch": 27.79,
1835
+ "learning_rate": 0.0004935352089253306,
1836
+ "loss": 0.0052,
1837
+ "step": 427063
1838
+ },
1839
+ {
1840
+ "epoch": 27.9,
1841
+ "learning_rate": 0.000491034901629032,
1842
+ "loss": 0.0056,
1843
+ "step": 428792
1844
+ },
1845
+ {
1846
+ "epoch": 28.0,
1847
+ "eval_loss": 0.04393070191144943,
1848
+ "eval_max_distance": 22,
1849
+ "eval_mean_distance": 0,
1850
+ "eval_runtime": 26.5483,
1851
+ "eval_samples_per_second": 37.667,
1852
+ "eval_steps_per_second": 2.524,
1853
+ "step": 430276
1854
+ },
1855
+ {
1856
+ "epoch": 28.02,
1857
+ "learning_rate": 0.0004885345943327332,
1858
+ "loss": 0.0105,
1859
+ "step": 430521
1860
+ },
1861
+ {
1862
+ "epoch": 28.13,
1863
+ "learning_rate": 0.0004860342870364345,
1864
+ "loss": 0.0093,
1865
+ "step": 432250
1866
+ },
1867
+ {
1868
+ "epoch": 28.24,
1869
+ "learning_rate": 0.0004835339797401358,
1870
+ "loss": 0.0055,
1871
+ "step": 433979
1872
+ },
1873
+ {
1874
+ "epoch": 28.35,
1875
+ "learning_rate": 0.0004810336724438371,
1876
+ "loss": 0.0049,
1877
+ "step": 435708
1878
+ },
1879
+ {
1880
+ "epoch": 28.47,
1881
+ "learning_rate": 0.00047853336514753835,
1882
+ "loss": 0.0044,
1883
+ "step": 437437
1884
+ },
1885
+ {
1886
+ "epoch": 28.58,
1887
+ "learning_rate": 0.0004760330578512397,
1888
+ "loss": 0.0044,
1889
+ "step": 439166
1890
+ },
1891
+ {
1892
+ "epoch": 28.69,
1893
+ "learning_rate": 0.000473532750554941,
1894
+ "loss": 0.0047,
1895
+ "step": 440895
1896
+ },
1897
+ {
1898
+ "epoch": 28.8,
1899
+ "learning_rate": 0.0004710324432586423,
1900
+ "loss": 0.0047,
1901
+ "step": 442624
1902
+ },
1903
+ {
1904
+ "epoch": 28.92,
1905
+ "learning_rate": 0.00046853213596234356,
1906
+ "loss": 0.0054,
1907
+ "step": 444353
1908
+ },
1909
+ {
1910
+ "epoch": 29.0,
1911
+ "eval_loss": 0.04814203828573227,
1912
+ "eval_max_distance": 23,
1913
+ "eval_mean_distance": 0,
1914
+ "eval_runtime": 35.122,
1915
+ "eval_samples_per_second": 28.472,
1916
+ "eval_steps_per_second": 1.908,
1917
+ "step": 445643
1918
+ },
1919
+ {
1920
+ "epoch": 29.03,
1921
+ "learning_rate": 0.00046603182866604485,
1922
+ "loss": 0.0103,
1923
+ "step": 446082
1924
+ },
1925
+ {
1926
+ "epoch": 29.14,
1927
+ "learning_rate": 0.00046353152136974614,
1928
+ "loss": 0.0076,
1929
+ "step": 447811
1930
+ },
1931
+ {
1932
+ "epoch": 29.25,
1933
+ "learning_rate": 0.00046103121407344743,
1934
+ "loss": 0.005,
1935
+ "step": 449540
1936
+ },
1937
+ {
1938
+ "epoch": 29.37,
1939
+ "learning_rate": 0.00045853090677714877,
1940
+ "loss": 0.0043,
1941
+ "step": 451269
1942
+ },
1943
+ {
1944
+ "epoch": 29.48,
1945
+ "learning_rate": 0.00045603059948085,
1946
+ "loss": 0.0043,
1947
+ "step": 452998
1948
+ },
1949
+ {
1950
+ "epoch": 29.59,
1951
+ "learning_rate": 0.00045353029218455135,
1952
+ "loss": 0.004,
1953
+ "step": 454727
1954
+ },
1955
+ {
1956
+ "epoch": 29.7,
1957
+ "learning_rate": 0.0004510299848882526,
1958
+ "loss": 0.0042,
1959
+ "step": 456456
1960
+ },
1961
+ {
1962
+ "epoch": 29.82,
1963
+ "learning_rate": 6.808721912515757e-06,
1964
+ "loss": 0.0039,
1965
+ "step": 458185
1966
+ },
1967
+ {
1968
+ "epoch": 29.93,
1969
+ "learning_rate": 2.641543085351246e-06,
1970
+ "loss": 0.004,
1971
+ "step": 459914
1972
+ },
1973
+ {
1974
+ "epoch": 30.0,
1975
+ "eval_loss": 0.04417673125863075,
1976
+ "eval_max_distance": 25,
1977
+ "eval_mean_distance": 0,
1978
+ "eval_runtime": 68.6071,
1979
+ "eval_samples_per_second": 14.576,
1980
+ "eval_steps_per_second": 0.977,
1981
+ "step": 461010
1982
+ },
1983
+ {
1984
+ "epoch": 30.0,
1985
+ "step": 461010,
1986
+ "total_flos": 1.1619438964958822e+17,
1987
+ "train_loss": 3.931375028864634e-05,
1988
+ "train_runtime": 411.3459,
1989
+ "train_samples_per_second": 16810.814,
1990
+ "train_steps_per_second": 1120.736
1991
+ }
1992
+ ],
1993
+ "logging_steps": 1729,
1994
+ "max_steps": 461010,
1995
+ "num_train_epochs": 30,
1996
+ "save_steps": 3458,
1997
+ "total_flos": 1.1619438964958822e+17,
1998
+ "trial_name": null,
1999
+ "trial_params": null
2000
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d22aeb992bd9320e28438aa9818e67e4f3f8675fde0b30b307f79b248f69c20
3
+ size 4091