alexue4 commited on
Commit
4148ae4
1 Parent(s): a917f64

End of training

Browse files
README.md CHANGED
@@ -15,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0003
19
  - Mean Distance: 0
20
- - Max Distance: 0
21
 
22
  ## Model description
23
 
@@ -37,48 +37,38 @@ More information needed
37
 
38
  The following hyperparameters were used during training:
39
  - learning_rate: 0.0001
40
- - train_batch_size: 30
41
- - eval_batch_size: 30
42
  - seed: 42
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 25
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
- |:-------------:|:-----:|:----:|:---------------:|:-------------:|:------------:|
52
- | 0.0013 | 1.0 | 69 | 0.0028 | 0 | 2 |
53
- | 0.0006 | 2.0 | 138 | 0.0026 | 0 | 3 |
54
- | 0.0025 | 3.0 | 207 | 0.0039 | 0 | 3 |
55
- | 0.0004 | 4.0 | 276 | 0.0037 | 0 | 3 |
56
- | 0.0005 | 5.0 | 345 | 0.0091 | 0 | 3 |
57
- | 0.0009 | 6.0 | 414 | 0.0006 | 0 | 0 |
58
- | 0.0016 | 7.0 | 483 | 0.0003 | 0 | 0 |
59
- | 0.0012 | 8.0 | 552 | 0.0111 | 0 | 5 |
60
- | 0.0008 | 9.0 | 621 | 0.0004 | 0 | 0 |
61
- | 0.0018 | 10.0 | 690 | 0.0003 | 0 | 0 |
62
- | 0.0028 | 11.0 | 759 | 0.0003 | 0 | 0 |
63
- | 0.0008 | 12.0 | 828 | 0.0003 | 0 | 0 |
64
- | 0.001 | 13.0 | 897 | 0.0004 | 0 | 2 |
65
- | 0.0026 | 14.0 | 966 | 0.0005 | 0 | 2 |
66
- | 0.0015 | 15.0 | 1035 | 0.0007 | 0 | 3 |
67
- | 0.0009 | 16.0 | 1104 | 0.0007 | 0 | 3 |
68
- | 0.0014 | 17.0 | 1173 | 0.0003 | 0 | 0 |
69
- | 0.001 | 18.0 | 1242 | 0.0004 | 0 | 0 |
70
- | 0.0007 | 19.0 | 1311 | 0.0013 | 0 | 3 |
71
- | 0.0013 | 20.0 | 1380 | 0.0013 | 0 | 3 |
72
- | 0.0007 | 21.0 | 1449 | 0.0003 | 0 | 0 |
73
- | 0.0016 | 22.0 | 1518 | 0.0003 | 0 | 0 |
74
- | 0.0013 | 23.0 | 1587 | 0.0003 | 0 | 0 |
75
- | 0.0004 | 24.0 | 1656 | 0.0003 | 0 | 0 |
76
- | 0.001 | 25.0 | 1725 | 0.0003 | 0 | 0 |
77
 
78
 
79
  ### Framework versions
80
 
81
- - Transformers 4.32.1
82
- - Pytorch 2.0.1+cu117
83
- - Datasets 2.14.4
84
- - Tokenizers 0.13.3
 
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0008
19
  - Mean Distance: 0
20
+ - Max Distance: 3
21
 
22
  ## Model description
23
 
 
37
 
38
  The following hyperparameters were used during training:
39
  - learning_rate: 0.0001
40
+ - train_batch_size: 150
41
+ - eval_batch_size: 150
42
  - seed: 42
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 15
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
+ |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
+ | 0.0004 | 1.0 | 3059 | 0.0022 | 0 | 12 |
53
+ | 0.0003 | 2.0 | 6118 | 0.0011 | 0 | 5 |
54
+ | 0.0003 | 3.0 | 9177 | 0.0010 | 0 | 6 |
55
+ | 0.0003 | 4.0 | 12236 | 0.0012 | 0 | 3 |
56
+ | 0.0003 | 5.0 | 15295 | 0.0008 | 0 | 3 |
57
+ | 0.0002 | 6.0 | 18354 | 0.0009 | 0 | 3 |
58
+ | 0.0002 | 7.0 | 21413 | 0.0008 | 0 | 3 |
59
+ | 0.0002 | 8.0 | 24472 | 0.0008 | 0 | 3 |
60
+ | 0.0002 | 9.0 | 27531 | 0.0007 | 0 | 3 |
61
+ | 0.0002 | 10.0 | 30590 | 0.0008 | 0 | 3 |
62
+ | 0.0002 | 11.0 | 33649 | 0.0008 | 0 | 3 |
63
+ | 0.0002 | 12.0 | 36708 | 0.0008 | 0 | 3 |
64
+ | 0.0002 | 13.0 | 39767 | 0.0008 | 0 | 3 |
65
+ | 0.0002 | 14.0 | 42826 | 0.0008 | 0 | 3 |
66
+ | 0.0002 | 15.0 | 45885 | 0.0008 | 0 | 3 |
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  ### Framework versions
70
 
71
+ - Transformers 4.37.2
72
+ - Pytorch 2.1.0+cu121
73
+ - Datasets 2.16.1
74
+ - Tokenizers 0.15.1
added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 20199,
3
+ "<extra_id_10>": 20189,
4
+ "<extra_id_11>": 20188,
5
+ "<extra_id_12>": 20187,
6
+ "<extra_id_13>": 20186,
7
+ "<extra_id_14>": 20185,
8
+ "<extra_id_15>": 20184,
9
+ "<extra_id_16>": 20183,
10
+ "<extra_id_17>": 20182,
11
+ "<extra_id_18>": 20181,
12
+ "<extra_id_19>": 20180,
13
+ "<extra_id_1>": 20198,
14
+ "<extra_id_20>": 20179,
15
+ "<extra_id_21>": 20178,
16
+ "<extra_id_22>": 20177,
17
+ "<extra_id_23>": 20176,
18
+ "<extra_id_24>": 20175,
19
+ "<extra_id_25>": 20174,
20
+ "<extra_id_26>": 20173,
21
+ "<extra_id_27>": 20172,
22
+ "<extra_id_28>": 20171,
23
+ "<extra_id_29>": 20170,
24
+ "<extra_id_2>": 20197,
25
+ "<extra_id_30>": 20169,
26
+ "<extra_id_31>": 20168,
27
+ "<extra_id_32>": 20167,
28
+ "<extra_id_33>": 20166,
29
+ "<extra_id_34>": 20165,
30
+ "<extra_id_35>": 20164,
31
+ "<extra_id_36>": 20163,
32
+ "<extra_id_37>": 20162,
33
+ "<extra_id_38>": 20161,
34
+ "<extra_id_39>": 20160,
35
+ "<extra_id_3>": 20196,
36
+ "<extra_id_40>": 20159,
37
+ "<extra_id_41>": 20158,
38
+ "<extra_id_42>": 20157,
39
+ "<extra_id_43>": 20156,
40
+ "<extra_id_44>": 20155,
41
+ "<extra_id_45>": 20154,
42
+ "<extra_id_46>": 20153,
43
+ "<extra_id_47>": 20152,
44
+ "<extra_id_48>": 20151,
45
+ "<extra_id_49>": 20150,
46
+ "<extra_id_4>": 20195,
47
+ "<extra_id_50>": 20149,
48
+ "<extra_id_51>": 20148,
49
+ "<extra_id_52>": 20147,
50
+ "<extra_id_53>": 20146,
51
+ "<extra_id_54>": 20145,
52
+ "<extra_id_55>": 20144,
53
+ "<extra_id_56>": 20143,
54
+ "<extra_id_57>": 20142,
55
+ "<extra_id_58>": 20141,
56
+ "<extra_id_59>": 20140,
57
+ "<extra_id_5>": 20194,
58
+ "<extra_id_60>": 20139,
59
+ "<extra_id_61>": 20138,
60
+ "<extra_id_62>": 20137,
61
+ "<extra_id_63>": 20136,
62
+ "<extra_id_64>": 20135,
63
+ "<extra_id_65>": 20134,
64
+ "<extra_id_66>": 20133,
65
+ "<extra_id_67>": 20132,
66
+ "<extra_id_68>": 20131,
67
+ "<extra_id_69>": 20130,
68
+ "<extra_id_6>": 20193,
69
+ "<extra_id_70>": 20129,
70
+ "<extra_id_71>": 20128,
71
+ "<extra_id_72>": 20127,
72
+ "<extra_id_73>": 20126,
73
+ "<extra_id_74>": 20125,
74
+ "<extra_id_75>": 20124,
75
+ "<extra_id_76>": 20123,
76
+ "<extra_id_77>": 20122,
77
+ "<extra_id_78>": 20121,
78
+ "<extra_id_79>": 20120,
79
+ "<extra_id_7>": 20192,
80
+ "<extra_id_80>": 20119,
81
+ "<extra_id_81>": 20118,
82
+ "<extra_id_82>": 20117,
83
+ "<extra_id_83>": 20116,
84
+ "<extra_id_84>": 20115,
85
+ "<extra_id_85>": 20114,
86
+ "<extra_id_86>": 20113,
87
+ "<extra_id_87>": 20112,
88
+ "<extra_id_88>": 20111,
89
+ "<extra_id_89>": 20110,
90
+ "<extra_id_8>": 20191,
91
+ "<extra_id_90>": 20109,
92
+ "<extra_id_91>": 20108,
93
+ "<extra_id_92>": 20107,
94
+ "<extra_id_93>": 20106,
95
+ "<extra_id_94>": 20105,
96
+ "<extra_id_95>": 20104,
97
+ "<extra_id_96>": 20103,
98
+ "<extra_id_97>": 20102,
99
+ "<extra_id_98>": 20101,
100
+ "<extra_id_99>": 20100,
101
+ "<extra_id_9>": 20190
102
+ }
config.json CHANGED
@@ -26,7 +26,7 @@
26
  "tie_word_embeddings": false,
27
  "tokenizer_class": "T5Tokenizer",
28
  "torch_dtype": "float32",
29
- "transformers_version": "4.32.1",
30
  "use_cache": true,
31
  "vocab_size": 20100
32
  }
 
26
  "tie_word_embeddings": false,
27
  "tokenizer_class": "T5Tokenizer",
28
  "torch_dtype": "float32",
29
+ "transformers_version": "4.37.2",
30
  "use_cache": true,
31
  "vocab_size": 20100
32
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "decoder_start_token_id": 0,
4
  "eos_token_id": 1,
5
  "pad_token_id": 0,
6
- "transformers_version": "4.32.1"
7
  }
 
3
  "decoder_start_token_id": 0,
4
  "eos_token_id": 1,
5
  "pad_token_id": 0,
6
+ "transformers_version": "4.37.2"
7
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6110aaf653c8f40296e4b2ecfada2f35edce407a76dc3ea7e879357e5b60b1fb
3
+ size 258600360
runs/Feb09_09-32-14_53af0da22b49/events.out.tfevents.1707471138.53af0da22b49.7344.6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2710126dbff402c0f7c727617f35948c443e13334be9e4d553f6fd3f73aa4b19
3
+ size 47147
special_tokens_map.json CHANGED
@@ -101,7 +101,25 @@
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
- "eos_token": "</s>",
105
- "pad_token": "<pad>",
106
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  }
 
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
+ "eos_token": {
105
+ "content": "</s>",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "pad_token": {
112
+ "content": "<pad>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "unk_token": {
119
+ "content": "<unk>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ }
125
  }
tokenizer_config.json CHANGED
@@ -1,4 +1,830 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "additional_special_tokens": [
3
  "<extra_id_0>",
4
  "<extra_id_1>",
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "20100": {
28
+ "content": "<extra_id_99>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "20101": {
36
+ "content": "<extra_id_98>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": true,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "20102": {
44
+ "content": "<extra_id_97>",
45
+ "lstrip": true,
46
+ "normalized": false,
47
+ "rstrip": true,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "20103": {
52
+ "content": "<extra_id_96>",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": true,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "20104": {
60
+ "content": "<extra_id_95>",
61
+ "lstrip": true,
62
+ "normalized": false,
63
+ "rstrip": true,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "20105": {
68
+ "content": "<extra_id_94>",
69
+ "lstrip": true,
70
+ "normalized": false,
71
+ "rstrip": true,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "20106": {
76
+ "content": "<extra_id_93>",
77
+ "lstrip": true,
78
+ "normalized": false,
79
+ "rstrip": true,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "20107": {
84
+ "content": "<extra_id_92>",
85
+ "lstrip": true,
86
+ "normalized": false,
87
+ "rstrip": true,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "20108": {
92
+ "content": "<extra_id_91>",
93
+ "lstrip": true,
94
+ "normalized": false,
95
+ "rstrip": true,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "20109": {
100
+ "content": "<extra_id_90>",
101
+ "lstrip": true,
102
+ "normalized": false,
103
+ "rstrip": true,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "20110": {
108
+ "content": "<extra_id_89>",
109
+ "lstrip": true,
110
+ "normalized": false,
111
+ "rstrip": true,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "20111": {
116
+ "content": "<extra_id_88>",
117
+ "lstrip": true,
118
+ "normalized": false,
119
+ "rstrip": true,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "20112": {
124
+ "content": "<extra_id_87>",
125
+ "lstrip": true,
126
+ "normalized": false,
127
+ "rstrip": true,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "20113": {
132
+ "content": "<extra_id_86>",
133
+ "lstrip": true,
134
+ "normalized": false,
135
+ "rstrip": true,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "20114": {
140
+ "content": "<extra_id_85>",
141
+ "lstrip": true,
142
+ "normalized": false,
143
+ "rstrip": true,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "20115": {
148
+ "content": "<extra_id_84>",
149
+ "lstrip": true,
150
+ "normalized": false,
151
+ "rstrip": true,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "20116": {
156
+ "content": "<extra_id_83>",
157
+ "lstrip": true,
158
+ "normalized": false,
159
+ "rstrip": true,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20117": {
164
+ "content": "<extra_id_82>",
165
+ "lstrip": true,
166
+ "normalized": false,
167
+ "rstrip": true,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "20118": {
172
+ "content": "<extra_id_81>",
173
+ "lstrip": true,
174
+ "normalized": false,
175
+ "rstrip": true,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "20119": {
180
+ "content": "<extra_id_80>",
181
+ "lstrip": true,
182
+ "normalized": false,
183
+ "rstrip": true,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "20120": {
188
+ "content": "<extra_id_79>",
189
+ "lstrip": true,
190
+ "normalized": false,
191
+ "rstrip": true,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "20121": {
196
+ "content": "<extra_id_78>",
197
+ "lstrip": true,
198
+ "normalized": false,
199
+ "rstrip": true,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "20122": {
204
+ "content": "<extra_id_77>",
205
+ "lstrip": true,
206
+ "normalized": false,
207
+ "rstrip": true,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "20123": {
212
+ "content": "<extra_id_76>",
213
+ "lstrip": true,
214
+ "normalized": false,
215
+ "rstrip": true,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "20124": {
220
+ "content": "<extra_id_75>",
221
+ "lstrip": true,
222
+ "normalized": false,
223
+ "rstrip": true,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "20125": {
228
+ "content": "<extra_id_74>",
229
+ "lstrip": true,
230
+ "normalized": false,
231
+ "rstrip": true,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "20126": {
236
+ "content": "<extra_id_73>",
237
+ "lstrip": true,
238
+ "normalized": false,
239
+ "rstrip": true,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "20127": {
244
+ "content": "<extra_id_72>",
245
+ "lstrip": true,
246
+ "normalized": false,
247
+ "rstrip": true,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "20128": {
252
+ "content": "<extra_id_71>",
253
+ "lstrip": true,
254
+ "normalized": false,
255
+ "rstrip": true,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "20129": {
260
+ "content": "<extra_id_70>",
261
+ "lstrip": true,
262
+ "normalized": false,
263
+ "rstrip": true,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "20130": {
268
+ "content": "<extra_id_69>",
269
+ "lstrip": true,
270
+ "normalized": false,
271
+ "rstrip": true,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "20131": {
276
+ "content": "<extra_id_68>",
277
+ "lstrip": true,
278
+ "normalized": false,
279
+ "rstrip": true,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "20132": {
284
+ "content": "<extra_id_67>",
285
+ "lstrip": true,
286
+ "normalized": false,
287
+ "rstrip": true,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "20133": {
292
+ "content": "<extra_id_66>",
293
+ "lstrip": true,
294
+ "normalized": false,
295
+ "rstrip": true,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "20134": {
300
+ "content": "<extra_id_65>",
301
+ "lstrip": true,
302
+ "normalized": false,
303
+ "rstrip": true,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "20135": {
308
+ "content": "<extra_id_64>",
309
+ "lstrip": true,
310
+ "normalized": false,
311
+ "rstrip": true,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "20136": {
316
+ "content": "<extra_id_63>",
317
+ "lstrip": true,
318
+ "normalized": false,
319
+ "rstrip": true,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "20137": {
324
+ "content": "<extra_id_62>",
325
+ "lstrip": true,
326
+ "normalized": false,
327
+ "rstrip": true,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "20138": {
332
+ "content": "<extra_id_61>",
333
+ "lstrip": true,
334
+ "normalized": false,
335
+ "rstrip": true,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "20139": {
340
+ "content": "<extra_id_60>",
341
+ "lstrip": true,
342
+ "normalized": false,
343
+ "rstrip": true,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "20140": {
348
+ "content": "<extra_id_59>",
349
+ "lstrip": true,
350
+ "normalized": false,
351
+ "rstrip": true,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "20141": {
356
+ "content": "<extra_id_58>",
357
+ "lstrip": true,
358
+ "normalized": false,
359
+ "rstrip": true,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "20142": {
364
+ "content": "<extra_id_57>",
365
+ "lstrip": true,
366
+ "normalized": false,
367
+ "rstrip": true,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "20143": {
372
+ "content": "<extra_id_56>",
373
+ "lstrip": true,
374
+ "normalized": false,
375
+ "rstrip": true,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "20144": {
380
+ "content": "<extra_id_55>",
381
+ "lstrip": true,
382
+ "normalized": false,
383
+ "rstrip": true,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "20145": {
388
+ "content": "<extra_id_54>",
389
+ "lstrip": true,
390
+ "normalized": false,
391
+ "rstrip": true,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "20146": {
396
+ "content": "<extra_id_53>",
397
+ "lstrip": true,
398
+ "normalized": false,
399
+ "rstrip": true,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "20147": {
404
+ "content": "<extra_id_52>",
405
+ "lstrip": true,
406
+ "normalized": false,
407
+ "rstrip": true,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "20148": {
412
+ "content": "<extra_id_51>",
413
+ "lstrip": true,
414
+ "normalized": false,
415
+ "rstrip": true,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "20149": {
420
+ "content": "<extra_id_50>",
421
+ "lstrip": true,
422
+ "normalized": false,
423
+ "rstrip": true,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "20150": {
428
+ "content": "<extra_id_49>",
429
+ "lstrip": true,
430
+ "normalized": false,
431
+ "rstrip": true,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "20151": {
436
+ "content": "<extra_id_48>",
437
+ "lstrip": true,
438
+ "normalized": false,
439
+ "rstrip": true,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "20152": {
444
+ "content": "<extra_id_47>",
445
+ "lstrip": true,
446
+ "normalized": false,
447
+ "rstrip": true,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "20153": {
452
+ "content": "<extra_id_46>",
453
+ "lstrip": true,
454
+ "normalized": false,
455
+ "rstrip": true,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "20154": {
460
+ "content": "<extra_id_45>",
461
+ "lstrip": true,
462
+ "normalized": false,
463
+ "rstrip": true,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "20155": {
468
+ "content": "<extra_id_44>",
469
+ "lstrip": true,
470
+ "normalized": false,
471
+ "rstrip": true,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "20156": {
476
+ "content": "<extra_id_43>",
477
+ "lstrip": true,
478
+ "normalized": false,
479
+ "rstrip": true,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "20157": {
484
+ "content": "<extra_id_42>",
485
+ "lstrip": true,
486
+ "normalized": false,
487
+ "rstrip": true,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "20158": {
492
+ "content": "<extra_id_41>",
493
+ "lstrip": true,
494
+ "normalized": false,
495
+ "rstrip": true,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "20159": {
500
+ "content": "<extra_id_40>",
501
+ "lstrip": true,
502
+ "normalized": false,
503
+ "rstrip": true,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "20160": {
508
+ "content": "<extra_id_39>",
509
+ "lstrip": true,
510
+ "normalized": false,
511
+ "rstrip": true,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "20161": {
516
+ "content": "<extra_id_38>",
517
+ "lstrip": true,
518
+ "normalized": false,
519
+ "rstrip": true,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "20162": {
524
+ "content": "<extra_id_37>",
525
+ "lstrip": true,
526
+ "normalized": false,
527
+ "rstrip": true,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "20163": {
532
+ "content": "<extra_id_36>",
533
+ "lstrip": true,
534
+ "normalized": false,
535
+ "rstrip": true,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "20164": {
540
+ "content": "<extra_id_35>",
541
+ "lstrip": true,
542
+ "normalized": false,
543
+ "rstrip": true,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "20165": {
548
+ "content": "<extra_id_34>",
549
+ "lstrip": true,
550
+ "normalized": false,
551
+ "rstrip": true,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "20166": {
556
+ "content": "<extra_id_33>",
557
+ "lstrip": true,
558
+ "normalized": false,
559
+ "rstrip": true,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "20167": {
564
+ "content": "<extra_id_32>",
565
+ "lstrip": true,
566
+ "normalized": false,
567
+ "rstrip": true,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "20168": {
572
+ "content": "<extra_id_31>",
573
+ "lstrip": true,
574
+ "normalized": false,
575
+ "rstrip": true,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "20169": {
580
+ "content": "<extra_id_30>",
581
+ "lstrip": true,
582
+ "normalized": false,
583
+ "rstrip": true,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "20170": {
588
+ "content": "<extra_id_29>",
589
+ "lstrip": true,
590
+ "normalized": false,
591
+ "rstrip": true,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "20171": {
596
+ "content": "<extra_id_28>",
597
+ "lstrip": true,
598
+ "normalized": false,
599
+ "rstrip": true,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "20172": {
604
+ "content": "<extra_id_27>",
605
+ "lstrip": true,
606
+ "normalized": false,
607
+ "rstrip": true,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "20173": {
612
+ "content": "<extra_id_26>",
613
+ "lstrip": true,
614
+ "normalized": false,
615
+ "rstrip": true,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "20174": {
620
+ "content": "<extra_id_25>",
621
+ "lstrip": true,
622
+ "normalized": false,
623
+ "rstrip": true,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "20175": {
628
+ "content": "<extra_id_24>",
629
+ "lstrip": true,
630
+ "normalized": false,
631
+ "rstrip": true,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "20176": {
636
+ "content": "<extra_id_23>",
637
+ "lstrip": true,
638
+ "normalized": false,
639
+ "rstrip": true,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "20177": {
644
+ "content": "<extra_id_22>",
645
+ "lstrip": true,
646
+ "normalized": false,
647
+ "rstrip": true,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "20178": {
652
+ "content": "<extra_id_21>",
653
+ "lstrip": true,
654
+ "normalized": false,
655
+ "rstrip": true,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "20179": {
660
+ "content": "<extra_id_20>",
661
+ "lstrip": true,
662
+ "normalized": false,
663
+ "rstrip": true,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "20180": {
668
+ "content": "<extra_id_19>",
669
+ "lstrip": true,
670
+ "normalized": false,
671
+ "rstrip": true,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "20181": {
676
+ "content": "<extra_id_18>",
677
+ "lstrip": true,
678
+ "normalized": false,
679
+ "rstrip": true,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "20182": {
684
+ "content": "<extra_id_17>",
685
+ "lstrip": true,
686
+ "normalized": false,
687
+ "rstrip": true,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "20183": {
692
+ "content": "<extra_id_16>",
693
+ "lstrip": true,
694
+ "normalized": false,
695
+ "rstrip": true,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "20184": {
700
+ "content": "<extra_id_15>",
701
+ "lstrip": true,
702
+ "normalized": false,
703
+ "rstrip": true,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "20185": {
708
+ "content": "<extra_id_14>",
709
+ "lstrip": true,
710
+ "normalized": false,
711
+ "rstrip": true,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "20186": {
716
+ "content": "<extra_id_13>",
717
+ "lstrip": true,
718
+ "normalized": false,
719
+ "rstrip": true,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "20187": {
724
+ "content": "<extra_id_12>",
725
+ "lstrip": true,
726
+ "normalized": false,
727
+ "rstrip": true,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "20188": {
732
+ "content": "<extra_id_11>",
733
+ "lstrip": true,
734
+ "normalized": false,
735
+ "rstrip": true,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "20189": {
740
+ "content": "<extra_id_10>",
741
+ "lstrip": true,
742
+ "normalized": false,
743
+ "rstrip": true,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "20190": {
748
+ "content": "<extra_id_9>",
749
+ "lstrip": true,
750
+ "normalized": false,
751
+ "rstrip": true,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "20191": {
756
+ "content": "<extra_id_8>",
757
+ "lstrip": true,
758
+ "normalized": false,
759
+ "rstrip": true,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "20192": {
764
+ "content": "<extra_id_7>",
765
+ "lstrip": true,
766
+ "normalized": false,
767
+ "rstrip": true,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "20193": {
772
+ "content": "<extra_id_6>",
773
+ "lstrip": true,
774
+ "normalized": false,
775
+ "rstrip": true,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "20194": {
780
+ "content": "<extra_id_5>",
781
+ "lstrip": true,
782
+ "normalized": false,
783
+ "rstrip": true,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "20195": {
788
+ "content": "<extra_id_4>",
789
+ "lstrip": true,
790
+ "normalized": false,
791
+ "rstrip": true,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "20196": {
796
+ "content": "<extra_id_3>",
797
+ "lstrip": true,
798
+ "normalized": false,
799
+ "rstrip": true,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "20197": {
804
+ "content": "<extra_id_2>",
805
+ "lstrip": true,
806
+ "normalized": false,
807
+ "rstrip": true,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "20198": {
812
+ "content": "<extra_id_1>",
813
+ "lstrip": true,
814
+ "normalized": false,
815
+ "rstrip": true,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "20199": {
820
+ "content": "<extra_id_0>",
821
+ "lstrip": true,
822
+ "normalized": false,
823
+ "rstrip": true,
824
+ "single_word": false,
825
+ "special": true
826
+ }
827
+ },
828
  "additional_special_tokens": [
829
  "<extra_id_0>",
830
  "<extra_id_1>",
trainer_state.json CHANGED
@@ -1,1430 +1,1380 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 25.0,
5
  "eval_steps": 500,
6
- "global_step": 1725,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "learning_rate": 5.780346820809248e-07,
14
- "loss": 0.005,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.13,
19
- "learning_rate": 5.202312138728324e-06,
20
- "loss": 0.0008,
21
- "step": 9
22
  },
23
  {
24
- "epoch": 0.26,
25
- "learning_rate": 1.0404624277456647e-05,
26
- "loss": 0.0019,
27
- "step": 18
28
  },
29
  {
30
- "epoch": 0.39,
31
- "learning_rate": 1.5606936416184973e-05,
32
- "loss": 0.0019,
33
- "step": 27
34
  },
35
  {
36
- "epoch": 0.52,
37
- "learning_rate": 2.0809248554913295e-05,
38
- "loss": 0.0016,
39
- "step": 36
40
  },
41
  {
42
- "epoch": 0.65,
43
- "learning_rate": 2.6011560693641617e-05,
44
- "loss": 0.0027,
45
- "step": 45
46
  },
47
  {
48
- "epoch": 0.78,
49
- "learning_rate": 3.1213872832369946e-05,
50
- "loss": 0.0032,
51
- "step": 54
52
  },
53
  {
54
- "epoch": 0.91,
55
- "learning_rate": 3.6416184971098265e-05,
56
- "loss": 0.0013,
57
- "step": 63
58
  },
59
  {
60
- "epoch": 1.0,
61
- "eval_loss": 0.002811400219798088,
62
- "eval_max_distance": 2,
63
- "eval_mean_distance": 0,
64
- "eval_runtime": 1.9415,
65
- "eval_samples_per_second": 42.235,
66
- "eval_steps_per_second": 1.545,
67
- "step": 69
68
  },
69
  {
70
- "epoch": 1.04,
71
- "learning_rate": 4.161849710982659e-05,
72
- "loss": 0.0033,
73
- "step": 72
74
  },
75
  {
76
- "epoch": 1.17,
77
- "learning_rate": 4.6820809248554915e-05,
78
- "loss": 0.0034,
79
- "step": 81
80
  },
81
  {
82
- "epoch": 1.3,
83
- "learning_rate": 5.2023121387283234e-05,
84
- "loss": 0.0015,
85
- "step": 90
86
  },
87
  {
88
- "epoch": 1.43,
89
- "learning_rate": 5.722543352601156e-05,
90
- "loss": 0.0042,
91
- "step": 99
92
  },
93
  {
94
- "epoch": 1.57,
95
- "learning_rate": 6.242774566473989e-05,
96
- "loss": 0.0025,
97
- "step": 108
98
  },
99
  {
100
- "epoch": 1.7,
101
- "learning_rate": 6.763005780346822e-05,
102
- "loss": 0.0012,
103
- "step": 117
 
 
 
 
104
  },
105
  {
106
- "epoch": 1.83,
107
- "learning_rate": 7.283236994219653e-05,
108
- "loss": 0.002,
109
- "step": 126
110
  },
111
  {
112
- "epoch": 1.96,
113
- "learning_rate": 7.803468208092485e-05,
114
- "loss": 0.0006,
115
- "step": 135
116
  },
117
  {
118
- "epoch": 2.0,
119
- "eval_loss": 0.0026117784436792135,
120
- "eval_max_distance": 3,
121
- "eval_mean_distance": 0,
122
- "eval_runtime": 1.9167,
123
- "eval_samples_per_second": 42.781,
124
- "eval_steps_per_second": 1.565,
125
- "step": 138
126
  },
127
  {
128
- "epoch": 2.09,
129
- "learning_rate": 8.323699421965318e-05,
130
- "loss": 0.001,
131
- "step": 144
132
  },
133
  {
134
- "epoch": 2.22,
135
- "learning_rate": 8.84393063583815e-05,
136
- "loss": 0.0016,
137
- "step": 153
138
  },
139
  {
140
- "epoch": 2.35,
141
- "learning_rate": 9.364161849710983e-05,
142
- "loss": 0.0011,
143
- "step": 162
144
  },
145
  {
146
- "epoch": 2.48,
147
- "learning_rate": 9.884393063583816e-05,
148
- "loss": 0.001,
149
- "step": 171
150
  },
151
  {
152
- "epoch": 2.61,
153
- "learning_rate": 9.954896907216495e-05,
154
- "loss": 0.0026,
155
- "step": 180
156
  },
157
  {
158
- "epoch": 2.74,
159
- "learning_rate": 9.896907216494846e-05,
160
- "loss": 0.0008,
161
- "step": 189
162
  },
163
  {
164
- "epoch": 2.87,
165
- "learning_rate": 9.838917525773196e-05,
166
- "loss": 0.0022,
167
- "step": 198
168
  },
169
  {
170
- "epoch": 3.0,
171
- "learning_rate": 9.780927835051546e-05,
172
- "loss": 0.0025,
173
- "step": 207
174
  },
175
  {
176
- "epoch": 3.0,
177
- "eval_loss": 0.003930480219423771,
178
- "eval_max_distance": 3,
179
- "eval_mean_distance": 0,
180
- "eval_runtime": 1.9341,
181
- "eval_samples_per_second": 42.396,
182
- "eval_steps_per_second": 1.551,
183
- "step": 207
184
  },
185
  {
186
- "epoch": 3.13,
187
- "learning_rate": 9.722938144329897e-05,
188
- "loss": 0.0019,
189
- "step": 216
190
  },
191
  {
192
- "epoch": 3.26,
193
- "learning_rate": 9.664948453608248e-05,
194
- "loss": 0.0024,
195
- "step": 225
 
 
 
 
196
  },
197
  {
198
- "epoch": 3.39,
199
- "learning_rate": 9.606958762886598e-05,
200
- "loss": 0.0007,
201
- "step": 234
202
  },
203
  {
204
- "epoch": 3.52,
205
- "learning_rate": 9.54896907216495e-05,
206
- "loss": 0.0026,
207
- "step": 243
208
  },
209
  {
210
- "epoch": 3.65,
211
- "learning_rate": 9.490979381443299e-05,
212
- "loss": 0.0009,
213
- "step": 252
214
  },
215
  {
216
- "epoch": 3.78,
217
- "learning_rate": 9.43298969072165e-05,
218
- "loss": 0.0012,
219
- "step": 261
220
  },
221
  {
222
- "epoch": 3.91,
223
- "learning_rate": 9.375e-05,
224
  "loss": 0.0004,
225
- "step": 270
226
- },
227
- {
228
- "epoch": 4.0,
229
- "eval_loss": 0.0036886015441268682,
230
- "eval_max_distance": 3,
231
- "eval_mean_distance": 0,
232
- "eval_runtime": 1.9271,
233
- "eval_samples_per_second": 42.551,
234
- "eval_steps_per_second": 1.557,
235
- "step": 276
236
  },
237
  {
238
- "epoch": 4.04,
239
- "learning_rate": 9.317010309278351e-05,
240
- "loss": 0.0018,
241
- "step": 279
242
  },
243
  {
244
- "epoch": 4.17,
245
- "learning_rate": 9.259020618556701e-05,
246
- "loss": 0.0017,
247
- "step": 288
248
  },
249
  {
250
- "epoch": 4.3,
251
- "learning_rate": 9.201030927835051e-05,
252
- "loss": 0.0014,
253
- "step": 297
254
  },
255
  {
256
- "epoch": 4.43,
257
- "learning_rate": 9.143041237113402e-05,
258
- "loss": 0.0005,
259
- "step": 306
260
  },
261
  {
262
- "epoch": 4.57,
263
- "learning_rate": 9.085051546391753e-05,
264
- "loss": 0.001,
265
- "step": 315
266
  },
267
  {
268
- "epoch": 4.7,
269
- "learning_rate": 9.027061855670103e-05,
270
- "loss": 0.002,
271
- "step": 324
272
  },
273
  {
274
- "epoch": 4.83,
275
- "learning_rate": 8.969072164948454e-05,
276
- "loss": 0.0016,
277
- "step": 333
278
  },
279
  {
280
- "epoch": 4.96,
281
- "learning_rate": 8.911082474226806e-05,
282
- "loss": 0.0005,
283
- "step": 342
284
  },
285
  {
286
- "epoch": 5.0,
287
- "eval_loss": 0.009117466397583485,
288
- "eval_max_distance": 3,
289
  "eval_mean_distance": 0,
290
- "eval_runtime": 1.9694,
291
- "eval_samples_per_second": 41.638,
292
- "eval_steps_per_second": 1.523,
293
- "step": 345
294
  },
295
  {
296
- "epoch": 5.09,
297
- "learning_rate": 8.853092783505154e-05,
298
- "loss": 0.0005,
299
- "step": 351
300
  },
301
  {
302
- "epoch": 5.22,
303
- "learning_rate": 8.795103092783505e-05,
304
- "loss": 0.0051,
305
- "step": 360
306
  },
307
  {
308
- "epoch": 5.35,
309
- "learning_rate": 8.737113402061856e-05,
310
- "loss": 0.0004,
311
- "step": 369
312
  },
313
  {
314
- "epoch": 5.48,
315
- "learning_rate": 8.679123711340206e-05,
316
- "loss": 0.0012,
317
- "step": 378
318
  },
319
  {
320
- "epoch": 5.61,
321
- "learning_rate": 8.621134020618558e-05,
322
- "loss": 0.001,
323
- "step": 387
324
  },
325
  {
326
- "epoch": 5.74,
327
- "learning_rate": 8.563144329896907e-05,
328
- "loss": 0.0015,
329
- "step": 396
330
  },
331
  {
332
- "epoch": 5.87,
333
- "learning_rate": 8.505154639175259e-05,
334
- "loss": 0.0016,
335
- "step": 405
336
  },
337
  {
338
- "epoch": 6.0,
339
- "learning_rate": 8.447164948453608e-05,
340
- "loss": 0.0009,
341
- "step": 414
342
  },
343
  {
344
- "epoch": 6.0,
345
- "eval_loss": 0.0005720060435123742,
346
- "eval_max_distance": 0,
347
- "eval_mean_distance": 0,
348
- "eval_runtime": 1.9399,
349
- "eval_samples_per_second": 42.271,
350
- "eval_steps_per_second": 1.546,
351
- "step": 414
352
  },
353
  {
354
- "epoch": 6.13,
355
- "learning_rate": 8.38917525773196e-05,
356
- "loss": 0.0005,
357
- "step": 423
358
  },
359
  {
360
- "epoch": 6.26,
361
- "learning_rate": 8.331185567010311e-05,
362
- "loss": 0.0006,
363
- "step": 432
364
  },
365
  {
366
- "epoch": 6.39,
367
- "learning_rate": 8.273195876288659e-05,
368
- "loss": 0.0014,
369
- "step": 441
370
  },
371
  {
372
- "epoch": 6.52,
373
- "learning_rate": 8.21520618556701e-05,
374
- "loss": 0.0019,
375
- "step": 450
376
  },
377
  {
378
- "epoch": 6.65,
379
- "learning_rate": 8.157216494845362e-05,
380
- "loss": 0.001,
381
- "step": 459
382
  },
383
  {
384
- "epoch": 6.78,
385
- "learning_rate": 8.099226804123711e-05,
386
- "loss": 0.0008,
387
- "step": 468
 
 
 
 
388
  },
389
  {
390
- "epoch": 6.91,
391
- "learning_rate": 8.041237113402063e-05,
392
- "loss": 0.0016,
393
- "step": 477
394
  },
395
  {
396
- "epoch": 7.0,
397
- "eval_loss": 0.00027213190332986414,
398
- "eval_max_distance": 0,
399
- "eval_mean_distance": 0,
400
- "eval_runtime": 1.9188,
401
- "eval_samples_per_second": 42.735,
402
- "eval_steps_per_second": 1.563,
403
- "step": 483
404
  },
405
  {
406
- "epoch": 7.04,
407
- "learning_rate": 7.983247422680414e-05,
408
- "loss": 0.0006,
409
- "step": 486
410
  },
411
  {
412
- "epoch": 7.17,
413
- "learning_rate": 7.925257731958762e-05,
414
- "loss": 0.007,
415
- "step": 495
416
  },
417
  {
418
- "epoch": 7.3,
419
- "learning_rate": 7.867268041237113e-05,
420
- "loss": 0.0004,
421
- "step": 504
422
  },
423
  {
424
- "epoch": 7.43,
425
- "learning_rate": 7.809278350515465e-05,
426
- "loss": 0.0016,
427
- "step": 513
428
  },
429
  {
430
- "epoch": 7.57,
431
- "learning_rate": 7.751288659793814e-05,
432
- "loss": 0.0006,
433
- "step": 522
434
  },
435
  {
436
- "epoch": 7.7,
437
- "learning_rate": 7.693298969072166e-05,
438
- "loss": 0.0011,
439
- "step": 531
440
  },
441
  {
442
- "epoch": 7.83,
443
- "learning_rate": 7.635309278350515e-05,
444
- "loss": 0.0014,
445
- "step": 540
446
  },
447
  {
448
- "epoch": 7.96,
449
- "learning_rate": 7.577319587628867e-05,
450
- "loss": 0.0012,
451
- "step": 549
452
  },
453
  {
454
- "epoch": 8.0,
455
- "eval_loss": 0.011139851063489914,
456
- "eval_max_distance": 5,
457
- "eval_mean_distance": 0,
458
- "eval_runtime": 1.9435,
459
- "eval_samples_per_second": 42.193,
460
- "eval_steps_per_second": 1.544,
461
- "step": 552
462
  },
463
  {
464
- "epoch": 8.09,
465
- "learning_rate": 7.519329896907217e-05,
466
- "loss": 0.0008,
467
- "step": 558
468
  },
469
  {
470
- "epoch": 8.22,
471
- "learning_rate": 7.461340206185568e-05,
472
- "loss": 0.0011,
473
- "step": 567
474
  },
475
  {
476
- "epoch": 8.35,
477
- "learning_rate": 7.403350515463919e-05,
478
- "loss": 0.0025,
479
- "step": 576
 
 
 
 
480
  },
481
  {
482
- "epoch": 8.48,
483
- "learning_rate": 7.345360824742269e-05,
484
- "loss": 0.003,
485
- "step": 585
486
  },
487
  {
488
- "epoch": 8.61,
489
- "learning_rate": 7.287371134020619e-05,
490
- "loss": 0.004,
491
- "step": 594
492
  },
493
  {
494
- "epoch": 8.74,
495
- "learning_rate": 7.22938144329897e-05,
496
- "loss": 0.002,
497
- "step": 603
498
  },
499
  {
500
- "epoch": 8.87,
501
- "learning_rate": 7.17139175257732e-05,
502
- "loss": 0.0007,
503
- "step": 612
504
  },
505
  {
506
- "epoch": 9.0,
507
- "learning_rate": 7.113402061855671e-05,
508
- "loss": 0.0008,
509
- "step": 621
510
  },
511
  {
512
- "epoch": 9.0,
513
- "eval_loss": 0.0003953798732254654,
514
- "eval_max_distance": 0,
515
- "eval_mean_distance": 0,
516
- "eval_runtime": 1.995,
517
- "eval_samples_per_second": 41.102,
518
- "eval_steps_per_second": 1.504,
519
- "step": 621
520
  },
521
  {
522
- "epoch": 9.13,
523
- "learning_rate": 7.055412371134022e-05,
524
- "loss": 0.0005,
525
- "step": 630
526
  },
527
  {
528
- "epoch": 9.26,
529
- "learning_rate": 6.99742268041237e-05,
530
- "loss": 0.0004,
531
- "step": 639
532
  },
533
  {
534
- "epoch": 9.39,
535
- "learning_rate": 6.939432989690722e-05,
536
- "loss": 0.0013,
537
- "step": 648
538
  },
539
  {
540
- "epoch": 9.52,
541
- "learning_rate": 6.881443298969073e-05,
542
  "loss": 0.0002,
543
- "step": 657
544
  },
545
  {
546
- "epoch": 9.65,
547
- "learning_rate": 6.823453608247423e-05,
548
- "loss": 0.0011,
549
- "step": 666
550
  },
551
  {
552
- "epoch": 9.78,
553
- "learning_rate": 6.765463917525774e-05,
554
- "loss": 0.0018,
555
- "step": 675
556
  },
557
  {
558
- "epoch": 9.91,
559
- "learning_rate": 6.707474226804124e-05,
560
- "loss": 0.0018,
561
- "step": 684
562
  },
563
  {
564
- "epoch": 10.0,
565
- "eval_loss": 0.00027754431357607245,
566
- "eval_max_distance": 0,
567
  "eval_mean_distance": 0,
568
- "eval_runtime": 1.9222,
569
- "eval_samples_per_second": 42.659,
570
- "eval_steps_per_second": 1.561,
571
- "step": 690
572
- },
573
- {
574
- "epoch": 10.04,
575
- "learning_rate": 6.649484536082475e-05,
576
- "loss": 0.0011,
577
- "step": 693
578
  },
579
  {
580
- "epoch": 10.17,
581
- "learning_rate": 6.591494845360825e-05,
582
- "loss": 0.0006,
583
- "step": 702
584
  },
585
  {
586
- "epoch": 10.3,
587
- "learning_rate": 6.533505154639176e-05,
588
- "loss": 0.0011,
589
- "step": 711
590
  },
591
  {
592
- "epoch": 10.43,
593
- "learning_rate": 6.475515463917527e-05,
594
- "loss": 0.0013,
595
- "step": 720
596
  },
597
  {
598
- "epoch": 10.57,
599
- "learning_rate": 6.417525773195877e-05,
600
- "loss": 0.0006,
601
- "step": 729
602
  },
603
  {
604
- "epoch": 10.7,
605
- "learning_rate": 6.359536082474227e-05,
606
- "loss": 0.0018,
607
- "step": 738
608
  },
609
  {
610
- "epoch": 10.83,
611
- "learning_rate": 6.301546391752578e-05,
612
- "loss": 0.0016,
613
- "step": 747
614
  },
615
  {
616
- "epoch": 10.96,
617
- "learning_rate": 6.243556701030928e-05,
618
- "loss": 0.0028,
619
- "step": 756
620
  },
621
  {
622
- "epoch": 11.0,
623
- "eval_loss": 0.00033258015173487365,
624
- "eval_max_distance": 0,
625
- "eval_mean_distance": 0,
626
- "eval_runtime": 1.9385,
627
- "eval_samples_per_second": 42.301,
628
- "eval_steps_per_second": 1.548,
629
- "step": 759
630
  },
631
  {
632
- "epoch": 11.09,
633
- "learning_rate": 6.185567010309279e-05,
634
- "loss": 0.0009,
635
- "step": 765
636
  },
637
  {
638
- "epoch": 11.22,
639
- "learning_rate": 6.12757731958763e-05,
640
- "loss": 0.0005,
641
- "step": 774
642
  },
643
  {
644
- "epoch": 11.35,
645
- "learning_rate": 6.069587628865979e-05,
646
- "loss": 0.0011,
647
- "step": 783
648
  },
649
  {
650
- "epoch": 11.48,
651
- "learning_rate": 6.01159793814433e-05,
652
- "loss": 0.0007,
653
- "step": 792
654
  },
655
  {
656
- "epoch": 11.61,
657
- "learning_rate": 5.953608247422681e-05,
658
- "loss": 0.0012,
659
- "step": 801
660
  },
661
  {
662
- "epoch": 11.74,
663
- "learning_rate": 5.8956185567010315e-05,
664
- "loss": 0.0021,
665
- "step": 810
666
  },
667
  {
668
- "epoch": 11.87,
669
- "learning_rate": 5.837628865979382e-05,
670
- "loss": 0.0006,
671
- "step": 819
 
 
 
 
672
  },
673
  {
674
- "epoch": 12.0,
675
- "learning_rate": 5.779639175257732e-05,
676
- "loss": 0.0008,
677
- "step": 828
678
  },
679
  {
680
- "epoch": 12.0,
681
- "eval_loss": 0.0002690624096430838,
682
- "eval_max_distance": 0,
683
- "eval_mean_distance": 0,
684
- "eval_runtime": 1.9448,
685
- "eval_samples_per_second": 42.163,
686
- "eval_steps_per_second": 1.543,
687
- "step": 828
688
  },
689
  {
690
- "epoch": 12.13,
691
- "learning_rate": 5.721649484536082e-05,
692
- "loss": 0.001,
693
- "step": 837
694
  },
695
  {
696
- "epoch": 12.26,
697
- "learning_rate": 5.663659793814433e-05,
698
- "loss": 0.0012,
699
- "step": 846
700
  },
701
  {
702
- "epoch": 12.39,
703
- "learning_rate": 5.605670103092784e-05,
704
- "loss": 0.0005,
705
- "step": 855
706
  },
707
  {
708
- "epoch": 12.52,
709
- "learning_rate": 5.5476804123711345e-05,
710
- "loss": 0.0011,
711
- "step": 864
712
  },
713
  {
714
- "epoch": 12.65,
715
- "learning_rate": 5.489690721649485e-05,
716
- "loss": 0.0026,
717
- "step": 873
718
  },
719
  {
720
- "epoch": 12.78,
721
- "learning_rate": 5.431701030927835e-05,
722
- "loss": 0.0009,
723
- "step": 882
724
  },
725
  {
726
- "epoch": 12.91,
727
- "learning_rate": 5.3737113402061854e-05,
728
- "loss": 0.001,
729
- "step": 891
730
  },
731
  {
732
- "epoch": 13.0,
733
- "eval_loss": 0.0004277784610167146,
734
- "eval_max_distance": 2,
735
- "eval_mean_distance": 0,
736
- "eval_runtime": 1.9315,
737
- "eval_samples_per_second": 42.454,
738
- "eval_steps_per_second": 1.553,
739
- "step": 897
740
  },
741
  {
742
- "epoch": 13.04,
743
- "learning_rate": 5.3157216494845366e-05,
744
- "loss": 0.0007,
745
- "step": 900
746
  },
747
  {
748
- "epoch": 13.17,
749
- "learning_rate": 5.257731958762887e-05,
750
- "loss": 0.0008,
751
- "step": 909
752
  },
753
  {
754
- "epoch": 13.3,
755
- "learning_rate": 5.1997422680412376e-05,
756
- "loss": 0.0005,
757
- "step": 918
758
  },
759
  {
760
- "epoch": 13.43,
761
- "learning_rate": 5.1417525773195874e-05,
762
- "loss": 0.0004,
763
- "step": 927
 
 
 
 
764
  },
765
  {
766
- "epoch": 13.57,
767
- "learning_rate": 5.083762886597938e-05,
768
- "loss": 0.0006,
769
- "step": 936
770
  },
771
  {
772
- "epoch": 13.7,
773
- "learning_rate": 5.025773195876289e-05,
774
- "loss": 0.0025,
775
- "step": 945
776
  },
777
  {
778
- "epoch": 13.83,
779
- "learning_rate": 4.9677835051546396e-05,
780
- "loss": 0.0009,
781
- "step": 954
782
  },
783
  {
784
- "epoch": 13.96,
785
- "learning_rate": 4.9097938144329895e-05,
786
- "loss": 0.0026,
787
- "step": 963
788
  },
789
  {
790
- "epoch": 14.0,
791
- "eval_loss": 0.0005385838449001312,
792
- "eval_max_distance": 2,
793
- "eval_mean_distance": 0,
794
- "eval_runtime": 1.993,
795
- "eval_samples_per_second": 41.144,
796
- "eval_steps_per_second": 1.505,
797
- "step": 966
798
  },
799
  {
800
- "epoch": 14.09,
801
- "learning_rate": 4.8518041237113407e-05,
802
- "loss": 0.0016,
803
- "step": 972
804
  },
805
  {
806
- "epoch": 14.22,
807
- "learning_rate": 4.793814432989691e-05,
808
- "loss": 0.0014,
809
- "step": 981
810
  },
811
  {
812
- "epoch": 14.35,
813
- "learning_rate": 4.735824742268041e-05,
814
- "loss": 0.0007,
815
- "step": 990
816
  },
817
  {
818
- "epoch": 14.48,
819
- "learning_rate": 4.677835051546392e-05,
820
- "loss": 0.0031,
821
- "step": 999
822
  },
823
  {
824
- "epoch": 14.61,
825
- "learning_rate": 4.619845360824743e-05,
826
- "loss": 0.0008,
827
- "step": 1008
828
  },
829
  {
830
- "epoch": 14.74,
831
- "learning_rate": 4.561855670103093e-05,
832
- "loss": 0.0028,
833
- "step": 1017
834
  },
835
  {
836
- "epoch": 14.87,
837
- "learning_rate": 4.503865979381444e-05,
838
- "loss": 0.0004,
839
- "step": 1026
840
  },
841
  {
842
- "epoch": 15.0,
843
- "learning_rate": 4.4458762886597936e-05,
844
- "loss": 0.0015,
845
- "step": 1035
846
  },
847
  {
848
- "epoch": 15.0,
849
- "eval_loss": 0.0007138837827369571,
850
  "eval_max_distance": 3,
851
  "eval_mean_distance": 0,
852
- "eval_runtime": 1.9688,
853
- "eval_samples_per_second": 41.651,
854
- "eval_steps_per_second": 1.524,
855
- "step": 1035
856
  },
857
  {
858
- "epoch": 15.13,
859
- "learning_rate": 4.387886597938145e-05,
860
- "loss": 0.0005,
861
- "step": 1044
862
  },
863
  {
864
- "epoch": 15.26,
865
- "learning_rate": 4.329896907216495e-05,
866
- "loss": 0.0011,
867
- "step": 1053
868
  },
869
  {
870
- "epoch": 15.39,
871
- "learning_rate": 4.271907216494845e-05,
872
- "loss": 0.0001,
873
- "step": 1062
874
  },
875
  {
876
- "epoch": 15.52,
877
- "learning_rate": 4.213917525773196e-05,
878
- "loss": 0.0009,
879
- "step": 1071
880
  },
881
  {
882
- "epoch": 15.65,
883
- "learning_rate": 4.155927835051547e-05,
884
- "loss": 0.0017,
885
- "step": 1080
886
  },
887
  {
888
- "epoch": 15.78,
889
- "learning_rate": 4.097938144329897e-05,
890
- "loss": 0.0013,
891
- "step": 1089
892
  },
893
  {
894
- "epoch": 15.91,
895
- "learning_rate": 4.039948453608248e-05,
896
- "loss": 0.0009,
897
- "step": 1098
898
  },
899
  {
900
- "epoch": 16.0,
901
- "eval_loss": 0.0006717974320054054,
902
- "eval_max_distance": 3,
903
- "eval_mean_distance": 0,
904
- "eval_runtime": 1.9244,
905
- "eval_samples_per_second": 42.612,
906
- "eval_steps_per_second": 1.559,
907
- "step": 1104
908
  },
909
  {
910
- "epoch": 16.04,
911
- "learning_rate": 3.9819587628865976e-05,
912
- "loss": 0.0015,
913
- "step": 1107
914
  },
915
  {
916
- "epoch": 16.17,
917
- "learning_rate": 3.923969072164949e-05,
918
  "loss": 0.0003,
919
- "step": 1116
920
  },
921
  {
922
- "epoch": 16.3,
923
- "learning_rate": 3.865979381443299e-05,
924
- "loss": 0.0006,
925
- "step": 1125
926
  },
927
  {
928
- "epoch": 16.43,
929
- "learning_rate": 3.807989690721649e-05,
930
- "loss": 0.0013,
931
- "step": 1134
932
  },
933
  {
934
- "epoch": 16.57,
935
- "learning_rate": 3.7500000000000003e-05,
936
- "loss": 0.0009,
937
- "step": 1143
938
  },
939
  {
940
- "epoch": 16.7,
941
- "learning_rate": 3.692010309278351e-05,
942
- "loss": 0.0004,
943
- "step": 1152
944
  },
945
  {
946
- "epoch": 16.83,
947
- "learning_rate": 3.6340206185567014e-05,
948
- "loss": 0.0008,
949
- "step": 1161
 
 
 
 
950
  },
951
  {
952
- "epoch": 16.96,
953
- "learning_rate": 3.576030927835052e-05,
954
- "loss": 0.0014,
955
- "step": 1170
956
  },
957
  {
958
- "epoch": 17.0,
959
- "eval_loss": 0.00033988503855653107,
960
- "eval_max_distance": 0,
961
- "eval_mean_distance": 0,
962
- "eval_runtime": 1.9136,
963
- "eval_samples_per_second": 42.851,
964
- "eval_steps_per_second": 1.568,
965
- "step": 1173
966
  },
967
  {
968
- "epoch": 17.09,
969
- "learning_rate": 3.5180412371134024e-05,
970
- "loss": 0.0016,
971
- "step": 1179
972
  },
973
  {
974
- "epoch": 17.22,
975
- "learning_rate": 3.460051546391753e-05,
976
- "loss": 0.0004,
977
- "step": 1188
978
  },
979
  {
980
- "epoch": 17.35,
981
- "learning_rate": 3.4020618556701034e-05,
982
- "loss": 0.002,
983
- "step": 1197
984
  },
985
  {
986
- "epoch": 17.48,
987
- "learning_rate": 3.344072164948453e-05,
988
- "loss": 0.0012,
989
- "step": 1206
990
  },
991
  {
992
- "epoch": 17.61,
993
- "learning_rate": 3.2860824742268044e-05,
994
  "loss": 0.0002,
995
- "step": 1215
996
  },
997
  {
998
- "epoch": 17.74,
999
- "learning_rate": 3.228092783505155e-05,
1000
- "loss": 0.0006,
1001
- "step": 1224
1002
  },
1003
  {
1004
- "epoch": 17.87,
1005
- "learning_rate": 3.1701030927835054e-05,
1006
- "loss": 0.0044,
1007
- "step": 1233
1008
  },
1009
  {
1010
- "epoch": 18.0,
1011
- "learning_rate": 3.112113402061856e-05,
1012
- "loss": 0.001,
1013
- "step": 1242
1014
  },
1015
  {
1016
- "epoch": 18.0,
1017
- "eval_loss": 0.00037691937177442014,
1018
- "eval_max_distance": 0,
1019
- "eval_mean_distance": 0,
1020
- "eval_runtime": 1.9496,
1021
- "eval_samples_per_second": 42.059,
1022
- "eval_steps_per_second": 1.539,
1023
- "step": 1242
1024
  },
1025
  {
1026
- "epoch": 18.13,
1027
- "learning_rate": 3.0541237113402065e-05,
1028
- "loss": 0.0005,
1029
- "step": 1251
1030
  },
1031
  {
1032
- "epoch": 18.26,
1033
- "learning_rate": 2.9961340206185566e-05,
1034
- "loss": 0.0009,
1035
- "step": 1260
1036
  },
1037
  {
1038
- "epoch": 18.39,
1039
- "learning_rate": 2.9381443298969075e-05,
1040
- "loss": 0.0008,
1041
- "step": 1269
 
 
 
 
1042
  },
1043
  {
1044
- "epoch": 18.52,
1045
- "learning_rate": 2.8801546391752577e-05,
1046
- "loss": 0.0007,
1047
- "step": 1278
1048
  },
1049
  {
1050
- "epoch": 18.65,
1051
- "learning_rate": 2.8221649484536085e-05,
1052
- "loss": 0.0016,
1053
- "step": 1287
1054
  },
1055
  {
1056
- "epoch": 18.78,
1057
- "learning_rate": 2.764175257731959e-05,
1058
- "loss": 0.0012,
1059
- "step": 1296
1060
  },
1061
  {
1062
- "epoch": 18.91,
1063
- "learning_rate": 2.7061855670103092e-05,
1064
- "loss": 0.0007,
1065
- "step": 1305
1066
  },
1067
  {
1068
- "epoch": 19.0,
1069
- "eval_loss": 0.001327142701484263,
1070
- "eval_max_distance": 3,
1071
- "eval_mean_distance": 0,
1072
- "eval_runtime": 1.9144,
1073
- "eval_samples_per_second": 42.834,
1074
- "eval_steps_per_second": 1.567,
1075
- "step": 1311
1076
  },
1077
  {
1078
- "epoch": 19.04,
1079
- "learning_rate": 2.64819587628866e-05,
1080
- "loss": 0.0005,
1081
- "step": 1314
1082
  },
1083
  {
1084
- "epoch": 19.17,
1085
- "learning_rate": 2.5902061855670106e-05,
1086
- "loss": 0.0011,
1087
- "step": 1323
1088
  },
1089
  {
1090
- "epoch": 19.3,
1091
- "learning_rate": 2.5322164948453607e-05,
1092
- "loss": 0.0006,
1093
- "step": 1332
1094
  },
1095
  {
1096
- "epoch": 19.43,
1097
- "learning_rate": 2.4742268041237116e-05,
1098
- "loss": 0.0004,
1099
- "step": 1341
1100
  },
1101
  {
1102
- "epoch": 19.57,
1103
- "learning_rate": 2.416237113402062e-05,
1104
- "loss": 0.002,
1105
- "step": 1350
1106
  },
1107
  {
1108
- "epoch": 19.7,
1109
- "learning_rate": 2.3582474226804126e-05,
1110
- "loss": 0.0003,
1111
- "step": 1359
1112
  },
1113
  {
1114
- "epoch": 19.83,
1115
- "learning_rate": 2.3002577319587628e-05,
1116
- "loss": 0.0011,
1117
- "step": 1368
1118
  },
1119
  {
1120
- "epoch": 19.96,
1121
- "learning_rate": 2.2422680412371136e-05,
1122
- "loss": 0.0013,
1123
- "step": 1377
1124
  },
1125
  {
1126
- "epoch": 20.0,
1127
- "eval_loss": 0.0012958323350176215,
1128
  "eval_max_distance": 3,
1129
  "eval_mean_distance": 0,
1130
- "eval_runtime": 1.9255,
1131
- "eval_samples_per_second": 42.587,
1132
- "eval_steps_per_second": 1.558,
1133
- "step": 1380
1134
  },
1135
  {
1136
- "epoch": 20.09,
1137
- "learning_rate": 2.184278350515464e-05,
1138
- "loss": 0.0017,
1139
- "step": 1386
1140
  },
1141
  {
1142
- "epoch": 20.22,
1143
- "learning_rate": 2.1262886597938146e-05,
1144
- "loss": 0.0011,
1145
- "step": 1395
1146
  },
1147
  {
1148
- "epoch": 20.35,
1149
- "learning_rate": 2.0682989690721648e-05,
1150
- "loss": 0.0014,
1151
- "step": 1404
1152
  },
1153
  {
1154
- "epoch": 20.48,
1155
- "learning_rate": 2.0103092783505157e-05,
1156
- "loss": 0.0006,
1157
- "step": 1413
1158
  },
1159
  {
1160
- "epoch": 20.61,
1161
- "learning_rate": 1.952319587628866e-05,
1162
- "loss": 0.0003,
1163
- "step": 1422
1164
  },
1165
  {
1166
- "epoch": 20.74,
1167
- "learning_rate": 1.8943298969072167e-05,
1168
  "loss": 0.0002,
1169
- "step": 1431
1170
  },
1171
  {
1172
- "epoch": 20.87,
1173
- "learning_rate": 1.8363402061855672e-05,
1174
- "loss": 0.0024,
1175
- "step": 1440
1176
  },
1177
  {
1178
- "epoch": 21.0,
1179
- "learning_rate": 1.7783505154639177e-05,
1180
- "loss": 0.0007,
1181
- "step": 1449
1182
  },
1183
  {
1184
- "epoch": 21.0,
1185
- "eval_loss": 0.0002567500632721931,
1186
- "eval_max_distance": 0,
1187
- "eval_mean_distance": 0,
1188
- "eval_runtime": 1.9342,
1189
- "eval_samples_per_second": 42.395,
1190
- "eval_steps_per_second": 1.551,
1191
- "step": 1449
1192
  },
1193
  {
1194
- "epoch": 21.13,
1195
- "learning_rate": 1.7203608247422682e-05,
1196
- "loss": 0.0006,
1197
- "step": 1458
1198
  },
1199
  {
1200
- "epoch": 21.26,
1201
- "learning_rate": 1.6623711340206187e-05,
1202
- "loss": 0.002,
1203
- "step": 1467
1204
  },
1205
  {
1206
- "epoch": 21.39,
1207
- "learning_rate": 1.6043814432989692e-05,
1208
- "loss": 0.0007,
1209
- "step": 1476
1210
  },
1211
  {
1212
- "epoch": 21.52,
1213
- "learning_rate": 1.5463917525773197e-05,
1214
- "loss": 0.0007,
1215
- "step": 1485
1216
  },
1217
  {
1218
- "epoch": 21.65,
1219
- "learning_rate": 1.4884020618556702e-05,
1220
- "loss": 0.0016,
1221
- "step": 1494
 
 
 
 
1222
  },
1223
  {
1224
- "epoch": 21.78,
1225
- "learning_rate": 1.4304123711340206e-05,
1226
- "loss": 0.0008,
1227
- "step": 1503
1228
  },
1229
  {
1230
- "epoch": 21.91,
1231
- "learning_rate": 1.3724226804123713e-05,
1232
- "loss": 0.0016,
1233
- "step": 1512
1234
  },
1235
  {
1236
- "epoch": 22.0,
1237
- "eval_loss": 0.0002821955131366849,
1238
- "eval_max_distance": 0,
1239
- "eval_mean_distance": 0,
1240
- "eval_runtime": 1.9113,
1241
- "eval_samples_per_second": 42.902,
1242
- "eval_steps_per_second": 1.57,
1243
- "step": 1518
1244
  },
1245
  {
1246
- "epoch": 22.04,
1247
- "learning_rate": 1.3144329896907218e-05,
1248
- "loss": 0.0018,
1249
- "step": 1521
1250
  },
1251
  {
1252
- "epoch": 22.17,
1253
- "learning_rate": 1.2564432989690723e-05,
1254
- "loss": 0.0027,
1255
- "step": 1530
1256
  },
1257
  {
1258
- "epoch": 22.3,
1259
- "learning_rate": 1.1984536082474228e-05,
1260
  "loss": 0.0002,
1261
- "step": 1539
1262
  },
1263
  {
1264
- "epoch": 22.43,
1265
- "learning_rate": 1.1404639175257733e-05,
1266
- "loss": 0.001,
1267
- "step": 1548
1268
  },
1269
  {
1270
- "epoch": 22.57,
1271
- "learning_rate": 1.0824742268041238e-05,
1272
- "loss": 0.0007,
1273
- "step": 1557
1274
  },
1275
  {
1276
- "epoch": 22.7,
1277
- "learning_rate": 1.0244845360824743e-05,
1278
- "loss": 0.0009,
1279
- "step": 1566
1280
  },
1281
  {
1282
- "epoch": 22.83,
1283
- "learning_rate": 9.664948453608248e-06,
1284
- "loss": 0.0012,
1285
- "step": 1575
1286
  },
1287
  {
1288
- "epoch": 22.96,
1289
- "learning_rate": 9.085051546391753e-06,
1290
- "loss": 0.0013,
1291
- "step": 1584
1292
  },
1293
  {
1294
- "epoch": 23.0,
1295
- "eval_loss": 0.00030223012436181307,
1296
- "eval_max_distance": 0,
1297
- "eval_mean_distance": 0,
1298
- "eval_runtime": 1.9675,
1299
- "eval_samples_per_second": 41.677,
1300
- "eval_steps_per_second": 1.525,
1301
- "step": 1587
1302
  },
1303
  {
1304
- "epoch": 23.09,
1305
- "learning_rate": 8.505154639175259e-06,
1306
- "loss": 0.0025,
1307
- "step": 1593
1308
  },
1309
  {
1310
- "epoch": 23.22,
1311
- "learning_rate": 7.925257731958764e-06,
1312
- "loss": 0.001,
1313
- "step": 1602
1314
  },
1315
  {
1316
- "epoch": 23.35,
1317
- "learning_rate": 7.345360824742269e-06,
1318
- "loss": 0.0004,
1319
- "step": 1611
 
 
 
 
1320
  },
1321
  {
1322
- "epoch": 23.48,
1323
- "learning_rate": 6.765463917525773e-06,
1324
- "loss": 0.0006,
1325
- "step": 1620
1326
  },
1327
  {
1328
- "epoch": 23.61,
1329
- "learning_rate": 6.185567010309279e-06,
1330
- "loss": 0.001,
1331
- "step": 1629
1332
  },
1333
  {
1334
- "epoch": 23.74,
1335
- "learning_rate": 5.605670103092784e-06,
1336
- "loss": 0.0012,
1337
- "step": 1638
1338
  },
1339
  {
1340
- "epoch": 23.87,
1341
- "learning_rate": 5.025773195876289e-06,
1342
- "loss": 0.0013,
1343
- "step": 1647
1344
  },
1345
  {
1346
- "epoch": 24.0,
1347
- "learning_rate": 4.445876288659794e-06,
1348
- "loss": 0.0004,
1349
- "step": 1656
1350
  },
1351
  {
1352
- "epoch": 24.0,
1353
- "eval_loss": 0.0002631743554957211,
1354
- "eval_max_distance": 0,
1355
- "eval_mean_distance": 0,
1356
- "eval_runtime": 1.9278,
1357
- "eval_samples_per_second": 42.536,
1358
- "eval_steps_per_second": 1.556,
1359
- "step": 1656
1360
  },
1361
  {
1362
- "epoch": 24.13,
1363
- "learning_rate": 3.865979381443299e-06,
1364
- "loss": 0.0018,
1365
- "step": 1665
1366
  },
1367
  {
1368
- "epoch": 24.26,
1369
- "learning_rate": 3.2860824742268044e-06,
1370
- "loss": 0.0006,
1371
- "step": 1674
1372
  },
1373
  {
1374
- "epoch": 24.39,
1375
- "learning_rate": 2.7061855670103095e-06,
1376
- "loss": 0.0012,
1377
- "step": 1683
1378
  },
1379
  {
1380
- "epoch": 24.52,
1381
- "learning_rate": 2.1262886597938146e-06,
1382
- "loss": 0.0009,
1383
- "step": 1692
1384
  },
1385
  {
1386
- "epoch": 24.65,
1387
- "learning_rate": 1.5463917525773197e-06,
1388
- "loss": 0.0007,
1389
- "step": 1701
1390
  },
1391
  {
1392
- "epoch": 24.78,
1393
- "learning_rate": 9.664948453608248e-07,
1394
- "loss": 0.0009,
1395
- "step": 1710
1396
  },
1397
  {
1398
- "epoch": 24.91,
1399
- "learning_rate": 3.8659793814432993e-07,
1400
- "loss": 0.001,
1401
- "step": 1719
1402
  },
1403
  {
1404
- "epoch": 25.0,
1405
- "eval_loss": 0.0002593309909570962,
1406
- "eval_max_distance": 0,
1407
  "eval_mean_distance": 0,
1408
- "eval_runtime": 2.0449,
1409
- "eval_samples_per_second": 40.1,
1410
- "eval_steps_per_second": 1.467,
1411
- "step": 1725
1412
- },
1413
- {
1414
- "epoch": 25.0,
1415
- "step": 1725,
1416
- "total_flos": 459342194208768.0,
1417
- "train_loss": 0.0013090899151048043,
1418
- "train_runtime": 199.817,
1419
- "train_samples_per_second": 256.61,
1420
- "train_steps_per_second": 8.633
1421
  }
1422
  ],
1423
- "logging_steps": 9,
1424
- "max_steps": 1725,
1425
- "num_train_epochs": 25,
1426
- "save_steps": 18,
1427
- "total_flos": 459342194208768.0,
 
 
1428
  "trial_name": null,
1429
  "trial_params": null
1430
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 15.0,
5
  "eval_steps": 500,
6
+ "global_step": 45885,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.1791239921551537e-08,
14
+ "loss": 0.0002,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.08,
19
+ "learning_rate": 5.011985181956853e-06,
20
+ "loss": 0.0017,
21
+ "step": 230
22
  },
23
  {
24
+ "epoch": 0.15,
25
+ "learning_rate": 1.0023970363913707e-05,
26
+ "loss": 0.0012,
27
+ "step": 460
28
  },
29
  {
30
+ "epoch": 0.23,
31
+ "learning_rate": 1.503595554587056e-05,
32
+ "loss": 0.0008,
33
+ "step": 690
34
  },
35
  {
36
+ "epoch": 0.3,
37
+ "learning_rate": 2.0047940727827413e-05,
38
+ "loss": 0.001,
39
+ "step": 920
40
  },
41
  {
42
+ "epoch": 0.38,
43
+ "learning_rate": 2.5059925909784264e-05,
44
+ "loss": 0.0008,
45
+ "step": 1150
46
  },
47
  {
48
+ "epoch": 0.45,
49
+ "learning_rate": 3.007191109174112e-05,
50
+ "loss": 0.0007,
51
+ "step": 1380
52
  },
53
  {
54
+ "epoch": 0.53,
55
+ "learning_rate": 3.5083896273697975e-05,
56
+ "loss": 0.0007,
57
+ "step": 1610
58
  },
59
  {
60
+ "epoch": 0.6,
61
+ "learning_rate": 4.0095881455654826e-05,
62
+ "loss": 0.0006,
63
+ "step": 1840
 
 
 
 
64
  },
65
  {
66
+ "epoch": 0.68,
67
+ "learning_rate": 4.5107866637611684e-05,
68
+ "loss": 0.0006,
69
+ "step": 2070
70
  },
71
  {
72
+ "epoch": 0.75,
73
+ "learning_rate": 5.011985181956853e-05,
74
+ "loss": 0.0005,
75
+ "step": 2300
76
  },
77
  {
78
+ "epoch": 0.83,
79
+ "learning_rate": 5.5131837001525385e-05,
80
+ "loss": 0.0004,
81
+ "step": 2530
82
  },
83
  {
84
+ "epoch": 0.9,
85
+ "learning_rate": 6.014382218348224e-05,
86
+ "loss": 0.0003,
87
+ "step": 2760
88
  },
89
  {
90
+ "epoch": 0.98,
91
+ "learning_rate": 6.51558073654391e-05,
92
+ "loss": 0.0004,
93
+ "step": 2990
94
  },
95
  {
96
+ "epoch": 1.0,
97
+ "eval_loss": 0.002156034577637911,
98
+ "eval_max_distance": 12,
99
+ "eval_mean_distance": 0,
100
+ "eval_runtime": 31.367,
101
+ "eval_samples_per_second": 18.778,
102
+ "eval_steps_per_second": 0.128,
103
+ "step": 3059
104
  },
105
  {
106
+ "epoch": 1.05,
107
+ "learning_rate": 7.016779254739595e-05,
108
+ "loss": 0.0004,
109
+ "step": 3220
110
  },
111
  {
112
+ "epoch": 1.13,
113
+ "learning_rate": 7.517977772935281e-05,
114
+ "loss": 0.0003,
115
+ "step": 3450
116
  },
117
  {
118
+ "epoch": 1.2,
119
+ "learning_rate": 8.019176291130965e-05,
120
+ "loss": 0.0004,
121
+ "step": 3680
 
 
 
 
122
  },
123
  {
124
+ "epoch": 1.28,
125
+ "learning_rate": 8.520374809326651e-05,
126
+ "loss": 0.0003,
127
+ "step": 3910
128
  },
129
  {
130
+ "epoch": 1.35,
131
+ "learning_rate": 9.021573327522337e-05,
132
+ "loss": 0.0004,
133
+ "step": 4140
134
  },
135
  {
136
+ "epoch": 1.43,
137
+ "learning_rate": 9.522771845718021e-05,
138
+ "loss": 0.0003,
139
+ "step": 4370
140
  },
141
  {
142
+ "epoch": 1.5,
143
+ "learning_rate": 9.997336303758234e-05,
144
+ "loss": 0.0003,
145
+ "step": 4600
146
  },
147
  {
148
+ "epoch": 1.58,
149
+ "learning_rate": 9.941640836884929e-05,
150
+ "loss": 0.0003,
151
+ "step": 4830
152
  },
153
  {
154
+ "epoch": 1.65,
155
+ "learning_rate": 9.885945370011624e-05,
156
+ "loss": 0.0004,
157
+ "step": 5060
158
  },
159
  {
160
+ "epoch": 1.73,
161
+ "learning_rate": 9.830249903138319e-05,
162
+ "loss": 0.0003,
163
+ "step": 5290
164
  },
165
  {
166
+ "epoch": 1.8,
167
+ "learning_rate": 9.774554436265014e-05,
168
+ "loss": 0.0004,
169
+ "step": 5520
170
  },
171
  {
172
+ "epoch": 1.88,
173
+ "learning_rate": 9.718858969391709e-05,
174
+ "loss": 0.0003,
175
+ "step": 5750
 
 
 
 
176
  },
177
  {
178
+ "epoch": 1.95,
179
+ "learning_rate": 9.663163502518404e-05,
180
+ "loss": 0.0003,
181
+ "step": 5980
182
  },
183
  {
184
+ "epoch": 2.0,
185
+ "eval_loss": 0.0011327904649078846,
186
+ "eval_max_distance": 5,
187
+ "eval_mean_distance": 0,
188
+ "eval_runtime": 30.6053,
189
+ "eval_samples_per_second": 19.245,
190
+ "eval_steps_per_second": 0.131,
191
+ "step": 6118
192
  },
193
  {
194
+ "epoch": 2.03,
195
+ "learning_rate": 9.6074680356451e-05,
196
+ "loss": 0.0003,
197
+ "step": 6210
198
  },
199
  {
200
+ "epoch": 2.11,
201
+ "learning_rate": 9.551772568771794e-05,
202
+ "loss": 0.0003,
203
+ "step": 6440
204
  },
205
  {
206
+ "epoch": 2.18,
207
+ "learning_rate": 9.496077101898489e-05,
208
+ "loss": 0.0003,
209
+ "step": 6670
210
  },
211
  {
212
+ "epoch": 2.26,
213
+ "learning_rate": 9.440381635025185e-05,
214
+ "loss": 0.0003,
215
+ "step": 6900
216
  },
217
  {
218
+ "epoch": 2.33,
219
+ "learning_rate": 9.384686168151879e-05,
220
  "loss": 0.0004,
221
+ "step": 7130
 
 
 
 
 
 
 
 
 
 
222
  },
223
  {
224
+ "epoch": 2.41,
225
+ "learning_rate": 9.328990701278575e-05,
226
+ "loss": 0.0003,
227
+ "step": 7360
228
  },
229
  {
230
+ "epoch": 2.48,
231
+ "learning_rate": 9.273295234405269e-05,
232
+ "loss": 0.0003,
233
+ "step": 7590
234
  },
235
  {
236
+ "epoch": 2.56,
237
+ "learning_rate": 9.217599767531964e-05,
238
+ "loss": 0.0003,
239
+ "step": 7820
240
  },
241
  {
242
+ "epoch": 2.63,
243
+ "learning_rate": 9.16190430065866e-05,
244
+ "loss": 0.0003,
245
+ "step": 8050
246
  },
247
  {
248
+ "epoch": 2.71,
249
+ "learning_rate": 9.106208833785354e-05,
250
+ "loss": 0.0003,
251
+ "step": 8280
252
  },
253
  {
254
+ "epoch": 2.78,
255
+ "learning_rate": 9.05051336691205e-05,
256
+ "loss": 0.0003,
257
+ "step": 8510
258
  },
259
  {
260
+ "epoch": 2.86,
261
+ "learning_rate": 8.994817900038746e-05,
262
+ "loss": 0.0003,
263
+ "step": 8740
264
  },
265
  {
266
+ "epoch": 2.93,
267
+ "learning_rate": 8.939122433165439e-05,
268
+ "loss": 0.0003,
269
+ "step": 8970
270
  },
271
  {
272
+ "epoch": 3.0,
273
+ "eval_loss": 0.0010147562716156244,
274
+ "eval_max_distance": 6,
275
  "eval_mean_distance": 0,
276
+ "eval_runtime": 30.7374,
277
+ "eval_samples_per_second": 19.162,
278
+ "eval_steps_per_second": 0.13,
279
+ "step": 9177
280
  },
281
  {
282
+ "epoch": 3.01,
283
+ "learning_rate": 8.883426966292136e-05,
284
+ "loss": 0.0003,
285
+ "step": 9200
286
  },
287
  {
288
+ "epoch": 3.08,
289
+ "learning_rate": 8.82773149941883e-05,
290
+ "loss": 0.0002,
291
+ "step": 9430
292
  },
293
  {
294
+ "epoch": 3.16,
295
+ "learning_rate": 8.772036032545526e-05,
296
+ "loss": 0.0003,
297
+ "step": 9660
298
  },
299
  {
300
+ "epoch": 3.23,
301
+ "learning_rate": 8.716340565672221e-05,
302
+ "loss": 0.0002,
303
+ "step": 9890
304
  },
305
  {
306
+ "epoch": 3.31,
307
+ "learning_rate": 8.660645098798916e-05,
308
+ "loss": 0.0003,
309
+ "step": 10120
310
  },
311
  {
312
+ "epoch": 3.38,
313
+ "learning_rate": 8.604949631925611e-05,
314
+ "loss": 0.0003,
315
+ "step": 10350
316
  },
317
  {
318
+ "epoch": 3.46,
319
+ "learning_rate": 8.549254165052306e-05,
320
+ "loss": 0.0003,
321
+ "step": 10580
322
  },
323
  {
324
+ "epoch": 3.53,
325
+ "learning_rate": 8.493558698179001e-05,
326
+ "loss": 0.0003,
327
+ "step": 10810
328
  },
329
  {
330
+ "epoch": 3.61,
331
+ "learning_rate": 8.437863231305696e-05,
332
+ "loss": 0.0003,
333
+ "step": 11040
 
 
 
 
334
  },
335
  {
336
+ "epoch": 3.68,
337
+ "learning_rate": 8.382167764432391e-05,
338
+ "loss": 0.0003,
339
+ "step": 11270
340
  },
341
  {
342
+ "epoch": 3.76,
343
+ "learning_rate": 8.326472297559086e-05,
344
+ "loss": 0.0003,
345
+ "step": 11500
346
  },
347
  {
348
+ "epoch": 3.83,
349
+ "learning_rate": 8.270776830685781e-05,
350
+ "loss": 0.0003,
351
+ "step": 11730
352
  },
353
  {
354
+ "epoch": 3.91,
355
+ "learning_rate": 8.215081363812476e-05,
356
+ "loss": 0.0003,
357
+ "step": 11960
358
  },
359
  {
360
+ "epoch": 3.98,
361
+ "learning_rate": 8.159385896939171e-05,
362
+ "loss": 0.0003,
363
+ "step": 12190
364
  },
365
  {
366
+ "epoch": 4.0,
367
+ "eval_loss": 0.0011835404438897967,
368
+ "eval_max_distance": 3,
369
+ "eval_mean_distance": 0,
370
+ "eval_runtime": 30.6791,
371
+ "eval_samples_per_second": 19.199,
372
+ "eval_steps_per_second": 0.13,
373
+ "step": 12236
374
  },
375
  {
376
+ "epoch": 4.06,
377
+ "learning_rate": 8.103690430065866e-05,
378
+ "loss": 0.0003,
379
+ "step": 12420
380
  },
381
  {
382
+ "epoch": 4.14,
383
+ "learning_rate": 8.047994963192561e-05,
384
+ "loss": 0.0003,
385
+ "step": 12650
 
 
 
 
386
  },
387
  {
388
+ "epoch": 4.21,
389
+ "learning_rate": 7.992299496319256e-05,
390
+ "loss": 0.0003,
391
+ "step": 12880
392
  },
393
  {
394
+ "epoch": 4.29,
395
+ "learning_rate": 7.936604029445953e-05,
396
+ "loss": 0.0002,
397
+ "step": 13110
398
  },
399
  {
400
+ "epoch": 4.36,
401
+ "learning_rate": 7.880908562572646e-05,
402
+ "loss": 0.0003,
403
+ "step": 13340
404
  },
405
  {
406
+ "epoch": 4.44,
407
+ "learning_rate": 7.825213095699341e-05,
408
+ "loss": 0.0003,
409
+ "step": 13570
410
  },
411
  {
412
+ "epoch": 4.51,
413
+ "learning_rate": 7.769517628826036e-05,
414
+ "loss": 0.0003,
415
+ "step": 13800
416
  },
417
  {
418
+ "epoch": 4.59,
419
+ "learning_rate": 7.713822161952731e-05,
420
+ "loss": 0.0003,
421
+ "step": 14030
422
  },
423
  {
424
+ "epoch": 4.66,
425
+ "learning_rate": 7.658126695079428e-05,
426
+ "loss": 0.0002,
427
+ "step": 14260
428
  },
429
  {
430
+ "epoch": 4.74,
431
+ "learning_rate": 7.602431228206122e-05,
432
+ "loss": 0.0003,
433
+ "step": 14490
434
  },
435
  {
436
+ "epoch": 4.81,
437
+ "learning_rate": 7.546735761332817e-05,
438
+ "loss": 0.0003,
439
+ "step": 14720
 
 
 
 
440
  },
441
  {
442
+ "epoch": 4.89,
443
+ "learning_rate": 7.491040294459513e-05,
444
+ "loss": 0.0002,
445
+ "step": 14950
446
  },
447
  {
448
+ "epoch": 4.96,
449
+ "learning_rate": 7.435344827586207e-05,
450
+ "loss": 0.0003,
451
+ "step": 15180
452
  },
453
  {
454
+ "epoch": 5.0,
455
+ "eval_loss": 0.000761075527407229,
456
+ "eval_max_distance": 3,
457
+ "eval_mean_distance": 0,
458
+ "eval_runtime": 30.6946,
459
+ "eval_samples_per_second": 19.189,
460
+ "eval_steps_per_second": 0.13,
461
+ "step": 15295
462
  },
463
  {
464
+ "epoch": 5.04,
465
+ "learning_rate": 7.379649360712903e-05,
466
+ "loss": 0.0002,
467
+ "step": 15410
468
  },
469
  {
470
+ "epoch": 5.11,
471
+ "learning_rate": 7.323953893839598e-05,
472
+ "loss": 0.0002,
473
+ "step": 15640
474
  },
475
  {
476
+ "epoch": 5.19,
477
+ "learning_rate": 7.268258426966292e-05,
478
+ "loss": 0.0003,
479
+ "step": 15870
480
  },
481
  {
482
+ "epoch": 5.26,
483
+ "learning_rate": 7.212562960092988e-05,
484
+ "loss": 0.0002,
485
+ "step": 16100
486
  },
487
  {
488
+ "epoch": 5.34,
489
+ "learning_rate": 7.156867493219682e-05,
490
+ "loss": 0.0003,
491
+ "step": 16330
492
  },
493
  {
494
+ "epoch": 5.41,
495
+ "learning_rate": 7.101172026346378e-05,
496
+ "loss": 0.0002,
497
+ "step": 16560
 
 
 
 
498
  },
499
  {
500
+ "epoch": 5.49,
501
+ "learning_rate": 7.045476559473073e-05,
502
+ "loss": 0.0002,
503
+ "step": 16790
504
  },
505
  {
506
+ "epoch": 5.56,
507
+ "learning_rate": 6.989781092599767e-05,
508
+ "loss": 0.0003,
509
+ "step": 17020
510
  },
511
  {
512
+ "epoch": 5.64,
513
+ "learning_rate": 6.934085625726463e-05,
514
+ "loss": 0.0003,
515
+ "step": 17250
516
  },
517
  {
518
+ "epoch": 5.71,
519
+ "learning_rate": 6.878390158853158e-05,
520
  "loss": 0.0002,
521
+ "step": 17480
522
  },
523
  {
524
+ "epoch": 5.79,
525
+ "learning_rate": 6.822694691979853e-05,
526
+ "loss": 0.0003,
527
+ "step": 17710
528
  },
529
  {
530
+ "epoch": 5.86,
531
+ "learning_rate": 6.766999225106548e-05,
532
+ "loss": 0.0003,
533
+ "step": 17940
534
  },
535
  {
536
+ "epoch": 5.94,
537
+ "learning_rate": 6.711303758233242e-05,
538
+ "loss": 0.0002,
539
+ "step": 18170
540
  },
541
  {
542
+ "epoch": 6.0,
543
+ "eval_loss": 0.0009165782830677927,
544
+ "eval_max_distance": 3,
545
  "eval_mean_distance": 0,
546
+ "eval_runtime": 30.7001,
547
+ "eval_samples_per_second": 19.186,
548
+ "eval_steps_per_second": 0.13,
549
+ "step": 18354
 
 
 
 
 
 
550
  },
551
  {
552
+ "epoch": 6.02,
553
+ "learning_rate": 6.655608291359938e-05,
554
+ "loss": 0.0003,
555
+ "step": 18400
556
  },
557
  {
558
+ "epoch": 6.09,
559
+ "learning_rate": 6.599912824486634e-05,
560
+ "loss": 0.0002,
561
+ "step": 18630
562
  },
563
  {
564
+ "epoch": 6.17,
565
+ "learning_rate": 6.544217357613329e-05,
566
+ "loss": 0.0002,
567
+ "step": 18860
568
  },
569
  {
570
+ "epoch": 6.24,
571
+ "learning_rate": 6.488521890740024e-05,
572
+ "loss": 0.0003,
573
+ "step": 19090
574
  },
575
  {
576
+ "epoch": 6.32,
577
+ "learning_rate": 6.432826423866719e-05,
578
+ "loss": 0.0002,
579
+ "step": 19320
580
  },
581
  {
582
+ "epoch": 6.39,
583
+ "learning_rate": 6.377130956993414e-05,
584
+ "loss": 0.0002,
585
+ "step": 19550
586
  },
587
  {
588
+ "epoch": 6.47,
589
+ "learning_rate": 6.321435490120109e-05,
590
+ "loss": 0.0003,
591
+ "step": 19780
592
  },
593
  {
594
+ "epoch": 6.54,
595
+ "learning_rate": 6.265740023246804e-05,
596
+ "loss": 0.0002,
597
+ "step": 20010
 
 
 
 
598
  },
599
  {
600
+ "epoch": 6.62,
601
+ "learning_rate": 6.210044556373499e-05,
602
+ "loss": 0.0002,
603
+ "step": 20240
604
  },
605
  {
606
+ "epoch": 6.69,
607
+ "learning_rate": 6.154349089500194e-05,
608
+ "loss": 0.0003,
609
+ "step": 20470
610
  },
611
  {
612
+ "epoch": 6.77,
613
+ "learning_rate": 6.098653622626889e-05,
614
+ "loss": 0.0002,
615
+ "step": 20700
616
  },
617
  {
618
+ "epoch": 6.84,
619
+ "learning_rate": 6.042958155753584e-05,
620
+ "loss": 0.0002,
621
+ "step": 20930
622
  },
623
  {
624
+ "epoch": 6.92,
625
+ "learning_rate": 5.9872626888802796e-05,
626
+ "loss": 0.0003,
627
+ "step": 21160
628
  },
629
  {
630
+ "epoch": 6.99,
631
+ "learning_rate": 5.931567222006974e-05,
632
+ "loss": 0.0002,
633
+ "step": 21390
634
  },
635
  {
636
+ "epoch": 7.0,
637
+ "eval_loss": 0.0007532148738391697,
638
+ "eval_max_distance": 3,
639
+ "eval_mean_distance": 0,
640
+ "eval_runtime": 31.1332,
641
+ "eval_samples_per_second": 18.919,
642
+ "eval_steps_per_second": 0.128,
643
+ "step": 21413
644
  },
645
  {
646
+ "epoch": 7.07,
647
+ "learning_rate": 5.87587175513367e-05,
648
+ "loss": 0.0003,
649
+ "step": 21620
650
  },
651
  {
652
+ "epoch": 7.14,
653
+ "learning_rate": 5.820176288260365e-05,
654
+ "loss": 0.0002,
655
+ "step": 21850
 
 
 
 
656
  },
657
  {
658
+ "epoch": 7.22,
659
+ "learning_rate": 5.764480821387059e-05,
660
+ "loss": 0.0002,
661
+ "step": 22080
662
  },
663
  {
664
+ "epoch": 7.29,
665
+ "learning_rate": 5.708785354513755e-05,
666
+ "loss": 0.0002,
667
+ "step": 22310
668
  },
669
  {
670
+ "epoch": 7.37,
671
+ "learning_rate": 5.653089887640449e-05,
672
+ "loss": 0.0003,
673
+ "step": 22540
674
  },
675
  {
676
+ "epoch": 7.44,
677
+ "learning_rate": 5.597394420767145e-05,
678
+ "loss": 0.0003,
679
+ "step": 22770
680
  },
681
  {
682
+ "epoch": 7.52,
683
+ "learning_rate": 5.54169895389384e-05,
684
+ "loss": 0.0002,
685
+ "step": 23000
686
  },
687
  {
688
+ "epoch": 7.59,
689
+ "learning_rate": 5.486003487020534e-05,
690
+ "loss": 0.0002,
691
+ "step": 23230
692
  },
693
  {
694
+ "epoch": 7.67,
695
+ "learning_rate": 5.43030802014723e-05,
696
+ "loss": 0.0003,
697
+ "step": 23460
698
  },
699
  {
700
+ "epoch": 7.74,
701
+ "learning_rate": 5.374612553273926e-05,
702
+ "loss": 0.0002,
703
+ "step": 23690
 
 
 
 
704
  },
705
  {
706
+ "epoch": 7.82,
707
+ "learning_rate": 5.31891708640062e-05,
708
+ "loss": 0.0002,
709
+ "step": 23920
710
  },
711
  {
712
+ "epoch": 7.89,
713
+ "learning_rate": 5.263221619527316e-05,
714
+ "loss": 0.0002,
715
+ "step": 24150
716
  },
717
  {
718
+ "epoch": 7.97,
719
+ "learning_rate": 5.20752615265401e-05,
720
+ "loss": 0.0002,
721
+ "step": 24380
722
  },
723
  {
724
+ "epoch": 8.0,
725
+ "eval_loss": 0.0007926285616122186,
726
+ "eval_max_distance": 3,
727
+ "eval_mean_distance": 0,
728
+ "eval_runtime": 30.7407,
729
+ "eval_samples_per_second": 19.16,
730
+ "eval_steps_per_second": 0.13,
731
+ "step": 24472
732
  },
733
  {
734
+ "epoch": 8.05,
735
+ "learning_rate": 5.151830685780705e-05,
736
+ "loss": 0.0002,
737
+ "step": 24610
738
  },
739
  {
740
+ "epoch": 8.12,
741
+ "learning_rate": 5.096135218907401e-05,
742
+ "loss": 0.0002,
743
+ "step": 24840
744
  },
745
  {
746
+ "epoch": 8.2,
747
+ "learning_rate": 5.040439752034095e-05,
748
+ "loss": 0.0002,
749
+ "step": 25070
750
  },
751
  {
752
+ "epoch": 8.27,
753
+ "learning_rate": 4.984744285160791e-05,
754
+ "loss": 0.0002,
755
+ "step": 25300
756
  },
757
  {
758
+ "epoch": 8.35,
759
+ "learning_rate": 4.929048818287485e-05,
760
+ "loss": 0.0002,
761
+ "step": 25530
 
 
 
 
762
  },
763
  {
764
+ "epoch": 8.42,
765
+ "learning_rate": 4.873353351414181e-05,
766
+ "loss": 0.0002,
767
+ "step": 25760
768
  },
769
  {
770
+ "epoch": 8.5,
771
+ "learning_rate": 4.817657884540876e-05,
772
+ "loss": 0.0002,
773
+ "step": 25990
774
  },
775
  {
776
+ "epoch": 8.57,
777
+ "learning_rate": 4.761962417667571e-05,
778
+ "loss": 0.0002,
779
+ "step": 26220
780
  },
781
  {
782
+ "epoch": 8.65,
783
+ "learning_rate": 4.706266950794266e-05,
784
+ "loss": 0.0002,
785
+ "step": 26450
786
  },
787
  {
788
+ "epoch": 8.72,
789
+ "learning_rate": 4.650571483920961e-05,
790
+ "loss": 0.0002,
791
+ "step": 26680
792
  },
793
  {
794
+ "epoch": 8.8,
795
+ "learning_rate": 4.594876017047656e-05,
796
+ "loss": 0.0002,
797
+ "step": 26910
798
  },
799
  {
800
+ "epoch": 8.87,
801
+ "learning_rate": 4.539180550174351e-05,
802
+ "loss": 0.0003,
803
+ "step": 27140
804
  },
805
  {
806
+ "epoch": 8.95,
807
+ "learning_rate": 4.483485083301046e-05,
808
+ "loss": 0.0002,
809
+ "step": 27370
810
  },
811
  {
812
+ "epoch": 9.0,
813
+ "eval_loss": 0.0007057118928059936,
814
  "eval_max_distance": 3,
815
  "eval_mean_distance": 0,
816
+ "eval_runtime": 30.795,
817
+ "eval_samples_per_second": 19.127,
818
+ "eval_steps_per_second": 0.13,
819
+ "step": 27531
820
  },
821
  {
822
+ "epoch": 9.02,
823
+ "learning_rate": 4.427789616427741e-05,
824
+ "loss": 0.0002,
825
+ "step": 27600
826
  },
827
  {
828
+ "epoch": 9.1,
829
+ "learning_rate": 4.372094149554436e-05,
830
+ "loss": 0.0002,
831
+ "step": 27830
832
  },
833
  {
834
+ "epoch": 9.17,
835
+ "learning_rate": 4.3163986826811313e-05,
836
+ "loss": 0.0002,
837
+ "step": 28060
838
  },
839
  {
840
+ "epoch": 9.25,
841
+ "learning_rate": 4.2607032158078264e-05,
842
+ "loss": 0.0002,
843
+ "step": 28290
844
  },
845
  {
846
+ "epoch": 9.32,
847
+ "learning_rate": 4.205007748934522e-05,
848
+ "loss": 0.0002,
849
+ "step": 28520
850
  },
851
  {
852
+ "epoch": 9.4,
853
+ "learning_rate": 4.149312282061217e-05,
854
+ "loss": 0.0002,
855
+ "step": 28750
856
  },
857
  {
858
+ "epoch": 9.47,
859
+ "learning_rate": 4.0936168151879115e-05,
860
+ "loss": 0.0002,
861
+ "step": 28980
862
  },
863
  {
864
+ "epoch": 9.55,
865
+ "learning_rate": 4.0379213483146065e-05,
866
+ "loss": 0.0002,
867
+ "step": 29210
 
 
 
 
868
  },
869
  {
870
+ "epoch": 9.62,
871
+ "learning_rate": 3.982225881441302e-05,
872
+ "loss": 0.0002,
873
+ "step": 29440
874
  },
875
  {
876
+ "epoch": 9.7,
877
+ "learning_rate": 3.926530414567997e-05,
878
  "loss": 0.0003,
879
+ "step": 29670
880
  },
881
  {
882
+ "epoch": 9.77,
883
+ "learning_rate": 3.870834947694692e-05,
884
+ "loss": 0.0002,
885
+ "step": 29900
886
  },
887
  {
888
+ "epoch": 9.85,
889
+ "learning_rate": 3.8151394808213873e-05,
890
+ "loss": 0.0002,
891
+ "step": 30130
892
  },
893
  {
894
+ "epoch": 9.92,
895
+ "learning_rate": 3.7594440139480824e-05,
896
+ "loss": 0.0002,
897
+ "step": 30360
898
  },
899
  {
900
+ "epoch": 10.0,
901
+ "learning_rate": 3.7037485470747774e-05,
902
+ "loss": 0.0002,
903
+ "step": 30590
904
  },
905
  {
906
+ "epoch": 10.0,
907
+ "eval_loss": 0.000787499884609133,
908
+ "eval_max_distance": 3,
909
+ "eval_mean_distance": 0,
910
+ "eval_runtime": 30.7843,
911
+ "eval_samples_per_second": 19.133,
912
+ "eval_steps_per_second": 0.13,
913
+ "step": 30590
914
  },
915
  {
916
+ "epoch": 10.08,
917
+ "learning_rate": 3.6480530802014724e-05,
918
+ "loss": 0.0002,
919
+ "step": 30820
920
  },
921
  {
922
+ "epoch": 10.15,
923
+ "learning_rate": 3.5923576133281675e-05,
924
+ "loss": 0.0002,
925
+ "step": 31050
 
 
 
 
926
  },
927
  {
928
+ "epoch": 10.23,
929
+ "learning_rate": 3.5366621464548625e-05,
930
+ "loss": 0.0002,
931
+ "step": 31280
932
  },
933
  {
934
+ "epoch": 10.3,
935
+ "learning_rate": 3.4809666795815576e-05,
936
+ "loss": 0.0002,
937
+ "step": 31510
938
  },
939
  {
940
+ "epoch": 10.38,
941
+ "learning_rate": 3.4252712127082526e-05,
942
+ "loss": 0.0002,
943
+ "step": 31740
944
  },
945
  {
946
+ "epoch": 10.45,
947
+ "learning_rate": 3.369575745834948e-05,
948
+ "loss": 0.0002,
949
+ "step": 31970
950
  },
951
  {
952
+ "epoch": 10.53,
953
+ "learning_rate": 3.313880278961643e-05,
954
  "loss": 0.0002,
955
+ "step": 32200
956
  },
957
  {
958
+ "epoch": 10.6,
959
+ "learning_rate": 3.258184812088338e-05,
960
+ "loss": 0.0002,
961
+ "step": 32430
962
  },
963
  {
964
+ "epoch": 10.68,
965
+ "learning_rate": 3.202489345215033e-05,
966
+ "loss": 0.0002,
967
+ "step": 32660
968
  },
969
  {
970
+ "epoch": 10.75,
971
+ "learning_rate": 3.1467938783417284e-05,
972
+ "loss": 0.0002,
973
+ "step": 32890
974
  },
975
  {
976
+ "epoch": 10.83,
977
+ "learning_rate": 3.0910984114684235e-05,
978
+ "loss": 0.0002,
979
+ "step": 33120
 
 
 
 
980
  },
981
  {
982
+ "epoch": 10.9,
983
+ "learning_rate": 3.0354029445951182e-05,
984
+ "loss": 0.0002,
985
+ "step": 33350
986
  },
987
  {
988
+ "epoch": 10.98,
989
+ "learning_rate": 2.9797074777218132e-05,
990
+ "loss": 0.0002,
991
+ "step": 33580
992
  },
993
  {
994
+ "epoch": 11.0,
995
+ "eval_loss": 0.000802784226834774,
996
+ "eval_max_distance": 3,
997
+ "eval_mean_distance": 0,
998
+ "eval_runtime": 30.8756,
999
+ "eval_samples_per_second": 19.077,
1000
+ "eval_steps_per_second": 0.13,
1001
+ "step": 33649
1002
  },
1003
  {
1004
+ "epoch": 11.05,
1005
+ "learning_rate": 2.9240120108485086e-05,
1006
+ "loss": 0.0002,
1007
+ "step": 33810
1008
  },
1009
  {
1010
+ "epoch": 11.13,
1011
+ "learning_rate": 2.8683165439752036e-05,
1012
+ "loss": 0.0002,
1013
+ "step": 34040
1014
  },
1015
  {
1016
+ "epoch": 11.2,
1017
+ "learning_rate": 2.8126210771018983e-05,
1018
+ "loss": 0.0002,
1019
+ "step": 34270
1020
  },
1021
  {
1022
+ "epoch": 11.28,
1023
+ "learning_rate": 2.756925610228594e-05,
1024
+ "loss": 0.0002,
1025
+ "step": 34500
1026
  },
1027
  {
1028
+ "epoch": 11.35,
1029
+ "learning_rate": 2.7012301433552887e-05,
1030
+ "loss": 0.0002,
1031
+ "step": 34730
 
 
 
 
1032
  },
1033
  {
1034
+ "epoch": 11.43,
1035
+ "learning_rate": 2.6455346764819838e-05,
1036
+ "loss": 0.0002,
1037
+ "step": 34960
1038
  },
1039
  {
1040
+ "epoch": 11.5,
1041
+ "learning_rate": 2.5898392096086788e-05,
1042
+ "loss": 0.0002,
1043
+ "step": 35190
1044
  },
1045
  {
1046
+ "epoch": 11.58,
1047
+ "learning_rate": 2.5341437427353742e-05,
1048
+ "loss": 0.0002,
1049
+ "step": 35420
1050
  },
1051
  {
1052
+ "epoch": 11.65,
1053
+ "learning_rate": 2.4784482758620692e-05,
1054
+ "loss": 0.0002,
1055
+ "step": 35650
1056
  },
1057
  {
1058
+ "epoch": 11.73,
1059
+ "learning_rate": 2.4227528089887643e-05,
1060
+ "loss": 0.0002,
1061
+ "step": 35880
1062
  },
1063
  {
1064
+ "epoch": 11.8,
1065
+ "learning_rate": 2.3670573421154593e-05,
1066
+ "loss": 0.0002,
1067
+ "step": 36110
1068
  },
1069
  {
1070
+ "epoch": 11.88,
1071
+ "learning_rate": 2.3113618752421543e-05,
1072
+ "loss": 0.0002,
1073
+ "step": 36340
1074
  },
1075
  {
1076
+ "epoch": 11.95,
1077
+ "learning_rate": 2.2556664083688494e-05,
1078
+ "loss": 0.0002,
1079
+ "step": 36570
1080
  },
1081
  {
1082
+ "epoch": 12.0,
1083
+ "eval_loss": 0.0007881763740442693,
1084
  "eval_max_distance": 3,
1085
  "eval_mean_distance": 0,
1086
+ "eval_runtime": 30.7194,
1087
+ "eval_samples_per_second": 19.174,
1088
+ "eval_steps_per_second": 0.13,
1089
+ "step": 36708
1090
  },
1091
  {
1092
+ "epoch": 12.03,
1093
+ "learning_rate": 2.1999709414955444e-05,
1094
+ "loss": 0.0002,
1095
+ "step": 36800
1096
  },
1097
  {
1098
+ "epoch": 12.11,
1099
+ "learning_rate": 2.1442754746222398e-05,
1100
+ "loss": 0.0002,
1101
+ "step": 37030
1102
  },
1103
  {
1104
+ "epoch": 12.18,
1105
+ "learning_rate": 2.0885800077489345e-05,
1106
+ "loss": 0.0002,
1107
+ "step": 37260
1108
  },
1109
  {
1110
+ "epoch": 12.26,
1111
+ "learning_rate": 2.03288454087563e-05,
1112
+ "loss": 0.0002,
1113
+ "step": 37490
1114
  },
1115
  {
1116
+ "epoch": 12.33,
1117
+ "learning_rate": 1.9771890740023245e-05,
1118
+ "loss": 0.0002,
1119
+ "step": 37720
1120
  },
1121
  {
1122
+ "epoch": 12.41,
1123
+ "learning_rate": 1.92149360712902e-05,
1124
  "loss": 0.0002,
1125
+ "step": 37950
1126
  },
1127
  {
1128
+ "epoch": 12.48,
1129
+ "learning_rate": 1.865798140255715e-05,
1130
+ "loss": 0.0002,
1131
+ "step": 38180
1132
  },
1133
  {
1134
+ "epoch": 12.56,
1135
+ "learning_rate": 1.81010267338241e-05,
1136
+ "loss": 0.0002,
1137
+ "step": 38410
1138
  },
1139
  {
1140
+ "epoch": 12.63,
1141
+ "learning_rate": 1.754407206509105e-05,
1142
+ "loss": 0.0002,
1143
+ "step": 38640
 
 
 
 
1144
  },
1145
  {
1146
+ "epoch": 12.71,
1147
+ "learning_rate": 1.6987117396358e-05,
1148
+ "loss": 0.0002,
1149
+ "step": 38870
1150
  },
1151
  {
1152
+ "epoch": 12.78,
1153
+ "learning_rate": 1.6430162727624954e-05,
1154
+ "loss": 0.0002,
1155
+ "step": 39100
1156
  },
1157
  {
1158
+ "epoch": 12.86,
1159
+ "learning_rate": 1.5873208058891905e-05,
1160
+ "loss": 0.0002,
1161
+ "step": 39330
1162
  },
1163
  {
1164
+ "epoch": 12.93,
1165
+ "learning_rate": 1.5316253390158855e-05,
1166
+ "loss": 0.0002,
1167
+ "step": 39560
1168
  },
1169
  {
1170
+ "epoch": 13.0,
1171
+ "eval_loss": 0.000838827807456255,
1172
+ "eval_max_distance": 3,
1173
+ "eval_mean_distance": 0,
1174
+ "eval_runtime": 30.7487,
1175
+ "eval_samples_per_second": 19.155,
1176
+ "eval_steps_per_second": 0.13,
1177
+ "step": 39767
1178
  },
1179
  {
1180
+ "epoch": 13.01,
1181
+ "learning_rate": 1.4759298721425804e-05,
1182
+ "loss": 0.0002,
1183
+ "step": 39790
1184
  },
1185
  {
1186
+ "epoch": 13.08,
1187
+ "learning_rate": 1.4202344052692756e-05,
1188
+ "loss": 0.0002,
1189
+ "step": 40020
1190
  },
1191
  {
1192
+ "epoch": 13.16,
1193
+ "learning_rate": 1.3645389383959706e-05,
1194
+ "loss": 0.0002,
1195
+ "step": 40250
 
 
 
 
1196
  },
1197
  {
1198
+ "epoch": 13.23,
1199
+ "learning_rate": 1.3088434715226658e-05,
1200
+ "loss": 0.0002,
1201
+ "step": 40480
1202
  },
1203
  {
1204
+ "epoch": 13.31,
1205
+ "learning_rate": 1.2531480046493607e-05,
1206
+ "loss": 0.0002,
1207
+ "step": 40710
1208
  },
1209
  {
1210
+ "epoch": 13.38,
1211
+ "learning_rate": 1.1974525377760557e-05,
1212
  "loss": 0.0002,
1213
+ "step": 40940
1214
  },
1215
  {
1216
+ "epoch": 13.46,
1217
+ "learning_rate": 1.141757070902751e-05,
1218
+ "loss": 0.0002,
1219
+ "step": 41170
1220
  },
1221
  {
1222
+ "epoch": 13.53,
1223
+ "learning_rate": 1.0860616040294461e-05,
1224
+ "loss": 0.0002,
1225
+ "step": 41400
1226
  },
1227
  {
1228
+ "epoch": 13.61,
1229
+ "learning_rate": 1.0303661371561412e-05,
1230
+ "loss": 0.0002,
1231
+ "step": 41630
1232
  },
1233
  {
1234
+ "epoch": 13.68,
1235
+ "learning_rate": 9.746706702828362e-06,
1236
+ "loss": 0.0002,
1237
+ "step": 41860
1238
  },
1239
  {
1240
+ "epoch": 13.76,
1241
+ "learning_rate": 9.189752034095312e-06,
1242
+ "loss": 0.0002,
1243
+ "step": 42090
1244
  },
1245
  {
1246
+ "epoch": 13.83,
1247
+ "learning_rate": 8.632797365362263e-06,
1248
+ "loss": 0.0002,
1249
+ "step": 42320
 
 
 
 
1250
  },
1251
  {
1252
+ "epoch": 13.91,
1253
+ "learning_rate": 8.075842696629215e-06,
1254
+ "loss": 0.0002,
1255
+ "step": 42550
1256
  },
1257
  {
1258
+ "epoch": 13.98,
1259
+ "learning_rate": 7.518888027896165e-06,
1260
+ "loss": 0.0002,
1261
+ "step": 42780
1262
  },
1263
  {
1264
+ "epoch": 14.0,
1265
+ "eval_loss": 0.0008361997315660119,
1266
+ "eval_max_distance": 3,
1267
+ "eval_mean_distance": 0,
1268
+ "eval_runtime": 30.9876,
1269
+ "eval_samples_per_second": 19.008,
1270
+ "eval_steps_per_second": 0.129,
1271
+ "step": 42826
1272
  },
1273
  {
1274
+ "epoch": 14.06,
1275
+ "learning_rate": 6.9619333591631155e-06,
1276
+ "loss": 0.0002,
1277
+ "step": 43010
1278
  },
1279
  {
1280
+ "epoch": 14.14,
1281
+ "learning_rate": 6.404978690430067e-06,
1282
+ "loss": 0.0002,
1283
+ "step": 43240
1284
  },
1285
  {
1286
+ "epoch": 14.21,
1287
+ "learning_rate": 5.848024021697017e-06,
1288
+ "loss": 0.0002,
1289
+ "step": 43470
1290
  },
1291
  {
1292
+ "epoch": 14.29,
1293
+ "learning_rate": 5.291069352963967e-06,
1294
+ "loss": 0.0002,
1295
+ "step": 43700
1296
  },
1297
  {
1298
+ "epoch": 14.36,
1299
+ "learning_rate": 4.7341146842309186e-06,
1300
+ "loss": 0.0002,
1301
+ "step": 43930
1302
  },
1303
  {
1304
+ "epoch": 14.44,
1305
+ "learning_rate": 4.177160015497869e-06,
1306
+ "loss": 0.0002,
1307
+ "step": 44160
 
 
 
 
1308
  },
1309
  {
1310
+ "epoch": 14.51,
1311
+ "learning_rate": 3.6202053467648197e-06,
1312
+ "loss": 0.0002,
1313
+ "step": 44390
1314
  },
1315
  {
1316
+ "epoch": 14.59,
1317
+ "learning_rate": 3.063250678031771e-06,
1318
+ "loss": 0.0002,
1319
+ "step": 44620
1320
  },
1321
  {
1322
+ "epoch": 14.66,
1323
+ "learning_rate": 2.5062960092987217e-06,
1324
+ "loss": 0.0002,
1325
+ "step": 44850
1326
  },
1327
  {
1328
+ "epoch": 14.74,
1329
+ "learning_rate": 1.949341340565672e-06,
1330
+ "loss": 0.0002,
1331
+ "step": 45080
1332
  },
1333
  {
1334
+ "epoch": 14.81,
1335
+ "learning_rate": 1.392386671832623e-06,
1336
+ "loss": 0.0002,
1337
+ "step": 45310
1338
  },
1339
  {
1340
+ "epoch": 14.89,
1341
+ "learning_rate": 8.354320030995738e-07,
1342
+ "loss": 0.0002,
1343
+ "step": 45540
1344
  },
1345
  {
1346
+ "epoch": 14.96,
1347
+ "learning_rate": 2.784773343665246e-07,
1348
+ "loss": 0.0002,
1349
+ "step": 45770
1350
  },
1351
  {
1352
+ "epoch": 15.0,
1353
+ "eval_loss": 0.0008401814266107976,
1354
+ "eval_max_distance": 3,
1355
  "eval_mean_distance": 0,
1356
+ "eval_runtime": 30.8673,
1357
+ "eval_samples_per_second": 19.082,
1358
+ "eval_steps_per_second": 0.13,
1359
+ "step": 45885
1360
+ },
1361
+ {
1362
+ "epoch": 15.0,
1363
+ "step": 45885,
1364
+ "total_flos": 7.988818341631795e+16,
1365
+ "train_loss": 0.0002746593983470409,
1366
+ "train_runtime": 6476.6732,
1367
+ "train_samples_per_second": 531.266,
1368
+ "train_steps_per_second": 7.085
1369
  }
1370
  ],
1371
+ "logging_steps": 230,
1372
+ "max_steps": 45885,
1373
+ "num_input_tokens_seen": 0,
1374
+ "num_train_epochs": 15,
1375
+ "save_steps": 4589,
1376
+ "total_flos": 7.988818341631795e+16,
1377
+ "train_batch_size": 75,
1378
  "trial_name": null,
1379
  "trial_params": null
1380
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1715be697a9be6fe0784bf769267cffcc821d071738c8fdd99dec6f06f39db0
3
- size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb73bbd8d63a4a4a349732132a9ced8e0528dece7669b9d0a3ba1f248d5ec37
3
+ size 4792