mmichall commited on
Commit
a17f1b7
1 Parent(s): bd4a71e

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ParaphrasingLM"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.33.2",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.33.2"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96fb7cf2c92d47a6083f32b6629c44bf68c7ba068b2fda834e2d53e4bd9d93de
3
+ size 995605509
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e65e156dbb38a78bc17d121aeb0d08609ef356190299cd83c1fd3b644cbc4ee
3
+ size 497807197
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25cae035dc0dd096af606f71b6c7ae2de5a58267a66ea9e3cc543f1ce0cfbee2
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e145e476f451fda47cdd72825a298a7721eb33bd14c0f954e3edae795b5418
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 128,
7
+ "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 29.746727859935408,
5
+ "eval_steps": 5000,
6
+ "global_step": 175000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 4.987500000000001e-06,
14
+ "loss": 1.4907,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.17,
19
+ "learning_rate": 4.975000000000001e-06,
20
+ "loss": 1.3231,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.25,
25
+ "learning_rate": 4.9625e-06,
26
+ "loss": 1.3028,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.34,
31
+ "learning_rate": 4.95e-06,
32
+ "loss": 1.2531,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.42,
37
+ "learning_rate": 4.937500000000001e-06,
38
+ "loss": 1.219,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.51,
43
+ "learning_rate": 4.925e-06,
44
+ "loss": 1.177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.59,
49
+ "learning_rate": 4.912500000000001e-06,
50
+ "loss": 1.1509,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.68,
55
+ "learning_rate": 4.9000000000000005e-06,
56
+ "loss": 1.1144,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.76,
61
+ "learning_rate": 4.8875e-06,
62
+ "loss": 1.0829,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.85,
67
+ "learning_rate": 4.875e-06,
68
+ "loss": 1.0648,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.85,
73
+ "eval_loss": 1.3415521383285522,
74
+ "eval_runtime": 31.2796,
75
+ "eval_samples_per_second": 375.261,
76
+ "eval_steps_per_second": 46.932,
77
+ "step": 5000
78
+ },
79
+ {
80
+ "epoch": 0.93,
81
+ "learning_rate": 4.8625000000000005e-06,
82
+ "loss": 1.0567,
83
+ "step": 5500
84
+ },
85
+ {
86
+ "epoch": 1.02,
87
+ "learning_rate": 4.85e-06,
88
+ "loss": 1.0334,
89
+ "step": 6000
90
+ },
91
+ {
92
+ "epoch": 1.1,
93
+ "learning_rate": 4.837500000000001e-06,
94
+ "loss": 0.9995,
95
+ "step": 6500
96
+ },
97
+ {
98
+ "epoch": 1.19,
99
+ "learning_rate": 4.825e-06,
100
+ "loss": 0.9909,
101
+ "step": 7000
102
+ },
103
+ {
104
+ "epoch": 1.27,
105
+ "learning_rate": 4.8125e-06,
106
+ "loss": 0.9687,
107
+ "step": 7500
108
+ },
109
+ {
110
+ "epoch": 1.36,
111
+ "learning_rate": 4.800000000000001e-06,
112
+ "loss": 0.9444,
113
+ "step": 8000
114
+ },
115
+ {
116
+ "epoch": 1.44,
117
+ "learning_rate": 4.7875e-06,
118
+ "loss": 0.9547,
119
+ "step": 8500
120
+ },
121
+ {
122
+ "epoch": 1.53,
123
+ "learning_rate": 4.775e-06,
124
+ "loss": 0.9217,
125
+ "step": 9000
126
+ },
127
+ {
128
+ "epoch": 1.61,
129
+ "learning_rate": 4.7625000000000006e-06,
130
+ "loss": 0.9121,
131
+ "step": 9500
132
+ },
133
+ {
134
+ "epoch": 1.7,
135
+ "learning_rate": 4.75e-06,
136
+ "loss": 0.9045,
137
+ "step": 10000
138
+ },
139
+ {
140
+ "epoch": 1.7,
141
+ "eval_loss": 1.1334267854690552,
142
+ "eval_runtime": 33.2532,
143
+ "eval_samples_per_second": 352.989,
144
+ "eval_steps_per_second": 44.146,
145
+ "step": 10000
146
+ },
147
+ {
148
+ "epoch": 1.78,
149
+ "learning_rate": 4.737500000000001e-06,
150
+ "loss": 0.8987,
151
+ "step": 10500
152
+ },
153
+ {
154
+ "epoch": 1.87,
155
+ "learning_rate": 4.7250000000000005e-06,
156
+ "loss": 0.8923,
157
+ "step": 11000
158
+ },
159
+ {
160
+ "epoch": 1.95,
161
+ "learning_rate": 4.7125e-06,
162
+ "loss": 0.8711,
163
+ "step": 11500
164
+ },
165
+ {
166
+ "epoch": 2.04,
167
+ "learning_rate": 4.7e-06,
168
+ "loss": 0.8655,
169
+ "step": 12000
170
+ },
171
+ {
172
+ "epoch": 2.12,
173
+ "learning_rate": 4.6875000000000004e-06,
174
+ "loss": 0.8458,
175
+ "step": 12500
176
+ },
177
+ {
178
+ "epoch": 2.21,
179
+ "learning_rate": 4.675000000000001e-06,
180
+ "loss": 0.8454,
181
+ "step": 13000
182
+ },
183
+ {
184
+ "epoch": 2.29,
185
+ "learning_rate": 4.662500000000001e-06,
186
+ "loss": 0.822,
187
+ "step": 13500
188
+ },
189
+ {
190
+ "epoch": 2.38,
191
+ "learning_rate": 4.65e-06,
192
+ "loss": 0.8252,
193
+ "step": 14000
194
+ },
195
+ {
196
+ "epoch": 2.46,
197
+ "learning_rate": 4.6375e-06,
198
+ "loss": 0.8105,
199
+ "step": 14500
200
+ },
201
+ {
202
+ "epoch": 2.55,
203
+ "learning_rate": 4.625000000000001e-06,
204
+ "loss": 0.8092,
205
+ "step": 15000
206
+ },
207
+ {
208
+ "epoch": 2.55,
209
+ "eval_loss": 1.0175799131393433,
210
+ "eval_runtime": 30.8472,
211
+ "eval_samples_per_second": 380.521,
212
+ "eval_steps_per_second": 47.589,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.63,
217
+ "learning_rate": 4.6125e-06,
218
+ "loss": 0.8106,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.72,
223
+ "learning_rate": 4.600000000000001e-06,
224
+ "loss": 0.8025,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.8,
229
+ "learning_rate": 4.5875000000000005e-06,
230
+ "loss": 0.7931,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.89,
235
+ "learning_rate": 4.575e-06,
236
+ "loss": 0.8081,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.97,
241
+ "learning_rate": 4.5625e-06,
242
+ "loss": 0.7895,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 3.06,
247
+ "learning_rate": 4.5500000000000005e-06,
248
+ "loss": 0.7828,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 3.14,
253
+ "learning_rate": 4.5375e-06,
254
+ "loss": 0.7617,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 3.23,
259
+ "learning_rate": 4.525000000000001e-06,
260
+ "loss": 0.756,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 3.31,
265
+ "learning_rate": 4.5125e-06,
266
+ "loss": 0.7465,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 3.4,
271
+ "learning_rate": 4.5e-06,
272
+ "loss": 0.7432,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 3.4,
277
+ "eval_loss": 0.9422996044158936,
278
+ "eval_runtime": 28.729,
279
+ "eval_samples_per_second": 408.577,
280
+ "eval_steps_per_second": 51.098,
281
+ "step": 20000
282
+ },
283
+ {
284
+ "epoch": 3.48,
285
+ "learning_rate": 4.4875e-06,
286
+ "loss": 0.7522,
287
+ "step": 20500
288
+ },
289
+ {
290
+ "epoch": 3.57,
291
+ "learning_rate": 4.475e-06,
292
+ "loss": 0.7471,
293
+ "step": 21000
294
+ },
295
+ {
296
+ "epoch": 3.65,
297
+ "learning_rate": 4.4625e-06,
298
+ "loss": 0.7368,
299
+ "step": 21500
300
+ },
301
+ {
302
+ "epoch": 3.74,
303
+ "learning_rate": 4.450000000000001e-06,
304
+ "loss": 0.7467,
305
+ "step": 22000
306
+ },
307
+ {
308
+ "epoch": 3.82,
309
+ "learning_rate": 4.4375e-06,
310
+ "loss": 0.733,
311
+ "step": 22500
312
+ },
313
+ {
314
+ "epoch": 3.91,
315
+ "learning_rate": 4.425e-06,
316
+ "loss": 0.7281,
317
+ "step": 23000
318
+ },
319
+ {
320
+ "epoch": 3.99,
321
+ "learning_rate": 4.4125000000000005e-06,
322
+ "loss": 0.7269,
323
+ "step": 23500
324
+ },
325
+ {
326
+ "epoch": 4.08,
327
+ "learning_rate": 4.4e-06,
328
+ "loss": 0.7234,
329
+ "step": 24000
330
+ },
331
+ {
332
+ "epoch": 4.16,
333
+ "learning_rate": 4.3875e-06,
334
+ "loss": 0.7101,
335
+ "step": 24500
336
+ },
337
+ {
338
+ "epoch": 4.25,
339
+ "learning_rate": 4.3750000000000005e-06,
340
+ "loss": 0.7029,
341
+ "step": 25000
342
+ },
343
+ {
344
+ "epoch": 4.25,
345
+ "eval_loss": 0.8877292275428772,
346
+ "eval_runtime": 28.5804,
347
+ "eval_samples_per_second": 410.7,
348
+ "eval_steps_per_second": 51.364,
349
+ "step": 25000
350
+ },
351
+ {
352
+ "epoch": 4.33,
353
+ "learning_rate": 4.362500000000001e-06,
354
+ "loss": 0.6897,
355
+ "step": 25500
356
+ },
357
+ {
358
+ "epoch": 4.42,
359
+ "learning_rate": 4.350000000000001e-06,
360
+ "loss": 0.7004,
361
+ "step": 26000
362
+ },
363
+ {
364
+ "epoch": 4.5,
365
+ "learning_rate": 4.3375e-06,
366
+ "loss": 0.6925,
367
+ "step": 26500
368
+ },
369
+ {
370
+ "epoch": 4.59,
371
+ "learning_rate": 4.325e-06,
372
+ "loss": 0.6967,
373
+ "step": 27000
374
+ },
375
+ {
376
+ "epoch": 4.67,
377
+ "learning_rate": 4.312500000000001e-06,
378
+ "loss": 0.6872,
379
+ "step": 27500
380
+ },
381
+ {
382
+ "epoch": 4.76,
383
+ "learning_rate": 4.3e-06,
384
+ "loss": 0.6927,
385
+ "step": 28000
386
+ },
387
+ {
388
+ "epoch": 4.84,
389
+ "learning_rate": 4.287500000000001e-06,
390
+ "loss": 0.6745,
391
+ "step": 28500
392
+ },
393
+ {
394
+ "epoch": 4.93,
395
+ "learning_rate": 4.2750000000000006e-06,
396
+ "loss": 0.686,
397
+ "step": 29000
398
+ },
399
+ {
400
+ "epoch": 5.01,
401
+ "learning_rate": 4.2625e-06,
402
+ "loss": 0.6753,
403
+ "step": 29500
404
+ },
405
+ {
406
+ "epoch": 5.1,
407
+ "learning_rate": 4.25e-06,
408
+ "loss": 0.6609,
409
+ "step": 30000
410
+ },
411
+ {
412
+ "epoch": 5.1,
413
+ "eval_loss": 0.8455274105072021,
414
+ "eval_runtime": 31.1298,
415
+ "eval_samples_per_second": 377.066,
416
+ "eval_steps_per_second": 47.157,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 5.18,
421
+ "learning_rate": 4.2375000000000005e-06,
422
+ "loss": 0.668,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 5.27,
427
+ "learning_rate": 4.225e-06,
428
+ "loss": 0.6669,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 5.35,
433
+ "learning_rate": 4.212500000000001e-06,
434
+ "loss": 0.6514,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 5.44,
439
+ "learning_rate": 4.2000000000000004e-06,
440
+ "loss": 0.662,
441
+ "step": 32000
442
+ },
443
+ {
444
+ "epoch": 5.52,
445
+ "learning_rate": 4.1875e-06,
446
+ "loss": 0.6617,
447
+ "step": 32500
448
+ },
449
+ {
450
+ "epoch": 5.61,
451
+ "learning_rate": 4.175e-06,
452
+ "loss": 0.6561,
453
+ "step": 33000
454
+ },
455
+ {
456
+ "epoch": 5.69,
457
+ "learning_rate": 4.1625e-06,
458
+ "loss": 0.6551,
459
+ "step": 33500
460
+ },
461
+ {
462
+ "epoch": 5.78,
463
+ "learning_rate": 4.15e-06,
464
+ "loss": 0.6429,
465
+ "step": 34000
466
+ },
467
+ {
468
+ "epoch": 5.86,
469
+ "learning_rate": 4.137500000000001e-06,
470
+ "loss": 0.6332,
471
+ "step": 34500
472
+ },
473
+ {
474
+ "epoch": 5.95,
475
+ "learning_rate": 4.125e-06,
476
+ "loss": 0.6479,
477
+ "step": 35000
478
+ },
479
+ {
480
+ "epoch": 5.95,
481
+ "eval_loss": 0.81136155128479,
482
+ "eval_runtime": 28.2662,
483
+ "eval_samples_per_second": 415.267,
484
+ "eval_steps_per_second": 51.935,
485
+ "step": 35000
486
+ },
487
+ {
488
+ "epoch": 6.03,
489
+ "learning_rate": 4.1125e-06,
490
+ "loss": 0.6444,
491
+ "step": 35500
492
+ },
493
+ {
494
+ "epoch": 6.12,
495
+ "learning_rate": 4.1e-06,
496
+ "loss": 0.6345,
497
+ "step": 36000
498
+ },
499
+ {
500
+ "epoch": 6.2,
501
+ "learning_rate": 4.0875e-06,
502
+ "loss": 0.6274,
503
+ "step": 36500
504
+ },
505
+ {
506
+ "epoch": 6.29,
507
+ "learning_rate": 4.075e-06,
508
+ "loss": 0.6288,
509
+ "step": 37000
510
+ },
511
+ {
512
+ "epoch": 6.37,
513
+ "learning_rate": 4.0625000000000005e-06,
514
+ "loss": 0.6392,
515
+ "step": 37500
516
+ },
517
+ {
518
+ "epoch": 6.46,
519
+ "learning_rate": 4.05e-06,
520
+ "loss": 0.6252,
521
+ "step": 38000
522
+ },
523
+ {
524
+ "epoch": 6.54,
525
+ "learning_rate": 4.037500000000001e-06,
526
+ "loss": 0.6223,
527
+ "step": 38500
528
+ },
529
+ {
530
+ "epoch": 6.63,
531
+ "learning_rate": 4.0250000000000004e-06,
532
+ "loss": 0.6155,
533
+ "step": 39000
534
+ },
535
+ {
536
+ "epoch": 6.71,
537
+ "learning_rate": 4.0125e-06,
538
+ "loss": 0.6287,
539
+ "step": 39500
540
+ },
541
+ {
542
+ "epoch": 6.8,
543
+ "learning_rate": 4.000000000000001e-06,
544
+ "loss": 0.624,
545
+ "step": 40000
546
+ },
547
+ {
548
+ "epoch": 6.8,
549
+ "eval_loss": 0.7838146090507507,
550
+ "eval_runtime": 30.7199,
551
+ "eval_samples_per_second": 382.097,
552
+ "eval_steps_per_second": 47.787,
553
+ "step": 40000
554
+ },
555
+ {
556
+ "epoch": 6.88,
557
+ "learning_rate": 3.9875e-06,
558
+ "loss": 0.612,
559
+ "step": 40500
560
+ },
561
+ {
562
+ "epoch": 6.97,
563
+ "learning_rate": 3.975000000000001e-06,
564
+ "loss": 0.6172,
565
+ "step": 41000
566
+ },
567
+ {
568
+ "epoch": 7.05,
569
+ "learning_rate": 3.962500000000001e-06,
570
+ "loss": 0.6094,
571
+ "step": 41500
572
+ },
573
+ {
574
+ "epoch": 7.14,
575
+ "learning_rate": 3.95e-06,
576
+ "loss": 0.603,
577
+ "step": 42000
578
+ },
579
+ {
580
+ "epoch": 7.22,
581
+ "learning_rate": 3.9375e-06,
582
+ "loss": 0.6002,
583
+ "step": 42500
584
+ },
585
+ {
586
+ "epoch": 7.31,
587
+ "learning_rate": 3.9250000000000005e-06,
588
+ "loss": 0.6095,
589
+ "step": 43000
590
+ },
591
+ {
592
+ "epoch": 7.39,
593
+ "learning_rate": 3.9125e-06,
594
+ "loss": 0.5925,
595
+ "step": 43500
596
+ },
597
+ {
598
+ "epoch": 7.48,
599
+ "learning_rate": 3.900000000000001e-06,
600
+ "loss": 0.593,
601
+ "step": 44000
602
+ },
603
+ {
604
+ "epoch": 7.56,
605
+ "learning_rate": 3.8875000000000005e-06,
606
+ "loss": 0.5963,
607
+ "step": 44500
608
+ },
609
+ {
610
+ "epoch": 7.65,
611
+ "learning_rate": 3.875e-06,
612
+ "loss": 0.6045,
613
+ "step": 45000
614
+ },
615
+ {
616
+ "epoch": 7.65,
617
+ "eval_loss": 0.7607721090316772,
618
+ "eval_runtime": 30.9239,
619
+ "eval_samples_per_second": 379.577,
620
+ "eval_steps_per_second": 47.471,
621
+ "step": 45000
622
+ },
623
+ {
624
+ "epoch": 7.73,
625
+ "learning_rate": 3.8625e-06,
626
+ "loss": 0.5967,
627
+ "step": 45500
628
+ },
629
+ {
630
+ "epoch": 7.82,
631
+ "learning_rate": 3.85e-06,
632
+ "loss": 0.5958,
633
+ "step": 46000
634
+ },
635
+ {
636
+ "epoch": 7.9,
637
+ "learning_rate": 3.8375e-06,
638
+ "loss": 0.5856,
639
+ "step": 46500
640
+ },
641
+ {
642
+ "epoch": 7.99,
643
+ "learning_rate": 3.825000000000001e-06,
644
+ "loss": 0.5957,
645
+ "step": 47000
646
+ },
647
+ {
648
+ "epoch": 8.07,
649
+ "learning_rate": 3.8125e-06,
650
+ "loss": 0.5742,
651
+ "step": 47500
652
+ },
653
+ {
654
+ "epoch": 8.16,
655
+ "learning_rate": 3.8000000000000005e-06,
656
+ "loss": 0.5745,
657
+ "step": 48000
658
+ },
659
+ {
660
+ "epoch": 8.24,
661
+ "learning_rate": 3.7875e-06,
662
+ "loss": 0.5847,
663
+ "step": 48500
664
+ },
665
+ {
666
+ "epoch": 8.33,
667
+ "learning_rate": 3.7750000000000003e-06,
668
+ "loss": 0.5826,
669
+ "step": 49000
670
+ },
671
+ {
672
+ "epoch": 8.41,
673
+ "learning_rate": 3.7625e-06,
674
+ "loss": 0.5765,
675
+ "step": 49500
676
+ },
677
+ {
678
+ "epoch": 8.5,
679
+ "learning_rate": 3.7500000000000005e-06,
680
+ "loss": 0.571,
681
+ "step": 50000
682
+ },
683
+ {
684
+ "epoch": 8.5,
685
+ "eval_loss": 0.7427138686180115,
686
+ "eval_runtime": 32.0182,
687
+ "eval_samples_per_second": 366.604,
688
+ "eval_steps_per_second": 45.849,
689
+ "step": 50000
690
+ },
691
+ {
692
+ "epoch": 8.58,
693
+ "learning_rate": 3.7375000000000006e-06,
694
+ "loss": 0.5777,
695
+ "step": 50500
696
+ },
697
+ {
698
+ "epoch": 8.67,
699
+ "learning_rate": 3.7250000000000003e-06,
700
+ "loss": 0.573,
701
+ "step": 51000
702
+ },
703
+ {
704
+ "epoch": 8.75,
705
+ "learning_rate": 3.7125000000000005e-06,
706
+ "loss": 0.5813,
707
+ "step": 51500
708
+ },
709
+ {
710
+ "epoch": 8.84,
711
+ "learning_rate": 3.7e-06,
712
+ "loss": 0.5715,
713
+ "step": 52000
714
+ },
715
+ {
716
+ "epoch": 8.92,
717
+ "learning_rate": 3.6875000000000007e-06,
718
+ "loss": 0.5636,
719
+ "step": 52500
720
+ },
721
+ {
722
+ "epoch": 9.01,
723
+ "learning_rate": 3.6750000000000004e-06,
724
+ "loss": 0.5749,
725
+ "step": 53000
726
+ },
727
+ {
728
+ "epoch": 9.09,
729
+ "learning_rate": 3.6625000000000005e-06,
730
+ "loss": 0.573,
731
+ "step": 53500
732
+ },
733
+ {
734
+ "epoch": 9.18,
735
+ "learning_rate": 3.65e-06,
736
+ "loss": 0.5606,
737
+ "step": 54000
738
+ },
739
+ {
740
+ "epoch": 9.26,
741
+ "learning_rate": 3.6375000000000003e-06,
742
+ "loss": 0.5553,
743
+ "step": 54500
744
+ },
745
+ {
746
+ "epoch": 9.35,
747
+ "learning_rate": 3.625e-06,
748
+ "loss": 0.5637,
749
+ "step": 55000
750
+ },
751
+ {
752
+ "epoch": 9.35,
753
+ "eval_loss": 0.7249044179916382,
754
+ "eval_runtime": 32.2708,
755
+ "eval_samples_per_second": 363.734,
756
+ "eval_steps_per_second": 45.49,
757
+ "step": 55000
758
+ },
759
+ {
760
+ "epoch": 9.43,
761
+ "learning_rate": 3.6125000000000006e-06,
762
+ "loss": 0.5553,
763
+ "step": 55500
764
+ },
765
+ {
766
+ "epoch": 9.52,
767
+ "learning_rate": 3.6000000000000003e-06,
768
+ "loss": 0.5648,
769
+ "step": 56000
770
+ },
771
+ {
772
+ "epoch": 9.6,
773
+ "learning_rate": 3.5875000000000004e-06,
774
+ "loss": 0.5512,
775
+ "step": 56500
776
+ },
777
+ {
778
+ "epoch": 9.69,
779
+ "learning_rate": 3.575e-06,
780
+ "loss": 0.5534,
781
+ "step": 57000
782
+ },
783
+ {
784
+ "epoch": 9.77,
785
+ "learning_rate": 3.5625e-06,
786
+ "loss": 0.5544,
787
+ "step": 57500
788
+ },
789
+ {
790
+ "epoch": 9.86,
791
+ "learning_rate": 3.5500000000000003e-06,
792
+ "loss": 0.5508,
793
+ "step": 58000
794
+ },
795
+ {
796
+ "epoch": 9.94,
797
+ "learning_rate": 3.5375000000000004e-06,
798
+ "loss": 0.5458,
799
+ "step": 58500
800
+ },
801
+ {
802
+ "epoch": 10.03,
803
+ "learning_rate": 3.525e-06,
804
+ "loss": 0.557,
805
+ "step": 59000
806
+ },
807
+ {
808
+ "epoch": 10.11,
809
+ "learning_rate": 3.5125000000000003e-06,
810
+ "loss": 0.5406,
811
+ "step": 59500
812
+ },
813
+ {
814
+ "epoch": 10.2,
815
+ "learning_rate": 3.5e-06,
816
+ "loss": 0.5488,
817
+ "step": 60000
818
+ },
819
+ {
820
+ "epoch": 10.2,
821
+ "eval_loss": 0.7100504636764526,
822
+ "eval_runtime": 32.186,
823
+ "eval_samples_per_second": 364.692,
824
+ "eval_steps_per_second": 45.61,
825
+ "step": 60000
826
+ },
827
+ {
828
+ "epoch": 10.28,
829
+ "learning_rate": 3.4875000000000005e-06,
830
+ "loss": 0.5462,
831
+ "step": 60500
832
+ },
833
+ {
834
+ "epoch": 10.37,
835
+ "learning_rate": 3.475e-06,
836
+ "loss": 0.5364,
837
+ "step": 61000
838
+ },
839
+ {
840
+ "epoch": 10.45,
841
+ "learning_rate": 3.4625000000000003e-06,
842
+ "loss": 0.5452,
843
+ "step": 61500
844
+ },
845
+ {
846
+ "epoch": 10.54,
847
+ "learning_rate": 3.45e-06,
848
+ "loss": 0.5449,
849
+ "step": 62000
850
+ },
851
+ {
852
+ "epoch": 10.62,
853
+ "learning_rate": 3.4375e-06,
854
+ "loss": 0.5353,
855
+ "step": 62500
856
+ },
857
+ {
858
+ "epoch": 10.71,
859
+ "learning_rate": 3.4250000000000007e-06,
860
+ "loss": 0.5359,
861
+ "step": 63000
862
+ },
863
+ {
864
+ "epoch": 10.79,
865
+ "learning_rate": 3.4125000000000004e-06,
866
+ "loss": 0.5356,
867
+ "step": 63500
868
+ },
869
+ {
870
+ "epoch": 10.88,
871
+ "learning_rate": 3.4000000000000005e-06,
872
+ "loss": 0.5385,
873
+ "step": 64000
874
+ },
875
+ {
876
+ "epoch": 10.96,
877
+ "learning_rate": 3.3875e-06,
878
+ "loss": 0.53,
879
+ "step": 64500
880
+ },
881
+ {
882
+ "epoch": 11.05,
883
+ "learning_rate": 3.3750000000000003e-06,
884
+ "loss": 0.525,
885
+ "step": 65000
886
+ },
887
+ {
888
+ "epoch": 11.05,
889
+ "eval_loss": 0.6971380114555359,
890
+ "eval_runtime": 29.293,
891
+ "eval_samples_per_second": 400.71,
892
+ "eval_steps_per_second": 50.114,
893
+ "step": 65000
894
+ },
895
+ {
896
+ "epoch": 11.13,
897
+ "learning_rate": 3.3625000000000004e-06,
898
+ "loss": 0.519,
899
+ "step": 65500
900
+ },
901
+ {
902
+ "epoch": 11.22,
903
+ "learning_rate": 3.3500000000000005e-06,
904
+ "loss": 0.5309,
905
+ "step": 66000
906
+ },
907
+ {
908
+ "epoch": 11.3,
909
+ "learning_rate": 3.3375000000000002e-06,
910
+ "loss": 0.5314,
911
+ "step": 66500
912
+ },
913
+ {
914
+ "epoch": 11.39,
915
+ "learning_rate": 3.3250000000000004e-06,
916
+ "loss": 0.5255,
917
+ "step": 67000
918
+ },
919
+ {
920
+ "epoch": 11.47,
921
+ "learning_rate": 3.3125e-06,
922
+ "loss": 0.5297,
923
+ "step": 67500
924
+ },
925
+ {
926
+ "epoch": 11.56,
927
+ "learning_rate": 3.3000000000000006e-06,
928
+ "loss": 0.5238,
929
+ "step": 68000
930
+ },
931
+ {
932
+ "epoch": 11.64,
933
+ "learning_rate": 3.2875000000000003e-06,
934
+ "loss": 0.5162,
935
+ "step": 68500
936
+ },
937
+ {
938
+ "epoch": 11.73,
939
+ "learning_rate": 3.2750000000000004e-06,
940
+ "loss": 0.5187,
941
+ "step": 69000
942
+ },
943
+ {
944
+ "epoch": 11.81,
945
+ "learning_rate": 3.2625e-06,
946
+ "loss": 0.5249,
947
+ "step": 69500
948
+ },
949
+ {
950
+ "epoch": 11.9,
951
+ "learning_rate": 3.2500000000000002e-06,
952
+ "loss": 0.5283,
953
+ "step": 70000
954
+ },
955
+ {
956
+ "epoch": 11.9,
957
+ "eval_loss": 0.685904324054718,
958
+ "eval_runtime": 29.2682,
959
+ "eval_samples_per_second": 401.05,
960
+ "eval_steps_per_second": 50.157,
961
+ "step": 70000
962
+ },
963
+ {
964
+ "epoch": 11.98,
965
+ "learning_rate": 3.2375e-06,
966
+ "loss": 0.5261,
967
+ "step": 70500
968
+ },
969
+ {
970
+ "epoch": 12.07,
971
+ "learning_rate": 3.2250000000000005e-06,
972
+ "loss": 0.5099,
973
+ "step": 71000
974
+ },
975
+ {
976
+ "epoch": 12.15,
977
+ "learning_rate": 3.2125e-06,
978
+ "loss": 0.5187,
979
+ "step": 71500
980
+ },
981
+ {
982
+ "epoch": 12.24,
983
+ "learning_rate": 3.2000000000000003e-06,
984
+ "loss": 0.5144,
985
+ "step": 72000
986
+ },
987
+ {
988
+ "epoch": 12.32,
989
+ "learning_rate": 3.1875e-06,
990
+ "loss": 0.515,
991
+ "step": 72500
992
+ },
993
+ {
994
+ "epoch": 12.41,
995
+ "learning_rate": 3.175e-06,
996
+ "loss": 0.5158,
997
+ "step": 73000
998
+ },
999
+ {
1000
+ "epoch": 12.49,
1001
+ "learning_rate": 3.1625000000000002e-06,
1002
+ "loss": 0.5078,
1003
+ "step": 73500
1004
+ },
1005
+ {
1006
+ "epoch": 12.58,
1007
+ "learning_rate": 3.1500000000000003e-06,
1008
+ "loss": 0.5068,
1009
+ "step": 74000
1010
+ },
1011
+ {
1012
+ "epoch": 12.66,
1013
+ "learning_rate": 3.1375e-06,
1014
+ "loss": 0.5114,
1015
+ "step": 74500
1016
+ },
1017
+ {
1018
+ "epoch": 12.75,
1019
+ "learning_rate": 3.125e-06,
1020
+ "loss": 0.522,
1021
+ "step": 75000
1022
+ },
1023
+ {
1024
+ "epoch": 12.75,
1025
+ "eval_loss": 0.6754906177520752,
1026
+ "eval_runtime": 29.2581,
1027
+ "eval_samples_per_second": 401.188,
1028
+ "eval_steps_per_second": 50.174,
1029
+ "step": 75000
1030
+ },
1031
+ {
1032
+ "epoch": 12.83,
1033
+ "learning_rate": 3.1125000000000007e-06,
1034
+ "loss": 0.5094,
1035
+ "step": 75500
1036
+ },
1037
+ {
1038
+ "epoch": 12.92,
1039
+ "learning_rate": 3.1000000000000004e-06,
1040
+ "loss": 0.5108,
1041
+ "step": 76000
1042
+ },
1043
+ {
1044
+ "epoch": 13.0,
1045
+ "learning_rate": 3.0875000000000005e-06,
1046
+ "loss": 0.5065,
1047
+ "step": 76500
1048
+ },
1049
+ {
1050
+ "epoch": 13.09,
1051
+ "learning_rate": 3.075e-06,
1052
+ "loss": 0.5033,
1053
+ "step": 77000
1054
+ },
1055
+ {
1056
+ "epoch": 13.17,
1057
+ "learning_rate": 3.0625000000000003e-06,
1058
+ "loss": 0.5032,
1059
+ "step": 77500
1060
+ },
1061
+ {
1062
+ "epoch": 13.26,
1063
+ "learning_rate": 3.05e-06,
1064
+ "loss": 0.5011,
1065
+ "step": 78000
1066
+ },
1067
+ {
1068
+ "epoch": 13.34,
1069
+ "learning_rate": 3.0375000000000006e-06,
1070
+ "loss": 0.5007,
1071
+ "step": 78500
1072
+ },
1073
+ {
1074
+ "epoch": 13.43,
1075
+ "learning_rate": 3.0250000000000003e-06,
1076
+ "loss": 0.4989,
1077
+ "step": 79000
1078
+ },
1079
+ {
1080
+ "epoch": 13.51,
1081
+ "learning_rate": 3.0125000000000004e-06,
1082
+ "loss": 0.4992,
1083
+ "step": 79500
1084
+ },
1085
+ {
1086
+ "epoch": 13.6,
1087
+ "learning_rate": 3e-06,
1088
+ "loss": 0.4996,
1089
+ "step": 80000
1090
+ },
1091
+ {
1092
+ "epoch": 13.6,
1093
+ "eval_loss": 0.6659587025642395,
1094
+ "eval_runtime": 29.4004,
1095
+ "eval_samples_per_second": 399.247,
1096
+ "eval_steps_per_second": 49.931,
1097
+ "step": 80000
1098
+ },
1099
+ {
1100
+ "epoch": 13.68,
1101
+ "learning_rate": 2.9875e-06,
1102
+ "loss": 0.5022,
1103
+ "step": 80500
1104
+ },
1105
+ {
1106
+ "epoch": 13.77,
1107
+ "learning_rate": 2.9750000000000003e-06,
1108
+ "loss": 0.4983,
1109
+ "step": 81000
1110
+ },
1111
+ {
1112
+ "epoch": 13.85,
1113
+ "learning_rate": 2.9625000000000004e-06,
1114
+ "loss": 0.502,
1115
+ "step": 81500
1116
+ },
1117
+ {
1118
+ "epoch": 13.94,
1119
+ "learning_rate": 2.95e-06,
1120
+ "loss": 0.495,
1121
+ "step": 82000
1122
+ },
1123
+ {
1124
+ "epoch": 14.02,
1125
+ "learning_rate": 2.9375000000000003e-06,
1126
+ "loss": 0.4967,
1127
+ "step": 82500
1128
+ },
1129
+ {
1130
+ "epoch": 14.11,
1131
+ "learning_rate": 2.925e-06,
1132
+ "loss": 0.4947,
1133
+ "step": 83000
1134
+ },
1135
+ {
1136
+ "epoch": 14.19,
1137
+ "learning_rate": 2.9125000000000005e-06,
1138
+ "loss": 0.4841,
1139
+ "step": 83500
1140
+ },
1141
+ {
1142
+ "epoch": 14.28,
1143
+ "learning_rate": 2.9e-06,
1144
+ "loss": 0.4922,
1145
+ "step": 84000
1146
+ },
1147
+ {
1148
+ "epoch": 14.36,
1149
+ "learning_rate": 2.8875000000000003e-06,
1150
+ "loss": 0.4925,
1151
+ "step": 84500
1152
+ },
1153
+ {
1154
+ "epoch": 14.45,
1155
+ "learning_rate": 2.875e-06,
1156
+ "loss": 0.4868,
1157
+ "step": 85000
1158
+ },
1159
+ {
1160
+ "epoch": 14.45,
1161
+ "eval_loss": 0.6585991978645325,
1162
+ "eval_runtime": 29.5661,
1163
+ "eval_samples_per_second": 397.009,
1164
+ "eval_steps_per_second": 49.652,
1165
+ "step": 85000
1166
+ },
1167
+ {
1168
+ "epoch": 14.53,
1169
+ "learning_rate": 2.8625e-06,
1170
+ "loss": 0.4943,
1171
+ "step": 85500
1172
+ },
1173
+ {
1174
+ "epoch": 14.62,
1175
+ "learning_rate": 2.85e-06,
1176
+ "loss": 0.4887,
1177
+ "step": 86000
1178
+ },
1179
+ {
1180
+ "epoch": 14.7,
1181
+ "learning_rate": 2.8375000000000004e-06,
1182
+ "loss": 0.4832,
1183
+ "step": 86500
1184
+ },
1185
+ {
1186
+ "epoch": 14.79,
1187
+ "learning_rate": 2.825e-06,
1188
+ "loss": 0.4922,
1189
+ "step": 87000
1190
+ },
1191
+ {
1192
+ "epoch": 14.87,
1193
+ "learning_rate": 2.8125e-06,
1194
+ "loss": 0.483,
1195
+ "step": 87500
1196
+ },
1197
+ {
1198
+ "epoch": 14.96,
1199
+ "learning_rate": 2.8000000000000003e-06,
1200
+ "loss": 0.4924,
1201
+ "step": 88000
1202
+ },
1203
+ {
1204
+ "epoch": 15.04,
1205
+ "learning_rate": 2.7875000000000004e-06,
1206
+ "loss": 0.4836,
1207
+ "step": 88500
1208
+ },
1209
+ {
1210
+ "epoch": 15.13,
1211
+ "learning_rate": 2.7750000000000005e-06,
1212
+ "loss": 0.4736,
1213
+ "step": 89000
1214
+ },
1215
+ {
1216
+ "epoch": 15.21,
1217
+ "learning_rate": 2.7625000000000002e-06,
1218
+ "loss": 0.4799,
1219
+ "step": 89500
1220
+ },
1221
+ {
1222
+ "epoch": 15.3,
1223
+ "learning_rate": 2.7500000000000004e-06,
1224
+ "loss": 0.4773,
1225
+ "step": 90000
1226
+ },
1227
+ {
1228
+ "epoch": 15.3,
1229
+ "eval_loss": 0.6510518789291382,
1230
+ "eval_runtime": 28.0473,
1231
+ "eval_samples_per_second": 418.507,
1232
+ "eval_steps_per_second": 52.34,
1233
+ "step": 90000
1234
+ },
1235
+ {
1236
+ "epoch": 15.38,
1237
+ "learning_rate": 2.7375e-06,
1238
+ "loss": 0.4791,
1239
+ "step": 90500
1240
+ },
1241
+ {
1242
+ "epoch": 15.47,
1243
+ "learning_rate": 2.7250000000000006e-06,
1244
+ "loss": 0.4788,
1245
+ "step": 91000
1246
+ },
1247
+ {
1248
+ "epoch": 15.55,
1249
+ "learning_rate": 2.7125000000000003e-06,
1250
+ "loss": 0.4759,
1251
+ "step": 91500
1252
+ },
1253
+ {
1254
+ "epoch": 15.64,
1255
+ "learning_rate": 2.7000000000000004e-06,
1256
+ "loss": 0.4784,
1257
+ "step": 92000
1258
+ },
1259
+ {
1260
+ "epoch": 15.72,
1261
+ "learning_rate": 2.6875e-06,
1262
+ "loss": 0.4762,
1263
+ "step": 92500
1264
+ },
1265
+ {
1266
+ "epoch": 15.81,
1267
+ "learning_rate": 2.6750000000000002e-06,
1268
+ "loss": 0.4827,
1269
+ "step": 93000
1270
+ },
1271
+ {
1272
+ "epoch": 15.89,
1273
+ "learning_rate": 2.6625e-06,
1274
+ "loss": 0.4844,
1275
+ "step": 93500
1276
+ },
1277
+ {
1278
+ "epoch": 15.98,
1279
+ "learning_rate": 2.6500000000000005e-06,
1280
+ "loss": 0.4787,
1281
+ "step": 94000
1282
+ },
1283
+ {
1284
+ "epoch": 16.06,
1285
+ "learning_rate": 2.6375e-06,
1286
+ "loss": 0.4759,
1287
+ "step": 94500
1288
+ },
1289
+ {
1290
+ "epoch": 16.15,
1291
+ "learning_rate": 2.6250000000000003e-06,
1292
+ "loss": 0.4724,
1293
+ "step": 95000
1294
+ },
1295
+ {
1296
+ "epoch": 16.15,
1297
+ "eval_loss": 0.6447970271110535,
1298
+ "eval_runtime": 27.4195,
1299
+ "eval_samples_per_second": 428.09,
1300
+ "eval_steps_per_second": 53.539,
1301
+ "step": 95000
1302
+ },
1303
+ {
1304
+ "epoch": 16.23,
1305
+ "learning_rate": 2.6125e-06,
1306
+ "loss": 0.4748,
1307
+ "step": 95500
1308
+ },
1309
+ {
1310
+ "epoch": 16.32,
1311
+ "learning_rate": 2.6e-06,
1312
+ "loss": 0.4711,
1313
+ "step": 96000
1314
+ },
1315
+ {
1316
+ "epoch": 16.4,
1317
+ "learning_rate": 2.5875000000000002e-06,
1318
+ "loss": 0.4744,
1319
+ "step": 96500
1320
+ },
1321
+ {
1322
+ "epoch": 16.49,
1323
+ "learning_rate": 2.5750000000000003e-06,
1324
+ "loss": 0.4751,
1325
+ "step": 97000
1326
+ },
1327
+ {
1328
+ "epoch": 16.57,
1329
+ "learning_rate": 2.5625e-06,
1330
+ "loss": 0.4716,
1331
+ "step": 97500
1332
+ },
1333
+ {
1334
+ "epoch": 16.66,
1335
+ "learning_rate": 2.55e-06,
1336
+ "loss": 0.4646,
1337
+ "step": 98000
1338
+ },
1339
+ {
1340
+ "epoch": 16.74,
1341
+ "learning_rate": 2.5375e-06,
1342
+ "loss": 0.4629,
1343
+ "step": 98500
1344
+ },
1345
+ {
1346
+ "epoch": 16.83,
1347
+ "learning_rate": 2.5250000000000004e-06,
1348
+ "loss": 0.4711,
1349
+ "step": 99000
1350
+ },
1351
+ {
1352
+ "epoch": 16.91,
1353
+ "learning_rate": 2.5125e-06,
1354
+ "loss": 0.4708,
1355
+ "step": 99500
1356
+ },
1357
+ {
1358
+ "epoch": 17.0,
1359
+ "learning_rate": 2.5e-06,
1360
+ "loss": 0.4682,
1361
+ "step": 100000
1362
+ },
1363
+ {
1364
+ "epoch": 17.0,
1365
+ "eval_loss": 0.6382132768630981,
1366
+ "eval_runtime": 27.4181,
1367
+ "eval_samples_per_second": 428.111,
1368
+ "eval_steps_per_second": 53.541,
1369
+ "step": 100000
1370
+ },
1371
+ {
1372
+ "epoch": 17.08,
1373
+ "learning_rate": 2.4875000000000003e-06,
1374
+ "loss": 0.4575,
1375
+ "step": 100500
1376
+ },
1377
+ {
1378
+ "epoch": 17.17,
1379
+ "learning_rate": 2.475e-06,
1380
+ "loss": 0.4609,
1381
+ "step": 101000
1382
+ },
1383
+ {
1384
+ "epoch": 17.25,
1385
+ "learning_rate": 2.4625e-06,
1386
+ "loss": 0.4673,
1387
+ "step": 101500
1388
+ },
1389
+ {
1390
+ "epoch": 17.34,
1391
+ "learning_rate": 2.4500000000000003e-06,
1392
+ "loss": 0.4653,
1393
+ "step": 102000
1394
+ },
1395
+ {
1396
+ "epoch": 17.42,
1397
+ "learning_rate": 2.4375e-06,
1398
+ "loss": 0.4595,
1399
+ "step": 102500
1400
+ },
1401
+ {
1402
+ "epoch": 17.51,
1403
+ "learning_rate": 2.425e-06,
1404
+ "loss": 0.4578,
1405
+ "step": 103000
1406
+ },
1407
+ {
1408
+ "epoch": 17.59,
1409
+ "learning_rate": 2.4125e-06,
1410
+ "loss": 0.4682,
1411
+ "step": 103500
1412
+ },
1413
+ {
1414
+ "epoch": 17.68,
1415
+ "learning_rate": 2.4000000000000003e-06,
1416
+ "loss": 0.4601,
1417
+ "step": 104000
1418
+ },
1419
+ {
1420
+ "epoch": 17.76,
1421
+ "learning_rate": 2.3875e-06,
1422
+ "loss": 0.4585,
1423
+ "step": 104500
1424
+ },
1425
+ {
1426
+ "epoch": 17.85,
1427
+ "learning_rate": 2.375e-06,
1428
+ "loss": 0.4648,
1429
+ "step": 105000
1430
+ },
1431
+ {
1432
+ "epoch": 17.85,
1433
+ "eval_loss": 0.6338370442390442,
1434
+ "eval_runtime": 27.5324,
1435
+ "eval_samples_per_second": 426.334,
1436
+ "eval_steps_per_second": 53.319,
1437
+ "step": 105000
1438
+ },
1439
+ {
1440
+ "epoch": 17.93,
1441
+ "learning_rate": 2.3625000000000003e-06,
1442
+ "loss": 0.4642,
1443
+ "step": 105500
1444
+ },
1445
+ {
1446
+ "epoch": 18.02,
1447
+ "learning_rate": 2.35e-06,
1448
+ "loss": 0.4648,
1449
+ "step": 106000
1450
+ },
1451
+ {
1452
+ "epoch": 18.1,
1453
+ "learning_rate": 2.3375000000000005e-06,
1454
+ "loss": 0.4599,
1455
+ "step": 106500
1456
+ },
1457
+ {
1458
+ "epoch": 18.19,
1459
+ "learning_rate": 2.325e-06,
1460
+ "loss": 0.4481,
1461
+ "step": 107000
1462
+ },
1463
+ {
1464
+ "epoch": 18.27,
1465
+ "learning_rate": 2.3125000000000003e-06,
1466
+ "loss": 0.4601,
1467
+ "step": 107500
1468
+ },
1469
+ {
1470
+ "epoch": 18.36,
1471
+ "learning_rate": 2.3000000000000004e-06,
1472
+ "loss": 0.4582,
1473
+ "step": 108000
1474
+ },
1475
+ {
1476
+ "epoch": 18.44,
1477
+ "learning_rate": 2.2875e-06,
1478
+ "loss": 0.4589,
1479
+ "step": 108500
1480
+ },
1481
+ {
1482
+ "epoch": 18.53,
1483
+ "learning_rate": 2.2750000000000002e-06,
1484
+ "loss": 0.4505,
1485
+ "step": 109000
1486
+ },
1487
+ {
1488
+ "epoch": 18.61,
1489
+ "learning_rate": 2.2625000000000004e-06,
1490
+ "loss": 0.4584,
1491
+ "step": 109500
1492
+ },
1493
+ {
1494
+ "epoch": 18.7,
1495
+ "learning_rate": 2.25e-06,
1496
+ "loss": 0.4551,
1497
+ "step": 110000
1498
+ },
1499
+ {
1500
+ "epoch": 18.7,
1501
+ "eval_loss": 0.6278859972953796,
1502
+ "eval_runtime": 28.5577,
1503
+ "eval_samples_per_second": 411.027,
1504
+ "eval_steps_per_second": 51.405,
1505
+ "step": 110000
1506
+ },
1507
+ {
1508
+ "epoch": 18.78,
1509
+ "learning_rate": 2.2375e-06,
1510
+ "loss": 0.4512,
1511
+ "step": 110500
1512
+ },
1513
+ {
1514
+ "epoch": 18.87,
1515
+ "learning_rate": 2.2250000000000003e-06,
1516
+ "loss": 0.4549,
1517
+ "step": 111000
1518
+ },
1519
+ {
1520
+ "epoch": 18.95,
1521
+ "learning_rate": 2.2125e-06,
1522
+ "loss": 0.4607,
1523
+ "step": 111500
1524
+ },
1525
+ {
1526
+ "epoch": 19.04,
1527
+ "learning_rate": 2.2e-06,
1528
+ "loss": 0.4493,
1529
+ "step": 112000
1530
+ },
1531
+ {
1532
+ "epoch": 19.12,
1533
+ "learning_rate": 2.1875000000000002e-06,
1534
+ "loss": 0.4481,
1535
+ "step": 112500
1536
+ },
1537
+ {
1538
+ "epoch": 19.21,
1539
+ "learning_rate": 2.1750000000000004e-06,
1540
+ "loss": 0.4475,
1541
+ "step": 113000
1542
+ },
1543
+ {
1544
+ "epoch": 19.29,
1545
+ "learning_rate": 2.1625e-06,
1546
+ "loss": 0.4487,
1547
+ "step": 113500
1548
+ },
1549
+ {
1550
+ "epoch": 19.38,
1551
+ "learning_rate": 2.15e-06,
1552
+ "loss": 0.4471,
1553
+ "step": 114000
1554
+ },
1555
+ {
1556
+ "epoch": 19.46,
1557
+ "learning_rate": 2.1375000000000003e-06,
1558
+ "loss": 0.4501,
1559
+ "step": 114500
1560
+ },
1561
+ {
1562
+ "epoch": 19.55,
1563
+ "learning_rate": 2.125e-06,
1564
+ "loss": 0.4412,
1565
+ "step": 115000
1566
+ },
1567
+ {
1568
+ "epoch": 19.55,
1569
+ "eval_loss": 0.6246311068534851,
1570
+ "eval_runtime": 32.3732,
1571
+ "eval_samples_per_second": 362.584,
1572
+ "eval_steps_per_second": 45.346,
1573
+ "step": 115000
1574
+ },
1575
+ {
1576
+ "epoch": 19.63,
1577
+ "learning_rate": 2.1125e-06,
1578
+ "loss": 0.4557,
1579
+ "step": 115500
1580
+ },
1581
+ {
1582
+ "epoch": 19.72,
1583
+ "learning_rate": 2.1000000000000002e-06,
1584
+ "loss": 0.4509,
1585
+ "step": 116000
1586
+ },
1587
+ {
1588
+ "epoch": 19.8,
1589
+ "learning_rate": 2.0875e-06,
1590
+ "loss": 0.4484,
1591
+ "step": 116500
1592
+ },
1593
+ {
1594
+ "epoch": 19.89,
1595
+ "learning_rate": 2.075e-06,
1596
+ "loss": 0.4464,
1597
+ "step": 117000
1598
+ },
1599
+ {
1600
+ "epoch": 19.97,
1601
+ "learning_rate": 2.0625e-06,
1602
+ "loss": 0.4442,
1603
+ "step": 117500
1604
+ },
1605
+ {
1606
+ "epoch": 20.06,
1607
+ "learning_rate": 2.05e-06,
1608
+ "loss": 0.4479,
1609
+ "step": 118000
1610
+ },
1611
+ {
1612
+ "epoch": 20.14,
1613
+ "learning_rate": 2.0375e-06,
1614
+ "loss": 0.4376,
1615
+ "step": 118500
1616
+ },
1617
+ {
1618
+ "epoch": 20.23,
1619
+ "learning_rate": 2.025e-06,
1620
+ "loss": 0.4441,
1621
+ "step": 119000
1622
+ },
1623
+ {
1624
+ "epoch": 20.31,
1625
+ "learning_rate": 2.0125000000000002e-06,
1626
+ "loss": 0.4429,
1627
+ "step": 119500
1628
+ },
1629
+ {
1630
+ "epoch": 20.4,
1631
+ "learning_rate": 2.0000000000000003e-06,
1632
+ "loss": 0.447,
1633
+ "step": 120000
1634
+ },
1635
+ {
1636
+ "epoch": 20.4,
1637
+ "eval_loss": 0.620963990688324,
1638
+ "eval_runtime": 29.0345,
1639
+ "eval_samples_per_second": 404.278,
1640
+ "eval_steps_per_second": 50.561,
1641
+ "step": 120000
1642
+ },
1643
+ {
1644
+ "epoch": 20.48,
1645
+ "learning_rate": 1.9875000000000005e-06,
1646
+ "loss": 0.4466,
1647
+ "step": 120500
1648
+ },
1649
+ {
1650
+ "epoch": 20.57,
1651
+ "learning_rate": 1.975e-06,
1652
+ "loss": 0.4487,
1653
+ "step": 121000
1654
+ },
1655
+ {
1656
+ "epoch": 20.65,
1657
+ "learning_rate": 1.9625000000000003e-06,
1658
+ "loss": 0.4406,
1659
+ "step": 121500
1660
+ },
1661
+ {
1662
+ "epoch": 20.74,
1663
+ "learning_rate": 1.9500000000000004e-06,
1664
+ "loss": 0.4423,
1665
+ "step": 122000
1666
+ },
1667
+ {
1668
+ "epoch": 20.82,
1669
+ "learning_rate": 1.9375e-06,
1670
+ "loss": 0.4454,
1671
+ "step": 122500
1672
+ },
1673
+ {
1674
+ "epoch": 20.91,
1675
+ "learning_rate": 1.925e-06,
1676
+ "loss": 0.4396,
1677
+ "step": 123000
1678
+ },
1679
+ {
1680
+ "epoch": 20.99,
1681
+ "learning_rate": 1.9125000000000003e-06,
1682
+ "loss": 0.4387,
1683
+ "step": 123500
1684
+ },
1685
+ {
1686
+ "epoch": 21.08,
1687
+ "learning_rate": 1.9000000000000002e-06,
1688
+ "loss": 0.4339,
1689
+ "step": 124000
1690
+ },
1691
+ {
1692
+ "epoch": 21.16,
1693
+ "learning_rate": 1.8875000000000001e-06,
1694
+ "loss": 0.4407,
1695
+ "step": 124500
1696
+ },
1697
+ {
1698
+ "epoch": 21.25,
1699
+ "learning_rate": 1.8750000000000003e-06,
1700
+ "loss": 0.4431,
1701
+ "step": 125000
1702
+ },
1703
+ {
1704
+ "epoch": 21.25,
1705
+ "eval_loss": 0.6155585646629333,
1706
+ "eval_runtime": 29.6557,
1707
+ "eval_samples_per_second": 395.809,
1708
+ "eval_steps_per_second": 49.501,
1709
+ "step": 125000
1710
+ },
1711
+ {
1712
+ "epoch": 21.33,
1713
+ "learning_rate": 1.8625000000000002e-06,
1714
+ "loss": 0.4341,
1715
+ "step": 125500
1716
+ },
1717
+ {
1718
+ "epoch": 21.42,
1719
+ "learning_rate": 1.85e-06,
1720
+ "loss": 0.4358,
1721
+ "step": 126000
1722
+ },
1723
+ {
1724
+ "epoch": 21.5,
1725
+ "learning_rate": 1.8375000000000002e-06,
1726
+ "loss": 0.4443,
1727
+ "step": 126500
1728
+ },
1729
+ {
1730
+ "epoch": 21.59,
1731
+ "learning_rate": 1.825e-06,
1732
+ "loss": 0.4307,
1733
+ "step": 127000
1734
+ },
1735
+ {
1736
+ "epoch": 21.67,
1737
+ "learning_rate": 1.8125e-06,
1738
+ "loss": 0.4422,
1739
+ "step": 127500
1740
+ },
1741
+ {
1742
+ "epoch": 21.76,
1743
+ "learning_rate": 1.8000000000000001e-06,
1744
+ "loss": 0.4384,
1745
+ "step": 128000
1746
+ },
1747
+ {
1748
+ "epoch": 21.84,
1749
+ "learning_rate": 1.7875e-06,
1750
+ "loss": 0.4372,
1751
+ "step": 128500
1752
+ },
1753
+ {
1754
+ "epoch": 21.93,
1755
+ "learning_rate": 1.7750000000000002e-06,
1756
+ "loss": 0.4328,
1757
+ "step": 129000
1758
+ },
1759
+ {
1760
+ "epoch": 22.01,
1761
+ "learning_rate": 1.7625e-06,
1762
+ "loss": 0.4404,
1763
+ "step": 129500
1764
+ },
1765
+ {
1766
+ "epoch": 22.1,
1767
+ "learning_rate": 1.75e-06,
1768
+ "loss": 0.4328,
1769
+ "step": 130000
1770
+ },
1771
+ {
1772
+ "epoch": 22.1,
1773
+ "eval_loss": 0.6130816340446472,
1774
+ "eval_runtime": 29.5785,
1775
+ "eval_samples_per_second": 396.843,
1776
+ "eval_steps_per_second": 49.631,
1777
+ "step": 130000
1778
+ },
1779
+ {
1780
+ "epoch": 22.18,
1781
+ "learning_rate": 1.7375e-06,
1782
+ "loss": 0.427,
1783
+ "step": 130500
1784
+ },
1785
+ {
1786
+ "epoch": 22.27,
1787
+ "learning_rate": 1.725e-06,
1788
+ "loss": 0.4246,
1789
+ "step": 131000
1790
+ },
1791
+ {
1792
+ "epoch": 22.35,
1793
+ "learning_rate": 1.7125000000000003e-06,
1794
+ "loss": 0.4369,
1795
+ "step": 131500
1796
+ },
1797
+ {
1798
+ "epoch": 22.44,
1799
+ "learning_rate": 1.7000000000000002e-06,
1800
+ "loss": 0.4315,
1801
+ "step": 132000
1802
+ },
1803
+ {
1804
+ "epoch": 22.52,
1805
+ "learning_rate": 1.6875000000000001e-06,
1806
+ "loss": 0.4356,
1807
+ "step": 132500
1808
+ },
1809
+ {
1810
+ "epoch": 22.61,
1811
+ "learning_rate": 1.6750000000000003e-06,
1812
+ "loss": 0.4282,
1813
+ "step": 133000
1814
+ },
1815
+ {
1816
+ "epoch": 22.69,
1817
+ "learning_rate": 1.6625000000000002e-06,
1818
+ "loss": 0.4295,
1819
+ "step": 133500
1820
+ },
1821
+ {
1822
+ "epoch": 22.78,
1823
+ "learning_rate": 1.6500000000000003e-06,
1824
+ "loss": 0.4303,
1825
+ "step": 134000
1826
+ },
1827
+ {
1828
+ "epoch": 22.86,
1829
+ "learning_rate": 1.6375000000000002e-06,
1830
+ "loss": 0.4346,
1831
+ "step": 134500
1832
+ },
1833
+ {
1834
+ "epoch": 22.95,
1835
+ "learning_rate": 1.6250000000000001e-06,
1836
+ "loss": 0.4352,
1837
+ "step": 135000
1838
+ },
1839
+ {
1840
+ "epoch": 22.95,
1841
+ "eval_loss": 0.6105329394340515,
1842
+ "eval_runtime": 31.3111,
1843
+ "eval_samples_per_second": 374.883,
1844
+ "eval_steps_per_second": 46.884,
1845
+ "step": 135000
1846
+ },
1847
+ {
1848
+ "epoch": 23.03,
1849
+ "learning_rate": 1.6125000000000002e-06,
1850
+ "loss": 0.4328,
1851
+ "step": 135500
1852
+ },
1853
+ {
1854
+ "epoch": 23.12,
1855
+ "learning_rate": 1.6000000000000001e-06,
1856
+ "loss": 0.4291,
1857
+ "step": 136000
1858
+ },
1859
+ {
1860
+ "epoch": 23.2,
1861
+ "learning_rate": 1.5875e-06,
1862
+ "loss": 0.4238,
1863
+ "step": 136500
1864
+ },
1865
+ {
1866
+ "epoch": 23.29,
1867
+ "learning_rate": 1.5750000000000002e-06,
1868
+ "loss": 0.4239,
1869
+ "step": 137000
1870
+ },
1871
+ {
1872
+ "epoch": 23.37,
1873
+ "learning_rate": 1.5625e-06,
1874
+ "loss": 0.4267,
1875
+ "step": 137500
1876
+ },
1877
+ {
1878
+ "epoch": 23.46,
1879
+ "learning_rate": 1.5500000000000002e-06,
1880
+ "loss": 0.4306,
1881
+ "step": 138000
1882
+ },
1883
+ {
1884
+ "epoch": 23.54,
1885
+ "learning_rate": 1.5375e-06,
1886
+ "loss": 0.4327,
1887
+ "step": 138500
1888
+ },
1889
+ {
1890
+ "epoch": 23.63,
1891
+ "learning_rate": 1.525e-06,
1892
+ "loss": 0.429,
1893
+ "step": 139000
1894
+ },
1895
+ {
1896
+ "epoch": 23.71,
1897
+ "learning_rate": 1.5125000000000001e-06,
1898
+ "loss": 0.4295,
1899
+ "step": 139500
1900
+ },
1901
+ {
1902
+ "epoch": 23.8,
1903
+ "learning_rate": 1.5e-06,
1904
+ "loss": 0.4268,
1905
+ "step": 140000
1906
+ },
1907
+ {
1908
+ "epoch": 23.8,
1909
+ "eval_loss": 0.6070874929428101,
1910
+ "eval_runtime": 28.7528,
1911
+ "eval_samples_per_second": 408.239,
1912
+ "eval_steps_per_second": 51.056,
1913
+ "step": 140000
1914
+ },
1915
+ {
1916
+ "epoch": 23.88,
1917
+ "learning_rate": 1.4875000000000002e-06,
1918
+ "loss": 0.424,
1919
+ "step": 140500
1920
+ },
1921
+ {
1922
+ "epoch": 23.97,
1923
+ "learning_rate": 1.475e-06,
1924
+ "loss": 0.423,
1925
+ "step": 141000
1926
+ },
1927
+ {
1928
+ "epoch": 24.05,
1929
+ "learning_rate": 1.4625e-06,
1930
+ "loss": 0.4194,
1931
+ "step": 141500
1932
+ },
1933
+ {
1934
+ "epoch": 24.14,
1935
+ "learning_rate": 1.45e-06,
1936
+ "loss": 0.4246,
1937
+ "step": 142000
1938
+ },
1939
+ {
1940
+ "epoch": 24.22,
1941
+ "learning_rate": 1.4375e-06,
1942
+ "loss": 0.4268,
1943
+ "step": 142500
1944
+ },
1945
+ {
1946
+ "epoch": 24.31,
1947
+ "learning_rate": 1.425e-06,
1948
+ "loss": 0.4245,
1949
+ "step": 143000
1950
+ },
1951
+ {
1952
+ "epoch": 24.39,
1953
+ "learning_rate": 1.4125e-06,
1954
+ "loss": 0.4183,
1955
+ "step": 143500
1956
+ },
1957
+ {
1958
+ "epoch": 24.48,
1959
+ "learning_rate": 1.4000000000000001e-06,
1960
+ "loss": 0.4234,
1961
+ "step": 144000
1962
+ },
1963
+ {
1964
+ "epoch": 24.56,
1965
+ "learning_rate": 1.3875000000000003e-06,
1966
+ "loss": 0.4267,
1967
+ "step": 144500
1968
+ },
1969
+ {
1970
+ "epoch": 24.65,
1971
+ "learning_rate": 1.3750000000000002e-06,
1972
+ "loss": 0.4212,
1973
+ "step": 145000
1974
+ },
1975
+ {
1976
+ "epoch": 24.65,
1977
+ "eval_loss": 0.6056092381477356,
1978
+ "eval_runtime": 29.9328,
1979
+ "eval_samples_per_second": 392.145,
1980
+ "eval_steps_per_second": 49.043,
1981
+ "step": 145000
1982
+ },
1983
+ {
1984
+ "epoch": 24.73,
1985
+ "learning_rate": 1.3625000000000003e-06,
1986
+ "loss": 0.4223,
1987
+ "step": 145500
1988
+ },
1989
+ {
1990
+ "epoch": 24.82,
1991
+ "learning_rate": 1.3500000000000002e-06,
1992
+ "loss": 0.4325,
1993
+ "step": 146000
1994
+ },
1995
+ {
1996
+ "epoch": 24.9,
1997
+ "learning_rate": 1.3375000000000001e-06,
1998
+ "loss": 0.4199,
1999
+ "step": 146500
2000
+ },
2001
+ {
2002
+ "epoch": 24.99,
2003
+ "learning_rate": 1.3250000000000002e-06,
2004
+ "loss": 0.4301,
2005
+ "step": 147000
2006
+ },
2007
+ {
2008
+ "epoch": 25.07,
2009
+ "learning_rate": 1.3125000000000001e-06,
2010
+ "loss": 0.413,
2011
+ "step": 147500
2012
+ },
2013
+ {
2014
+ "epoch": 25.16,
2015
+ "learning_rate": 1.3e-06,
2016
+ "loss": 0.4213,
2017
+ "step": 148000
2018
+ },
2019
+ {
2020
+ "epoch": 25.24,
2021
+ "learning_rate": 1.2875000000000002e-06,
2022
+ "loss": 0.4211,
2023
+ "step": 148500
2024
+ },
2025
+ {
2026
+ "epoch": 25.33,
2027
+ "learning_rate": 1.275e-06,
2028
+ "loss": 0.4288,
2029
+ "step": 149000
2030
+ },
2031
+ {
2032
+ "epoch": 25.41,
2033
+ "learning_rate": 1.2625000000000002e-06,
2034
+ "loss": 0.4256,
2035
+ "step": 149500
2036
+ },
2037
+ {
2038
+ "epoch": 25.5,
2039
+ "learning_rate": 1.25e-06,
2040
+ "loss": 0.4261,
2041
+ "step": 150000
2042
+ },
2043
+ {
2044
+ "epoch": 25.5,
2045
+ "eval_loss": 0.6023589372634888,
2046
+ "eval_runtime": 42.7696,
2047
+ "eval_samples_per_second": 274.447,
2048
+ "eval_steps_per_second": 34.323,
2049
+ "step": 150000
2050
+ },
2051
+ {
2052
+ "epoch": 25.58,
2053
+ "learning_rate": 1.2375e-06,
2054
+ "loss": 0.4193,
2055
+ "step": 150500
2056
+ },
2057
+ {
2058
+ "epoch": 25.67,
2059
+ "learning_rate": 1.2250000000000001e-06,
2060
+ "loss": 0.4186,
2061
+ "step": 151000
2062
+ },
2063
+ {
2064
+ "epoch": 25.75,
2065
+ "learning_rate": 1.2125e-06,
2066
+ "loss": 0.4154,
2067
+ "step": 151500
2068
+ },
2069
+ {
2070
+ "epoch": 25.84,
2071
+ "learning_rate": 1.2000000000000002e-06,
2072
+ "loss": 0.4238,
2073
+ "step": 152000
2074
+ },
2075
+ {
2076
+ "epoch": 25.92,
2077
+ "learning_rate": 1.1875e-06,
2078
+ "loss": 0.4165,
2079
+ "step": 152500
2080
+ },
2081
+ {
2082
+ "epoch": 26.01,
2083
+ "learning_rate": 1.175e-06,
2084
+ "loss": 0.4165,
2085
+ "step": 153000
2086
+ },
2087
+ {
2088
+ "epoch": 26.09,
2089
+ "learning_rate": 1.1625e-06,
2090
+ "loss": 0.4169,
2091
+ "step": 153500
2092
+ },
2093
+ {
2094
+ "epoch": 26.18,
2095
+ "learning_rate": 1.1500000000000002e-06,
2096
+ "loss": 0.4116,
2097
+ "step": 154000
2098
+ },
2099
+ {
2100
+ "epoch": 26.26,
2101
+ "learning_rate": 1.1375000000000001e-06,
2102
+ "loss": 0.4138,
2103
+ "step": 154500
2104
+ },
2105
+ {
2106
+ "epoch": 26.35,
2107
+ "learning_rate": 1.125e-06,
2108
+ "loss": 0.4192,
2109
+ "step": 155000
2110
+ },
2111
+ {
2112
+ "epoch": 26.35,
2113
+ "eval_loss": 0.6006730794906616,
2114
+ "eval_runtime": 27.6694,
2115
+ "eval_samples_per_second": 424.223,
2116
+ "eval_steps_per_second": 53.055,
2117
+ "step": 155000
2118
+ },
2119
+ {
2120
+ "epoch": 26.43,
2121
+ "learning_rate": 1.1125000000000001e-06,
2122
+ "loss": 0.4216,
2123
+ "step": 155500
2124
+ },
2125
+ {
2126
+ "epoch": 26.52,
2127
+ "learning_rate": 1.1e-06,
2128
+ "loss": 0.4186,
2129
+ "step": 156000
2130
+ },
2131
+ {
2132
+ "epoch": 26.6,
2133
+ "learning_rate": 1.0875000000000002e-06,
2134
+ "loss": 0.4148,
2135
+ "step": 156500
2136
+ },
2137
+ {
2138
+ "epoch": 26.69,
2139
+ "learning_rate": 1.075e-06,
2140
+ "loss": 0.4186,
2141
+ "step": 157000
2142
+ },
2143
+ {
2144
+ "epoch": 26.77,
2145
+ "learning_rate": 1.0625e-06,
2146
+ "loss": 0.4202,
2147
+ "step": 157500
2148
+ },
2149
+ {
2150
+ "epoch": 26.86,
2151
+ "learning_rate": 1.0500000000000001e-06,
2152
+ "loss": 0.4139,
2153
+ "step": 158000
2154
+ },
2155
+ {
2156
+ "epoch": 26.94,
2157
+ "learning_rate": 1.0375e-06,
2158
+ "loss": 0.4196,
2159
+ "step": 158500
2160
+ },
2161
+ {
2162
+ "epoch": 27.03,
2163
+ "learning_rate": 1.025e-06,
2164
+ "loss": 0.4191,
2165
+ "step": 159000
2166
+ },
2167
+ {
2168
+ "epoch": 27.11,
2169
+ "learning_rate": 1.0125e-06,
2170
+ "loss": 0.4145,
2171
+ "step": 159500
2172
+ },
2173
+ {
2174
+ "epoch": 27.2,
2175
+ "learning_rate": 1.0000000000000002e-06,
2176
+ "loss": 0.4117,
2177
+ "step": 160000
2178
+ },
2179
+ {
2180
+ "epoch": 27.2,
2181
+ "eval_loss": 0.599934995174408,
2182
+ "eval_runtime": 27.3077,
2183
+ "eval_samples_per_second": 429.842,
2184
+ "eval_steps_per_second": 53.758,
2185
+ "step": 160000
2186
+ },
2187
+ {
2188
+ "epoch": 27.28,
2189
+ "learning_rate": 9.875e-07,
2190
+ "loss": 0.4202,
2191
+ "step": 160500
2192
+ },
2193
+ {
2194
+ "epoch": 27.37,
2195
+ "learning_rate": 9.750000000000002e-07,
2196
+ "loss": 0.4166,
2197
+ "step": 161000
2198
+ },
2199
+ {
2200
+ "epoch": 27.45,
2201
+ "learning_rate": 9.625e-07,
2202
+ "loss": 0.4119,
2203
+ "step": 161500
2204
+ },
2205
+ {
2206
+ "epoch": 27.54,
2207
+ "learning_rate": 9.500000000000001e-07,
2208
+ "loss": 0.4109,
2209
+ "step": 162000
2210
+ },
2211
+ {
2212
+ "epoch": 27.62,
2213
+ "learning_rate": 9.375000000000001e-07,
2214
+ "loss": 0.4149,
2215
+ "step": 162500
2216
+ },
2217
+ {
2218
+ "epoch": 27.71,
2219
+ "learning_rate": 9.25e-07,
2220
+ "loss": 0.412,
2221
+ "step": 163000
2222
+ },
2223
+ {
2224
+ "epoch": 27.79,
2225
+ "learning_rate": 9.125e-07,
2226
+ "loss": 0.4145,
2227
+ "step": 163500
2228
+ },
2229
+ {
2230
+ "epoch": 27.88,
2231
+ "learning_rate": 9.000000000000001e-07,
2232
+ "loss": 0.4175,
2233
+ "step": 164000
2234
+ },
2235
+ {
2236
+ "epoch": 27.96,
2237
+ "learning_rate": 8.875000000000001e-07,
2238
+ "loss": 0.4112,
2239
+ "step": 164500
2240
+ },
2241
+ {
2242
+ "epoch": 28.05,
2243
+ "learning_rate": 8.75e-07,
2244
+ "loss": 0.4087,
2245
+ "step": 165000
2246
+ },
2247
+ {
2248
+ "epoch": 28.05,
2249
+ "eval_loss": 0.5984655618667603,
2250
+ "eval_runtime": 27.6329,
2251
+ "eval_samples_per_second": 424.783,
2252
+ "eval_steps_per_second": 53.125,
2253
+ "step": 165000
2254
+ },
2255
+ {
2256
+ "epoch": 28.13,
2257
+ "learning_rate": 8.625e-07,
2258
+ "loss": 0.4147,
2259
+ "step": 165500
2260
+ },
2261
+ {
2262
+ "epoch": 28.22,
2263
+ "learning_rate": 8.500000000000001e-07,
2264
+ "loss": 0.4125,
2265
+ "step": 166000
2266
+ },
2267
+ {
2268
+ "epoch": 28.3,
2269
+ "learning_rate": 8.375000000000001e-07,
2270
+ "loss": 0.4117,
2271
+ "step": 166500
2272
+ },
2273
+ {
2274
+ "epoch": 28.39,
2275
+ "learning_rate": 8.250000000000001e-07,
2276
+ "loss": 0.4186,
2277
+ "step": 167000
2278
+ },
2279
+ {
2280
+ "epoch": 28.47,
2281
+ "learning_rate": 8.125000000000001e-07,
2282
+ "loss": 0.4056,
2283
+ "step": 167500
2284
+ },
2285
+ {
2286
+ "epoch": 28.56,
2287
+ "learning_rate": 8.000000000000001e-07,
2288
+ "loss": 0.4177,
2289
+ "step": 168000
2290
+ },
2291
+ {
2292
+ "epoch": 28.64,
2293
+ "learning_rate": 7.875000000000001e-07,
2294
+ "loss": 0.414,
2295
+ "step": 168500
2296
+ },
2297
+ {
2298
+ "epoch": 28.73,
2299
+ "learning_rate": 7.750000000000001e-07,
2300
+ "loss": 0.4147,
2301
+ "step": 169000
2302
+ },
2303
+ {
2304
+ "epoch": 28.81,
2305
+ "learning_rate": 7.625e-07,
2306
+ "loss": 0.4134,
2307
+ "step": 169500
2308
+ },
2309
+ {
2310
+ "epoch": 28.9,
2311
+ "learning_rate": 7.5e-07,
2312
+ "loss": 0.4219,
2313
+ "step": 170000
2314
+ },
2315
+ {
2316
+ "epoch": 28.9,
2317
+ "eval_loss": 0.5966935157775879,
2318
+ "eval_runtime": 27.6605,
2319
+ "eval_samples_per_second": 424.36,
2320
+ "eval_steps_per_second": 53.072,
2321
+ "step": 170000
2322
+ },
2323
+ {
2324
+ "epoch": 28.98,
2325
+ "learning_rate": 7.375e-07,
2326
+ "loss": 0.4117,
2327
+ "step": 170500
2328
+ },
2329
+ {
2330
+ "epoch": 29.07,
2331
+ "learning_rate": 7.25e-07,
2332
+ "loss": 0.409,
2333
+ "step": 171000
2334
+ },
2335
+ {
2336
+ "epoch": 29.15,
2337
+ "learning_rate": 7.125e-07,
2338
+ "loss": 0.4138,
2339
+ "step": 171500
2340
+ },
2341
+ {
2342
+ "epoch": 29.24,
2343
+ "learning_rate": 7.000000000000001e-07,
2344
+ "loss": 0.4104,
2345
+ "step": 172000
2346
+ },
2347
+ {
2348
+ "epoch": 29.32,
2349
+ "learning_rate": 6.875000000000001e-07,
2350
+ "loss": 0.4015,
2351
+ "step": 172500
2352
+ },
2353
+ {
2354
+ "epoch": 29.41,
2355
+ "learning_rate": 6.750000000000001e-07,
2356
+ "loss": 0.4104,
2357
+ "step": 173000
2358
+ },
2359
+ {
2360
+ "epoch": 29.49,
2361
+ "learning_rate": 6.625000000000001e-07,
2362
+ "loss": 0.4053,
2363
+ "step": 173500
2364
+ },
2365
+ {
2366
+ "epoch": 29.58,
2367
+ "learning_rate": 6.5e-07,
2368
+ "loss": 0.4163,
2369
+ "step": 174000
2370
+ },
2371
+ {
2372
+ "epoch": 29.66,
2373
+ "learning_rate": 6.375e-07,
2374
+ "loss": 0.4124,
2375
+ "step": 174500
2376
+ },
2377
+ {
2378
+ "epoch": 29.75,
2379
+ "learning_rate": 6.25e-07,
2380
+ "loss": 0.411,
2381
+ "step": 175000
2382
+ },
2383
+ {
2384
+ "epoch": 29.75,
2385
+ "eval_loss": 0.5960233211517334,
2386
+ "eval_runtime": 27.9317,
2387
+ "eval_samples_per_second": 420.239,
2388
+ "eval_steps_per_second": 52.557,
2389
+ "step": 175000
2390
+ }
2391
+ ],
2392
+ "logging_steps": 500,
2393
+ "max_steps": 200000,
2394
+ "num_train_epochs": 34,
2395
+ "save_steps": 5000,
2396
+ "total_flos": 9.1304017790976e+16,
2397
+ "trial_name": null,
2398
+ "trial_params": null
2399
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6399202e41fad4c90760d18a2e6b23c2fbe9dcfb300b3b60c13b3a6e88df6965
3
+ size 4219
vocab.json ADDED
The diff for this file is too large to render. See raw diff