Text Generation
Transformers
PyTorch
llama
Eval Results
text-generation-inference
Inference Endpoints
Declare commited on
Commit
1249c08
·
1 Parent(s): cf2c634

adding remaining files

Browse files
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.29.0"
7
+ }
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13476835328
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
218
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
219
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
220
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
221
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
222
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
223
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
224
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
226
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
227
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
228
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
229
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
230
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
231
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
232
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
235
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
236
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
237
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
238
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
239
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
240
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
241
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
249
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
250
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
251
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
252
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
253
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
254
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
255
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
256
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
257
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
258
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
259
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
260
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
261
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
262
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
263
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
264
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
265
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
266
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
267
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
268
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
296
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
297
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
298
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
299
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
300
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
301
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
302
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
303
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
304
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
305
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
306
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
307
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
308
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
309
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
310
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
311
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
312
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
313
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
314
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
315
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
316
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
317
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
318
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
319
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
320
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
321
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
322
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
323
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
324
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
325
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
326
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
327
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
328
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
329
+ }
330
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "model_max_length": 1280,
22
+ "pad_token": null,
23
+ "padding_side": "right",
24
+ "sp_model_kwargs": {},
25
+ "tokenizer_class": "LlamaTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
trainer_state.json ADDED
@@ -0,0 +1,971 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9960768928991763,
5
+ "global_step": 954,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "learning_rate": 9.997799572243123e-06,
13
+ "loss": 0.4974,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.06,
18
+ "learning_rate": 9.990195641770761e-06,
19
+ "loss": 0.4841,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "eval_loss": 0.4602764844894409,
25
+ "eval_runtime": 62.0223,
26
+ "eval_samples_per_second": 13.431,
27
+ "eval_steps_per_second": 0.855,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.09,
32
+ "learning_rate": 9.9771693033643e-06,
33
+ "loss": 0.4748,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.13,
38
+ "learning_rate": 9.958734711603195e-06,
39
+ "loss": 0.4678,
40
+ "step": 40
41
+ },
42
+ {
43
+ "epoch": 0.13,
44
+ "eval_loss": 0.45144811272621155,
45
+ "eval_runtime": 61.8568,
46
+ "eval_samples_per_second": 13.467,
47
+ "eval_steps_per_second": 0.857,
48
+ "step": 40
49
+ },
50
+ {
51
+ "epoch": 0.16,
52
+ "learning_rate": 9.934911897741493e-06,
53
+ "loss": 0.4687,
54
+ "step": 50
55
+ },
56
+ {
57
+ "epoch": 0.19,
58
+ "learning_rate": 9.905726747941616e-06,
59
+ "loss": 0.4687,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 0.19,
64
+ "eval_loss": 0.44708773493766785,
65
+ "eval_runtime": 62.0907,
66
+ "eval_samples_per_second": 13.416,
67
+ "eval_steps_per_second": 0.854,
68
+ "step": 60
69
+ },
70
+ {
71
+ "epoch": 0.22,
72
+ "learning_rate": 9.871210975146135e-06,
73
+ "loss": 0.473,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 0.25,
78
+ "learning_rate": 9.831402084618113e-06,
79
+ "loss": 0.4608,
80
+ "step": 80
81
+ },
82
+ {
83
+ "epoch": 0.25,
84
+ "eval_loss": 0.44395506381988525,
85
+ "eval_runtime": 61.9484,
86
+ "eval_samples_per_second": 13.447,
87
+ "eval_steps_per_second": 0.856,
88
+ "step": 80
89
+ },
90
+ {
91
+ "epoch": 0.28,
92
+ "learning_rate": 9.786343333187412e-06,
93
+ "loss": 0.4542,
94
+ "step": 90
95
+ },
96
+ {
97
+ "epoch": 0.31,
98
+ "learning_rate": 9.736083682247287e-06,
99
+ "loss": 0.4593,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.31,
104
+ "eval_loss": 0.43974635004997253,
105
+ "eval_runtime": 61.9084,
106
+ "eval_samples_per_second": 13.455,
107
+ "eval_steps_per_second": 0.856,
108
+ "step": 100
109
+ },
110
+ {
111
+ "epoch": 0.35,
112
+ "learning_rate": 9.680677744552346e-06,
113
+ "loss": 0.4681,
114
+ "step": 110
115
+ },
116
+ {
117
+ "epoch": 0.38,
118
+ "learning_rate": 9.620185724875652e-06,
119
+ "loss": 0.4488,
120
+ "step": 120
121
+ },
122
+ {
123
+ "epoch": 0.38,
124
+ "eval_loss": 0.435758113861084,
125
+ "eval_runtime": 61.8043,
126
+ "eval_samples_per_second": 13.478,
127
+ "eval_steps_per_second": 0.858,
128
+ "step": 120
129
+ },
130
+ {
131
+ "epoch": 0.41,
132
+ "learning_rate": 9.55467335458948e-06,
133
+ "loss": 0.4478,
134
+ "step": 130
135
+ },
136
+ {
137
+ "epoch": 0.44,
138
+ "learning_rate": 9.484211820240797e-06,
139
+ "loss": 0.4494,
140
+ "step": 140
141
+ },
142
+ {
143
+ "epoch": 0.44,
144
+ "eval_loss": 0.434115469455719,
145
+ "eval_runtime": 61.841,
146
+ "eval_samples_per_second": 13.47,
147
+ "eval_steps_per_second": 0.857,
148
+ "step": 140
149
+ },
150
+ {
151
+ "epoch": 0.47,
152
+ "learning_rate": 9.408877686199078e-06,
153
+ "loss": 0.44,
154
+ "step": 150
155
+ },
156
+ {
157
+ "epoch": 0.5,
158
+ "learning_rate": 9.328752811460542e-06,
159
+ "loss": 0.4477,
160
+ "step": 160
161
+ },
162
+ {
163
+ "epoch": 0.5,
164
+ "eval_loss": 0.4319334030151367,
165
+ "eval_runtime": 61.9956,
166
+ "eval_samples_per_second": 13.436,
167
+ "eval_steps_per_second": 0.855,
168
+ "step": 160
169
+ },
170
+ {
171
+ "epoch": 0.53,
172
+ "learning_rate": 9.243924260699133e-06,
173
+ "loss": 0.4465,
174
+ "step": 170
175
+ },
176
+ {
177
+ "epoch": 0.56,
178
+ "learning_rate": 9.154484209661002e-06,
179
+ "loss": 0.445,
180
+ "step": 180
181
+ },
182
+ {
183
+ "epoch": 0.56,
184
+ "eval_loss": 0.42997926473617554,
185
+ "eval_runtime": 62.0149,
186
+ "eval_samples_per_second": 13.432,
187
+ "eval_steps_per_second": 0.855,
188
+ "step": 180
189
+ },
190
+ {
191
+ "epoch": 0.6,
192
+ "learning_rate": 9.060529845005184e-06,
193
+ "loss": 0.4332,
194
+ "step": 190
195
+ },
196
+ {
197
+ "epoch": 0.63,
198
+ "learning_rate": 8.962163258699397e-06,
199
+ "loss": 0.4366,
200
+ "step": 200
201
+ },
202
+ {
203
+ "epoch": 0.63,
204
+ "eval_loss": 0.4274918735027313,
205
+ "eval_runtime": 61.8797,
206
+ "eval_samples_per_second": 13.462,
207
+ "eval_steps_per_second": 0.857,
208
+ "step": 200
209
+ },
210
+ {
211
+ "epoch": 0.66,
212
+ "learning_rate": 8.859491337085643e-06,
213
+ "loss": 0.428,
214
+ "step": 210
215
+ },
216
+ {
217
+ "epoch": 0.69,
218
+ "learning_rate": 8.752625644736204e-06,
219
+ "loss": 0.442,
220
+ "step": 220
221
+ },
222
+ {
223
+ "epoch": 0.69,
224
+ "eval_loss": 0.42491650581359863,
225
+ "eval_runtime": 62.1183,
226
+ "eval_samples_per_second": 13.41,
227
+ "eval_steps_per_second": 0.853,
228
+ "step": 220
229
+ },
230
+ {
231
+ "epoch": 0.72,
232
+ "learning_rate": 8.641682303226197e-06,
233
+ "loss": 0.4442,
234
+ "step": 230
235
+ },
236
+ {
237
+ "epoch": 0.75,
238
+ "learning_rate": 8.526781864954453e-06,
239
+ "loss": 0.4424,
240
+ "step": 240
241
+ },
242
+ {
243
+ "epoch": 0.75,
244
+ "eval_loss": 0.4228505492210388,
245
+ "eval_runtime": 62.0338,
246
+ "eval_samples_per_second": 13.428,
247
+ "eval_steps_per_second": 0.854,
248
+ "step": 240
249
+ },
250
+ {
251
+ "epoch": 0.78,
252
+ "learning_rate": 8.40804918214979e-06,
253
+ "loss": 0.4301,
254
+ "step": 250
255
+ },
256
+ {
257
+ "epoch": 0.82,
258
+ "learning_rate": 8.28561327120505e-06,
259
+ "loss": 0.4427,
260
+ "step": 260
261
+ },
262
+ {
263
+ "epoch": 0.82,
264
+ "eval_loss": 0.41998758912086487,
265
+ "eval_runtime": 61.9344,
266
+ "eval_samples_per_second": 13.45,
267
+ "eval_steps_per_second": 0.856,
268
+ "step": 260
269
+ },
270
+ {
271
+ "epoch": 0.85,
272
+ "learning_rate": 8.159607172486301e-06,
273
+ "loss": 0.4316,
274
+ "step": 270
275
+ },
276
+ {
277
+ "epoch": 0.88,
278
+ "learning_rate": 8.030167805769537e-06,
279
+ "loss": 0.4372,
280
+ "step": 280
281
+ },
282
+ {
283
+ "epoch": 0.88,
284
+ "eval_loss": 0.4175536036491394,
285
+ "eval_runtime": 61.971,
286
+ "eval_samples_per_second": 13.442,
287
+ "eval_steps_per_second": 0.855,
288
+ "step": 280
289
+ },
290
+ {
291
+ "epoch": 0.91,
292
+ "learning_rate": 7.897435821461964e-06,
293
+ "loss": 0.4398,
294
+ "step": 290
295
+ },
296
+ {
297
+ "epoch": 0.94,
298
+ "learning_rate": 7.761555447769548e-06,
299
+ "loss": 0.4335,
300
+ "step": 300
301
+ },
302
+ {
303
+ "epoch": 0.94,
304
+ "eval_loss": 0.4155929982662201,
305
+ "eval_runtime": 61.9696,
306
+ "eval_samples_per_second": 13.442,
307
+ "eval_steps_per_second": 0.855,
308
+ "step": 300
309
+ },
310
+ {
311
+ "epoch": 0.97,
312
+ "learning_rate": 7.622674333976863e-06,
313
+ "loss": 0.4371,
314
+ "step": 310
315
+ },
316
+ {
317
+ "epoch": 1.0,
318
+ "learning_rate": 7.4809433900095705e-06,
319
+ "loss": 0.4088,
320
+ "step": 320
321
+ },
322
+ {
323
+ "epoch": 1.0,
324
+ "eval_loss": 0.41465404629707336,
325
+ "eval_runtime": 61.9822,
326
+ "eval_samples_per_second": 13.439,
327
+ "eval_steps_per_second": 0.855,
328
+ "step": 320
329
+ },
330
+ {
331
+ "epoch": 1.04,
332
+ "learning_rate": 7.336516622453833e-06,
333
+ "loss": 0.3166,
334
+ "step": 330
335
+ },
336
+ {
337
+ "epoch": 1.07,
338
+ "learning_rate": 7.1895509672108674e-06,
339
+ "loss": 0.3145,
340
+ "step": 340
341
+ },
342
+ {
343
+ "epoch": 1.07,
344
+ "eval_loss": 0.42051395773887634,
345
+ "eval_runtime": 62.5118,
346
+ "eval_samples_per_second": 13.325,
347
+ "eval_steps_per_second": 0.848,
348
+ "step": 340
349
+ },
350
+ {
351
+ "epoch": 1.1,
352
+ "learning_rate": 7.040206118968466e-06,
353
+ "loss": 0.3136,
354
+ "step": 350
355
+ },
356
+ {
357
+ "epoch": 1.13,
358
+ "learning_rate": 6.88864435767478e-06,
359
+ "loss": 0.3151,
360
+ "step": 360
361
+ },
362
+ {
363
+ "epoch": 1.13,
364
+ "eval_loss": 0.4205115735530853,
365
+ "eval_runtime": 62.5792,
366
+ "eval_samples_per_second": 13.311,
367
+ "eval_steps_per_second": 0.847,
368
+ "step": 360
369
+ },
370
+ {
371
+ "epoch": 1.16,
372
+ "learning_rate": 6.735030372202942e-06,
373
+ "loss": 0.3137,
374
+ "step": 370
375
+ },
376
+ {
377
+ "epoch": 1.19,
378
+ "learning_rate": 6.579531081398105e-06,
379
+ "loss": 0.3019,
380
+ "step": 380
381
+ },
382
+ {
383
+ "epoch": 1.19,
384
+ "eval_loss": 0.4216003119945526,
385
+ "eval_runtime": 62.3646,
386
+ "eval_samples_per_second": 13.357,
387
+ "eval_steps_per_second": 0.85,
388
+ "step": 380
389
+ },
390
+ {
391
+ "epoch": 1.22,
392
+ "learning_rate": 6.4223154527013755e-06,
393
+ "loss": 0.3044,
394
+ "step": 390
395
+ },
396
+ {
397
+ "epoch": 1.26,
398
+ "learning_rate": 6.263554318547713e-06,
399
+ "loss": 0.3044,
400
+ "step": 400
401
+ },
402
+ {
403
+ "epoch": 1.26,
404
+ "eval_loss": 0.4185173809528351,
405
+ "eval_runtime": 61.9058,
406
+ "eval_samples_per_second": 13.456,
407
+ "eval_steps_per_second": 0.856,
408
+ "step": 400
409
+ },
410
+ {
411
+ "epoch": 1.29,
412
+ "learning_rate": 6.1034201907373045e-06,
413
+ "loss": 0.305,
414
+ "step": 410
415
+ },
416
+ {
417
+ "epoch": 1.32,
418
+ "learning_rate": 5.942087072982131e-06,
419
+ "loss": 0.3034,
420
+ "step": 420
421
+ },
422
+ {
423
+ "epoch": 1.32,
424
+ "eval_loss": 0.41815003752708435,
425
+ "eval_runtime": 62.0238,
426
+ "eval_samples_per_second": 13.43,
427
+ "eval_steps_per_second": 0.855,
428
+ "step": 420
429
+ },
430
+ {
431
+ "epoch": 1.35,
432
+ "learning_rate": 5.779730271831384e-06,
433
+ "loss": 0.3115,
434
+ "step": 430
435
+ },
436
+ {
437
+ "epoch": 1.38,
438
+ "learning_rate": 5.616526206181215e-06,
439
+ "loss": 0.3026,
440
+ "step": 440
441
+ },
442
+ {
443
+ "epoch": 1.38,
444
+ "eval_loss": 0.41711267828941345,
445
+ "eval_runtime": 62.5134,
446
+ "eval_samples_per_second": 13.325,
447
+ "eval_steps_per_second": 0.848,
448
+ "step": 440
449
+ },
450
+ {
451
+ "epoch": 1.41,
452
+ "learning_rate": 5.4526522155758015e-06,
453
+ "loss": 0.3077,
454
+ "step": 450
455
+ },
456
+ {
457
+ "epoch": 1.45,
458
+ "learning_rate": 5.288286367508009e-06,
459
+ "loss": 0.3062,
460
+ "step": 460
461
+ },
462
+ {
463
+ "epoch": 1.45,
464
+ "eval_loss": 0.41751572489738464,
465
+ "eval_runtime": 62.561,
466
+ "eval_samples_per_second": 13.315,
467
+ "eval_steps_per_second": 0.847,
468
+ "step": 460
469
+ },
470
+ {
471
+ "epoch": 1.48,
472
+ "learning_rate": 5.123607263929075e-06,
473
+ "loss": 0.3076,
474
+ "step": 470
475
+ },
476
+ {
477
+ "epoch": 1.51,
478
+ "learning_rate": 4.958793847177518e-06,
479
+ "loss": 0.315,
480
+ "step": 480
481
+ },
482
+ {
483
+ "epoch": 1.51,
484
+ "eval_loss": 0.41455498337745667,
485
+ "eval_runtime": 62.0669,
486
+ "eval_samples_per_second": 13.421,
487
+ "eval_steps_per_second": 0.854,
488
+ "step": 480
489
+ },
490
+ {
491
+ "epoch": 1.54,
492
+ "learning_rate": 4.7940252055382115e-06,
493
+ "loss": 0.3024,
494
+ "step": 490
495
+ },
496
+ {
497
+ "epoch": 1.57,
498
+ "learning_rate": 4.629480378642832e-06,
499
+ "loss": 0.3124,
500
+ "step": 500
501
+ },
502
+ {
503
+ "epoch": 1.57,
504
+ "eval_loss": 0.41453319787979126,
505
+ "eval_runtime": 61.9067,
506
+ "eval_samples_per_second": 13.456,
507
+ "eval_steps_per_second": 0.856,
508
+ "step": 500
509
+ },
510
+ {
511
+ "epoch": 1.6,
512
+ "learning_rate": 4.46533816292321e-06,
513
+ "loss": 0.31,
514
+ "step": 510
515
+ },
516
+ {
517
+ "epoch": 1.63,
518
+ "learning_rate": 4.301776917328918e-06,
519
+ "loss": 0.3096,
520
+ "step": 520
521
+ },
522
+ {
523
+ "epoch": 1.63,
524
+ "eval_loss": 0.41273748874664307,
525
+ "eval_runtime": 62.1417,
526
+ "eval_samples_per_second": 13.405,
527
+ "eval_steps_per_second": 0.853,
528
+ "step": 520
529
+ },
530
+ {
531
+ "epoch": 1.67,
532
+ "learning_rate": 4.138974369520252e-06,
533
+ "loss": 0.3044,
534
+ "step": 530
535
+ },
536
+ {
537
+ "epoch": 1.7,
538
+ "learning_rate": 3.977107422747163e-06,
539
+ "loss": 0.3178,
540
+ "step": 540
541
+ },
542
+ {
543
+ "epoch": 1.7,
544
+ "eval_loss": 0.4111482501029968,
545
+ "eval_runtime": 62.5639,
546
+ "eval_samples_per_second": 13.314,
547
+ "eval_steps_per_second": 0.847,
548
+ "step": 540
549
+ },
550
+ {
551
+ "epoch": 1.73,
552
+ "learning_rate": 3.816351963624017e-06,
553
+ "loss": 0.3102,
554
+ "step": 550
555
+ },
556
+ {
557
+ "epoch": 1.76,
558
+ "learning_rate": 3.6568826710090353e-06,
559
+ "loss": 0.3044,
560
+ "step": 560
561
+ },
562
+ {
563
+ "epoch": 1.76,
564
+ "eval_loss": 0.4110707640647888,
565
+ "eval_runtime": 62.5689,
566
+ "eval_samples_per_second": 13.313,
567
+ "eval_steps_per_second": 0.847,
568
+ "step": 560
569
+ },
570
+ {
571
+ "epoch": 1.79,
572
+ "learning_rate": 3.4988728261960957e-06,
573
+ "loss": 0.306,
574
+ "step": 570
575
+ },
576
+ {
577
+ "epoch": 1.82,
578
+ "learning_rate": 3.3424941246251574e-06,
579
+ "loss": 0.3078,
580
+ "step": 580
581
+ },
582
+ {
583
+ "epoch": 1.82,
584
+ "eval_loss": 0.4091060757637024,
585
+ "eval_runtime": 62.0676,
586
+ "eval_samples_per_second": 13.421,
587
+ "eval_steps_per_second": 0.854,
588
+ "step": 580
589
+ },
590
+ {
591
+ "epoch": 1.85,
592
+ "learning_rate": 3.1879164893158713e-06,
593
+ "loss": 0.2977,
594
+ "step": 590
595
+ },
596
+ {
597
+ "epoch": 1.89,
598
+ "learning_rate": 3.035307886227156e-06,
599
+ "loss": 0.2967,
600
+ "step": 600
601
+ },
602
+ {
603
+ "epoch": 1.89,
604
+ "eval_loss": 0.4094270169734955,
605
+ "eval_runtime": 62.0655,
606
+ "eval_samples_per_second": 13.421,
607
+ "eval_steps_per_second": 0.854,
608
+ "step": 600
609
+ },
610
+ {
611
+ "epoch": 1.92,
612
+ "learning_rate": 2.8848341417433036e-06,
613
+ "loss": 0.3069,
614
+ "step": 610
615
+ },
616
+ {
617
+ "epoch": 1.95,
618
+ "learning_rate": 2.736658762485005e-06,
619
+ "loss": 0.3068,
620
+ "step": 620
621
+ },
622
+ {
623
+ "epoch": 1.95,
624
+ "eval_loss": 0.4080323278903961,
625
+ "eval_runtime": 62.0098,
626
+ "eval_samples_per_second": 13.433,
627
+ "eval_steps_per_second": 0.855,
628
+ "step": 620
629
+ },
630
+ {
631
+ "epoch": 1.98,
632
+ "learning_rate": 2.590942757641035e-06,
633
+ "loss": 0.3037,
634
+ "step": 630
635
+ },
636
+ {
637
+ "epoch": 2.01,
638
+ "learning_rate": 2.447844464013703e-06,
639
+ "loss": 0.276,
640
+ "step": 640
641
+ },
642
+ {
643
+ "epoch": 2.01,
644
+ "eval_loss": 0.4152510464191437,
645
+ "eval_runtime": 61.9609,
646
+ "eval_samples_per_second": 13.444,
647
+ "eval_steps_per_second": 0.855,
648
+ "step": 640
649
+ },
650
+ {
651
+ "epoch": 2.04,
652
+ "learning_rate": 2.3075193739681182e-06,
653
+ "loss": 0.2327,
654
+ "step": 650
655
+ },
656
+ {
657
+ "epoch": 2.07,
658
+ "learning_rate": 2.170119966472293e-06,
659
+ "loss": 0.2288,
660
+ "step": 660
661
+ },
662
+ {
663
+ "epoch": 2.07,
664
+ "eval_loss": 0.43204566836357117,
665
+ "eval_runtime": 62.0792,
666
+ "eval_samples_per_second": 13.418,
667
+ "eval_steps_per_second": 0.854,
668
+ "step": 660
669
+ },
670
+ {
671
+ "epoch": 2.1,
672
+ "learning_rate": 2.0357955414116075e-06,
673
+ "loss": 0.2267,
674
+ "step": 670
675
+ },
676
+ {
677
+ "epoch": 2.14,
678
+ "learning_rate": 1.9046920573577239e-06,
679
+ "loss": 0.2244,
680
+ "step": 680
681
+ },
682
+ {
683
+ "epoch": 2.14,
684
+ "eval_loss": 0.4292474389076233,
685
+ "eval_runtime": 62.0223,
686
+ "eval_samples_per_second": 13.431,
687
+ "eval_steps_per_second": 0.855,
688
+ "step": 680
689
+ },
690
+ {
691
+ "epoch": 2.17,
692
+ "learning_rate": 1.7769519729682105e-06,
693
+ "loss": 0.2327,
694
+ "step": 690
695
+ },
696
+ {
697
+ "epoch": 2.2,
698
+ "learning_rate": 1.6527140921892066e-06,
699
+ "loss": 0.2336,
700
+ "step": 700
701
+ },
702
+ {
703
+ "epoch": 2.2,
704
+ "eval_loss": 0.427610844373703,
705
+ "eval_runtime": 62.1107,
706
+ "eval_samples_per_second": 13.412,
707
+ "eval_steps_per_second": 0.853,
708
+ "step": 700
709
+ },
710
+ {
711
+ "epoch": 2.23,
712
+ "learning_rate": 1.532113413429357e-06,
713
+ "loss": 0.2386,
714
+ "step": 710
715
+ },
716
+ {
717
+ "epoch": 2.26,
718
+ "learning_rate": 1.4152809828688708e-06,
719
+ "loss": 0.2266,
720
+ "step": 720
721
+ },
722
+ {
723
+ "epoch": 2.26,
724
+ "eval_loss": 0.4290391206741333,
725
+ "eval_runtime": 62.2449,
726
+ "eval_samples_per_second": 13.383,
727
+ "eval_steps_per_second": 0.851,
728
+ "step": 720
729
+ },
730
+ {
731
+ "epoch": 2.29,
732
+ "learning_rate": 1.3023437520631426e-06,
733
+ "loss": 0.2328,
734
+ "step": 730
735
+ },
736
+ {
737
+ "epoch": 2.32,
738
+ "learning_rate": 1.1934244399956206e-06,
739
+ "loss": 0.2312,
740
+ "step": 740
741
+ },
742
+ {
743
+ "epoch": 2.32,
744
+ "eval_loss": 0.42950907349586487,
745
+ "eval_runtime": 62.2915,
746
+ "eval_samples_per_second": 13.373,
747
+ "eval_steps_per_second": 0.851,
748
+ "step": 740
749
+ },
750
+ {
751
+ "epoch": 2.36,
752
+ "learning_rate": 1.0886413997298595e-06,
753
+ "loss": 0.2338,
754
+ "step": 750
755
+ },
756
+ {
757
+ "epoch": 2.39,
758
+ "learning_rate": 9.881084898056197e-07,
759
+ "loss": 0.2277,
760
+ "step": 760
761
+ },
762
+ {
763
+ "epoch": 2.39,
764
+ "eval_loss": 0.4284292161464691,
765
+ "eval_runtime": 62.4097,
766
+ "eval_samples_per_second": 13.347,
767
+ "eval_steps_per_second": 0.849,
768
+ "step": 760
769
+ },
770
+ {
771
+ "epoch": 2.42,
772
+ "learning_rate": 8.919349505187813e-07,
773
+ "loss": 0.2333,
774
+ "step": 770
775
+ },
776
+ {
777
+ "epoch": 2.45,
778
+ "learning_rate": 8.002252852194992e-07,
779
+ "loss": 0.2332,
780
+ "step": 780
781
+ },
782
+ {
783
+ "epoch": 2.45,
784
+ "eval_loss": 0.42790091037750244,
785
+ "eval_runtime": 62.4651,
786
+ "eval_samples_per_second": 13.335,
787
+ "eval_steps_per_second": 0.848,
788
+ "step": 780
789
+ },
790
+ {
791
+ "epoch": 2.48,
792
+ "learning_rate": 7.130791467575676e-07,
793
+ "loss": 0.2257,
794
+ "step": 790
795
+ },
796
+ {
797
+ "epoch": 2.51,
798
+ "learning_rate": 6.305912291984229e-07,
799
+ "loss": 0.2289,
800
+ "step": 800
801
+ },
802
+ {
803
+ "epoch": 2.51,
804
+ "eval_loss": 0.42792582511901855,
805
+ "eval_runtime": 62.5296,
806
+ "eval_samples_per_second": 13.322,
807
+ "eval_steps_per_second": 0.848,
808
+ "step": 800
809
+ },
810
+ {
811
+ "epoch": 2.54,
812
+ "learning_rate": 5.528511649273932e-07,
813
+ "loss": 0.2303,
814
+ "step": 810
815
+ },
816
+ {
817
+ "epoch": 2.58,
818
+ "learning_rate": 4.799434272540576e-07,
819
+ "loss": 0.2279,
820
+ "step": 820
821
+ },
822
+ {
823
+ "epoch": 2.58,
824
+ "eval_loss": 0.4278266131877899,
825
+ "eval_runtime": 62.5218,
826
+ "eval_samples_per_second": 13.323,
827
+ "eval_steps_per_second": 0.848,
828
+ "step": 820
829
+ },
830
+ {
831
+ "epoch": 2.61,
832
+ "learning_rate": 4.1194723862250317e-07,
833
+ "loss": 0.2267,
834
+ "step": 830
835
+ },
836
+ {
837
+ "epoch": 2.64,
838
+ "learning_rate": 3.4893648452724636e-07,
839
+ "loss": 0.2312,
840
+ "step": 840
841
+ },
842
+ {
843
+ "epoch": 2.64,
844
+ "eval_loss": 0.4273243546485901,
845
+ "eval_runtime": 62.5591,
846
+ "eval_samples_per_second": 13.315,
847
+ "eval_steps_per_second": 0.847,
848
+ "step": 840
849
+ },
850
+ {
851
+ "epoch": 2.67,
852
+ "learning_rate": 2.9097963322834597e-07,
853
+ "loss": 0.2306,
854
+ "step": 850
855
+ },
856
+ {
857
+ "epoch": 2.7,
858
+ "learning_rate": 2.3813966135294574e-07,
859
+ "loss": 0.2334,
860
+ "step": 860
861
+ },
862
+ {
863
+ "epoch": 2.7,
864
+ "eval_loss": 0.42646506428718567,
865
+ "eval_runtime": 62.6378,
866
+ "eval_samples_per_second": 13.299,
867
+ "eval_steps_per_second": 0.846,
868
+ "step": 860
869
+ },
870
+ {
871
+ "epoch": 2.73,
872
+ "learning_rate": 1.9047398546410633e-07,
873
+ "loss": 0.2306,
874
+ "step": 870
875
+ },
876
+ {
877
+ "epoch": 2.76,
878
+ "learning_rate": 1.4803439967125022e-07,
879
+ "loss": 0.2278,
880
+ "step": 880
881
+ },
882
+ {
883
+ "epoch": 2.76,
884
+ "eval_loss": 0.42754805088043213,
885
+ "eval_runtime": 62.1476,
886
+ "eval_samples_per_second": 13.404,
887
+ "eval_steps_per_second": 0.853,
888
+ "step": 880
889
+ },
890
+ {
891
+ "epoch": 2.8,
892
+ "learning_rate": 1.1086701935005606e-07,
893
+ "loss": 0.2296,
894
+ "step": 890
895
+ },
896
+ {
897
+ "epoch": 2.83,
898
+ "learning_rate": 7.901223103291833e-08,
899
+ "loss": 0.2295,
900
+ "step": 900
901
+ },
902
+ {
903
+ "epoch": 2.83,
904
+ "eval_loss": 0.4276488721370697,
905
+ "eval_runtime": 61.9957,
906
+ "eval_samples_per_second": 13.436,
907
+ "eval_steps_per_second": 0.855,
908
+ "step": 900
909
+ },
910
+ {
911
+ "epoch": 2.86,
912
+ "learning_rate": 5.250464852444792e-08,
913
+ "loss": 0.2334,
914
+ "step": 910
915
+ },
916
+ {
917
+ "epoch": 2.89,
918
+ "learning_rate": 3.137307528968292e-08,
919
+ "loss": 0.2292,
920
+ "step": 920
921
+ },
922
+ {
923
+ "epoch": 2.89,
924
+ "eval_loss": 0.4273829162120819,
925
+ "eval_runtime": 62.0452,
926
+ "eval_samples_per_second": 13.426,
927
+ "eval_steps_per_second": 0.854,
928
+ "step": 920
929
+ },
930
+ {
931
+ "epoch": 2.92,
932
+ "learning_rate": 1.5640473155894566e-08,
933
+ "loss": 0.2284,
934
+ "step": 930
935
+ },
936
+ {
937
+ "epoch": 2.95,
938
+ "learning_rate": 5.323937361977338e-09,
939
+ "loss": 0.2291,
940
+ "step": 940
941
+ },
942
+ {
943
+ "epoch": 2.95,
944
+ "eval_loss": 0.42734310030937195,
945
+ "eval_runtime": 61.9825,
946
+ "eval_samples_per_second": 13.439,
947
+ "eval_steps_per_second": 0.855,
948
+ "step": 940
949
+ },
950
+ {
951
+ "epoch": 2.98,
952
+ "learning_rate": 4.346779825575853e-10,
953
+ "loss": 0.2288,
954
+ "step": 950
955
+ },
956
+ {
957
+ "epoch": 3.0,
958
+ "step": 954,
959
+ "total_flos": 6.195687991759864e+18,
960
+ "train_loss": 0.1527079766776327,
961
+ "train_runtime": 29559.5606,
962
+ "train_samples_per_second": 4.138,
963
+ "train_steps_per_second": 0.032
964
+ }
965
+ ],
966
+ "max_steps": 954,
967
+ "num_train_epochs": 3,
968
+ "total_flos": 6.195687991759864e+18,
969
+ "trial_name": null,
970
+ "trial_params": null
971
+ }