DylanJHJ commited on
Commit
45062a0
Β·
1 Parent(s): c168127

add new models

Browse files
.gitattributes CHANGED
@@ -33,8 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- checkpoint-*/*bim filter=lfs diff=lfs merge=lfs -text
37
- checkpoint-5000/training_args.bin filter=lfs diff=lfs merge=lfs -text
38
  checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
39
  checkpoint-10000/training_args.bin filter=lfs diff=lfs merge=lfs -text
40
- checkpoint-5000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
36
  checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-10000/training_args.bin filter=lfs diff=lfs merge=lfs -text
38
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
39
+ **/**/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
{checkpoint-10000 β†’ archived/checkpoint-10000}/config.json RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/generation_config.json RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/optimizer.pt RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/pytorch_model.bin RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/rng_state.pth RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/scheduler.pt RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/trainer_state.json RENAMED
File without changes
{checkpoint-10000 β†’ archived/checkpoint-10000}/training_args.bin RENAMED
File without changes
checkpoint-5000/trainer_state.json DELETED
@@ -1,156 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.6308352258390109,
5
- "global_step": 5000,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.06,
12
- "learning_rate": 6.25e-05,
13
- "loss": 5.1725,
14
- "step": 500
15
- },
16
- {
17
- "epoch": 0.06,
18
- "eval_loss": 0.6939732432365417,
19
- "eval_runtime": 5.8238,
20
- "eval_samples_per_second": 17.171,
21
- "eval_steps_per_second": 8.585,
22
- "step": 500
23
- },
24
- {
25
- "epoch": 0.13,
26
- "learning_rate": 9.782608695652174e-05,
27
- "loss": 0.5104,
28
- "step": 1000
29
- },
30
- {
31
- "epoch": 0.13,
32
- "eval_loss": 0.641968846321106,
33
- "eval_runtime": 6.0125,
34
- "eval_samples_per_second": 16.632,
35
- "eval_steps_per_second": 8.316,
36
- "step": 1000
37
- },
38
- {
39
- "epoch": 0.19,
40
- "learning_rate": 9.239130434782609e-05,
41
- "loss": 0.485,
42
- "step": 1500
43
- },
44
- {
45
- "epoch": 0.19,
46
- "eval_loss": 0.6206984519958496,
47
- "eval_runtime": 5.6411,
48
- "eval_samples_per_second": 17.727,
49
- "eval_steps_per_second": 8.864,
50
- "step": 1500
51
- },
52
- {
53
- "epoch": 0.25,
54
- "learning_rate": 8.695652173913044e-05,
55
- "loss": 0.4644,
56
- "step": 2000
57
- },
58
- {
59
- "epoch": 0.25,
60
- "eval_loss": 0.6094934344291687,
61
- "eval_runtime": 4.7677,
62
- "eval_samples_per_second": 20.974,
63
- "eval_steps_per_second": 10.487,
64
- "step": 2000
65
- },
66
- {
67
- "epoch": 0.32,
68
- "learning_rate": 8.152173913043478e-05,
69
- "loss": 0.4563,
70
- "step": 2500
71
- },
72
- {
73
- "epoch": 0.32,
74
- "eval_loss": 0.6136000156402588,
75
- "eval_runtime": 5.1919,
76
- "eval_samples_per_second": 19.261,
77
- "eval_steps_per_second": 9.63,
78
- "step": 2500
79
- },
80
- {
81
- "epoch": 0.38,
82
- "learning_rate": 7.608695652173914e-05,
83
- "loss": 0.4426,
84
- "step": 3000
85
- },
86
- {
87
- "epoch": 0.38,
88
- "eval_loss": 0.6097093224525452,
89
- "eval_runtime": 5.562,
90
- "eval_samples_per_second": 17.979,
91
- "eval_steps_per_second": 8.99,
92
- "step": 3000
93
- },
94
- {
95
- "epoch": 0.44,
96
- "learning_rate": 7.065217391304349e-05,
97
- "loss": 0.4401,
98
- "step": 3500
99
- },
100
- {
101
- "epoch": 0.44,
102
- "eval_loss": 0.5967560410499573,
103
- "eval_runtime": 5.8426,
104
- "eval_samples_per_second": 17.116,
105
- "eval_steps_per_second": 8.558,
106
- "step": 3500
107
- },
108
- {
109
- "epoch": 0.5,
110
- "learning_rate": 6.521739130434783e-05,
111
- "loss": 0.4258,
112
- "step": 4000
113
- },
114
- {
115
- "epoch": 0.5,
116
- "eval_loss": 0.6082923412322998,
117
- "eval_runtime": 5.3584,
118
- "eval_samples_per_second": 18.662,
119
- "eval_steps_per_second": 9.331,
120
- "step": 4000
121
- },
122
- {
123
- "epoch": 0.57,
124
- "learning_rate": 5.9782608695652175e-05,
125
- "loss": 0.424,
126
- "step": 4500
127
- },
128
- {
129
- "epoch": 0.57,
130
- "eval_loss": 0.5975988507270813,
131
- "eval_runtime": 4.7572,
132
- "eval_samples_per_second": 21.021,
133
- "eval_steps_per_second": 10.51,
134
- "step": 4500
135
- },
136
- {
137
- "epoch": 0.63,
138
- "learning_rate": 5.4347826086956524e-05,
139
- "loss": 0.4375,
140
- "step": 5000
141
- },
142
- {
143
- "epoch": 0.63,
144
- "eval_loss": 0.5958980321884155,
145
- "eval_runtime": 5.8907,
146
- "eval_samples_per_second": 16.976,
147
- "eval_steps_per_second": 8.488,
148
- "step": 5000
149
- }
150
- ],
151
- "max_steps": 10000,
152
- "num_train_epochs": 2,
153
- "total_flos": 1.921873153314816e+16,
154
- "trial_name": null,
155
- "trial_params": null
156
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-5000/config.json β†’ config.json RENAMED
File without changes
checkpoint-5000/generation_config.json β†’ generation_config.json RENAMED
File without changes
checkpoint-5000/optimizer.pt β†’ optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daf86974bcdabe8d4b82019dd893671ec7dc44eebe5d722a1ca82b4ca3e2208a
3
  size 2371333
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d4d0984f6d3afa328cdb63469c5a8ff9da0e29f70ebc2b46f803ea273608cd
3
  size 2371333
checkpoint-5000/pytorch_model.bin β†’ pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a03e000db887a4c2ae1e3911ba55550f93b8468c29500cd6a9bca46d21b96adc
3
  size 990408885
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70f77c75f6db835162d64802678671bae5af6ec92907394913c274e72964591a
3
  size 990408885
checkpoint-5000/rng_state.pth β†’ rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab9b4ed4f719b5c091aaee0f8a3b0a2f1977980e6eb7bbd803bf6300cea01e57
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96805bdb6c1ed875dd7da931970e8b852a1c747ef4c73dba10e101dc01ad5c13
3
  size 14575
checkpoint-5000/scheduler.pt β†’ scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5db814da74fa52897e2706d0a6e369da3f074aa77a52775256fa20b47664cee
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f5e24b8bf255cbefe9d307944a9741807d095b40cc5429a7befe9515b366b0f
3
  size 627
trainer_state.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.5195263290501386,
5
+ "global_step": 10000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 9.5e-05,
13
+ "loss": 0.8023,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.13,
18
+ "eval_loss": 0.6604204177856445,
19
+ "eval_runtime": 5.6621,
20
+ "eval_samples_per_second": 52.984,
21
+ "eval_steps_per_second": 26.492,
22
+ "step": 500
23
+ },
24
+ {
25
+ "epoch": 0.25,
26
+ "learning_rate": 9e-05,
27
+ "loss": 0.7113,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.25,
32
+ "eval_loss": 0.6501660943031311,
33
+ "eval_runtime": 5.6949,
34
+ "eval_samples_per_second": 52.679,
35
+ "eval_steps_per_second": 26.339,
36
+ "step": 1000
37
+ },
38
+ {
39
+ "epoch": 0.38,
40
+ "learning_rate": 8.5e-05,
41
+ "loss": 0.7123,
42
+ "step": 1500
43
+ },
44
+ {
45
+ "epoch": 0.38,
46
+ "eval_loss": 0.6351694464683533,
47
+ "eval_runtime": 7.0443,
48
+ "eval_samples_per_second": 42.588,
49
+ "eval_steps_per_second": 21.294,
50
+ "step": 1500
51
+ },
52
+ {
53
+ "epoch": 0.5,
54
+ "learning_rate": 8e-05,
55
+ "loss": 0.7002,
56
+ "step": 2000
57
+ },
58
+ {
59
+ "epoch": 0.5,
60
+ "eval_loss": 0.6252465844154358,
61
+ "eval_runtime": 5.6908,
62
+ "eval_samples_per_second": 52.716,
63
+ "eval_steps_per_second": 26.358,
64
+ "step": 2000
65
+ },
66
+ {
67
+ "epoch": 0.63,
68
+ "learning_rate": 7.500000000000001e-05,
69
+ "loss": 0.6974,
70
+ "step": 2500
71
+ },
72
+ {
73
+ "epoch": 0.63,
74
+ "eval_loss": 0.6316436529159546,
75
+ "eval_runtime": 5.5384,
76
+ "eval_samples_per_second": 54.167,
77
+ "eval_steps_per_second": 27.083,
78
+ "step": 2500
79
+ },
80
+ {
81
+ "epoch": 0.76,
82
+ "learning_rate": 7e-05,
83
+ "loss": 0.668,
84
+ "step": 3000
85
+ },
86
+ {
87
+ "epoch": 0.76,
88
+ "eval_loss": 0.6197097301483154,
89
+ "eval_runtime": 5.9147,
90
+ "eval_samples_per_second": 50.721,
91
+ "eval_steps_per_second": 25.361,
92
+ "step": 3000
93
+ },
94
+ {
95
+ "epoch": 0.88,
96
+ "learning_rate": 6.500000000000001e-05,
97
+ "loss": 0.6721,
98
+ "step": 3500
99
+ },
100
+ {
101
+ "epoch": 0.88,
102
+ "eval_loss": 0.6143491268157959,
103
+ "eval_runtime": 6.0601,
104
+ "eval_samples_per_second": 49.504,
105
+ "eval_steps_per_second": 24.752,
106
+ "step": 3500
107
+ },
108
+ {
109
+ "epoch": 1.01,
110
+ "learning_rate": 6e-05,
111
+ "loss": 0.6603,
112
+ "step": 4000
113
+ },
114
+ {
115
+ "epoch": 1.01,
116
+ "eval_loss": 0.6048210859298706,
117
+ "eval_runtime": 5.3107,
118
+ "eval_samples_per_second": 56.49,
119
+ "eval_steps_per_second": 28.245,
120
+ "step": 4000
121
+ },
122
+ {
123
+ "epoch": 1.13,
124
+ "learning_rate": 5.500000000000001e-05,
125
+ "loss": 0.598,
126
+ "step": 4500
127
+ },
128
+ {
129
+ "epoch": 1.13,
130
+ "eval_loss": 0.6044912934303284,
131
+ "eval_runtime": 5.4715,
132
+ "eval_samples_per_second": 54.829,
133
+ "eval_steps_per_second": 27.415,
134
+ "step": 4500
135
+ },
136
+ {
137
+ "epoch": 1.26,
138
+ "learning_rate": 5e-05,
139
+ "loss": 0.5961,
140
+ "step": 5000
141
+ },
142
+ {
143
+ "epoch": 1.26,
144
+ "eval_loss": 0.5996153950691223,
145
+ "eval_runtime": 5.2909,
146
+ "eval_samples_per_second": 56.701,
147
+ "eval_steps_per_second": 28.35,
148
+ "step": 5000
149
+ },
150
+ {
151
+ "epoch": 1.39,
152
+ "learning_rate": 4.5e-05,
153
+ "loss": 0.5712,
154
+ "step": 5500
155
+ },
156
+ {
157
+ "epoch": 1.39,
158
+ "eval_loss": 0.6034131646156311,
159
+ "eval_runtime": 5.0673,
160
+ "eval_samples_per_second": 59.203,
161
+ "eval_steps_per_second": 29.602,
162
+ "step": 5500
163
+ },
164
+ {
165
+ "epoch": 1.51,
166
+ "learning_rate": 4e-05,
167
+ "loss": 0.5863,
168
+ "step": 6000
169
+ },
170
+ {
171
+ "epoch": 1.51,
172
+ "eval_loss": 0.5980101227760315,
173
+ "eval_runtime": 5.146,
174
+ "eval_samples_per_second": 58.297,
175
+ "eval_steps_per_second": 29.149,
176
+ "step": 6000
177
+ },
178
+ {
179
+ "epoch": 1.64,
180
+ "learning_rate": 3.5e-05,
181
+ "loss": 0.5814,
182
+ "step": 6500
183
+ },
184
+ {
185
+ "epoch": 1.64,
186
+ "eval_loss": 0.5993916988372803,
187
+ "eval_runtime": 5.2338,
188
+ "eval_samples_per_second": 57.32,
189
+ "eval_steps_per_second": 28.66,
190
+ "step": 6500
191
+ },
192
+ {
193
+ "epoch": 1.76,
194
+ "learning_rate": 3e-05,
195
+ "loss": 0.5726,
196
+ "step": 7000
197
+ },
198
+ {
199
+ "epoch": 1.76,
200
+ "eval_loss": 0.5940994024276733,
201
+ "eval_runtime": 5.1782,
202
+ "eval_samples_per_second": 57.935,
203
+ "eval_steps_per_second": 28.968,
204
+ "step": 7000
205
+ },
206
+ {
207
+ "epoch": 1.89,
208
+ "learning_rate": 2.5e-05,
209
+ "loss": 0.5751,
210
+ "step": 7500
211
+ },
212
+ {
213
+ "epoch": 1.89,
214
+ "eval_loss": 0.5939777493476868,
215
+ "eval_runtime": 5.4507,
216
+ "eval_samples_per_second": 55.039,
217
+ "eval_steps_per_second": 27.52,
218
+ "step": 7500
219
+ },
220
+ {
221
+ "epoch": 2.02,
222
+ "learning_rate": 2e-05,
223
+ "loss": 0.5831,
224
+ "step": 8000
225
+ },
226
+ {
227
+ "epoch": 2.02,
228
+ "eval_loss": 0.5925641059875488,
229
+ "eval_runtime": 5.3244,
230
+ "eval_samples_per_second": 56.344,
231
+ "eval_steps_per_second": 28.172,
232
+ "step": 8000
233
+ },
234
+ {
235
+ "epoch": 2.14,
236
+ "learning_rate": 1.5e-05,
237
+ "loss": 0.542,
238
+ "step": 8500
239
+ },
240
+ {
241
+ "epoch": 2.14,
242
+ "eval_loss": 0.597409188747406,
243
+ "eval_runtime": 5.5596,
244
+ "eval_samples_per_second": 53.96,
245
+ "eval_steps_per_second": 26.98,
246
+ "step": 8500
247
+ },
248
+ {
249
+ "epoch": 2.27,
250
+ "learning_rate": 1e-05,
251
+ "loss": 0.5256,
252
+ "step": 9000
253
+ },
254
+ {
255
+ "epoch": 2.27,
256
+ "eval_loss": 0.594516396522522,
257
+ "eval_runtime": 5.486,
258
+ "eval_samples_per_second": 54.684,
259
+ "eval_steps_per_second": 27.342,
260
+ "step": 9000
261
+ },
262
+ {
263
+ "epoch": 2.39,
264
+ "learning_rate": 5e-06,
265
+ "loss": 0.543,
266
+ "step": 9500
267
+ },
268
+ {
269
+ "epoch": 2.39,
270
+ "eval_loss": 0.592924177646637,
271
+ "eval_runtime": 5.3977,
272
+ "eval_samples_per_second": 55.579,
273
+ "eval_steps_per_second": 27.789,
274
+ "step": 9500
275
+ },
276
+ {
277
+ "epoch": 2.52,
278
+ "learning_rate": 0.0,
279
+ "loss": 0.5272,
280
+ "step": 10000
281
+ },
282
+ {
283
+ "epoch": 2.52,
284
+ "eval_loss": 0.593059241771698,
285
+ "eval_runtime": 5.7212,
286
+ "eval_samples_per_second": 52.437,
287
+ "eval_steps_per_second": 26.218,
288
+ "step": 10000
289
+ }
290
+ ],
291
+ "max_steps": 10000,
292
+ "num_train_epochs": 3,
293
+ "total_flos": 1.2121549777083802e+17,
294
+ "trial_name": null,
295
+ "trial_params": null
296
+ }
checkpoint-5000/training_args.bin β†’ training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92133b734236cd14623f5f32d7dac13646e59db8ee0cae12878e084143f07cb4
3
  size 4219
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8289752bca98a3f14e53feb07fafa691076be700549694b90e50635b318dad97
3
  size 4219