nielsbantilan commited on
Commit
23e5ad7
·
verified ·
1 Parent(s): 63dcd8f

Upload folder using huggingface_hub

Browse files
Files changed (23) hide show
  1. checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt +1 -1
  2. checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
  3. checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_model_states.pt +1 -1
  4. checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  5. checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_model_states.pt +1 -1
  6. checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_model_states.pt +1 -1
  8. checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_model_states.pt +1 -1
  10. checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  11. checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_model_states.pt +1 -1
  12. checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  13. checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_model_states.pt +1 -1
  14. checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  15. checkpoint-400/global_step400/zero_pp_rank_7_mp_rank_00_model_states.pt +1 -1
  16. checkpoint-400/global_step400/zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  17. checkpoint-400/pytorch_model.bin +1 -1
  18. checkpoint-400/trainer_state.json +29 -29
  19. checkpoint-400/training_args.bin +1 -1
  20. flyte_training_config.json +1 -1
  21. pytorch_model.bin +1 -1
  22. trainer_state.json +40 -40
  23. training_args.bin +1 -1
checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c93ee5b8cb3c9fe770a734c41a9651c2bd2146d06ed01d86f475bc092d763245
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3abdb1b27926b5bd4b6322e6914488f04073e8a67e3337225d5d9c3d6edcc762
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f2e2c33433b4a7873e0a2c58a13adc74837e4636a63cf083630a22f825bc948
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410ea099966cadd4fe5764a35a13d9c39d5174f707a63c382935008d47fe35bf
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04a2f965bbffccbc96869f8a21e9abf77a2d94b05cbd52053a6b13fefb1b242f
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:903079583239f22acce0c9c67f51a199855fadb7e8a74f7b66482481d30bf039
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16061aa0e674e18de87177bd697b06479c1b6217aa62f20756f0d17550a6b9d1
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ab25dd5e456d0fb94491249f1b4c3d93c60a0d7460610da0c7deb842904c9e6
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ed6c8f793581ff40564f7448fd7cdd4c3e97f5cfb8ff6c4c8c1836eaa505532
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae702f4e764ca7aee7ebfd5a3c712eea9d9c3e15252018dfc2f55a3ef1f61b66
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1777ef89643d77b7d7615d7c5cd6bf22cec543a918d0d2181691167c3b2bf662
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84ea9c6c97df48e2d576e32883a804fa7f2e732976629ed3a0f3f7c8c4ec0107
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c52a2b7e7f349e660aa5f856d7fe448a97e68beb4de6c46968e37bc63d5fd37
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43816999f29d7d93caffd194f52451c7994a01cecd14926d5c2a3e275b92a542
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa2a50627546e6e379f5d592681a31a65f8c1f703301f03074ac3837ba7c78c8
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e13b1e5e16a0d00ce8f0eaa26f1e0ec238e3cd5f16f690a094a1c2b4949831f8
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ad2756edd42edb4c1db8e438125921f09897cd655083806fb1c358e230b5eaa
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8444840a9ea2c94e90ee67c2f0d7b227d455583cfe897ed20297617b5434169
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3579d9a26f8311633f02b253340b832e0335042ef8cc4ab9bb6de6540f5941e
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5859482f1943a6eb045eef8cfd6be9ab2ae71d232b28041113d3b85a74d71310
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a32922ba3284e5a1bb06e98224a3bc3bb52e072386ea0c2ad3331a6fb19c1bc
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c8e45955844d80f0d83d3008675a45332f415c5ad4bf41415901eda5bd161b
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b200e1c68d22e4ff09e8def614520ed4cd3ee318392d9795e98b10fb4b1642c8
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0be357239a340323a404def3760bd0d67baa4c41d47ba1e5d84b50570d011d82
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c101d52f6ff8cb5cc6220900dc099fc438ab5ace5de24773ee1b4039c222e025
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fac124f36827160aa31a4f6fe2e0cdf035d0a424b7ac4d3f7af772af45468bd
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9480e0823bd09697f3944726f296063f9236576ab9f5765d93b20c62cf1bfb2f
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9a549cb525994897a2d2008986f6df822312ae62692c60b0f0a045b03eb1b67
3
  size 4163799934
checkpoint-400/global_step400/zero_pp_rank_7_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d739b4b9f8c2f56af45739f78a55b08b87619d78ba896d57002909378c95222b
3
  size 134451731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b9bd14a96dfcb030a61fd92ad2a5c6452700cf67ba36c6a7a92c9d88b8ddf1
3
  size 134451731
checkpoint-400/global_step400/zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3441f14607a54bc3f5eff1b191fe2864d101588934f8f9c17fa4b45feec9a210
3
  size 4163799934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b950854cd9a80854734464b357749cfe53f8ccf91c26abf6b54ab908c27301
3
  size 4163799934
checkpoint-400/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6ac41172c7bd6abf75e8e0e73e4f8dbe3202ec6c399179c73462afcffdd1671
3
  size 5686106713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a871f9eb54a5c28354a5f10190d672a25be5edfbac232e025d5fdbe23cb380c
3
  size 5686106713
checkpoint-400/trainer_state.json CHANGED
@@ -10,43 +10,43 @@
10
  {
11
  "epoch": 6.67,
12
  "learning_rate": 1.5357481488588927e-05,
13
- "loss": 1.9783,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 13.33,
18
  "learning_rate": 2e-05,
19
- "loss": 1.0744,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 20.0,
24
  "learning_rate": 2e-05,
25
- "loss": 0.2335,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 26.67,
30
  "learning_rate": 2e-05,
31
- "loss": 0.0723,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 33.33,
36
  "learning_rate": 2e-05,
37
- "loss": 0.046,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 40.0,
42
  "learning_rate": 2e-05,
43
- "loss": 0.0321,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 46.67,
48
  "learning_rate": 2e-05,
49
- "loss": 0.0261,
50
  "step": 70
51
  },
52
  {
@@ -58,19 +58,19 @@
58
  {
59
  "epoch": 60.0,
60
  "learning_rate": 2e-05,
61
- "loss": 0.0158,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 66.67,
66
  "learning_rate": 2e-05,
67
- "loss": 0.0124,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 73.33,
72
  "learning_rate": 2e-05,
73
- "loss": 0.0106,
74
  "step": 110
75
  },
76
  {
@@ -82,19 +82,19 @@
82
  {
83
  "epoch": 86.67,
84
  "learning_rate": 2e-05,
85
- "loss": 0.0083,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 93.33,
90
  "learning_rate": 2e-05,
91
- "loss": 0.0071,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 100.0,
96
  "learning_rate": 2e-05,
97
- "loss": 0.007,
98
  "step": 150
99
  },
100
  {
@@ -106,31 +106,31 @@
106
  {
107
  "epoch": 113.33,
108
  "learning_rate": 2e-05,
109
- "loss": 0.0061,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 120.0,
114
  "learning_rate": 2e-05,
115
- "loss": 0.0059,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 126.67,
120
  "learning_rate": 2e-05,
121
- "loss": 0.0054,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 133.33,
126
  "learning_rate": 2e-05,
127
- "loss": 0.0052,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 140.0,
132
  "learning_rate": 2e-05,
133
- "loss": 0.0049,
134
  "step": 210
135
  },
136
  {
@@ -142,13 +142,13 @@
142
  {
143
  "epoch": 153.33,
144
  "learning_rate": 2e-05,
145
- "loss": 0.0045,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 160.0,
150
  "learning_rate": 2e-05,
151
- "loss": 0.0047,
152
  "step": 240
153
  },
154
  {
@@ -160,7 +160,7 @@
160
  {
161
  "epoch": 173.33,
162
  "learning_rate": 2e-05,
163
- "loss": 0.0046,
164
  "step": 260
165
  },
166
  {
@@ -178,7 +178,7 @@
178
  {
179
  "epoch": 193.33,
180
  "learning_rate": 2e-05,
181
- "loss": 0.0043,
182
  "step": 290
183
  },
184
  {
@@ -202,19 +202,19 @@
202
  {
203
  "epoch": 220.0,
204
  "learning_rate": 2e-05,
205
- "loss": 0.004,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 226.67,
210
  "learning_rate": 2e-05,
211
- "loss": 0.0041,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 233.33,
216
  "learning_rate": 2e-05,
217
- "loss": 0.004,
218
  "step": 350
219
  },
220
  {
@@ -226,25 +226,25 @@
226
  {
227
  "epoch": 246.67,
228
  "learning_rate": 2e-05,
229
- "loss": 0.0042,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 253.33,
234
  "learning_rate": 2e-05,
235
- "loss": 0.0041,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 260.0,
240
  "learning_rate": 2e-05,
241
- "loss": 0.0038,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 266.67,
246
  "learning_rate": 2e-05,
247
- "loss": 0.0041,
248
  "step": 400
249
  }
250
  ],
 
10
  {
11
  "epoch": 6.67,
12
  "learning_rate": 1.5357481488588927e-05,
13
+ "loss": 1.98,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 13.33,
18
  "learning_rate": 2e-05,
19
+ "loss": 1.0643,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 20.0,
24
  "learning_rate": 2e-05,
25
+ "loss": 0.2385,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 26.67,
30
  "learning_rate": 2e-05,
31
+ "loss": 0.0722,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 33.33,
36
  "learning_rate": 2e-05,
37
+ "loss": 0.0466,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 40.0,
42
  "learning_rate": 2e-05,
43
+ "loss": 0.0329,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 46.67,
48
  "learning_rate": 2e-05,
49
+ "loss": 0.0256,
50
  "step": 70
51
  },
52
  {
 
58
  {
59
  "epoch": 60.0,
60
  "learning_rate": 2e-05,
61
+ "loss": 0.0155,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 66.67,
66
  "learning_rate": 2e-05,
67
+ "loss": 0.0126,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 73.33,
72
  "learning_rate": 2e-05,
73
+ "loss": 0.0104,
74
  "step": 110
75
  },
76
  {
 
82
  {
83
  "epoch": 86.67,
84
  "learning_rate": 2e-05,
85
+ "loss": 0.0085,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 93.33,
90
  "learning_rate": 2e-05,
91
+ "loss": 0.0073,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 100.0,
96
  "learning_rate": 2e-05,
97
+ "loss": 0.0066,
98
  "step": 150
99
  },
100
  {
 
106
  {
107
  "epoch": 113.33,
108
  "learning_rate": 2e-05,
109
+ "loss": 0.0056,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 120.0,
114
  "learning_rate": 2e-05,
115
+ "loss": 0.0058,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 126.67,
120
  "learning_rate": 2e-05,
121
+ "loss": 0.0051,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 133.33,
126
  "learning_rate": 2e-05,
127
+ "loss": 0.0053,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 140.0,
132
  "learning_rate": 2e-05,
133
+ "loss": 0.005,
134
  "step": 210
135
  },
136
  {
 
142
  {
143
  "epoch": 153.33,
144
  "learning_rate": 2e-05,
145
+ "loss": 0.0051,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 160.0,
150
  "learning_rate": 2e-05,
151
+ "loss": 0.0046,
152
  "step": 240
153
  },
154
  {
 
160
  {
161
  "epoch": 173.33,
162
  "learning_rate": 2e-05,
163
+ "loss": 0.0043,
164
  "step": 260
165
  },
166
  {
 
178
  {
179
  "epoch": 193.33,
180
  "learning_rate": 2e-05,
181
+ "loss": 0.0044,
182
  "step": 290
183
  },
184
  {
 
202
  {
203
  "epoch": 220.0,
204
  "learning_rate": 2e-05,
205
+ "loss": 0.0042,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 226.67,
210
  "learning_rate": 2e-05,
211
+ "loss": 0.0042,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 233.33,
216
  "learning_rate": 2e-05,
217
+ "loss": 0.0042,
218
  "step": 350
219
  },
220
  {
 
226
  {
227
  "epoch": 246.67,
228
  "learning_rate": 2e-05,
229
+ "loss": 0.0041,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 253.33,
234
  "learning_rate": 2e-05,
235
+ "loss": 0.0039,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 260.0,
240
  "learning_rate": 2e-05,
241
+ "loss": 0.0039,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 266.67,
246
  "learning_rate": 2e-05,
247
+ "loss": 0.004,
248
  "step": 400
249
  }
250
  ],
checkpoint-400/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567801fbca456df7279774860aae6a5f038bd64e03f891b6ae2b93a59c8c417b
3
  size 5563
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6085604e5a1ea54a8fda0f0d3c312d4d5e1773f18a0a0cf06a5dcc2b00210491
3
  size 5563
flyte_training_config.json CHANGED
@@ -1 +1 @@
1
- {"base_model": "togethercomputer/RedPajama-INCITE-Base-3B-v1", "data_path": "wikipedia", "data_name": "20220301.simple", "num_epochs": 1, "max_steps": 500, "learning_rate": 2e-05, "weight_decay": 0.02, "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "batch_size": 16, "micro_batch_size": 1, "val_set_size": 0, "group_by_length": false, "instruction_key": "instruction", "input_key": "input", "output_key": "output", "device_map": "auto", "cache_dir": null, "optim": "adamw_torch", "model_max_length": 512, "debug_mode": false, "debug_train_data_size": 1024, "wandb_project": ""}
 
1
+ {"base_model": "togethercomputer/RedPajama-INCITE-Base-3B-v1", "data_path": "wikipedia", "data_name": "20220301.simple", "num_epochs": 2, "max_steps": 500, "learning_rate": 2e-05, "weight_decay": 0.02, "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "batch_size": 16, "micro_batch_size": 1, "val_set_size": 0, "group_by_length": false, "instruction_key": "instruction", "input_key": "input", "output_key": "output", "device_map": "auto", "cache_dir": null, "optim": "adamw_torch", "model_max_length": 512, "debug_mode": false, "debug_train_data_size": 1024, "wandb_project": ""}
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f3895af09e0538e006a1966cba065c0ffa5e3f6694c04007a381d31c326bcf4
3
  size 5686106713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cac632e2637c55b07b1745e8754257c32050851c6adbaa0ee0ed22a5826b2d84
3
  size 5686106713
trainer_state.json CHANGED
@@ -10,43 +10,43 @@
10
  {
11
  "epoch": 6.67,
12
  "learning_rate": 1.5357481488588927e-05,
13
- "loss": 1.9783,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 13.33,
18
  "learning_rate": 2e-05,
19
- "loss": 1.0744,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 20.0,
24
  "learning_rate": 2e-05,
25
- "loss": 0.2335,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 26.67,
30
  "learning_rate": 2e-05,
31
- "loss": 0.0723,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 33.33,
36
  "learning_rate": 2e-05,
37
- "loss": 0.046,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 40.0,
42
  "learning_rate": 2e-05,
43
- "loss": 0.0321,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 46.67,
48
  "learning_rate": 2e-05,
49
- "loss": 0.0261,
50
  "step": 70
51
  },
52
  {
@@ -58,19 +58,19 @@
58
  {
59
  "epoch": 60.0,
60
  "learning_rate": 2e-05,
61
- "loss": 0.0158,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 66.67,
66
  "learning_rate": 2e-05,
67
- "loss": 0.0124,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 73.33,
72
  "learning_rate": 2e-05,
73
- "loss": 0.0106,
74
  "step": 110
75
  },
76
  {
@@ -82,19 +82,19 @@
82
  {
83
  "epoch": 86.67,
84
  "learning_rate": 2e-05,
85
- "loss": 0.0083,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 93.33,
90
  "learning_rate": 2e-05,
91
- "loss": 0.0071,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 100.0,
96
  "learning_rate": 2e-05,
97
- "loss": 0.007,
98
  "step": 150
99
  },
100
  {
@@ -106,31 +106,31 @@
106
  {
107
  "epoch": 113.33,
108
  "learning_rate": 2e-05,
109
- "loss": 0.0061,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 120.0,
114
  "learning_rate": 2e-05,
115
- "loss": 0.0059,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 126.67,
120
  "learning_rate": 2e-05,
121
- "loss": 0.0054,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 133.33,
126
  "learning_rate": 2e-05,
127
- "loss": 0.0052,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 140.0,
132
  "learning_rate": 2e-05,
133
- "loss": 0.0049,
134
  "step": 210
135
  },
136
  {
@@ -142,13 +142,13 @@
142
  {
143
  "epoch": 153.33,
144
  "learning_rate": 2e-05,
145
- "loss": 0.0045,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 160.0,
150
  "learning_rate": 2e-05,
151
- "loss": 0.0047,
152
  "step": 240
153
  },
154
  {
@@ -160,7 +160,7 @@
160
  {
161
  "epoch": 173.33,
162
  "learning_rate": 2e-05,
163
- "loss": 0.0046,
164
  "step": 260
165
  },
166
  {
@@ -178,7 +178,7 @@
178
  {
179
  "epoch": 193.33,
180
  "learning_rate": 2e-05,
181
- "loss": 0.0043,
182
  "step": 290
183
  },
184
  {
@@ -202,19 +202,19 @@
202
  {
203
  "epoch": 220.0,
204
  "learning_rate": 2e-05,
205
- "loss": 0.004,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 226.67,
210
  "learning_rate": 2e-05,
211
- "loss": 0.0041,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 233.33,
216
  "learning_rate": 2e-05,
217
- "loss": 0.004,
218
  "step": 350
219
  },
220
  {
@@ -226,55 +226,55 @@
226
  {
227
  "epoch": 246.67,
228
  "learning_rate": 2e-05,
229
- "loss": 0.0042,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 253.33,
234
  "learning_rate": 2e-05,
235
- "loss": 0.0041,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 260.0,
240
  "learning_rate": 2e-05,
241
- "loss": 0.0038,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 266.67,
246
  "learning_rate": 2e-05,
247
- "loss": 0.0041,
248
  "step": 400
249
  },
250
  {
251
  "epoch": 273.33,
252
  "learning_rate": 2e-05,
253
- "loss": 0.0037,
254
  "step": 410
255
  },
256
  {
257
  "epoch": 280.0,
258
  "learning_rate": 2e-05,
259
- "loss": 0.0038,
260
  "step": 420
261
  },
262
  {
263
  "epoch": 286.67,
264
  "learning_rate": 2e-05,
265
- "loss": 0.0038,
266
  "step": 430
267
  },
268
  {
269
  "epoch": 293.33,
270
  "learning_rate": 2e-05,
271
- "loss": 0.0037,
272
  "step": 440
273
  },
274
  {
275
  "epoch": 300.0,
276
  "learning_rate": 2e-05,
277
- "loss": 0.0038,
278
  "step": 450
279
  },
280
  {
@@ -286,19 +286,19 @@
286
  {
287
  "epoch": 313.33,
288
  "learning_rate": 2e-05,
289
- "loss": 0.0039,
290
  "step": 470
291
  },
292
  {
293
  "epoch": 320.0,
294
  "learning_rate": 2e-05,
295
- "loss": 0.0037,
296
  "step": 480
297
  },
298
  {
299
  "epoch": 326.67,
300
  "learning_rate": 2e-05,
301
- "loss": 0.0036,
302
  "step": 490
303
  },
304
  {
@@ -311,9 +311,9 @@
311
  "epoch": 333.33,
312
  "step": 500,
313
  "total_flos": 210359990353920.0,
314
- "train_loss": 0.07407628475874663,
315
- "train_runtime": 21320.65,
316
- "train_samples_per_second": 3.002,
317
  "train_steps_per_second": 0.023
318
  }
319
  ],
 
10
  {
11
  "epoch": 6.67,
12
  "learning_rate": 1.5357481488588927e-05,
13
+ "loss": 1.98,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 13.33,
18
  "learning_rate": 2e-05,
19
+ "loss": 1.0643,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 20.0,
24
  "learning_rate": 2e-05,
25
+ "loss": 0.2385,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 26.67,
30
  "learning_rate": 2e-05,
31
+ "loss": 0.0722,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 33.33,
36
  "learning_rate": 2e-05,
37
+ "loss": 0.0466,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 40.0,
42
  "learning_rate": 2e-05,
43
+ "loss": 0.0329,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 46.67,
48
  "learning_rate": 2e-05,
49
+ "loss": 0.0256,
50
  "step": 70
51
  },
52
  {
 
58
  {
59
  "epoch": 60.0,
60
  "learning_rate": 2e-05,
61
+ "loss": 0.0155,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 66.67,
66
  "learning_rate": 2e-05,
67
+ "loss": 0.0126,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 73.33,
72
  "learning_rate": 2e-05,
73
+ "loss": 0.0104,
74
  "step": 110
75
  },
76
  {
 
82
  {
83
  "epoch": 86.67,
84
  "learning_rate": 2e-05,
85
+ "loss": 0.0085,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 93.33,
90
  "learning_rate": 2e-05,
91
+ "loss": 0.0073,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 100.0,
96
  "learning_rate": 2e-05,
97
+ "loss": 0.0066,
98
  "step": 150
99
  },
100
  {
 
106
  {
107
  "epoch": 113.33,
108
  "learning_rate": 2e-05,
109
+ "loss": 0.0056,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 120.0,
114
  "learning_rate": 2e-05,
115
+ "loss": 0.0058,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 126.67,
120
  "learning_rate": 2e-05,
121
+ "loss": 0.0051,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 133.33,
126
  "learning_rate": 2e-05,
127
+ "loss": 0.0053,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 140.0,
132
  "learning_rate": 2e-05,
133
+ "loss": 0.005,
134
  "step": 210
135
  },
136
  {
 
142
  {
143
  "epoch": 153.33,
144
  "learning_rate": 2e-05,
145
+ "loss": 0.0051,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 160.0,
150
  "learning_rate": 2e-05,
151
+ "loss": 0.0046,
152
  "step": 240
153
  },
154
  {
 
160
  {
161
  "epoch": 173.33,
162
  "learning_rate": 2e-05,
163
+ "loss": 0.0043,
164
  "step": 260
165
  },
166
  {
 
178
  {
179
  "epoch": 193.33,
180
  "learning_rate": 2e-05,
181
+ "loss": 0.0044,
182
  "step": 290
183
  },
184
  {
 
202
  {
203
  "epoch": 220.0,
204
  "learning_rate": 2e-05,
205
+ "loss": 0.0042,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 226.67,
210
  "learning_rate": 2e-05,
211
+ "loss": 0.0042,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 233.33,
216
  "learning_rate": 2e-05,
217
+ "loss": 0.0042,
218
  "step": 350
219
  },
220
  {
 
226
  {
227
  "epoch": 246.67,
228
  "learning_rate": 2e-05,
229
+ "loss": 0.0041,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 253.33,
234
  "learning_rate": 2e-05,
235
+ "loss": 0.0039,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 260.0,
240
  "learning_rate": 2e-05,
241
+ "loss": 0.0039,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 266.67,
246
  "learning_rate": 2e-05,
247
+ "loss": 0.004,
248
  "step": 400
249
  },
250
  {
251
  "epoch": 273.33,
252
  "learning_rate": 2e-05,
253
+ "loss": 0.0038,
254
  "step": 410
255
  },
256
  {
257
  "epoch": 280.0,
258
  "learning_rate": 2e-05,
259
+ "loss": 0.0039,
260
  "step": 420
261
  },
262
  {
263
  "epoch": 286.67,
264
  "learning_rate": 2e-05,
265
+ "loss": 0.0036,
266
  "step": 430
267
  },
268
  {
269
  "epoch": 293.33,
270
  "learning_rate": 2e-05,
271
+ "loss": 0.0038,
272
  "step": 440
273
  },
274
  {
275
  "epoch": 300.0,
276
  "learning_rate": 2e-05,
277
+ "loss": 0.0041,
278
  "step": 450
279
  },
280
  {
 
286
  {
287
  "epoch": 313.33,
288
  "learning_rate": 2e-05,
289
+ "loss": 0.0038,
290
  "step": 470
291
  },
292
  {
293
  "epoch": 320.0,
294
  "learning_rate": 2e-05,
295
+ "loss": 0.0038,
296
  "step": 480
297
  },
298
  {
299
  "epoch": 326.67,
300
  "learning_rate": 2e-05,
301
+ "loss": 0.0037,
302
  "step": 490
303
  },
304
  {
 
311
  "epoch": 333.33,
312
  "step": 500,
313
  "total_flos": 210359990353920.0,
314
+ "train_loss": 0.0740217172279954,
315
+ "train_runtime": 21405.8064,
316
+ "train_samples_per_second": 2.99,
317
  "train_steps_per_second": 0.023
318
  }
319
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567801fbca456df7279774860aae6a5f038bd64e03f891b6ae2b93a59c8c417b
3
  size 5563
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6085604e5a1ea54a8fda0f0d3c312d4d5e1773f18a0a0cf06a5dcc2b00210491
3
  size 5563