Gerson Fabian Buenahora Ormaza commited on
Commit
1a45c77
1 Parent(s): 5d7af2a

Upload 12 files

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "BueormLLC/CleanGPT",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "BueormLLC/RAGPT-2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f06541cef2542bfe1c5e51c5013188fa980eaf805a2ba2048e82da798ace6522
3
  size 497774208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cccf36dbb1232f726dd86b9fd5cab7b9ab38dad643e4bb7b73ae7d6475a11509
3
  size 497774208
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d285b214ff9eb000bdc26aac9b035606764b96d34f9f5236535bb42e0ab3722
3
  size 995642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50bfe1def334ee87b42a6fb6f30891e8e72704f52892b5f894f307cb58b8964
3
  size 995642298
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:549a8235ab4fe89a52f68cd8f2066d72d9d3c145ac02436bdb0adf36fca0dec5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39031638812cf8937efb39b99f8e04816250b75127fac74c34e34812d023c9de
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0daba17a72d1aa993bd866cf1c63e35f955a56ce5c903da992079fc3684233b8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4848d97de2f99c7cf83c5cf8ad176114ae8a5a8c5ff5c71446432e90c8eb1c6
3
  size 1064
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|endoftext|>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json CHANGED
@@ -1,159 +1,334 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 8.0,
5
  "eval_steps": 500,
6
- "global_step": 10392,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.3849114703618168,
13
- "grad_norm": 1.290488839149475,
14
- "learning_rate": 5e-05,
15
- "loss": 7.6662,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.7698229407236336,
20
- "grad_norm": 1.1826180219650269,
21
- "learning_rate": 4.747270521633644e-05,
22
- "loss": 6.3593,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 1.1547344110854503,
27
- "grad_norm": 1.238223910331726,
28
- "learning_rate": 4.494541043267287e-05,
29
- "loss": 6.132,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 1.539645881447267,
34
- "grad_norm": 1.1373482942581177,
35
- "learning_rate": 4.24181156490093e-05,
36
- "loss": 5.9398,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 1.924557351809084,
41
- "grad_norm": 1.2479649782180786,
42
- "learning_rate": 3.9890820865345734e-05,
43
- "loss": 5.8258,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 2.3094688221709005,
48
- "grad_norm": 1.5960040092468262,
49
- "learning_rate": 3.736352608168217e-05,
50
- "loss": 5.6365,
 
 
 
 
 
 
 
 
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 2.6943802925327174,
55
- "grad_norm": 1.454331398010254,
56
- "learning_rate": 3.4836231298018604e-05,
57
- "loss": 5.578,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 3.079291762894534,
62
- "grad_norm": 1.5335205793380737,
63
- "learning_rate": 3.2308936514355035e-05,
64
- "loss": 5.5167,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 3.464203233256351,
69
- "grad_norm": 1.5028727054595947,
70
- "learning_rate": 2.9781641730691466e-05,
71
- "loss": 5.3757,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 3.849114703618168,
76
- "grad_norm": 1.7399624586105347,
77
- "learning_rate": 2.7254346947027904e-05,
78
- "loss": 5.3516,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 4.234026173979984,
83
- "grad_norm": 1.6884772777557373,
84
- "learning_rate": 2.4727052163364335e-05,
85
- "loss": 5.286,
 
 
 
 
 
 
 
 
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 4.618937644341801,
90
- "grad_norm": 1.8180075883865356,
91
- "learning_rate": 2.219975737970077e-05,
92
- "loss": 5.1973,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 5.003849114703618,
97
- "grad_norm": 1.6495822668075562,
98
- "learning_rate": 1.96724625960372e-05,
99
- "loss": 5.1844,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 5.388760585065435,
104
- "grad_norm": 1.929316759109497,
105
- "learning_rate": 1.7145167812373636e-05,
106
- "loss": 5.0743,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 5.773672055427252,
111
- "grad_norm": 1.9738694429397583,
112
- "learning_rate": 1.4617873028710069e-05,
113
- "loss": 5.0788,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 6.158583525789068,
118
- "grad_norm": 1.9357603788375854,
119
- "learning_rate": 1.2090578245046502e-05,
120
- "loss": 5.0497,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 6.543494996150885,
125
- "grad_norm": 2.0328495502471924,
126
- "learning_rate": 9.563283461382936e-06,
127
- "loss": 4.9795,
 
 
 
 
 
 
 
 
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 6.928406466512702,
132
- "grad_norm": 2.1695396900177,
133
- "learning_rate": 7.03598867771937e-06,
134
- "loss": 4.9729,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 7.313317936874519,
139
- "grad_norm": 2.4989547729492188,
140
- "learning_rate": 4.508693894055803e-06,
141
- "loss": 4.9227,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 7.698229407236336,
146
- "grad_norm": 2.1488616466522217,
147
- "learning_rate": 1.981399110392236e-06,
148
- "loss": 4.9139,
149
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  }
151
  ],
152
  "logging_steps": 500,
153
- "max_steps": 10392,
154
  "num_input_tokens_seen": 0,
155
- "num_train_epochs": 8,
156
- "save_steps": 10000,
157
  "stateful_callbacks": {
158
  "TrainerControl": {
159
  "args": {
@@ -161,13 +336,13 @@
161
  "should_evaluate": false,
162
  "should_log": false,
163
  "should_save": true,
164
- "should_training_stop": true
165
  },
166
  "attributes": {}
167
  }
168
  },
169
- "total_flos": 2.1712322691072e+16,
170
- "train_batch_size": 8,
171
  "trial_name": null,
172
  "trial_params": null
173
  }
 
1
  {
2
+ "best_metric": 0.8724454641342163,
3
+ "best_model_checkpoint": "./results/checkpoint-2700",
4
+ "epoch": 7.0,
5
  "eval_steps": 500,
6
+ "global_step": 18900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.18518518518518517,
13
+ "grad_norm": 2.5104269981384277,
14
+ "learning_rate": 4.94212962962963e-05,
15
+ "loss": 0.9092,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.37037037037037035,
20
+ "grad_norm": 1.030969262123108,
21
+ "learning_rate": 4.8842592592592595e-05,
22
+ "loss": 0.869,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.5555555555555556,
27
+ "grad_norm": 0.49540138244628906,
28
+ "learning_rate": 4.8263888888888895e-05,
29
+ "loss": 0.8595,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.7407407407407407,
34
+ "grad_norm": 0.6758684515953064,
35
+ "learning_rate": 4.768518518518519e-05,
36
+ "loss": 0.8446,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.9259259259259259,
41
+ "grad_norm": 1.0114092826843262,
42
+ "learning_rate": 4.710648148148149e-05,
43
+ "loss": 0.8403,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 1.0,
48
+ "eval_loss": 0.8724454641342163,
49
+ "eval_runtime": 30.4341,
50
+ "eval_samples_per_second": 39.43,
51
+ "eval_steps_per_second": 9.857,
52
+ "step": 2700
53
+ },
54
+ {
55
+ "epoch": 1.1111111111111112,
56
+ "grad_norm": 0.8358873128890991,
57
+ "learning_rate": 4.652777777777778e-05,
58
+ "loss": 0.8109,
59
  "step": 3000
60
  },
61
  {
62
+ "epoch": 1.2962962962962963,
63
+ "grad_norm": 0.8434183597564697,
64
+ "learning_rate": 4.594907407407408e-05,
65
+ "loss": 0.7912,
66
  "step": 3500
67
  },
68
  {
69
+ "epoch": 1.4814814814814814,
70
+ "grad_norm": 1.2122215032577515,
71
+ "learning_rate": 4.5370370370370374e-05,
72
+ "loss": 0.7891,
73
  "step": 4000
74
  },
75
  {
76
+ "epoch": 1.6666666666666665,
77
+ "grad_norm": 0.7747897505760193,
78
+ "learning_rate": 4.4791666666666673e-05,
79
+ "loss": 0.8132,
80
  "step": 4500
81
  },
82
  {
83
+ "epoch": 1.8518518518518519,
84
+ "grad_norm": 0.9554975032806396,
85
+ "learning_rate": 4.4212962962962966e-05,
86
+ "loss": 0.8047,
87
  "step": 5000
88
  },
89
  {
90
+ "epoch": 2.0,
91
+ "eval_loss": 0.8918996453285217,
92
+ "eval_runtime": 30.4698,
93
+ "eval_samples_per_second": 39.383,
94
+ "eval_steps_per_second": 9.846,
95
+ "step": 5400
96
+ },
97
+ {
98
+ "epoch": 2.037037037037037,
99
+ "grad_norm": 0.6646651029586792,
100
+ "learning_rate": 4.3634259259259266e-05,
101
+ "loss": 0.8091,
102
  "step": 5500
103
  },
104
  {
105
+ "epoch": 2.2222222222222223,
106
+ "grad_norm": 4.084255218505859,
107
+ "learning_rate": 4.305555555555556e-05,
108
+ "loss": 0.7432,
109
  "step": 6000
110
  },
111
  {
112
+ "epoch": 2.4074074074074074,
113
+ "grad_norm": 2.1203970909118652,
114
+ "learning_rate": 4.247685185185186e-05,
115
+ "loss": 0.7355,
116
  "step": 6500
117
  },
118
  {
119
+ "epoch": 2.5925925925925926,
120
+ "grad_norm": 4.367093086242676,
121
+ "learning_rate": 4.1898148148148145e-05,
122
+ "loss": 0.751,
123
  "step": 7000
124
  },
125
  {
126
+ "epoch": 2.7777777777777777,
127
+ "grad_norm": 1.1563506126403809,
128
+ "learning_rate": 4.1319444444444445e-05,
129
+ "loss": 0.7632,
130
  "step": 7500
131
  },
132
  {
133
+ "epoch": 2.962962962962963,
134
+ "grad_norm": 0.9595785140991211,
135
+ "learning_rate": 4.074074074074074e-05,
136
+ "loss": 0.7714,
137
  "step": 8000
138
  },
139
  {
140
+ "epoch": 3.0,
141
+ "eval_loss": 0.9504669308662415,
142
+ "eval_runtime": 30.5135,
143
+ "eval_samples_per_second": 39.327,
144
+ "eval_steps_per_second": 9.832,
145
+ "step": 8100
146
+ },
147
+ {
148
+ "epoch": 3.148148148148148,
149
+ "grad_norm": 0.6189069747924805,
150
+ "learning_rate": 4.016203703703704e-05,
151
+ "loss": 0.7277,
152
  "step": 8500
153
  },
154
  {
155
+ "epoch": 3.3333333333333335,
156
+ "grad_norm": 0.6079156994819641,
157
+ "learning_rate": 3.958333333333333e-05,
158
+ "loss": 0.7373,
159
  "step": 9000
160
  },
161
  {
162
+ "epoch": 3.5185185185185186,
163
+ "grad_norm": 0.4996514320373535,
164
+ "learning_rate": 3.900462962962963e-05,
165
+ "loss": 0.7206,
166
  "step": 9500
167
  },
168
  {
169
+ "epoch": 3.7037037037037037,
170
+ "grad_norm": 0.9442146420478821,
171
+ "learning_rate": 3.8425925925925924e-05,
172
+ "loss": 0.7215,
173
  "step": 10000
174
+ },
175
+ {
176
+ "epoch": 3.888888888888889,
177
+ "grad_norm": 0.46321621537208557,
178
+ "learning_rate": 3.7847222222222224e-05,
179
+ "loss": 0.7238,
180
+ "step": 10500
181
+ },
182
+ {
183
+ "epoch": 4.0,
184
+ "eval_loss": 1.0164023637771606,
185
+ "eval_runtime": 30.4613,
186
+ "eval_samples_per_second": 39.394,
187
+ "eval_steps_per_second": 9.849,
188
+ "step": 10800
189
+ },
190
+ {
191
+ "epoch": 4.074074074074074,
192
+ "grad_norm": 0.808857798576355,
193
+ "learning_rate": 3.726851851851852e-05,
194
+ "loss": 0.7143,
195
+ "step": 11000
196
+ },
197
+ {
198
+ "epoch": 4.2592592592592595,
199
+ "grad_norm": 0.2266809195280075,
200
+ "learning_rate": 3.6689814814814816e-05,
201
+ "loss": 0.7017,
202
+ "step": 11500
203
+ },
204
+ {
205
+ "epoch": 4.444444444444445,
206
+ "grad_norm": 0.8129966259002686,
207
+ "learning_rate": 3.611111111111111e-05,
208
+ "loss": 0.7165,
209
+ "step": 12000
210
+ },
211
+ {
212
+ "epoch": 4.62962962962963,
213
+ "grad_norm": 0.7154943943023682,
214
+ "learning_rate": 3.553240740740741e-05,
215
+ "loss": 0.6892,
216
+ "step": 12500
217
+ },
218
+ {
219
+ "epoch": 4.814814814814815,
220
+ "grad_norm": 0.823897659778595,
221
+ "learning_rate": 3.49537037037037e-05,
222
+ "loss": 0.7026,
223
+ "step": 13000
224
+ },
225
+ {
226
+ "epoch": 5.0,
227
+ "grad_norm": 0.7548332810401917,
228
+ "learning_rate": 3.4375e-05,
229
+ "loss": 0.7019,
230
+ "step": 13500
231
+ },
232
+ {
233
+ "epoch": 5.0,
234
+ "eval_loss": 1.056677222251892,
235
+ "eval_runtime": 30.3702,
236
+ "eval_samples_per_second": 39.512,
237
+ "eval_steps_per_second": 9.878,
238
+ "step": 13500
239
+ },
240
+ {
241
+ "epoch": 5.185185185185185,
242
+ "grad_norm": 0.6250707507133484,
243
+ "learning_rate": 3.3796296296296295e-05,
244
+ "loss": 0.7107,
245
+ "step": 14000
246
+ },
247
+ {
248
+ "epoch": 5.37037037037037,
249
+ "grad_norm": 0.7014070749282837,
250
+ "learning_rate": 3.3217592592592595e-05,
251
+ "loss": 0.696,
252
+ "step": 14500
253
+ },
254
+ {
255
+ "epoch": 5.555555555555555,
256
+ "grad_norm": 0.8305183053016663,
257
+ "learning_rate": 3.263888888888889e-05,
258
+ "loss": 0.6858,
259
+ "step": 15000
260
+ },
261
+ {
262
+ "epoch": 5.7407407407407405,
263
+ "grad_norm": 0.5459818840026855,
264
+ "learning_rate": 3.206018518518519e-05,
265
+ "loss": 0.6828,
266
+ "step": 15500
267
+ },
268
+ {
269
+ "epoch": 5.925925925925926,
270
+ "grad_norm": 0.40176087617874146,
271
+ "learning_rate": 3.148148148148148e-05,
272
+ "loss": 0.6841,
273
+ "step": 16000
274
+ },
275
+ {
276
+ "epoch": 6.0,
277
+ "eval_loss": 1.0900229215621948,
278
+ "eval_runtime": 30.4123,
279
+ "eval_samples_per_second": 39.458,
280
+ "eval_steps_per_second": 9.864,
281
+ "step": 16200
282
+ },
283
+ {
284
+ "epoch": 6.111111111111111,
285
+ "grad_norm": 0.4161689281463623,
286
+ "learning_rate": 3.090277777777778e-05,
287
+ "loss": 0.6615,
288
+ "step": 16500
289
+ },
290
+ {
291
+ "epoch": 6.296296296296296,
292
+ "grad_norm": 0.7132428288459778,
293
+ "learning_rate": 3.0324074074074077e-05,
294
+ "loss": 0.683,
295
+ "step": 17000
296
+ },
297
+ {
298
+ "epoch": 6.481481481481482,
299
+ "grad_norm": 0.6823524236679077,
300
+ "learning_rate": 2.9745370370370373e-05,
301
+ "loss": 0.6692,
302
+ "step": 17500
303
+ },
304
+ {
305
+ "epoch": 6.666666666666667,
306
+ "grad_norm": 0.3051627278327942,
307
+ "learning_rate": 2.916666666666667e-05,
308
+ "loss": 0.6771,
309
+ "step": 18000
310
+ },
311
+ {
312
+ "epoch": 6.851851851851852,
313
+ "grad_norm": 0.5912793278694153,
314
+ "learning_rate": 2.8587962962962966e-05,
315
+ "loss": 0.6959,
316
+ "step": 18500
317
+ },
318
+ {
319
+ "epoch": 7.0,
320
+ "eval_loss": 1.12686288356781,
321
+ "eval_runtime": 30.4381,
322
+ "eval_samples_per_second": 39.424,
323
+ "eval_steps_per_second": 9.856,
324
+ "step": 18900
325
  }
326
  ],
327
  "logging_steps": 500,
328
+ "max_steps": 43200,
329
  "num_input_tokens_seen": 0,
330
+ "num_train_epochs": 16,
331
+ "save_steps": 500,
332
  "stateful_callbacks": {
333
  "TrainerControl": {
334
  "args": {
 
336
  "should_evaluate": false,
337
  "should_log": false,
338
  "should_save": true,
339
+ "should_training_stop": false
340
  },
341
  "attributes": {}
342
  }
343
  },
344
+ "total_flos": 1.97536776192e+16,
345
+ "train_batch_size": 4,
346
  "trial_name": null,
347
  "trial_params": null
348
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfe3c1ee4a1e601a606aca8b79d3e2e73720343bd516dac207d73e6f246551ec
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b099f7fb886c1c9a23e2c84872a02a97862f45cdd77dbcdbacca1ddcaa8b3c5f
3
  size 5112
vocab.json CHANGED
The diff for this file is too large to render. See raw diff