Ar4l commited on
Commit
a12f512
·
verified ·
1 Parent(s): 1b82aa3

Upload folder using huggingface_hub

Browse files
all_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "eval_accuracy": 0.853210985660553,
4
+ "eval_f1": 0.8608695652173913,
5
+ "eval_loss": 0.7391630411148071,
6
+ "eval_mcc": 0.7061073536146776,
7
+ "eval_runtime": 0.6999,
8
+ "eval_samples": 436,
9
+ "eval_samples_per_second": 622.962,
10
+ "eval_steps_per_second": 78.585,
11
+ "total_flos": 3.178750951973683e+16,
12
+ "train_loss": 0.16765205063521402,
13
+ "train_runtime": 3789.576,
14
+ "train_samples": 67349,
15
+ "train_samples_per_second": 355.443,
16
+ "train_steps_per_second": 44.432
17
+ }
checkpoint-42095/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/ubuntu/utah/babylm-24/data/training/models/10M_babylm_ascii/SPM-Unigram_6144/DebertaV2-Base-10M_babylm-A",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 1536,
12
+ "label2id": {
13
+ "0": 0,
14
+ "1": 1
15
+ },
16
+ "layer_norm_eps": 1e-07,
17
+ "max_position_embeddings": 512,
18
+ "max_relative_positions": -1,
19
+ "model_type": "deberta-v2",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 8,
22
+ "pad_token_id": 3,
23
+ "pooler_dropout": 0,
24
+ "pooler_hidden_act": "gelu",
25
+ "pooler_hidden_size": 768,
26
+ "pos_att_type": null,
27
+ "position_biased_input": true,
28
+ "relative_attention": false,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.44.2",
31
+ "type_vocab_size": 0,
32
+ "vocab_size": 6144
33
+ }
checkpoint-42095/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c64146fadd6816f974e6b4696626d1acd3bcd0e2d60660589a0f913a9938d3a
3
+ size 174103504
checkpoint-42095/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5acce98f39141951dd984fbd8c1eb65775f5a1882bab8dc39be5a0802228952
3
+ size 348288250
checkpoint-42095/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f1ca5c120e6945f77fe7c71e4391903916ce310cb210da5888d9bdc3e2a330d
3
+ size 14244
checkpoint-42095/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cefd41126061d476060c6738e97af19422e52e7175beef90f798a3636c54a43a
3
+ size 1064
checkpoint-42095/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-42095/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-42095/tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[PAR]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[TAB]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "clean_up_tokenization_spaces": false,
61
+ "cls_token": "[CLS]",
62
+ "mask_token": "[MASK]",
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "[PAD]",
65
+ "sep_token": "[SEP]",
66
+ "tokenizer_class": "PreTrainedTokenizerFast"
67
+ }
checkpoint-42095/trainer_state.json ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8608695652173913,
3
+ "best_model_checkpoint": "/home/ubuntu/utah/babylm-24/src/evaluation/results/finetune/DebertaV2-Base-10M_babylm-A/sst2/checkpoint-42095",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 42095,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05938947618482005,
13
+ "grad_norm": 2.9371345043182373,
14
+ "learning_rate": 2.991091578572277e-05,
15
+ "loss": 0.5976,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.1187789523696401,
20
+ "grad_norm": 6.96113920211792,
21
+ "learning_rate": 2.982183157144554e-05,
22
+ "loss": 0.4885,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.17816842855446016,
27
+ "grad_norm": 5.86502742767334,
28
+ "learning_rate": 2.973274735716831e-05,
29
+ "loss": 0.4746,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.2375579047392802,
34
+ "grad_norm": 7.14243745803833,
35
+ "learning_rate": 2.964366314289108e-05,
36
+ "loss": 0.4426,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.2969473809241002,
41
+ "grad_norm": 8.168976783752441,
42
+ "learning_rate": 2.955457892861385e-05,
43
+ "loss": 0.4213,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.3563368571089203,
48
+ "grad_norm": 12.06148624420166,
49
+ "learning_rate": 2.9465494714336618e-05,
50
+ "loss": 0.3904,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.41572633329374037,
55
+ "grad_norm": 5.824960708618164,
56
+ "learning_rate": 2.937641050005939e-05,
57
+ "loss": 0.3788,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.4751158094785604,
62
+ "grad_norm": 4.579179763793945,
63
+ "learning_rate": 2.9287326285782158e-05,
64
+ "loss": 0.3646,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.5345052856633804,
69
+ "grad_norm": 25.88724136352539,
70
+ "learning_rate": 2.919824207150493e-05,
71
+ "loss": 0.3727,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.5938947618482004,
76
+ "grad_norm": 1.1444391012191772,
77
+ "learning_rate": 2.91091578572277e-05,
78
+ "loss": 0.3607,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.6532842380330206,
83
+ "grad_norm": 6.4522600173950195,
84
+ "learning_rate": 2.902007364295047e-05,
85
+ "loss": 0.3597,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.7126737142178406,
90
+ "grad_norm": 13.808451652526855,
91
+ "learning_rate": 2.8930989428673242e-05,
92
+ "loss": 0.3384,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.7720631904026607,
97
+ "grad_norm": 16.229154586791992,
98
+ "learning_rate": 2.884190521439601e-05,
99
+ "loss": 0.3408,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.8314526665874807,
104
+ "grad_norm": 26.570392608642578,
105
+ "learning_rate": 2.8752821000118782e-05,
106
+ "loss": 0.3391,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.8908421427723008,
111
+ "grad_norm": 4.880529880523682,
112
+ "learning_rate": 2.866373678584155e-05,
113
+ "loss": 0.3436,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.9502316189571208,
118
+ "grad_norm": 18.9448299407959,
119
+ "learning_rate": 2.857465257156432e-05,
120
+ "loss": 0.3151,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 1.0,
125
+ "eval_accuracy": 0.8463302850723267,
126
+ "eval_f1": 0.8466819221967964,
127
+ "eval_loss": 0.40749993920326233,
128
+ "eval_mcc": 0.694664779303656,
129
+ "eval_runtime": 0.7142,
130
+ "eval_samples_per_second": 610.484,
131
+ "eval_steps_per_second": 77.011,
132
+ "step": 8419
133
+ },
134
+ {
135
+ "epoch": 1.0096210951419409,
136
+ "grad_norm": 11.80332088470459,
137
+ "learning_rate": 2.848556835728709e-05,
138
+ "loss": 0.3028,
139
+ "step": 8500
140
+ },
141
+ {
142
+ "epoch": 1.0690105713267608,
143
+ "grad_norm": 22.269908905029297,
144
+ "learning_rate": 2.839648414300986e-05,
145
+ "loss": 0.2843,
146
+ "step": 9000
147
+ },
148
+ {
149
+ "epoch": 1.128400047511581,
150
+ "grad_norm": 14.310569763183594,
151
+ "learning_rate": 2.830739992873263e-05,
152
+ "loss": 0.2653,
153
+ "step": 9500
154
+ },
155
+ {
156
+ "epoch": 1.187789523696401,
157
+ "grad_norm": 6.787134170532227,
158
+ "learning_rate": 2.82183157144554e-05,
159
+ "loss": 0.2808,
160
+ "step": 10000
161
+ },
162
+ {
163
+ "epoch": 1.247178999881221,
164
+ "grad_norm": 1.5390983819961548,
165
+ "learning_rate": 2.812923150017817e-05,
166
+ "loss": 0.2651,
167
+ "step": 10500
168
+ },
169
+ {
170
+ "epoch": 1.3065684760660412,
171
+ "grad_norm": 0.39634034037590027,
172
+ "learning_rate": 2.804014728590094e-05,
173
+ "loss": 0.2641,
174
+ "step": 11000
175
+ },
176
+ {
177
+ "epoch": 1.3659579522508611,
178
+ "grad_norm": 0.24770836532115936,
179
+ "learning_rate": 2.795106307162371e-05,
180
+ "loss": 0.2448,
181
+ "step": 11500
182
+ },
183
+ {
184
+ "epoch": 1.425347428435681,
185
+ "grad_norm": 16.2122859954834,
186
+ "learning_rate": 2.786197885734648e-05,
187
+ "loss": 0.2632,
188
+ "step": 12000
189
+ },
190
+ {
191
+ "epoch": 1.4847369046205012,
192
+ "grad_norm": 4.9013285636901855,
193
+ "learning_rate": 2.777289464306925e-05,
194
+ "loss": 0.2547,
195
+ "step": 12500
196
+ },
197
+ {
198
+ "epoch": 1.5441263808053214,
199
+ "grad_norm": 8.146185874938965,
200
+ "learning_rate": 2.7683810428792017e-05,
201
+ "loss": 0.2602,
202
+ "step": 13000
203
+ },
204
+ {
205
+ "epoch": 1.6035158569901413,
206
+ "grad_norm": 13.071233749389648,
207
+ "learning_rate": 2.759472621451479e-05,
208
+ "loss": 0.2492,
209
+ "step": 13500
210
+ },
211
+ {
212
+ "epoch": 1.6629053331749613,
213
+ "grad_norm": 0.20201528072357178,
214
+ "learning_rate": 2.7505642000237557e-05,
215
+ "loss": 0.2559,
216
+ "step": 14000
217
+ },
218
+ {
219
+ "epoch": 1.7222948093597814,
220
+ "grad_norm": 3.819157123565674,
221
+ "learning_rate": 2.741655778596033e-05,
222
+ "loss": 0.237,
223
+ "step": 14500
224
+ },
225
+ {
226
+ "epoch": 1.7816842855446016,
227
+ "grad_norm": 0.21059203147888184,
228
+ "learning_rate": 2.7327473571683097e-05,
229
+ "loss": 0.2485,
230
+ "step": 15000
231
+ },
232
+ {
233
+ "epoch": 1.8410737617294215,
234
+ "grad_norm": 0.5426599383354187,
235
+ "learning_rate": 2.7238389357405867e-05,
236
+ "loss": 0.2451,
237
+ "step": 15500
238
+ },
239
+ {
240
+ "epoch": 1.9004632379142414,
241
+ "grad_norm": 64.44636535644531,
242
+ "learning_rate": 2.714930514312864e-05,
243
+ "loss": 0.2523,
244
+ "step": 16000
245
+ },
246
+ {
247
+ "epoch": 1.9598527140990618,
248
+ "grad_norm": 4.042015552520752,
249
+ "learning_rate": 2.7060220928851407e-05,
250
+ "loss": 0.2707,
251
+ "step": 16500
252
+ },
253
+ {
254
+ "epoch": 2.0,
255
+ "eval_accuracy": 0.8394495248794556,
256
+ "eval_f1": 0.8379629629629629,
257
+ "eval_loss": 0.6586544513702393,
258
+ "eval_mcc": 0.6824127269645459,
259
+ "eval_runtime": 0.6937,
260
+ "eval_samples_per_second": 628.482,
261
+ "eval_steps_per_second": 79.281,
262
+ "step": 16838
263
+ },
264
+ {
265
+ "epoch": 2.0192421902838817,
266
+ "grad_norm": 0.36723724007606506,
267
+ "learning_rate": 2.697113671457418e-05,
268
+ "loss": 0.2145,
269
+ "step": 17000
270
+ },
271
+ {
272
+ "epoch": 2.0786316664687017,
273
+ "grad_norm": 0.06311015039682388,
274
+ "learning_rate": 2.6882052500296947e-05,
275
+ "loss": 0.1803,
276
+ "step": 17500
277
+ },
278
+ {
279
+ "epoch": 2.1380211426535216,
280
+ "grad_norm": 0.21473073959350586,
281
+ "learning_rate": 2.6792968286019718e-05,
282
+ "loss": 0.1914,
283
+ "step": 18000
284
+ },
285
+ {
286
+ "epoch": 2.197410618838342,
287
+ "grad_norm": 12.172515869140625,
288
+ "learning_rate": 2.6703884071742488e-05,
289
+ "loss": 0.1957,
290
+ "step": 18500
291
+ },
292
+ {
293
+ "epoch": 2.256800095023162,
294
+ "grad_norm": 0.08349990844726562,
295
+ "learning_rate": 2.6614799857465258e-05,
296
+ "loss": 0.2067,
297
+ "step": 19000
298
+ },
299
+ {
300
+ "epoch": 2.316189571207982,
301
+ "grad_norm": 6.744356632232666,
302
+ "learning_rate": 2.6525715643188028e-05,
303
+ "loss": 0.1858,
304
+ "step": 19500
305
+ },
306
+ {
307
+ "epoch": 2.375579047392802,
308
+ "grad_norm": 23.043901443481445,
309
+ "learning_rate": 2.6436631428910798e-05,
310
+ "loss": 0.1994,
311
+ "step": 20000
312
+ },
313
+ {
314
+ "epoch": 2.434968523577622,
315
+ "grad_norm": 0.16990970075130463,
316
+ "learning_rate": 2.6347547214633565e-05,
317
+ "loss": 0.2012,
318
+ "step": 20500
319
+ },
320
+ {
321
+ "epoch": 2.494357999762442,
322
+ "grad_norm": 4.681793689727783,
323
+ "learning_rate": 2.6258463000356338e-05,
324
+ "loss": 0.1931,
325
+ "step": 21000
326
+ },
327
+ {
328
+ "epoch": 2.553747475947262,
329
+ "grad_norm": 8.904121398925781,
330
+ "learning_rate": 2.6169378786079108e-05,
331
+ "loss": 0.1899,
332
+ "step": 21500
333
+ },
334
+ {
335
+ "epoch": 2.6131369521320824,
336
+ "grad_norm": 0.045907679945230484,
337
+ "learning_rate": 2.608029457180188e-05,
338
+ "loss": 0.2032,
339
+ "step": 22000
340
+ },
341
+ {
342
+ "epoch": 2.6725264283169023,
343
+ "grad_norm": 0.12679323554039001,
344
+ "learning_rate": 2.599121035752465e-05,
345
+ "loss": 0.2189,
346
+ "step": 22500
347
+ },
348
+ {
349
+ "epoch": 2.7319159045017223,
350
+ "grad_norm": 0.09417314827442169,
351
+ "learning_rate": 2.5902126143247415e-05,
352
+ "loss": 0.193,
353
+ "step": 23000
354
+ },
355
+ {
356
+ "epoch": 2.791305380686542,
357
+ "grad_norm": 0.06781476736068726,
358
+ "learning_rate": 2.581304192897019e-05,
359
+ "loss": 0.2021,
360
+ "step": 23500
361
+ },
362
+ {
363
+ "epoch": 2.850694856871362,
364
+ "grad_norm": 0.15305034816265106,
365
+ "learning_rate": 2.5723957714692955e-05,
366
+ "loss": 0.198,
367
+ "step": 24000
368
+ },
369
+ {
370
+ "epoch": 2.9100843330561825,
371
+ "grad_norm": 0.7531378865242004,
372
+ "learning_rate": 2.563487350041573e-05,
373
+ "loss": 0.1799,
374
+ "step": 24500
375
+ },
376
+ {
377
+ "epoch": 2.9694738092410025,
378
+ "grad_norm": 0.24162191152572632,
379
+ "learning_rate": 2.5545789286138496e-05,
380
+ "loss": 0.1858,
381
+ "step": 25000
382
+ },
383
+ {
384
+ "epoch": 3.0,
385
+ "eval_accuracy": 0.85550457239151,
386
+ "eval_f1": 0.8538283062645011,
387
+ "eval_loss": 0.5957673788070679,
388
+ "eval_mcc": 0.7149506045130871,
389
+ "eval_runtime": 0.7033,
390
+ "eval_samples_per_second": 619.903,
391
+ "eval_steps_per_second": 78.199,
392
+ "step": 25257
393
+ },
394
+ {
395
+ "epoch": 3.0288632854258224,
396
+ "grad_norm": 0.03327510878443718,
397
+ "learning_rate": 2.5456705071861266e-05,
398
+ "loss": 0.1564,
399
+ "step": 25500
400
+ },
401
+ {
402
+ "epoch": 3.0882527616106428,
403
+ "grad_norm": 59.41106033325195,
404
+ "learning_rate": 2.5367620857584036e-05,
405
+ "loss": 0.1308,
406
+ "step": 26000
407
+ },
408
+ {
409
+ "epoch": 3.1476422377954627,
410
+ "grad_norm": 0.029464269056916237,
411
+ "learning_rate": 2.5278536643306806e-05,
412
+ "loss": 0.1276,
413
+ "step": 26500
414
+ },
415
+ {
416
+ "epoch": 3.2070317139802826,
417
+ "grad_norm": 0.1652437150478363,
418
+ "learning_rate": 2.518945242902958e-05,
419
+ "loss": 0.1532,
420
+ "step": 27000
421
+ },
422
+ {
423
+ "epoch": 3.2664211901651026,
424
+ "grad_norm": 0.20975850522518158,
425
+ "learning_rate": 2.5100368214752346e-05,
426
+ "loss": 0.1558,
427
+ "step": 27500
428
+ },
429
+ {
430
+ "epoch": 3.325810666349923,
431
+ "grad_norm": 0.33388465642929077,
432
+ "learning_rate": 2.5011284000475116e-05,
433
+ "loss": 0.1421,
434
+ "step": 28000
435
+ },
436
+ {
437
+ "epoch": 3.385200142534743,
438
+ "grad_norm": 0.06191316992044449,
439
+ "learning_rate": 2.4922199786197886e-05,
440
+ "loss": 0.1356,
441
+ "step": 28500
442
+ },
443
+ {
444
+ "epoch": 3.444589618719563,
445
+ "grad_norm": 0.03824834153056145,
446
+ "learning_rate": 2.4833115571920656e-05,
447
+ "loss": 0.1615,
448
+ "step": 29000
449
+ },
450
+ {
451
+ "epoch": 3.503979094904383,
452
+ "grad_norm": 0.03963463753461838,
453
+ "learning_rate": 2.4744031357643426e-05,
454
+ "loss": 0.1515,
455
+ "step": 29500
456
+ },
457
+ {
458
+ "epoch": 3.563368571089203,
459
+ "grad_norm": 0.26668134331703186,
460
+ "learning_rate": 2.4654947143366197e-05,
461
+ "loss": 0.1578,
462
+ "step": 30000
463
+ },
464
+ {
465
+ "epoch": 3.622758047274023,
466
+ "grad_norm": 0.07644706219434738,
467
+ "learning_rate": 2.4565862929088963e-05,
468
+ "loss": 0.1399,
469
+ "step": 30500
470
+ },
471
+ {
472
+ "epoch": 3.682147523458843,
473
+ "grad_norm": 0.02788461185991764,
474
+ "learning_rate": 2.4476778714811737e-05,
475
+ "loss": 0.156,
476
+ "step": 31000
477
+ },
478
+ {
479
+ "epoch": 3.741536999643663,
480
+ "grad_norm": 0.2001054584980011,
481
+ "learning_rate": 2.4387694500534507e-05,
482
+ "loss": 0.1519,
483
+ "step": 31500
484
+ },
485
+ {
486
+ "epoch": 3.8009264758284833,
487
+ "grad_norm": 9.855899810791016,
488
+ "learning_rate": 2.4298610286257277e-05,
489
+ "loss": 0.1498,
490
+ "step": 32000
491
+ },
492
+ {
493
+ "epoch": 3.8603159520133032,
494
+ "grad_norm": 0.25349605083465576,
495
+ "learning_rate": 2.4209526071980047e-05,
496
+ "loss": 0.1582,
497
+ "step": 32500
498
+ },
499
+ {
500
+ "epoch": 3.919705428198123,
501
+ "grad_norm": 0.10745853930711746,
502
+ "learning_rate": 2.4120441857702814e-05,
503
+ "loss": 0.154,
504
+ "step": 33000
505
+ },
506
+ {
507
+ "epoch": 3.9790949043829436,
508
+ "grad_norm": 0.018555356189608574,
509
+ "learning_rate": 2.4031357643425587e-05,
510
+ "loss": 0.1567,
511
+ "step": 33500
512
+ },
513
+ {
514
+ "epoch": 4.0,
515
+ "eval_accuracy": 0.8463302850723267,
516
+ "eval_f1": 0.8546637744034707,
517
+ "eval_loss": 0.6577614545822144,
518
+ "eval_mcc": 0.6923721957357695,
519
+ "eval_runtime": 0.7006,
520
+ "eval_samples_per_second": 622.298,
521
+ "eval_steps_per_second": 78.501,
522
+ "step": 33676
523
+ },
524
+ {
525
+ "epoch": 4.0384843805677635,
526
+ "grad_norm": 0.04052357375621796,
527
+ "learning_rate": 2.3942273429148354e-05,
528
+ "loss": 0.1321,
529
+ "step": 34000
530
+ },
531
+ {
532
+ "epoch": 4.097873856752583,
533
+ "grad_norm": 28.136058807373047,
534
+ "learning_rate": 2.3853189214871127e-05,
535
+ "loss": 0.12,
536
+ "step": 34500
537
+ },
538
+ {
539
+ "epoch": 4.157263332937403,
540
+ "grad_norm": 12.880512237548828,
541
+ "learning_rate": 2.3764105000593894e-05,
542
+ "loss": 0.1092,
543
+ "step": 35000
544
+ },
545
+ {
546
+ "epoch": 4.216652809122223,
547
+ "grad_norm": 0.02295825444161892,
548
+ "learning_rate": 2.3675020786316664e-05,
549
+ "loss": 0.1015,
550
+ "step": 35500
551
+ },
552
+ {
553
+ "epoch": 4.276042285307043,
554
+ "grad_norm": 16.506240844726562,
555
+ "learning_rate": 2.3585936572039434e-05,
556
+ "loss": 0.1011,
557
+ "step": 36000
558
+ },
559
+ {
560
+ "epoch": 4.335431761491864,
561
+ "grad_norm": 0.05963263288140297,
562
+ "learning_rate": 2.3496852357762204e-05,
563
+ "loss": 0.1203,
564
+ "step": 36500
565
+ },
566
+ {
567
+ "epoch": 4.394821237676684,
568
+ "grad_norm": 6.27707576751709,
569
+ "learning_rate": 2.3407768143484978e-05,
570
+ "loss": 0.108,
571
+ "step": 37000
572
+ },
573
+ {
574
+ "epoch": 4.454210713861504,
575
+ "grad_norm": 0.04750403016805649,
576
+ "learning_rate": 2.3318683929207745e-05,
577
+ "loss": 0.111,
578
+ "step": 37500
579
+ },
580
+ {
581
+ "epoch": 4.513600190046324,
582
+ "grad_norm": 0.11624455451965332,
583
+ "learning_rate": 2.3229599714930515e-05,
584
+ "loss": 0.1253,
585
+ "step": 38000
586
+ },
587
+ {
588
+ "epoch": 4.572989666231144,
589
+ "grad_norm": 0.024209963157773018,
590
+ "learning_rate": 2.3140515500653285e-05,
591
+ "loss": 0.1087,
592
+ "step": 38500
593
+ },
594
+ {
595
+ "epoch": 4.632379142415964,
596
+ "grad_norm": 0.38843753933906555,
597
+ "learning_rate": 2.3051431286376055e-05,
598
+ "loss": 0.134,
599
+ "step": 39000
600
+ },
601
+ {
602
+ "epoch": 4.691768618600784,
603
+ "grad_norm": 0.3449760973453522,
604
+ "learning_rate": 2.2962347072098825e-05,
605
+ "loss": 0.1211,
606
+ "step": 39500
607
+ },
608
+ {
609
+ "epoch": 4.751158094785604,
610
+ "grad_norm": 0.3117709457874298,
611
+ "learning_rate": 2.2873262857821595e-05,
612
+ "loss": 0.1151,
613
+ "step": 40000
614
+ },
615
+ {
616
+ "epoch": 4.810547570970424,
617
+ "grad_norm": 0.09373793005943298,
618
+ "learning_rate": 2.2784178643544362e-05,
619
+ "loss": 0.1247,
620
+ "step": 40500
621
+ },
622
+ {
623
+ "epoch": 4.869937047155244,
624
+ "grad_norm": 25.01434898376465,
625
+ "learning_rate": 2.2695094429267135e-05,
626
+ "loss": 0.131,
627
+ "step": 41000
628
+ },
629
+ {
630
+ "epoch": 4.929326523340064,
631
+ "grad_norm": 0.02744464948773384,
632
+ "learning_rate": 2.2606010214989902e-05,
633
+ "loss": 0.1432,
634
+ "step": 41500
635
+ },
636
+ {
637
+ "epoch": 4.988715999524884,
638
+ "grad_norm": 18.553770065307617,
639
+ "learning_rate": 2.2516926000712676e-05,
640
+ "loss": 0.1145,
641
+ "step": 42000
642
+ },
643
+ {
644
+ "epoch": 5.0,
645
+ "eval_accuracy": 0.853210985660553,
646
+ "eval_f1": 0.8608695652173913,
647
+ "eval_loss": 0.7391630411148071,
648
+ "eval_mcc": 0.7061073536146776,
649
+ "eval_runtime": 0.728,
650
+ "eval_samples_per_second": 598.936,
651
+ "eval_steps_per_second": 75.554,
652
+ "step": 42095
653
+ }
654
+ ],
655
+ "logging_steps": 500,
656
+ "max_steps": 168380,
657
+ "num_input_tokens_seen": 0,
658
+ "num_train_epochs": 20,
659
+ "save_steps": 500,
660
+ "stateful_callbacks": {
661
+ "EarlyStoppingCallback": {
662
+ "args": {
663
+ "early_stopping_patience": 3,
664
+ "early_stopping_threshold": 0.001
665
+ },
666
+ "attributes": {
667
+ "early_stopping_patience_counter": 0
668
+ }
669
+ },
670
+ "TrainerControl": {
671
+ "args": {
672
+ "should_epoch_stop": false,
673
+ "should_evaluate": false,
674
+ "should_log": false,
675
+ "should_save": true,
676
+ "should_training_stop": false
677
+ },
678
+ "attributes": {}
679
+ }
680
+ },
681
+ "total_flos": 1.986719344983552e+16,
682
+ "train_batch_size": 8,
683
+ "trial_name": null,
684
+ "trial_params": null
685
+ }
checkpoint-42095/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49c4e7e1a564faf612ab0f5c2b33608102b3f95ecb9a2d435a4b3463c4324b3
3
+ size 5368
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/ubuntu/utah/babylm-24/data/training/models/10M_babylm_ascii/SPM-Unigram_6144/DebertaV2-Base-10M_babylm-A",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 1536,
12
+ "label2id": {
13
+ "0": 0,
14
+ "1": 1
15
+ },
16
+ "layer_norm_eps": 1e-07,
17
+ "max_position_embeddings": 512,
18
+ "max_relative_positions": -1,
19
+ "model_type": "deberta-v2",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 8,
22
+ "pad_token_id": 3,
23
+ "pooler_dropout": 0,
24
+ "pooler_hidden_act": "gelu",
25
+ "pooler_hidden_size": 768,
26
+ "pos_att_type": null,
27
+ "position_biased_input": true,
28
+ "relative_attention": false,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.44.2",
31
+ "type_vocab_size": 0,
32
+ "vocab_size": 6144
33
+ }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "eval_accuracy": 0.853210985660553,
4
+ "eval_f1": 0.8608695652173913,
5
+ "eval_loss": 0.7391630411148071,
6
+ "eval_mcc": 0.7061073536146776,
7
+ "eval_runtime": 0.6999,
8
+ "eval_samples": 436,
9
+ "eval_samples_per_second": 622.962,
10
+ "eval_steps_per_second": 78.585
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c64146fadd6816f974e6b4696626d1acd3bcd0e2d60660589a0f913a9938d3a
3
+ size 174103504
predictions.txt ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index prediction
2
+ 0 1
3
+ 1 0
4
+ 2 1
5
+ 3 0
6
+ 4 1
7
+ 5 0
8
+ 6 0
9
+ 7 0
10
+ 8 1
11
+ 9 0
12
+ 10 0
13
+ 11 1
14
+ 12 1
15
+ 13 1
16
+ 14 1
17
+ 15 0
18
+ 16 1
19
+ 17 1
20
+ 18 0
21
+ 19 1
22
+ 20 1
23
+ 21 0
24
+ 22 0
25
+ 23 0
26
+ 24 1
27
+ 25 0
28
+ 26 1
29
+ 27 0
30
+ 28 0
31
+ 29 0
32
+ 30 0
33
+ 31 1
34
+ 32 0
35
+ 33 1
36
+ 34 1
37
+ 35 1
38
+ 36 1
39
+ 37 0
40
+ 38 0
41
+ 39 0
42
+ 40 1
43
+ 41 0
44
+ 42 1
45
+ 43 0
46
+ 44 1
47
+ 45 1
48
+ 46 1
49
+ 47 1
50
+ 48 1
51
+ 49 1
52
+ 50 1
53
+ 51 0
54
+ 52 0
55
+ 53 0
56
+ 54 1
57
+ 55 0
58
+ 56 1
59
+ 57 0
60
+ 58 1
61
+ 59 1
62
+ 60 1
63
+ 61 0
64
+ 62 1
65
+ 63 0
66
+ 64 1
67
+ 65 0
68
+ 66 1
69
+ 67 1
70
+ 68 0
71
+ 69 0
72
+ 70 0
73
+ 71 1
74
+ 72 1
75
+ 73 0
76
+ 74 0
77
+ 75 0
78
+ 76 0
79
+ 77 0
80
+ 78 1
81
+ 79 0
82
+ 80 1
83
+ 81 0
84
+ 82 0
85
+ 83 1
86
+ 84 0
87
+ 85 1
88
+ 86 1
89
+ 87 0
90
+ 88 1
91
+ 89 1
92
+ 90 1
93
+ 91 1
94
+ 92 1
95
+ 93 0
96
+ 94 0
97
+ 95 1
98
+ 96 1
99
+ 97 1
100
+ 98 1
101
+ 99 1
102
+ 100 0
103
+ 101 0
104
+ 102 1
105
+ 103 1
106
+ 104 1
107
+ 105 1
108
+ 106 1
109
+ 107 1
110
+ 108 0
111
+ 109 1
112
+ 110 0
113
+ 111 1
114
+ 112 1
115
+ 113 1
116
+ 114 1
117
+ 115 1
118
+ 116 1
119
+ 117 0
120
+ 118 0
121
+ 119 0
122
+ 120 0
123
+ 121 0
124
+ 122 0
125
+ 123 0
126
+ 124 1
127
+ 125 1
128
+ 126 1
129
+ 127 0
130
+ 128 0
131
+ 129 1
132
+ 130 0
133
+ 131 0
134
+ 132 1
135
+ 133 0
136
+ 134 0
137
+ 135 0
138
+ 136 1
139
+ 137 1
140
+ 138 0
141
+ 139 1
142
+ 140 0
143
+ 141 0
144
+ 142 1
145
+ 143 0
146
+ 144 1
147
+ 145 1
148
+ 146 1
149
+ 147 1
150
+ 148 1
151
+ 149 1
152
+ 150 1
153
+ 151 0
154
+ 152 1
155
+ 153 1
156
+ 154 0
157
+ 155 1
158
+ 156 1
159
+ 157 0
160
+ 158 1
161
+ 159 1
162
+ 160 1
163
+ 161 0
164
+ 162 0
165
+ 163 0
166
+ 164 0
167
+ 165 1
168
+ 166 0
169
+ 167 1
170
+ 168 1
171
+ 169 1
172
+ 170 0
173
+ 171 0
174
+ 172 1
175
+ 173 1
176
+ 174 0
177
+ 175 0
178
+ 176 0
179
+ 177 1
180
+ 178 0
181
+ 179 0
182
+ 180 1
183
+ 181 0
184
+ 182 1
185
+ 183 1
186
+ 184 1
187
+ 185 1
188
+ 186 0
189
+ 187 0
190
+ 188 0
191
+ 189 0
192
+ 190 1
193
+ 191 0
194
+ 192 0
195
+ 193 0
196
+ 194 0
197
+ 195 0
198
+ 196 0
199
+ 197 0
200
+ 198 0
201
+ 199 1
202
+ 200 0
203
+ 201 1
204
+ 202 0
205
+ 203 1
206
+ 204 1
207
+ 205 1
208
+ 206 1
209
+ 207 0
210
+ 208 0
211
+ 209 0
212
+ 210 0
213
+ 211 1
214
+ 212 1
215
+ 213 0
216
+ 214 1
217
+ 215 0
218
+ 216 1
219
+ 217 0
220
+ 218 0
221
+ 219 0
222
+ 220 1
223
+ 221 1
224
+ 222 1
225
+ 223 0
226
+ 224 1
227
+ 225 1
228
+ 226 0
229
+ 227 1
230
+ 228 1
231
+ 229 0
232
+ 230 0
233
+ 231 1
234
+ 232 1
235
+ 233 0
236
+ 234 0
237
+ 235 1
238
+ 236 0
239
+ 237 1
240
+ 238 0
241
+ 239 1
242
+ 240 0
243
+ 241 1
244
+ 242 1
245
+ 243 0
246
+ 244 1
247
+ 245 1
248
+ 246 1
249
+ 247 1
250
+ 248 0
251
+ 249 1
252
+ 250 1
253
+ 251 0
254
+ 252 1
255
+ 253 1
256
+ 254 1
257
+ 255 1
258
+ 256 1
259
+ 257 1
260
+ 258 1
261
+ 259 1
262
+ 260 0
263
+ 261 1
264
+ 262 1
265
+ 263 0
266
+ 264 1
267
+ 265 0
268
+ 266 0
269
+ 267 0
270
+ 268 1
271
+ 269 1
272
+ 270 0
273
+ 271 0
274
+ 272 1
275
+ 273 1
276
+ 274 1
277
+ 275 0
278
+ 276 1
279
+ 277 1
280
+ 278 0
281
+ 279 0
282
+ 280 0
283
+ 281 0
284
+ 282 1
285
+ 283 1
286
+ 284 1
287
+ 285 1
288
+ 286 1
289
+ 287 1
290
+ 288 0
291
+ 289 1
292
+ 290 0
293
+ 291 0
294
+ 292 0
295
+ 293 0
296
+ 294 1
297
+ 295 1
298
+ 296 0
299
+ 297 1
300
+ 298 0
301
+ 299 1
302
+ 300 1
303
+ 301 1
304
+ 302 0
305
+ 303 1
306
+ 304 1
307
+ 305 1
308
+ 306 0
309
+ 307 1
310
+ 308 0
311
+ 309 0
312
+ 310 0
313
+ 311 1
314
+ 312 1
315
+ 313 1
316
+ 314 0
317
+ 315 0
318
+ 316 0
319
+ 317 0
320
+ 318 1
321
+ 319 1
322
+ 320 1
323
+ 321 0
324
+ 322 0
325
+ 323 0
326
+ 324 0
327
+ 325 1
328
+ 326 1
329
+ 327 1
330
+ 328 1
331
+ 329 0
332
+ 330 0
333
+ 331 0
334
+ 332 0
335
+ 333 1
336
+ 334 1
337
+ 335 1
338
+ 336 0
339
+ 337 1
340
+ 338 0
341
+ 339 1
342
+ 340 1
343
+ 341 0
344
+ 342 0
345
+ 343 1
346
+ 344 1
347
+ 345 0
348
+ 346 1
349
+ 347 0
350
+ 348 0
351
+ 349 1
352
+ 350 1
353
+ 351 0
354
+ 352 1
355
+ 353 1
356
+ 354 1
357
+ 355 1
358
+ 356 0
359
+ 357 1
360
+ 358 1
361
+ 359 1
362
+ 360 1
363
+ 361 1
364
+ 362 1
365
+ 363 1
366
+ 364 0
367
+ 365 0
368
+ 366 1
369
+ 367 1
370
+ 368 1
371
+ 369 0
372
+ 370 1
373
+ 371 1
374
+ 372 0
375
+ 373 0
376
+ 374 0
377
+ 375 0
378
+ 376 0
379
+ 377 1
380
+ 378 1
381
+ 379 0
382
+ 380 0
383
+ 381 1
384
+ 382 0
385
+ 383 1
386
+ 384 1
387
+ 385 0
388
+ 386 0
389
+ 387 0
390
+ 388 1
391
+ 389 1
392
+ 390 0
393
+ 391 0
394
+ 392 1
395
+ 393 0
396
+ 394 0
397
+ 395 0
398
+ 396 1
399
+ 397 0
400
+ 398 0
401
+ 399 0
402
+ 400 1
403
+ 401 0
404
+ 402 0
405
+ 403 1
406
+ 404 1
407
+ 405 1
408
+ 406 1
409
+ 407 0
410
+ 408 1
411
+ 409 1
412
+ 410 1
413
+ 411 0
414
+ 412 1
415
+ 413 1
416
+ 414 1
417
+ 415 1
418
+ 416 0
419
+ 417 0
420
+ 418 1
421
+ 419 1
422
+ 420 1
423
+ 421 0
424
+ 422 0
425
+ 423 0
426
+ 424 0
427
+ 425 1
428
+ 426 0
429
+ 427 0
430
+ 428 0
431
+ 429 1
432
+ 430 0
433
+ 431 0
434
+ 432 1
435
+ 433 1
436
+ 434 0
437
+ 435 1
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[PAR]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[TAB]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "clean_up_tokenization_spaces": false,
61
+ "cls_token": "[CLS]",
62
+ "mask_token": "[MASK]",
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "[PAD]",
65
+ "sep_token": "[SEP]",
66
+ "tokenizer_class": "PreTrainedTokenizerFast"
67
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "total_flos": 3.178750951973683e+16,
4
+ "train_loss": 0.16765205063521402,
5
+ "train_runtime": 3789.576,
6
+ "train_samples": 67349,
7
+ "train_samples_per_second": 355.443,
8
+ "train_steps_per_second": 44.432
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8608695652173913,
3
+ "best_model_checkpoint": "/home/ubuntu/utah/babylm-24/src/evaluation/results/finetune/DebertaV2-Base-10M_babylm-A/sst2/checkpoint-42095",
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 67352,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05938947618482005,
13
+ "grad_norm": 2.9371345043182373,
14
+ "learning_rate": 2.991091578572277e-05,
15
+ "loss": 0.5976,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.1187789523696401,
20
+ "grad_norm": 6.96113920211792,
21
+ "learning_rate": 2.982183157144554e-05,
22
+ "loss": 0.4885,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.17816842855446016,
27
+ "grad_norm": 5.86502742767334,
28
+ "learning_rate": 2.973274735716831e-05,
29
+ "loss": 0.4746,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.2375579047392802,
34
+ "grad_norm": 7.14243745803833,
35
+ "learning_rate": 2.964366314289108e-05,
36
+ "loss": 0.4426,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.2969473809241002,
41
+ "grad_norm": 8.168976783752441,
42
+ "learning_rate": 2.955457892861385e-05,
43
+ "loss": 0.4213,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.3563368571089203,
48
+ "grad_norm": 12.06148624420166,
49
+ "learning_rate": 2.9465494714336618e-05,
50
+ "loss": 0.3904,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.41572633329374037,
55
+ "grad_norm": 5.824960708618164,
56
+ "learning_rate": 2.937641050005939e-05,
57
+ "loss": 0.3788,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.4751158094785604,
62
+ "grad_norm": 4.579179763793945,
63
+ "learning_rate": 2.9287326285782158e-05,
64
+ "loss": 0.3646,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.5345052856633804,
69
+ "grad_norm": 25.88724136352539,
70
+ "learning_rate": 2.919824207150493e-05,
71
+ "loss": 0.3727,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.5938947618482004,
76
+ "grad_norm": 1.1444391012191772,
77
+ "learning_rate": 2.91091578572277e-05,
78
+ "loss": 0.3607,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.6532842380330206,
83
+ "grad_norm": 6.4522600173950195,
84
+ "learning_rate": 2.902007364295047e-05,
85
+ "loss": 0.3597,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.7126737142178406,
90
+ "grad_norm": 13.808451652526855,
91
+ "learning_rate": 2.8930989428673242e-05,
92
+ "loss": 0.3384,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.7720631904026607,
97
+ "grad_norm": 16.229154586791992,
98
+ "learning_rate": 2.884190521439601e-05,
99
+ "loss": 0.3408,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.8314526665874807,
104
+ "grad_norm": 26.570392608642578,
105
+ "learning_rate": 2.8752821000118782e-05,
106
+ "loss": 0.3391,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.8908421427723008,
111
+ "grad_norm": 4.880529880523682,
112
+ "learning_rate": 2.866373678584155e-05,
113
+ "loss": 0.3436,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.9502316189571208,
118
+ "grad_norm": 18.9448299407959,
119
+ "learning_rate": 2.857465257156432e-05,
120
+ "loss": 0.3151,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 1.0,
125
+ "eval_accuracy": 0.8463302850723267,
126
+ "eval_f1": 0.8466819221967964,
127
+ "eval_loss": 0.40749993920326233,
128
+ "eval_mcc": 0.694664779303656,
129
+ "eval_runtime": 0.7142,
130
+ "eval_samples_per_second": 610.484,
131
+ "eval_steps_per_second": 77.011,
132
+ "step": 8419
133
+ },
134
+ {
135
+ "epoch": 1.0096210951419409,
136
+ "grad_norm": 11.80332088470459,
137
+ "learning_rate": 2.848556835728709e-05,
138
+ "loss": 0.3028,
139
+ "step": 8500
140
+ },
141
+ {
142
+ "epoch": 1.0690105713267608,
143
+ "grad_norm": 22.269908905029297,
144
+ "learning_rate": 2.839648414300986e-05,
145
+ "loss": 0.2843,
146
+ "step": 9000
147
+ },
148
+ {
149
+ "epoch": 1.128400047511581,
150
+ "grad_norm": 14.310569763183594,
151
+ "learning_rate": 2.830739992873263e-05,
152
+ "loss": 0.2653,
153
+ "step": 9500
154
+ },
155
+ {
156
+ "epoch": 1.187789523696401,
157
+ "grad_norm": 6.787134170532227,
158
+ "learning_rate": 2.82183157144554e-05,
159
+ "loss": 0.2808,
160
+ "step": 10000
161
+ },
162
+ {
163
+ "epoch": 1.247178999881221,
164
+ "grad_norm": 1.5390983819961548,
165
+ "learning_rate": 2.812923150017817e-05,
166
+ "loss": 0.2651,
167
+ "step": 10500
168
+ },
169
+ {
170
+ "epoch": 1.3065684760660412,
171
+ "grad_norm": 0.39634034037590027,
172
+ "learning_rate": 2.804014728590094e-05,
173
+ "loss": 0.2641,
174
+ "step": 11000
175
+ },
176
+ {
177
+ "epoch": 1.3659579522508611,
178
+ "grad_norm": 0.24770836532115936,
179
+ "learning_rate": 2.795106307162371e-05,
180
+ "loss": 0.2448,
181
+ "step": 11500
182
+ },
183
+ {
184
+ "epoch": 1.425347428435681,
185
+ "grad_norm": 16.2122859954834,
186
+ "learning_rate": 2.786197885734648e-05,
187
+ "loss": 0.2632,
188
+ "step": 12000
189
+ },
190
+ {
191
+ "epoch": 1.4847369046205012,
192
+ "grad_norm": 4.9013285636901855,
193
+ "learning_rate": 2.777289464306925e-05,
194
+ "loss": 0.2547,
195
+ "step": 12500
196
+ },
197
+ {
198
+ "epoch": 1.5441263808053214,
199
+ "grad_norm": 8.146185874938965,
200
+ "learning_rate": 2.7683810428792017e-05,
201
+ "loss": 0.2602,
202
+ "step": 13000
203
+ },
204
+ {
205
+ "epoch": 1.6035158569901413,
206
+ "grad_norm": 13.071233749389648,
207
+ "learning_rate": 2.759472621451479e-05,
208
+ "loss": 0.2492,
209
+ "step": 13500
210
+ },
211
+ {
212
+ "epoch": 1.6629053331749613,
213
+ "grad_norm": 0.20201528072357178,
214
+ "learning_rate": 2.7505642000237557e-05,
215
+ "loss": 0.2559,
216
+ "step": 14000
217
+ },
218
+ {
219
+ "epoch": 1.7222948093597814,
220
+ "grad_norm": 3.819157123565674,
221
+ "learning_rate": 2.741655778596033e-05,
222
+ "loss": 0.237,
223
+ "step": 14500
224
+ },
225
+ {
226
+ "epoch": 1.7816842855446016,
227
+ "grad_norm": 0.21059203147888184,
228
+ "learning_rate": 2.7327473571683097e-05,
229
+ "loss": 0.2485,
230
+ "step": 15000
231
+ },
232
+ {
233
+ "epoch": 1.8410737617294215,
234
+ "grad_norm": 0.5426599383354187,
235
+ "learning_rate": 2.7238389357405867e-05,
236
+ "loss": 0.2451,
237
+ "step": 15500
238
+ },
239
+ {
240
+ "epoch": 1.9004632379142414,
241
+ "grad_norm": 64.44636535644531,
242
+ "learning_rate": 2.714930514312864e-05,
243
+ "loss": 0.2523,
244
+ "step": 16000
245
+ },
246
+ {
247
+ "epoch": 1.9598527140990618,
248
+ "grad_norm": 4.042015552520752,
249
+ "learning_rate": 2.7060220928851407e-05,
250
+ "loss": 0.2707,
251
+ "step": 16500
252
+ },
253
+ {
254
+ "epoch": 2.0,
255
+ "eval_accuracy": 0.8394495248794556,
256
+ "eval_f1": 0.8379629629629629,
257
+ "eval_loss": 0.6586544513702393,
258
+ "eval_mcc": 0.6824127269645459,
259
+ "eval_runtime": 0.6937,
260
+ "eval_samples_per_second": 628.482,
261
+ "eval_steps_per_second": 79.281,
262
+ "step": 16838
263
+ },
264
+ {
265
+ "epoch": 2.0192421902838817,
266
+ "grad_norm": 0.36723724007606506,
267
+ "learning_rate": 2.697113671457418e-05,
268
+ "loss": 0.2145,
269
+ "step": 17000
270
+ },
271
+ {
272
+ "epoch": 2.0786316664687017,
273
+ "grad_norm": 0.06311015039682388,
274
+ "learning_rate": 2.6882052500296947e-05,
275
+ "loss": 0.1803,
276
+ "step": 17500
277
+ },
278
+ {
279
+ "epoch": 2.1380211426535216,
280
+ "grad_norm": 0.21473073959350586,
281
+ "learning_rate": 2.6792968286019718e-05,
282
+ "loss": 0.1914,
283
+ "step": 18000
284
+ },
285
+ {
286
+ "epoch": 2.197410618838342,
287
+ "grad_norm": 12.172515869140625,
288
+ "learning_rate": 2.6703884071742488e-05,
289
+ "loss": 0.1957,
290
+ "step": 18500
291
+ },
292
+ {
293
+ "epoch": 2.256800095023162,
294
+ "grad_norm": 0.08349990844726562,
295
+ "learning_rate": 2.6614799857465258e-05,
296
+ "loss": 0.2067,
297
+ "step": 19000
298
+ },
299
+ {
300
+ "epoch": 2.316189571207982,
301
+ "grad_norm": 6.744356632232666,
302
+ "learning_rate": 2.6525715643188028e-05,
303
+ "loss": 0.1858,
304
+ "step": 19500
305
+ },
306
+ {
307
+ "epoch": 2.375579047392802,
308
+ "grad_norm": 23.043901443481445,
309
+ "learning_rate": 2.6436631428910798e-05,
310
+ "loss": 0.1994,
311
+ "step": 20000
312
+ },
313
+ {
314
+ "epoch": 2.434968523577622,
315
+ "grad_norm": 0.16990970075130463,
316
+ "learning_rate": 2.6347547214633565e-05,
317
+ "loss": 0.2012,
318
+ "step": 20500
319
+ },
320
+ {
321
+ "epoch": 2.494357999762442,
322
+ "grad_norm": 4.681793689727783,
323
+ "learning_rate": 2.6258463000356338e-05,
324
+ "loss": 0.1931,
325
+ "step": 21000
326
+ },
327
+ {
328
+ "epoch": 2.553747475947262,
329
+ "grad_norm": 8.904121398925781,
330
+ "learning_rate": 2.6169378786079108e-05,
331
+ "loss": 0.1899,
332
+ "step": 21500
333
+ },
334
+ {
335
+ "epoch": 2.6131369521320824,
336
+ "grad_norm": 0.045907679945230484,
337
+ "learning_rate": 2.608029457180188e-05,
338
+ "loss": 0.2032,
339
+ "step": 22000
340
+ },
341
+ {
342
+ "epoch": 2.6725264283169023,
343
+ "grad_norm": 0.12679323554039001,
344
+ "learning_rate": 2.599121035752465e-05,
345
+ "loss": 0.2189,
346
+ "step": 22500
347
+ },
348
+ {
349
+ "epoch": 2.7319159045017223,
350
+ "grad_norm": 0.09417314827442169,
351
+ "learning_rate": 2.5902126143247415e-05,
352
+ "loss": 0.193,
353
+ "step": 23000
354
+ },
355
+ {
356
+ "epoch": 2.791305380686542,
357
+ "grad_norm": 0.06781476736068726,
358
+ "learning_rate": 2.581304192897019e-05,
359
+ "loss": 0.2021,
360
+ "step": 23500
361
+ },
362
+ {
363
+ "epoch": 2.850694856871362,
364
+ "grad_norm": 0.15305034816265106,
365
+ "learning_rate": 2.5723957714692955e-05,
366
+ "loss": 0.198,
367
+ "step": 24000
368
+ },
369
+ {
370
+ "epoch": 2.9100843330561825,
371
+ "grad_norm": 0.7531378865242004,
372
+ "learning_rate": 2.563487350041573e-05,
373
+ "loss": 0.1799,
374
+ "step": 24500
375
+ },
376
+ {
377
+ "epoch": 2.9694738092410025,
378
+ "grad_norm": 0.24162191152572632,
379
+ "learning_rate": 2.5545789286138496e-05,
380
+ "loss": 0.1858,
381
+ "step": 25000
382
+ },
383
+ {
384
+ "epoch": 3.0,
385
+ "eval_accuracy": 0.85550457239151,
386
+ "eval_f1": 0.8538283062645011,
387
+ "eval_loss": 0.5957673788070679,
388
+ "eval_mcc": 0.7149506045130871,
389
+ "eval_runtime": 0.7033,
390
+ "eval_samples_per_second": 619.903,
391
+ "eval_steps_per_second": 78.199,
392
+ "step": 25257
393
+ },
394
+ {
395
+ "epoch": 3.0288632854258224,
396
+ "grad_norm": 0.03327510878443718,
397
+ "learning_rate": 2.5456705071861266e-05,
398
+ "loss": 0.1564,
399
+ "step": 25500
400
+ },
401
+ {
402
+ "epoch": 3.0882527616106428,
403
+ "grad_norm": 59.41106033325195,
404
+ "learning_rate": 2.5367620857584036e-05,
405
+ "loss": 0.1308,
406
+ "step": 26000
407
+ },
408
+ {
409
+ "epoch": 3.1476422377954627,
410
+ "grad_norm": 0.029464269056916237,
411
+ "learning_rate": 2.5278536643306806e-05,
412
+ "loss": 0.1276,
413
+ "step": 26500
414
+ },
415
+ {
416
+ "epoch": 3.2070317139802826,
417
+ "grad_norm": 0.1652437150478363,
418
+ "learning_rate": 2.518945242902958e-05,
419
+ "loss": 0.1532,
420
+ "step": 27000
421
+ },
422
+ {
423
+ "epoch": 3.2664211901651026,
424
+ "grad_norm": 0.20975850522518158,
425
+ "learning_rate": 2.5100368214752346e-05,
426
+ "loss": 0.1558,
427
+ "step": 27500
428
+ },
429
+ {
430
+ "epoch": 3.325810666349923,
431
+ "grad_norm": 0.33388465642929077,
432
+ "learning_rate": 2.5011284000475116e-05,
433
+ "loss": 0.1421,
434
+ "step": 28000
435
+ },
436
+ {
437
+ "epoch": 3.385200142534743,
438
+ "grad_norm": 0.06191316992044449,
439
+ "learning_rate": 2.4922199786197886e-05,
440
+ "loss": 0.1356,
441
+ "step": 28500
442
+ },
443
+ {
444
+ "epoch": 3.444589618719563,
445
+ "grad_norm": 0.03824834153056145,
446
+ "learning_rate": 2.4833115571920656e-05,
447
+ "loss": 0.1615,
448
+ "step": 29000
449
+ },
450
+ {
451
+ "epoch": 3.503979094904383,
452
+ "grad_norm": 0.03963463753461838,
453
+ "learning_rate": 2.4744031357643426e-05,
454
+ "loss": 0.1515,
455
+ "step": 29500
456
+ },
457
+ {
458
+ "epoch": 3.563368571089203,
459
+ "grad_norm": 0.26668134331703186,
460
+ "learning_rate": 2.4654947143366197e-05,
461
+ "loss": 0.1578,
462
+ "step": 30000
463
+ },
464
+ {
465
+ "epoch": 3.622758047274023,
466
+ "grad_norm": 0.07644706219434738,
467
+ "learning_rate": 2.4565862929088963e-05,
468
+ "loss": 0.1399,
469
+ "step": 30500
470
+ },
471
+ {
472
+ "epoch": 3.682147523458843,
473
+ "grad_norm": 0.02788461185991764,
474
+ "learning_rate": 2.4476778714811737e-05,
475
+ "loss": 0.156,
476
+ "step": 31000
477
+ },
478
+ {
479
+ "epoch": 3.741536999643663,
480
+ "grad_norm": 0.2001054584980011,
481
+ "learning_rate": 2.4387694500534507e-05,
482
+ "loss": 0.1519,
483
+ "step": 31500
484
+ },
485
+ {
486
+ "epoch": 3.8009264758284833,
487
+ "grad_norm": 9.855899810791016,
488
+ "learning_rate": 2.4298610286257277e-05,
489
+ "loss": 0.1498,
490
+ "step": 32000
491
+ },
492
+ {
493
+ "epoch": 3.8603159520133032,
494
+ "grad_norm": 0.25349605083465576,
495
+ "learning_rate": 2.4209526071980047e-05,
496
+ "loss": 0.1582,
497
+ "step": 32500
498
+ },
499
+ {
500
+ "epoch": 3.919705428198123,
501
+ "grad_norm": 0.10745853930711746,
502
+ "learning_rate": 2.4120441857702814e-05,
503
+ "loss": 0.154,
504
+ "step": 33000
505
+ },
506
+ {
507
+ "epoch": 3.9790949043829436,
508
+ "grad_norm": 0.018555356189608574,
509
+ "learning_rate": 2.4031357643425587e-05,
510
+ "loss": 0.1567,
511
+ "step": 33500
512
+ },
513
+ {
514
+ "epoch": 4.0,
515
+ "eval_accuracy": 0.8463302850723267,
516
+ "eval_f1": 0.8546637744034707,
517
+ "eval_loss": 0.6577614545822144,
518
+ "eval_mcc": 0.6923721957357695,
519
+ "eval_runtime": 0.7006,
520
+ "eval_samples_per_second": 622.298,
521
+ "eval_steps_per_second": 78.501,
522
+ "step": 33676
523
+ },
524
+ {
525
+ "epoch": 4.0384843805677635,
526
+ "grad_norm": 0.04052357375621796,
527
+ "learning_rate": 2.3942273429148354e-05,
528
+ "loss": 0.1321,
529
+ "step": 34000
530
+ },
531
+ {
532
+ "epoch": 4.097873856752583,
533
+ "grad_norm": 28.136058807373047,
534
+ "learning_rate": 2.3853189214871127e-05,
535
+ "loss": 0.12,
536
+ "step": 34500
537
+ },
538
+ {
539
+ "epoch": 4.157263332937403,
540
+ "grad_norm": 12.880512237548828,
541
+ "learning_rate": 2.3764105000593894e-05,
542
+ "loss": 0.1092,
543
+ "step": 35000
544
+ },
545
+ {
546
+ "epoch": 4.216652809122223,
547
+ "grad_norm": 0.02295825444161892,
548
+ "learning_rate": 2.3675020786316664e-05,
549
+ "loss": 0.1015,
550
+ "step": 35500
551
+ },
552
+ {
553
+ "epoch": 4.276042285307043,
554
+ "grad_norm": 16.506240844726562,
555
+ "learning_rate": 2.3585936572039434e-05,
556
+ "loss": 0.1011,
557
+ "step": 36000
558
+ },
559
+ {
560
+ "epoch": 4.335431761491864,
561
+ "grad_norm": 0.05963263288140297,
562
+ "learning_rate": 2.3496852357762204e-05,
563
+ "loss": 0.1203,
564
+ "step": 36500
565
+ },
566
+ {
567
+ "epoch": 4.394821237676684,
568
+ "grad_norm": 6.27707576751709,
569
+ "learning_rate": 2.3407768143484978e-05,
570
+ "loss": 0.108,
571
+ "step": 37000
572
+ },
573
+ {
574
+ "epoch": 4.454210713861504,
575
+ "grad_norm": 0.04750403016805649,
576
+ "learning_rate": 2.3318683929207745e-05,
577
+ "loss": 0.111,
578
+ "step": 37500
579
+ },
580
+ {
581
+ "epoch": 4.513600190046324,
582
+ "grad_norm": 0.11624455451965332,
583
+ "learning_rate": 2.3229599714930515e-05,
584
+ "loss": 0.1253,
585
+ "step": 38000
586
+ },
587
+ {
588
+ "epoch": 4.572989666231144,
589
+ "grad_norm": 0.024209963157773018,
590
+ "learning_rate": 2.3140515500653285e-05,
591
+ "loss": 0.1087,
592
+ "step": 38500
593
+ },
594
+ {
595
+ "epoch": 4.632379142415964,
596
+ "grad_norm": 0.38843753933906555,
597
+ "learning_rate": 2.3051431286376055e-05,
598
+ "loss": 0.134,
599
+ "step": 39000
600
+ },
601
+ {
602
+ "epoch": 4.691768618600784,
603
+ "grad_norm": 0.3449760973453522,
604
+ "learning_rate": 2.2962347072098825e-05,
605
+ "loss": 0.1211,
606
+ "step": 39500
607
+ },
608
+ {
609
+ "epoch": 4.751158094785604,
610
+ "grad_norm": 0.3117709457874298,
611
+ "learning_rate": 2.2873262857821595e-05,
612
+ "loss": 0.1151,
613
+ "step": 40000
614
+ },
615
+ {
616
+ "epoch": 4.810547570970424,
617
+ "grad_norm": 0.09373793005943298,
618
+ "learning_rate": 2.2784178643544362e-05,
619
+ "loss": 0.1247,
620
+ "step": 40500
621
+ },
622
+ {
623
+ "epoch": 4.869937047155244,
624
+ "grad_norm": 25.01434898376465,
625
+ "learning_rate": 2.2695094429267135e-05,
626
+ "loss": 0.131,
627
+ "step": 41000
628
+ },
629
+ {
630
+ "epoch": 4.929326523340064,
631
+ "grad_norm": 0.02744464948773384,
632
+ "learning_rate": 2.2606010214989902e-05,
633
+ "loss": 0.1432,
634
+ "step": 41500
635
+ },
636
+ {
637
+ "epoch": 4.988715999524884,
638
+ "grad_norm": 18.553770065307617,
639
+ "learning_rate": 2.2516926000712676e-05,
640
+ "loss": 0.1145,
641
+ "step": 42000
642
+ },
643
+ {
644
+ "epoch": 5.0,
645
+ "eval_accuracy": 0.853210985660553,
646
+ "eval_f1": 0.8608695652173913,
647
+ "eval_loss": 0.7391630411148071,
648
+ "eval_mcc": 0.7061073536146776,
649
+ "eval_runtime": 0.728,
650
+ "eval_samples_per_second": 598.936,
651
+ "eval_steps_per_second": 75.554,
652
+ "step": 42095
653
+ },
654
+ {
655
+ "epoch": 5.048105475709704,
656
+ "grad_norm": 0.027907686308026314,
657
+ "learning_rate": 2.2427841786435446e-05,
658
+ "loss": 0.0863,
659
+ "step": 42500
660
+ },
661
+ {
662
+ "epoch": 5.107494951894524,
663
+ "grad_norm": 0.025640016421675682,
664
+ "learning_rate": 2.2338757572158212e-05,
665
+ "loss": 0.0777,
666
+ "step": 43000
667
+ },
668
+ {
669
+ "epoch": 5.166884428079344,
670
+ "grad_norm": 0.02034921571612358,
671
+ "learning_rate": 2.2249673357880986e-05,
672
+ "loss": 0.0911,
673
+ "step": 43500
674
+ },
675
+ {
676
+ "epoch": 5.226273904264165,
677
+ "grad_norm": 0.013952106237411499,
678
+ "learning_rate": 2.2160589143603753e-05,
679
+ "loss": 0.1016,
680
+ "step": 44000
681
+ },
682
+ {
683
+ "epoch": 5.285663380448985,
684
+ "grad_norm": 0.0147418063133955,
685
+ "learning_rate": 2.2071504929326526e-05,
686
+ "loss": 0.0962,
687
+ "step": 44500
688
+ },
689
+ {
690
+ "epoch": 5.345052856633805,
691
+ "grad_norm": 7.1807122230529785,
692
+ "learning_rate": 2.1982420715049293e-05,
693
+ "loss": 0.0818,
694
+ "step": 45000
695
+ },
696
+ {
697
+ "epoch": 5.404442332818625,
698
+ "grad_norm": 0.024392470717430115,
699
+ "learning_rate": 2.1893336500772063e-05,
700
+ "loss": 0.0834,
701
+ "step": 45500
702
+ },
703
+ {
704
+ "epoch": 5.463831809003445,
705
+ "grad_norm": 0.009795928373932838,
706
+ "learning_rate": 2.1804252286494833e-05,
707
+ "loss": 0.0979,
708
+ "step": 46000
709
+ },
710
+ {
711
+ "epoch": 5.5232212851882645,
712
+ "grad_norm": 0.013820298947393894,
713
+ "learning_rate": 2.1715168072217603e-05,
714
+ "loss": 0.1039,
715
+ "step": 46500
716
+ },
717
+ {
718
+ "epoch": 5.582610761373084,
719
+ "grad_norm": 0.01806664653122425,
720
+ "learning_rate": 2.1626083857940373e-05,
721
+ "loss": 0.104,
722
+ "step": 47000
723
+ },
724
+ {
725
+ "epoch": 5.642000237557904,
726
+ "grad_norm": 0.020521830767393112,
727
+ "learning_rate": 2.1536999643663143e-05,
728
+ "loss": 0.0938,
729
+ "step": 47500
730
+ },
731
+ {
732
+ "epoch": 5.701389713742724,
733
+ "grad_norm": 0.04232034087181091,
734
+ "learning_rate": 2.1447915429385913e-05,
735
+ "loss": 0.0845,
736
+ "step": 48000
737
+ },
738
+ {
739
+ "epoch": 5.760779189927545,
740
+ "grad_norm": 0.41699323058128357,
741
+ "learning_rate": 2.1358831215108683e-05,
742
+ "loss": 0.1029,
743
+ "step": 48500
744
+ },
745
+ {
746
+ "epoch": 5.820168666112365,
747
+ "grad_norm": 0.017835628241300583,
748
+ "learning_rate": 2.1269747000831454e-05,
749
+ "loss": 0.1023,
750
+ "step": 49000
751
+ },
752
+ {
753
+ "epoch": 5.879558142297185,
754
+ "grad_norm": 0.027343884110450745,
755
+ "learning_rate": 2.1180662786554224e-05,
756
+ "loss": 0.0985,
757
+ "step": 49500
758
+ },
759
+ {
760
+ "epoch": 5.938947618482005,
761
+ "grad_norm": 0.4489924907684326,
762
+ "learning_rate": 2.1091578572276994e-05,
763
+ "loss": 0.099,
764
+ "step": 50000
765
+ },
766
+ {
767
+ "epoch": 5.998337094666825,
768
+ "grad_norm": 4.496362686157227,
769
+ "learning_rate": 2.100249435799976e-05,
770
+ "loss": 0.0995,
771
+ "step": 50500
772
+ },
773
+ {
774
+ "epoch": 6.0,
775
+ "eval_accuracy": 0.8279816508293152,
776
+ "eval_f1": 0.845360824742268,
777
+ "eval_loss": 0.9949702024459839,
778
+ "eval_mcc": 0.6612999229278168,
779
+ "eval_runtime": 0.7516,
780
+ "eval_samples_per_second": 580.102,
781
+ "eval_steps_per_second": 73.178,
782
+ "step": 50514
783
+ },
784
+ {
785
+ "epoch": 6.057726570851645,
786
+ "grad_norm": 0.04877474159002304,
787
+ "learning_rate": 2.0913410143722534e-05,
788
+ "loss": 0.0626,
789
+ "step": 51000
790
+ },
791
+ {
792
+ "epoch": 6.117116047036465,
793
+ "grad_norm": 0.1524512767791748,
794
+ "learning_rate": 2.08243259294453e-05,
795
+ "loss": 0.0703,
796
+ "step": 51500
797
+ },
798
+ {
799
+ "epoch": 6.1765055232212855,
800
+ "grad_norm": 8.834334373474121,
801
+ "learning_rate": 2.0735241715168074e-05,
802
+ "loss": 0.0719,
803
+ "step": 52000
804
+ },
805
+ {
806
+ "epoch": 6.2358949994061055,
807
+ "grad_norm": 0.009824572131037712,
808
+ "learning_rate": 2.0646157500890844e-05,
809
+ "loss": 0.0573,
810
+ "step": 52500
811
+ },
812
+ {
813
+ "epoch": 6.295284475590925,
814
+ "grad_norm": 0.8331696391105652,
815
+ "learning_rate": 2.055707328661361e-05,
816
+ "loss": 0.0629,
817
+ "step": 53000
818
+ },
819
+ {
820
+ "epoch": 6.354673951775745,
821
+ "grad_norm": 0.0664055198431015,
822
+ "learning_rate": 2.0467989072336384e-05,
823
+ "loss": 0.0672,
824
+ "step": 53500
825
+ },
826
+ {
827
+ "epoch": 6.414063427960565,
828
+ "grad_norm": 0.03841827064752579,
829
+ "learning_rate": 2.037890485805915e-05,
830
+ "loss": 0.0793,
831
+ "step": 54000
832
+ },
833
+ {
834
+ "epoch": 6.473452904145385,
835
+ "grad_norm": 0.07754085958003998,
836
+ "learning_rate": 2.0289820643781925e-05,
837
+ "loss": 0.0741,
838
+ "step": 54500
839
+ },
840
+ {
841
+ "epoch": 6.532842380330205,
842
+ "grad_norm": 694.4078979492188,
843
+ "learning_rate": 2.020073642950469e-05,
844
+ "loss": 0.0634,
845
+ "step": 55000
846
+ },
847
+ {
848
+ "epoch": 6.592231856515026,
849
+ "grad_norm": 0.1212846115231514,
850
+ "learning_rate": 2.011165221522746e-05,
851
+ "loss": 0.0841,
852
+ "step": 55500
853
+ },
854
+ {
855
+ "epoch": 6.651621332699846,
856
+ "grad_norm": 0.015646882355213165,
857
+ "learning_rate": 2.002256800095023e-05,
858
+ "loss": 0.0718,
859
+ "step": 56000
860
+ },
861
+ {
862
+ "epoch": 6.711010808884666,
863
+ "grad_norm": 0.04938916116952896,
864
+ "learning_rate": 1.9933483786673e-05,
865
+ "loss": 0.0926,
866
+ "step": 56500
867
+ },
868
+ {
869
+ "epoch": 6.770400285069486,
870
+ "grad_norm": 0.0076505206525325775,
871
+ "learning_rate": 1.9844399572395772e-05,
872
+ "loss": 0.0901,
873
+ "step": 57000
874
+ },
875
+ {
876
+ "epoch": 6.829789761254306,
877
+ "grad_norm": 0.015326344408094883,
878
+ "learning_rate": 1.9755315358118542e-05,
879
+ "loss": 0.0814,
880
+ "step": 57500
881
+ },
882
+ {
883
+ "epoch": 6.889179237439126,
884
+ "grad_norm": 0.08718911558389664,
885
+ "learning_rate": 1.9666231143841312e-05,
886
+ "loss": 0.0811,
887
+ "step": 58000
888
+ },
889
+ {
890
+ "epoch": 6.948568713623946,
891
+ "grad_norm": 0.01760442741215229,
892
+ "learning_rate": 1.9577146929564082e-05,
893
+ "loss": 0.0842,
894
+ "step": 58500
895
+ },
896
+ {
897
+ "epoch": 7.0,
898
+ "eval_accuracy": 0.8440366983413696,
899
+ "eval_f1": 0.8482142857142857,
900
+ "eval_loss": 0.810035765171051,
901
+ "eval_mcc": 0.6879791966290185,
902
+ "eval_runtime": 0.6705,
903
+ "eval_samples_per_second": 650.225,
904
+ "eval_steps_per_second": 82.024,
905
+ "step": 58933
906
+ },
907
+ {
908
+ "epoch": 7.0079581898087655,
909
+ "grad_norm": 0.017652327194809914,
910
+ "learning_rate": 1.9488062715286852e-05,
911
+ "loss": 0.0831,
912
+ "step": 59000
913
+ },
914
+ {
915
+ "epoch": 7.067347665993586,
916
+ "grad_norm": 0.017668193206191063,
917
+ "learning_rate": 1.9398978501009622e-05,
918
+ "loss": 0.0472,
919
+ "step": 59500
920
+ },
921
+ {
922
+ "epoch": 7.126737142178406,
923
+ "grad_norm": 0.06514804065227509,
924
+ "learning_rate": 1.9309894286732392e-05,
925
+ "loss": 0.0625,
926
+ "step": 60000
927
+ },
928
+ {
929
+ "epoch": 7.186126618363226,
930
+ "grad_norm": 0.007870903238654137,
931
+ "learning_rate": 1.922081007245516e-05,
932
+ "loss": 0.0537,
933
+ "step": 60500
934
+ },
935
+ {
936
+ "epoch": 7.245516094548046,
937
+ "grad_norm": 0.0029470089357346296,
938
+ "learning_rate": 1.9131725858177933e-05,
939
+ "loss": 0.0506,
940
+ "step": 61000
941
+ },
942
+ {
943
+ "epoch": 7.304905570732866,
944
+ "grad_norm": 0.004455640912055969,
945
+ "learning_rate": 1.90426416439007e-05,
946
+ "loss": 0.0459,
947
+ "step": 61500
948
+ },
949
+ {
950
+ "epoch": 7.364295046917686,
951
+ "grad_norm": 0.054865576326847076,
952
+ "learning_rate": 1.8953557429623473e-05,
953
+ "loss": 0.055,
954
+ "step": 62000
955
+ },
956
+ {
957
+ "epoch": 7.423684523102506,
958
+ "grad_norm": 0.07598511129617691,
959
+ "learning_rate": 1.886447321534624e-05,
960
+ "loss": 0.0598,
961
+ "step": 62500
962
+ },
963
+ {
964
+ "epoch": 7.483073999287326,
965
+ "grad_norm": 0.12898291647434235,
966
+ "learning_rate": 1.877538900106901e-05,
967
+ "loss": 0.0522,
968
+ "step": 63000
969
+ },
970
+ {
971
+ "epoch": 7.542463475472147,
972
+ "grad_norm": 0.006004431750625372,
973
+ "learning_rate": 1.8686304786791783e-05,
974
+ "loss": 0.0602,
975
+ "step": 63500
976
+ },
977
+ {
978
+ "epoch": 7.601852951656967,
979
+ "grad_norm": 0.02722933515906334,
980
+ "learning_rate": 1.859722057251455e-05,
981
+ "loss": 0.0604,
982
+ "step": 64000
983
+ },
984
+ {
985
+ "epoch": 7.6612424278417866,
986
+ "grad_norm": 0.019477859139442444,
987
+ "learning_rate": 1.8508136358237323e-05,
988
+ "loss": 0.0682,
989
+ "step": 64500
990
+ },
991
+ {
992
+ "epoch": 7.7206319040266065,
993
+ "grad_norm": 0.029639530926942825,
994
+ "learning_rate": 1.841905214396009e-05,
995
+ "loss": 0.0734,
996
+ "step": 65000
997
+ },
998
+ {
999
+ "epoch": 7.780021380211426,
1000
+ "grad_norm": 0.019286124035716057,
1001
+ "learning_rate": 1.832996792968286e-05,
1002
+ "loss": 0.0732,
1003
+ "step": 65500
1004
+ },
1005
+ {
1006
+ "epoch": 7.839410856396246,
1007
+ "grad_norm": 0.012325610034167767,
1008
+ "learning_rate": 1.824088371540563e-05,
1009
+ "loss": 0.0534,
1010
+ "step": 66000
1011
+ },
1012
+ {
1013
+ "epoch": 7.898800332581066,
1014
+ "grad_norm": 0.0682038888335228,
1015
+ "learning_rate": 1.81517995011284e-05,
1016
+ "loss": 0.0886,
1017
+ "step": 66500
1018
+ },
1019
+ {
1020
+ "epoch": 7.958189808765887,
1021
+ "grad_norm": 0.006911112926900387,
1022
+ "learning_rate": 1.806271528685117e-05,
1023
+ "loss": 0.0656,
1024
+ "step": 67000
1025
+ },
1026
+ {
1027
+ "epoch": 8.0,
1028
+ "eval_accuracy": 0.8417431116104126,
1029
+ "eval_f1": 0.8516129032258064,
1030
+ "eval_loss": 0.8965951204299927,
1031
+ "eval_mcc": 0.6835885663714486,
1032
+ "eval_runtime": 0.6751,
1033
+ "eval_samples_per_second": 645.866,
1034
+ "eval_steps_per_second": 81.474,
1035
+ "step": 67352
1036
+ },
1037
+ {
1038
+ "epoch": 8.0,
1039
+ "step": 67352,
1040
+ "total_flos": 3.178750951973683e+16,
1041
+ "train_loss": 0.16765205063521402,
1042
+ "train_runtime": 3789.576,
1043
+ "train_samples_per_second": 355.443,
1044
+ "train_steps_per_second": 44.432
1045
+ }
1046
+ ],
1047
+ "logging_steps": 500,
1048
+ "max_steps": 168380,
1049
+ "num_input_tokens_seen": 0,
1050
+ "num_train_epochs": 20,
1051
+ "save_steps": 500,
1052
+ "stateful_callbacks": {
1053
+ "EarlyStoppingCallback": {
1054
+ "args": {
1055
+ "early_stopping_patience": 3,
1056
+ "early_stopping_threshold": 0.001
1057
+ },
1058
+ "attributes": {
1059
+ "early_stopping_patience_counter": 0
1060
+ }
1061
+ },
1062
+ "TrainerControl": {
1063
+ "args": {
1064
+ "should_epoch_stop": false,
1065
+ "should_evaluate": false,
1066
+ "should_log": false,
1067
+ "should_save": true,
1068
+ "should_training_stop": true
1069
+ },
1070
+ "attributes": {}
1071
+ }
1072
+ },
1073
+ "total_flos": 3.178750951973683e+16,
1074
+ "train_batch_size": 8,
1075
+ "trial_name": null,
1076
+ "trial_params": null
1077
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49c4e7e1a564faf612ab0f5c2b33608102b3f95ecb9a2d435a4b3463c4324b3
3
+ size 5368