AhmedSSoliman commited on
Commit
a671687
·
verified ·
1 Parent(s): 6d06c2a

Model save

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +73 -0
  2. config.json +47 -0
  3. model.safetensors +3 -0
  4. training_args.bin +3 -0
  5. trial-0/checkpoint-1506/config.json +47 -0
  6. trial-0/checkpoint-1506/model.safetensors +3 -0
  7. trial-0/checkpoint-1506/optimizer.pt +3 -0
  8. trial-0/checkpoint-1506/rng_state.pth +3 -0
  9. trial-0/checkpoint-1506/scheduler.pt +3 -0
  10. trial-0/checkpoint-1506/trainer_state.json +255 -0
  11. trial-0/checkpoint-1506/training_args.bin +3 -0
  12. trial-1/checkpoint-6022/config.json +47 -0
  13. trial-1/checkpoint-6022/model.safetensors +3 -0
  14. trial-1/checkpoint-6022/optimizer.pt +3 -0
  15. trial-1/checkpoint-6022/rng_state.pth +3 -0
  16. trial-1/checkpoint-6022/scheduler.pt +3 -0
  17. trial-1/checkpoint-6022/trainer_state.json +897 -0
  18. trial-1/checkpoint-6022/training_args.bin +3 -0
  19. trial-2/checkpoint-6022/config.json +47 -0
  20. trial-2/checkpoint-6022/model.safetensors +3 -0
  21. trial-2/checkpoint-6022/optimizer.pt +3 -0
  22. trial-2/checkpoint-6022/rng_state.pth +3 -0
  23. trial-2/checkpoint-6022/scheduler.pt +3 -0
  24. trial-2/checkpoint-6022/trainer_state.json +897 -0
  25. trial-2/checkpoint-6022/training_args.bin +3 -0
  26. trial-3/checkpoint-1506/config.json +47 -0
  27. trial-3/checkpoint-1506/model.safetensors +3 -0
  28. trial-3/checkpoint-1506/optimizer.pt +3 -0
  29. trial-3/checkpoint-1506/rng_state.pth +3 -0
  30. trial-3/checkpoint-1506/scheduler.pt +3 -0
  31. trial-3/checkpoint-1506/trainer_state.json +255 -0
  32. trial-3/checkpoint-1506/training_args.bin +3 -0
  33. trial-4/checkpoint-3011/config.json +47 -0
  34. trial-4/checkpoint-3011/model.safetensors +3 -0
  35. trial-4/checkpoint-3011/optimizer.pt +3 -0
  36. trial-4/checkpoint-3011/rng_state.pth +3 -0
  37. trial-4/checkpoint-3011/scheduler.pt +3 -0
  38. trial-4/checkpoint-3011/trainer_state.json +465 -0
  39. trial-4/checkpoint-3011/training_args.bin +3 -0
  40. trial-5/checkpoint-3012/config.json +47 -0
  41. trial-5/checkpoint-3012/model.safetensors +3 -0
  42. trial-5/checkpoint-3012/optimizer.pt +3 -0
  43. trial-5/checkpoint-3012/rng_state.pth +3 -0
  44. trial-5/checkpoint-3012/scheduler.pt +3 -0
  45. trial-5/checkpoint-3012/trainer_state.json +477 -0
  46. trial-5/checkpoint-3012/training_args.bin +3 -0
  47. trial-6/checkpoint-6022/config.json +47 -0
  48. trial-6/checkpoint-6022/model.safetensors +3 -0
  49. trial-6/checkpoint-6022/optimizer.pt +3 -0
  50. trial-6/checkpoint-6022/rng_state.pth +3 -0
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: answerdotai/ModernBERT-base
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ - precision
10
+ - recall
11
+ - f1
12
+ model-index:
13
+ - name: answerdotai-ModernBERT-base-finetuned
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # answerdotai-ModernBERT-base-finetuned
21
+
22
+ This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the None dataset.
23
+ It achieves the following results on the evaluation set:
24
+ - Loss: 0.0116
25
+ - Accuracy: 0.9976
26
+ - Precision: 0.9977
27
+ - Recall: 0.9976
28
+ - F1: 0.9976
29
+
30
+ ## Model description
31
+
32
+ More information needed
33
+
34
+ ## Intended uses & limitations
35
+
36
+ More information needed
37
+
38
+ ## Training and evaluation data
39
+
40
+ More information needed
41
+
42
+ ## Training procedure
43
+
44
+ ### Training hyperparameters
45
+
46
+ The following hyperparameters were used during training:
47
+ - learning_rate: 4.244005797262286e-05
48
+ - train_batch_size: 32
49
+ - eval_batch_size: 32
50
+ - seed: 42
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: linear
53
+ - num_epochs: 7
54
+
55
+ ### Training results
56
+
57
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | F1 |
58
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:------:|
59
+ | 0.0175 | 1.0 | 1506 | 0.0195 | 0.9971 | 0.9971 | 0.9971 | 0.9971 |
60
+ | 0.0134 | 2.0 | 3012 | 0.0153 | 0.9970 | 0.9970 | 0.9970 | 0.9970 |
61
+ | 0.0 | 3.0 | 4518 | 0.0228 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
62
+ | 0.0 | 4.0 | 6024 | 0.0270 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
63
+ | 0.0 | 5.0 | 7530 | 0.0272 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
64
+ | 0.0 | 6.0 | 9036 | 0.0279 | 0.9975 | 0.9975 | 0.9975 | 0.9975 |
65
+ | 0.0 | 7.0 | 10542 | 0.0283 | 0.9975 | 0.9975 | 0.9975 | 0.9975 |
66
+
67
+
68
+ ### Framework versions
69
+
70
+ - Transformers 4.48.0.dev0
71
+ - Pytorch 2.5.1+cu124
72
+ - Datasets 3.2.0
73
+ - Tokenizers 0.21.0
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2c8555404b25095196f950baad8216db0404ff16448d62a6d453105d7bd0c7
3
+ size 598439784
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b0c987e99ad21c3b9517dc831f21fd66bcbcd55d62a62f0a28008a0e8674e2
3
+ size 5432
trial-0/checkpoint-1506/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-0/checkpoint-1506/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68eefa4a9be7b2db68618e1cb44c2cdf2163fb53cc3380fc52767266b121ddd2
3
+ size 598439784
trial-0/checkpoint-1506/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a1a4cc69805f73befa2723d41c1d97c0a2f799125f15e25de8295d6c23580c
3
+ size 1196967418
trial-0/checkpoint-1506/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
3
+ size 14244
trial-0/checkpoint-1506/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5bddebb63f2196cebff07c6da8f9e668e8379463981f8be40fb7e151e6c09ff
3
+ size 1064
trial-0/checkpoint-1506/trainer_state.json ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.02135350927710533,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-0/checkpoint-1506",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1506,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.033200531208499334,
13
+ "grad_norm": 11.822611808776855,
14
+ "learning_rate": 4.4935320035267014e-05,
15
+ "loss": 0.295,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.06640106241699867,
20
+ "grad_norm": 0.11557121574878693,
21
+ "learning_rate": 4.463495024893502e-05,
22
+ "loss": 0.0808,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.099601593625498,
27
+ "grad_norm": 0.01743650808930397,
28
+ "learning_rate": 4.433458046260302e-05,
29
+ "loss": 0.052,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.13280212483399734,
34
+ "grad_norm": 4.474731922149658,
35
+ "learning_rate": 4.4034210676271024e-05,
36
+ "loss": 0.0491,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.16600265604249667,
41
+ "grad_norm": 4.205756664276123,
42
+ "learning_rate": 4.373384088993902e-05,
43
+ "loss": 0.0344,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.199203187250996,
48
+ "grad_norm": 4.239188194274902,
49
+ "learning_rate": 4.343347110360703e-05,
50
+ "loss": 0.0295,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.23240371845949534,
55
+ "grad_norm": 0.19662700593471527,
56
+ "learning_rate": 4.3133101317275027e-05,
57
+ "loss": 0.0342,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.2656042496679947,
62
+ "grad_norm": 0.008393031544983387,
63
+ "learning_rate": 4.2832731530943025e-05,
64
+ "loss": 0.0245,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.29880478087649404,
69
+ "grad_norm": 0.06995929777622223,
70
+ "learning_rate": 4.253236174461103e-05,
71
+ "loss": 0.0281,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.33200531208499334,
76
+ "grad_norm": 0.010315222665667534,
77
+ "learning_rate": 4.223199195827902e-05,
78
+ "loss": 0.0188,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.3652058432934927,
83
+ "grad_norm": 3.1021769046783447,
84
+ "learning_rate": 4.193162217194703e-05,
85
+ "loss": 0.018,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.398406374501992,
90
+ "grad_norm": 0.00041495164623484015,
91
+ "learning_rate": 4.1631252385615027e-05,
92
+ "loss": 0.0053,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.4316069057104914,
97
+ "grad_norm": 0.19596342742443085,
98
+ "learning_rate": 4.133088259928303e-05,
99
+ "loss": 0.0178,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.4648074369189907,
104
+ "grad_norm": 0.0566418319940567,
105
+ "learning_rate": 4.103051281295103e-05,
106
+ "loss": 0.0101,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.49800796812749004,
111
+ "grad_norm": 0.005816417746245861,
112
+ "learning_rate": 4.0730143026619036e-05,
113
+ "loss": 0.0166,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.5312084993359893,
118
+ "grad_norm": 2.2474324703216553,
119
+ "learning_rate": 4.0429773240287035e-05,
120
+ "loss": 0.0156,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5644090305444888,
125
+ "grad_norm": 0.06311876326799393,
126
+ "learning_rate": 4.0129403453955033e-05,
127
+ "loss": 0.0166,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5976095617529881,
132
+ "grad_norm": 0.012764506973326206,
133
+ "learning_rate": 3.982903366762304e-05,
134
+ "loss": 0.0175,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.6308100929614874,
139
+ "grad_norm": 0.00253055221401155,
140
+ "learning_rate": 3.952866388129104e-05,
141
+ "loss": 0.0047,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6640106241699867,
146
+ "grad_norm": 0.03604559600353241,
147
+ "learning_rate": 3.922829409495904e-05,
148
+ "loss": 0.016,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6972111553784861,
153
+ "grad_norm": 0.006498202681541443,
154
+ "learning_rate": 3.892792430862704e-05,
155
+ "loss": 0.0055,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.7304116865869854,
160
+ "grad_norm": 0.11296769976615906,
161
+ "learning_rate": 3.862755452229504e-05,
162
+ "loss": 0.0122,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7636122177954847,
167
+ "grad_norm": 0.0005851402529515326,
168
+ "learning_rate": 3.8327184735963046e-05,
169
+ "loss": 0.01,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.796812749003984,
174
+ "grad_norm": 0.018440622836351395,
175
+ "learning_rate": 3.8026814949631044e-05,
176
+ "loss": 0.0064,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.8300132802124834,
181
+ "grad_norm": 0.0023099363315850496,
182
+ "learning_rate": 3.772644516329905e-05,
183
+ "loss": 0.0011,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8632138114209827,
188
+ "grad_norm": 0.07595626264810562,
189
+ "learning_rate": 3.742607537696705e-05,
190
+ "loss": 0.0156,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.896414342629482,
195
+ "grad_norm": 0.0008996099350042641,
196
+ "learning_rate": 3.7125705590635054e-05,
197
+ "loss": 0.0103,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.9296148738379814,
202
+ "grad_norm": 3.656134504126385e-05,
203
+ "learning_rate": 3.682533580430305e-05,
204
+ "loss": 0.0027,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.9628154050464808,
209
+ "grad_norm": 0.2666904032230377,
210
+ "learning_rate": 3.652496601797105e-05,
211
+ "loss": 0.0152,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9960159362549801,
216
+ "grad_norm": 0.011590929701924324,
217
+ "learning_rate": 3.622459623163905e-05,
218
+ "loss": 0.0115,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 1.0,
223
+ "eval_accuracy": 0.9963024809160306,
224
+ "eval_f1": 0.9962997469825083,
225
+ "eval_loss": 0.02135350927710533,
226
+ "eval_precision": 0.9962971957079396,
227
+ "eval_recall": 0.9963024809160306,
228
+ "eval_runtime": 34.0647,
229
+ "eval_samples_per_second": 246.12,
230
+ "eval_steps_per_second": 7.691,
231
+ "step": 1506
232
+ }
233
+ ],
234
+ "logging_steps": 50,
235
+ "max_steps": 7530,
236
+ "num_input_tokens_seen": 0,
237
+ "num_train_epochs": 5,
238
+ "save_steps": 500,
239
+ "stateful_callbacks": {
240
+ "TrainerControl": {
241
+ "args": {
242
+ "should_epoch_stop": false,
243
+ "should_evaluate": false,
244
+ "should_log": false,
245
+ "should_save": true,
246
+ "should_training_stop": false
247
+ },
248
+ "attributes": {}
249
+ }
250
+ },
251
+ "total_flos": 1.641430544259072e+16,
252
+ "train_batch_size": 32,
253
+ "trial_name": null,
254
+ "trial_params": null
255
+ }
trial-0/checkpoint-1506/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87e0989b8aabc63686d8b1c4f4f6463501f9b534fd10b5dda472e02e5c6d200
3
+ size 5368
trial-1/checkpoint-6022/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-1/checkpoint-6022/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9376e02caf20a3536db5adaec49e89c8583378974c975bdfa4e4fa72bb7ed87c
3
+ size 598439784
trial-1/checkpoint-6022/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f989a18c3b9f0cb969ade19c78b7d7d4405053c69000081f12d16f8076c4691
3
+ size 1196967418
trial-1/checkpoint-6022/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
3
+ size 14244
trial-1/checkpoint-6022/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04bd594b0cd8e46cee28cfc34b0ba6a02854df28789c81eb4c180d9356f4de00
3
+ size 1064
trial-1/checkpoint-6022/trainer_state.json ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0445549376308918,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-1/checkpoint-6022",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 6022,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016605778811026237,
13
+ "grad_norm": 15.757351875305176,
14
+ "learning_rate": 2.4306427769118723e-06,
15
+ "loss": 0.6703,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.033211557622052475,
20
+ "grad_norm": 14.056926727294922,
21
+ "learning_rate": 2.425586942863882e-06,
22
+ "loss": 0.4736,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.04981733643307871,
27
+ "grad_norm": 15.678231239318848,
28
+ "learning_rate": 2.4205311088158915e-06,
29
+ "loss": 0.338,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.06642311524410495,
34
+ "grad_norm": 4.84220552444458,
35
+ "learning_rate": 2.4154752747679013e-06,
36
+ "loss": 0.2931,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.08302889405513118,
41
+ "grad_norm": 5.182389736175537,
42
+ "learning_rate": 2.4104194407199107e-06,
43
+ "loss": 0.251,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.09963467286615742,
48
+ "grad_norm": 1.5187151432037354,
49
+ "learning_rate": 2.4053636066719205e-06,
50
+ "loss": 0.2133,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.11624045167718366,
55
+ "grad_norm": 16.253589630126953,
56
+ "learning_rate": 2.40030777262393e-06,
57
+ "loss": 0.1518,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.1328462304882099,
62
+ "grad_norm": 6.757865905761719,
63
+ "learning_rate": 2.3952519385759397e-06,
64
+ "loss": 0.1508,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.14945200929923613,
69
+ "grad_norm": 2.119438886642456,
70
+ "learning_rate": 2.390196104527949e-06,
71
+ "loss": 0.1175,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.16605778811026237,
76
+ "grad_norm": 15.932334899902344,
77
+ "learning_rate": 2.3851402704799585e-06,
78
+ "loss": 0.1401,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.1826635669212886,
83
+ "grad_norm": 22.459735870361328,
84
+ "learning_rate": 2.3800844364319683e-06,
85
+ "loss": 0.1384,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.19926934573231483,
90
+ "grad_norm": 10.65778923034668,
91
+ "learning_rate": 2.3750286023839777e-06,
92
+ "loss": 0.1179,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.2158751245433411,
97
+ "grad_norm": 6.71965217590332,
98
+ "learning_rate": 2.3699727683359876e-06,
99
+ "loss": 0.0782,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.23248090335436733,
104
+ "grad_norm": 3.6098344326019287,
105
+ "learning_rate": 2.364916934287997e-06,
106
+ "loss": 0.138,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.24908668216539356,
111
+ "grad_norm": 2.3249447345733643,
112
+ "learning_rate": 2.3598611002400068e-06,
113
+ "loss": 0.1087,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.2656924609764198,
118
+ "grad_norm": 15.047837257385254,
119
+ "learning_rate": 2.354805266192016e-06,
120
+ "loss": 0.0868,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.282298239787446,
125
+ "grad_norm": 6.7322773933410645,
126
+ "learning_rate": 2.349749432144026e-06,
127
+ "loss": 0.0954,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.29890401859847227,
132
+ "grad_norm": 12.954623222351074,
133
+ "learning_rate": 2.3446935980960354e-06,
134
+ "loss": 0.0689,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.3155097974094985,
139
+ "grad_norm": 1.4312756061553955,
140
+ "learning_rate": 2.3396377640480448e-06,
141
+ "loss": 0.0908,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.33211557622052473,
146
+ "grad_norm": 0.21316280961036682,
147
+ "learning_rate": 2.3345819300000546e-06,
148
+ "loss": 0.0766,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.348721355031551,
153
+ "grad_norm": 13.642809867858887,
154
+ "learning_rate": 2.329526095952064e-06,
155
+ "loss": 0.0533,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.3653271338425772,
160
+ "grad_norm": 14.525202751159668,
161
+ "learning_rate": 2.324470261904074e-06,
162
+ "loss": 0.0745,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.38193291265360346,
167
+ "grad_norm": 0.5210687518119812,
168
+ "learning_rate": 2.319414427856083e-06,
169
+ "loss": 0.0618,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.39853869146462967,
174
+ "grad_norm": 0.07292640954256058,
175
+ "learning_rate": 2.314358593808093e-06,
176
+ "loss": 0.0307,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.41514447027565593,
181
+ "grad_norm": 0.08236780017614365,
182
+ "learning_rate": 2.309302759760103e-06,
183
+ "loss": 0.0321,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.4317502490866822,
188
+ "grad_norm": 28.97471809387207,
189
+ "learning_rate": 2.304246925712112e-06,
190
+ "loss": 0.0748,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.4483560278977084,
195
+ "grad_norm": 0.4781515896320343,
196
+ "learning_rate": 2.2991910916641216e-06,
197
+ "loss": 0.0733,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.46496180670873466,
202
+ "grad_norm": 3.214794397354126,
203
+ "learning_rate": 2.2941352576161314e-06,
204
+ "loss": 0.0149,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.48156758551976087,
209
+ "grad_norm": 0.3289443850517273,
210
+ "learning_rate": 2.289079423568141e-06,
211
+ "loss": 0.0401,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.4981733643307871,
216
+ "grad_norm": 0.12368986010551453,
217
+ "learning_rate": 2.28402358952015e-06,
218
+ "loss": 0.0334,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.5147791431418134,
223
+ "grad_norm": 0.08283340185880661,
224
+ "learning_rate": 2.27896775547216e-06,
225
+ "loss": 0.0331,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 0.5313849219528396,
230
+ "grad_norm": 2.650063991546631,
231
+ "learning_rate": 2.2739119214241694e-06,
232
+ "loss": 0.0496,
233
+ "step": 1600
234
+ },
235
+ {
236
+ "epoch": 0.5479907007638658,
237
+ "grad_norm": 3.296297311782837,
238
+ "learning_rate": 2.2688560873761792e-06,
239
+ "loss": 0.0365,
240
+ "step": 1650
241
+ },
242
+ {
243
+ "epoch": 0.564596479574892,
244
+ "grad_norm": 0.032304324209690094,
245
+ "learning_rate": 2.263800253328189e-06,
246
+ "loss": 0.005,
247
+ "step": 1700
248
+ },
249
+ {
250
+ "epoch": 0.5812022583859183,
251
+ "grad_norm": 0.003552216337993741,
252
+ "learning_rate": 2.2587444192801985e-06,
253
+ "loss": 0.0183,
254
+ "step": 1750
255
+ },
256
+ {
257
+ "epoch": 0.5978080371969445,
258
+ "grad_norm": 0.0315885953605175,
259
+ "learning_rate": 2.253688585232208e-06,
260
+ "loss": 0.0184,
261
+ "step": 1800
262
+ },
263
+ {
264
+ "epoch": 0.6144138160079707,
265
+ "grad_norm": 0.004702410195022821,
266
+ "learning_rate": 2.2486327511842177e-06,
267
+ "loss": 0.0346,
268
+ "step": 1850
269
+ },
270
+ {
271
+ "epoch": 0.631019594818997,
272
+ "grad_norm": 0.07862639427185059,
273
+ "learning_rate": 2.243576917136227e-06,
274
+ "loss": 0.0296,
275
+ "step": 1900
276
+ },
277
+ {
278
+ "epoch": 0.6476253736300233,
279
+ "grad_norm": 0.3578585982322693,
280
+ "learning_rate": 2.2385210830882364e-06,
281
+ "loss": 0.0266,
282
+ "step": 1950
283
+ },
284
+ {
285
+ "epoch": 0.6642311524410495,
286
+ "grad_norm": 0.045335959643125534,
287
+ "learning_rate": 2.2334652490402463e-06,
288
+ "loss": 0.032,
289
+ "step": 2000
290
+ },
291
+ {
292
+ "epoch": 0.6808369312520757,
293
+ "grad_norm": 1.6869137287139893,
294
+ "learning_rate": 2.2284094149922557e-06,
295
+ "loss": 0.0297,
296
+ "step": 2050
297
+ },
298
+ {
299
+ "epoch": 0.697442710063102,
300
+ "grad_norm": 0.6017621755599976,
301
+ "learning_rate": 2.2233535809442655e-06,
302
+ "loss": 0.0119,
303
+ "step": 2100
304
+ },
305
+ {
306
+ "epoch": 0.7140484888741282,
307
+ "grad_norm": 0.13145552575588226,
308
+ "learning_rate": 2.2182977468962753e-06,
309
+ "loss": 0.0157,
310
+ "step": 2150
311
+ },
312
+ {
313
+ "epoch": 0.7306542676851544,
314
+ "grad_norm": 0.00971242692321539,
315
+ "learning_rate": 2.2132419128482847e-06,
316
+ "loss": 0.0099,
317
+ "step": 2200
318
+ },
319
+ {
320
+ "epoch": 0.7472600464961807,
321
+ "grad_norm": 0.5801131725311279,
322
+ "learning_rate": 2.208186078800294e-06,
323
+ "loss": 0.0235,
324
+ "step": 2250
325
+ },
326
+ {
327
+ "epoch": 0.7638658253072069,
328
+ "grad_norm": 0.008363746106624603,
329
+ "learning_rate": 2.203130244752304e-06,
330
+ "loss": 0.0275,
331
+ "step": 2300
332
+ },
333
+ {
334
+ "epoch": 0.7804716041182331,
335
+ "grad_norm": 0.23013177514076233,
336
+ "learning_rate": 2.1980744107043133e-06,
337
+ "loss": 0.0022,
338
+ "step": 2350
339
+ },
340
+ {
341
+ "epoch": 0.7970773829292593,
342
+ "grad_norm": 0.044313572347164154,
343
+ "learning_rate": 2.1930185766563227e-06,
344
+ "loss": 0.0185,
345
+ "step": 2400
346
+ },
347
+ {
348
+ "epoch": 0.8136831617402857,
349
+ "grad_norm": 0.008519169874489307,
350
+ "learning_rate": 2.1879627426083325e-06,
351
+ "loss": 0.0023,
352
+ "step": 2450
353
+ },
354
+ {
355
+ "epoch": 0.8302889405513119,
356
+ "grad_norm": 0.0008576350519433618,
357
+ "learning_rate": 2.182906908560342e-06,
358
+ "loss": 0.0062,
359
+ "step": 2500
360
+ },
361
+ {
362
+ "epoch": 0.8468947193623381,
363
+ "grad_norm": 0.56068354845047,
364
+ "learning_rate": 2.1778510745123517e-06,
365
+ "loss": 0.0106,
366
+ "step": 2550
367
+ },
368
+ {
369
+ "epoch": 0.8635004981733644,
370
+ "grad_norm": 33.770652770996094,
371
+ "learning_rate": 2.1727952404643615e-06,
372
+ "loss": 0.0298,
373
+ "step": 2600
374
+ },
375
+ {
376
+ "epoch": 0.8801062769843906,
377
+ "grad_norm": 0.0006891911034472287,
378
+ "learning_rate": 2.167739406416371e-06,
379
+ "loss": 0.0046,
380
+ "step": 2650
381
+ },
382
+ {
383
+ "epoch": 0.8967120557954168,
384
+ "grad_norm": 0.000691475928761065,
385
+ "learning_rate": 2.1626835723683803e-06,
386
+ "loss": 0.0014,
387
+ "step": 2700
388
+ },
389
+ {
390
+ "epoch": 0.913317834606443,
391
+ "grad_norm": 0.022216275334358215,
392
+ "learning_rate": 2.15762773832039e-06,
393
+ "loss": 0.0152,
394
+ "step": 2750
395
+ },
396
+ {
397
+ "epoch": 0.9299236134174693,
398
+ "grad_norm": 0.0004267705953679979,
399
+ "learning_rate": 2.1525719042723995e-06,
400
+ "loss": 0.0117,
401
+ "step": 2800
402
+ },
403
+ {
404
+ "epoch": 0.9465293922284955,
405
+ "grad_norm": 0.016712836921215057,
406
+ "learning_rate": 2.147516070224409e-06,
407
+ "loss": 0.0009,
408
+ "step": 2850
409
+ },
410
+ {
411
+ "epoch": 0.9631351710395217,
412
+ "grad_norm": 23.74860382080078,
413
+ "learning_rate": 2.1424602361764187e-06,
414
+ "loss": 0.0233,
415
+ "step": 2900
416
+ },
417
+ {
418
+ "epoch": 0.9797409498505479,
419
+ "grad_norm": 0.0039037028327584267,
420
+ "learning_rate": 2.137404402128428e-06,
421
+ "loss": 0.0193,
422
+ "step": 2950
423
+ },
424
+ {
425
+ "epoch": 0.9963467286615743,
426
+ "grad_norm": 0.0023961260449141264,
427
+ "learning_rate": 2.132348568080438e-06,
428
+ "loss": 0.0068,
429
+ "step": 3000
430
+ },
431
+ {
432
+ "epoch": 1.0,
433
+ "eval_accuracy": 0.9921278625954199,
434
+ "eval_f1": 0.9921278625954199,
435
+ "eval_loss": 0.046909503638744354,
436
+ "eval_precision": 0.9921278625954199,
437
+ "eval_recall": 0.9921278625954199,
438
+ "eval_runtime": 36.762,
439
+ "eval_samples_per_second": 228.061,
440
+ "eval_steps_per_second": 14.254,
441
+ "step": 3011
442
+ },
443
+ {
444
+ "epoch": 1.0129525074726005,
445
+ "grad_norm": 0.0033601378090679646,
446
+ "learning_rate": 2.1272927340324478e-06,
447
+ "loss": 0.0005,
448
+ "step": 3050
449
+ },
450
+ {
451
+ "epoch": 1.0295582862836268,
452
+ "grad_norm": 0.038166940212249756,
453
+ "learning_rate": 2.122236899984457e-06,
454
+ "loss": 0.0002,
455
+ "step": 3100
456
+ },
457
+ {
458
+ "epoch": 1.0461640650946529,
459
+ "grad_norm": 0.0003456630220171064,
460
+ "learning_rate": 2.1171810659364666e-06,
461
+ "loss": 0.0139,
462
+ "step": 3150
463
+ },
464
+ {
465
+ "epoch": 1.0627698439056792,
466
+ "grad_norm": 0.004587268922477961,
467
+ "learning_rate": 2.1121252318884764e-06,
468
+ "loss": 0.0001,
469
+ "step": 3200
470
+ },
471
+ {
472
+ "epoch": 1.0793756227167055,
473
+ "grad_norm": 0.08502045273780823,
474
+ "learning_rate": 2.1070693978404858e-06,
475
+ "loss": 0.0216,
476
+ "step": 3250
477
+ },
478
+ {
479
+ "epoch": 1.0959814015277316,
480
+ "grad_norm": 0.10945820808410645,
481
+ "learning_rate": 2.102013563792495e-06,
482
+ "loss": 0.0256,
483
+ "step": 3300
484
+ },
485
+ {
486
+ "epoch": 1.112587180338758,
487
+ "grad_norm": 0.03236968442797661,
488
+ "learning_rate": 2.096957729744505e-06,
489
+ "loss": 0.005,
490
+ "step": 3350
491
+ },
492
+ {
493
+ "epoch": 1.1291929591497842,
494
+ "grad_norm": 0.007731316145509481,
495
+ "learning_rate": 2.0919018956965144e-06,
496
+ "loss": 0.0101,
497
+ "step": 3400
498
+ },
499
+ {
500
+ "epoch": 1.1457987379608103,
501
+ "grad_norm": 0.00674546230584383,
502
+ "learning_rate": 2.086846061648524e-06,
503
+ "loss": 0.0051,
504
+ "step": 3450
505
+ },
506
+ {
507
+ "epoch": 1.1624045167718366,
508
+ "grad_norm": 0.004380326252430677,
509
+ "learning_rate": 2.081790227600534e-06,
510
+ "loss": 0.0039,
511
+ "step": 3500
512
+ },
513
+ {
514
+ "epoch": 1.1790102955828627,
515
+ "grad_norm": 0.031456008553504944,
516
+ "learning_rate": 2.0767343935525434e-06,
517
+ "loss": 0.0001,
518
+ "step": 3550
519
+ },
520
+ {
521
+ "epoch": 1.195616074393889,
522
+ "grad_norm": 0.017602458596229553,
523
+ "learning_rate": 2.071678559504553e-06,
524
+ "loss": 0.006,
525
+ "step": 3600
526
+ },
527
+ {
528
+ "epoch": 1.2122218532049154,
529
+ "grad_norm": 0.009589639492332935,
530
+ "learning_rate": 2.0666227254565626e-06,
531
+ "loss": 0.001,
532
+ "step": 3650
533
+ },
534
+ {
535
+ "epoch": 1.2288276320159415,
536
+ "grad_norm": 0.003254746785387397,
537
+ "learning_rate": 2.061566891408572e-06,
538
+ "loss": 0.0,
539
+ "step": 3700
540
+ },
541
+ {
542
+ "epoch": 1.2454334108269678,
543
+ "grad_norm": 0.0011986729223281145,
544
+ "learning_rate": 2.056511057360582e-06,
545
+ "loss": 0.0126,
546
+ "step": 3750
547
+ },
548
+ {
549
+ "epoch": 1.2620391896379939,
550
+ "grad_norm": 0.006293583195656538,
551
+ "learning_rate": 2.0514552233125912e-06,
552
+ "loss": 0.0006,
553
+ "step": 3800
554
+ },
555
+ {
556
+ "epoch": 1.2786449684490202,
557
+ "grad_norm": 0.11370380967855453,
558
+ "learning_rate": 2.0463993892646006e-06,
559
+ "loss": 0.0252,
560
+ "step": 3850
561
+ },
562
+ {
563
+ "epoch": 1.2952507472600465,
564
+ "grad_norm": 0.0018469190690666437,
565
+ "learning_rate": 2.0413435552166104e-06,
566
+ "loss": 0.0004,
567
+ "step": 3900
568
+ },
569
+ {
570
+ "epoch": 1.3118565260710726,
571
+ "grad_norm": 0.0002411604655208066,
572
+ "learning_rate": 2.0362877211686202e-06,
573
+ "loss": 0.003,
574
+ "step": 3950
575
+ },
576
+ {
577
+ "epoch": 1.328462304882099,
578
+ "grad_norm": 4.065009852638468e-05,
579
+ "learning_rate": 2.0312318871206296e-06,
580
+ "loss": 0.0165,
581
+ "step": 4000
582
+ },
583
+ {
584
+ "epoch": 1.3450680836931252,
585
+ "grad_norm": 0.005062599666416645,
586
+ "learning_rate": 2.0261760530726395e-06,
587
+ "loss": 0.0028,
588
+ "step": 4050
589
+ },
590
+ {
591
+ "epoch": 1.3616738625041513,
592
+ "grad_norm": 0.017400013282895088,
593
+ "learning_rate": 2.021120219024649e-06,
594
+ "loss": 0.001,
595
+ "step": 4100
596
+ },
597
+ {
598
+ "epoch": 1.3782796413151777,
599
+ "grad_norm": 0.05683843046426773,
600
+ "learning_rate": 2.0160643849766582e-06,
601
+ "loss": 0.0124,
602
+ "step": 4150
603
+ },
604
+ {
605
+ "epoch": 1.394885420126204,
606
+ "grad_norm": 0.0027029893826693296,
607
+ "learning_rate": 2.011008550928668e-06,
608
+ "loss": 0.0003,
609
+ "step": 4200
610
+ },
611
+ {
612
+ "epoch": 1.41149119893723,
613
+ "grad_norm": 0.002034110017120838,
614
+ "learning_rate": 2.0059527168806775e-06,
615
+ "loss": 0.0073,
616
+ "step": 4250
617
+ },
618
+ {
619
+ "epoch": 1.4280969777482564,
620
+ "grad_norm": 0.001398180378600955,
621
+ "learning_rate": 2.000896882832687e-06,
622
+ "loss": 0.0044,
623
+ "step": 4300
624
+ },
625
+ {
626
+ "epoch": 1.4447027565592827,
627
+ "grad_norm": 0.00037716259248554707,
628
+ "learning_rate": 1.9958410487846967e-06,
629
+ "loss": 0.0228,
630
+ "step": 4350
631
+ },
632
+ {
633
+ "epoch": 1.4613085353703088,
634
+ "grad_norm": 0.015627387911081314,
635
+ "learning_rate": 1.9907852147367065e-06,
636
+ "loss": 0.0114,
637
+ "step": 4400
638
+ },
639
+ {
640
+ "epoch": 1.4779143141813351,
641
+ "grad_norm": 0.008964600041508675,
642
+ "learning_rate": 1.985729380688716e-06,
643
+ "loss": 0.0032,
644
+ "step": 4450
645
+ },
646
+ {
647
+ "epoch": 1.4945200929923614,
648
+ "grad_norm": 0.003252738853916526,
649
+ "learning_rate": 1.9806735466407257e-06,
650
+ "loss": 0.0082,
651
+ "step": 4500
652
+ },
653
+ {
654
+ "epoch": 1.5111258718033875,
655
+ "grad_norm": 0.00012037971464451402,
656
+ "learning_rate": 1.975617712592735e-06,
657
+ "loss": 0.0001,
658
+ "step": 4550
659
+ },
660
+ {
661
+ "epoch": 1.5277316506144138,
662
+ "grad_norm": 0.010974590666592121,
663
+ "learning_rate": 1.9705618785447445e-06,
664
+ "loss": 0.0,
665
+ "step": 4600
666
+ },
667
+ {
668
+ "epoch": 1.5443374294254402,
669
+ "grad_norm": 0.08398176729679108,
670
+ "learning_rate": 1.9655060444967543e-06,
671
+ "loss": 0.0002,
672
+ "step": 4650
673
+ },
674
+ {
675
+ "epoch": 1.5609432082364663,
676
+ "grad_norm": 0.03629281371831894,
677
+ "learning_rate": 1.9604502104487637e-06,
678
+ "loss": 0.006,
679
+ "step": 4700
680
+ },
681
+ {
682
+ "epoch": 1.5775489870474926,
683
+ "grad_norm": 0.00034110501292161644,
684
+ "learning_rate": 1.955394376400773e-06,
685
+ "loss": 0.0003,
686
+ "step": 4750
687
+ },
688
+ {
689
+ "epoch": 1.594154765858519,
690
+ "grad_norm": 0.0027959852013736963,
691
+ "learning_rate": 1.950338542352783e-06,
692
+ "loss": 0.0,
693
+ "step": 4800
694
+ },
695
+ {
696
+ "epoch": 1.610760544669545,
697
+ "grad_norm": 0.0001677741383900866,
698
+ "learning_rate": 1.9452827083047927e-06,
699
+ "loss": 0.0023,
700
+ "step": 4850
701
+ },
702
+ {
703
+ "epoch": 1.627366323480571,
704
+ "grad_norm": 0.055583104491233826,
705
+ "learning_rate": 1.940226874256802e-06,
706
+ "loss": 0.0225,
707
+ "step": 4900
708
+ },
709
+ {
710
+ "epoch": 1.6439721022915976,
711
+ "grad_norm": 8.664117194712162e-05,
712
+ "learning_rate": 1.935171040208812e-06,
713
+ "loss": 0.0009,
714
+ "step": 4950
715
+ },
716
+ {
717
+ "epoch": 1.6605778811026237,
718
+ "grad_norm": 0.0017323939828202128,
719
+ "learning_rate": 1.9301152061608213e-06,
720
+ "loss": 0.008,
721
+ "step": 5000
722
+ },
723
+ {
724
+ "epoch": 1.6771836599136498,
725
+ "grad_norm": 0.0034425491467118263,
726
+ "learning_rate": 1.9250593721128307e-06,
727
+ "loss": 0.0,
728
+ "step": 5050
729
+ },
730
+ {
731
+ "epoch": 1.6937894387246761,
732
+ "grad_norm": 6.076216959627345e-05,
733
+ "learning_rate": 1.9200035380648405e-06,
734
+ "loss": 0.0041,
735
+ "step": 5100
736
+ },
737
+ {
738
+ "epoch": 1.7103952175357025,
739
+ "grad_norm": 0.0018082900205627084,
740
+ "learning_rate": 1.91494770401685e-06,
741
+ "loss": 0.0017,
742
+ "step": 5150
743
+ },
744
+ {
745
+ "epoch": 1.7270009963467285,
746
+ "grad_norm": 0.008552160114049911,
747
+ "learning_rate": 1.9098918699688593e-06,
748
+ "loss": 0.0137,
749
+ "step": 5200
750
+ },
751
+ {
752
+ "epoch": 1.7436067751577549,
753
+ "grad_norm": 0.08908296376466751,
754
+ "learning_rate": 1.9048360359208694e-06,
755
+ "loss": 0.0092,
756
+ "step": 5250
757
+ },
758
+ {
759
+ "epoch": 1.7602125539687812,
760
+ "grad_norm": 0.002973488997668028,
761
+ "learning_rate": 1.8997802018728788e-06,
762
+ "loss": 0.0002,
763
+ "step": 5300
764
+ },
765
+ {
766
+ "epoch": 1.7768183327798073,
767
+ "grad_norm": 0.005116044543683529,
768
+ "learning_rate": 1.8947243678248884e-06,
769
+ "loss": 0.0079,
770
+ "step": 5350
771
+ },
772
+ {
773
+ "epoch": 1.7934241115908336,
774
+ "grad_norm": 0.002092874376103282,
775
+ "learning_rate": 1.889668533776898e-06,
776
+ "loss": 0.0,
777
+ "step": 5400
778
+ },
779
+ {
780
+ "epoch": 1.81002989040186,
781
+ "grad_norm": 0.0070649790577590466,
782
+ "learning_rate": 1.8846126997289076e-06,
783
+ "loss": 0.0,
784
+ "step": 5450
785
+ },
786
+ {
787
+ "epoch": 1.826635669212886,
788
+ "grad_norm": 0.001974167302250862,
789
+ "learning_rate": 1.879556865680917e-06,
790
+ "loss": 0.016,
791
+ "step": 5500
792
+ },
793
+ {
794
+ "epoch": 1.8432414480239123,
795
+ "grad_norm": 0.0012006360339000821,
796
+ "learning_rate": 1.8745010316329268e-06,
797
+ "loss": 0.0,
798
+ "step": 5550
799
+ },
800
+ {
801
+ "epoch": 1.8598472268349386,
802
+ "grad_norm": 0.006318301893770695,
803
+ "learning_rate": 1.8694451975849362e-06,
804
+ "loss": 0.0,
805
+ "step": 5600
806
+ },
807
+ {
808
+ "epoch": 1.8764530056459647,
809
+ "grad_norm": 0.0020722977351397276,
810
+ "learning_rate": 1.8643893635369458e-06,
811
+ "loss": 0.0104,
812
+ "step": 5650
813
+ },
814
+ {
815
+ "epoch": 1.893058784456991,
816
+ "grad_norm": 0.0874456912279129,
817
+ "learning_rate": 1.8593335294889556e-06,
818
+ "loss": 0.0023,
819
+ "step": 5700
820
+ },
821
+ {
822
+ "epoch": 1.9096645632680174,
823
+ "grad_norm": 0.00042386740096844733,
824
+ "learning_rate": 1.854277695440965e-06,
825
+ "loss": 0.0105,
826
+ "step": 5750
827
+ },
828
+ {
829
+ "epoch": 1.9262703420790435,
830
+ "grad_norm": 0.05140538513660431,
831
+ "learning_rate": 1.8492218613929746e-06,
832
+ "loss": 0.0008,
833
+ "step": 5800
834
+ },
835
+ {
836
+ "epoch": 1.9428761208900698,
837
+ "grad_norm": 0.00046465068589895964,
838
+ "learning_rate": 1.8441660273449842e-06,
839
+ "loss": 0.0176,
840
+ "step": 5850
841
+ },
842
+ {
843
+ "epoch": 1.959481899701096,
844
+ "grad_norm": 0.001875279936939478,
845
+ "learning_rate": 1.8391101932969938e-06,
846
+ "loss": 0.0002,
847
+ "step": 5900
848
+ },
849
+ {
850
+ "epoch": 1.9760876785121222,
851
+ "grad_norm": 0.0012590339174494147,
852
+ "learning_rate": 1.8340543592490032e-06,
853
+ "loss": 0.001,
854
+ "step": 5950
855
+ },
856
+ {
857
+ "epoch": 1.9926934573231485,
858
+ "grad_norm": 25.133811950683594,
859
+ "learning_rate": 1.828998525201013e-06,
860
+ "loss": 0.0229,
861
+ "step": 6000
862
+ },
863
+ {
864
+ "epoch": 2.0,
865
+ "eval_accuracy": 0.995706106870229,
866
+ "eval_f1": 0.9956269879098661,
867
+ "eval_loss": 0.0445549376308918,
868
+ "eval_precision": 0.9956596696711074,
869
+ "eval_recall": 0.995706106870229,
870
+ "eval_runtime": 38.3077,
871
+ "eval_samples_per_second": 218.859,
872
+ "eval_steps_per_second": 13.679,
873
+ "step": 6022
874
+ }
875
+ ],
876
+ "logging_steps": 50,
877
+ "max_steps": 24088,
878
+ "num_input_tokens_seen": 0,
879
+ "num_train_epochs": 8,
880
+ "save_steps": 500,
881
+ "stateful_callbacks": {
882
+ "TrainerControl": {
883
+ "args": {
884
+ "should_epoch_stop": false,
885
+ "should_evaluate": false,
886
+ "should_log": false,
887
+ "should_save": true,
888
+ "should_training_stop": false
889
+ },
890
+ "attributes": {}
891
+ }
892
+ },
893
+ "total_flos": 3.282861088518144e+16,
894
+ "train_batch_size": 16,
895
+ "trial_name": null,
896
+ "trial_params": null
897
+ }
trial-1/checkpoint-6022/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:161830f01fe4451cf2afb08516c24e569c5b229b44b735c51814ae17b5494e10
3
+ size 5368
trial-2/checkpoint-6022/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-2/checkpoint-6022/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d8242e8a21a76a0ad8b21949fe7bd68e94de5ce2da543a151336909fcb8e83
3
+ size 598439784
trial-2/checkpoint-6022/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c89405c1def95fb7d1e0ff7deac188ca136134ebd620d1451c9f0d4ed557d77a
3
+ size 1196967418
trial-2/checkpoint-6022/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
3
+ size 14244
trial-2/checkpoint-6022/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daebe5b6f96508652ee77aa623e80e4943a4ab7b8acffe2720aa77d58c2624f9
3
+ size 1064
trial-2/checkpoint-6022/trainer_state.json ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.031979888677597046,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-2/checkpoint-6022",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 6022,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016605778811026237,
13
+ "grad_norm": 21.788597106933594,
14
+ "learning_rate": 5.429575351871404e-06,
15
+ "loss": 0.5789,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.033211557622052475,
20
+ "grad_norm": 20.038349151611328,
21
+ "learning_rate": 5.416664391316233e-06,
22
+ "loss": 0.37,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.04981733643307871,
27
+ "grad_norm": 23.927526473999023,
28
+ "learning_rate": 5.403753430761063e-06,
29
+ "loss": 0.25,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.06642311524410495,
34
+ "grad_norm": 4.1712799072265625,
35
+ "learning_rate": 5.390842470205893e-06,
36
+ "loss": 0.1921,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.08302889405513118,
41
+ "grad_norm": 6.138601303100586,
42
+ "learning_rate": 5.3779315096507225e-06,
43
+ "loss": 0.1365,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.09963467286615742,
48
+ "grad_norm": 0.9431160092353821,
49
+ "learning_rate": 5.3650205490955514e-06,
50
+ "loss": 0.1473,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.11624045167718366,
55
+ "grad_norm": 25.303245544433594,
56
+ "learning_rate": 5.352109588540381e-06,
57
+ "loss": 0.0875,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.1328462304882099,
62
+ "grad_norm": 14.83379077911377,
63
+ "learning_rate": 5.33919862798521e-06,
64
+ "loss": 0.111,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.14945200929923613,
69
+ "grad_norm": 0.2346535325050354,
70
+ "learning_rate": 5.32628766743004e-06,
71
+ "loss": 0.0722,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.16605778811026237,
76
+ "grad_norm": 19.045169830322266,
77
+ "learning_rate": 5.31337670687487e-06,
78
+ "loss": 0.1236,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.1826635669212886,
83
+ "grad_norm": 10.871609687805176,
84
+ "learning_rate": 5.300465746319699e-06,
85
+ "loss": 0.1018,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.19926934573231483,
90
+ "grad_norm": 8.278830528259277,
91
+ "learning_rate": 5.287554785764528e-06,
92
+ "loss": 0.0608,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.2158751245433411,
97
+ "grad_norm": 3.4486818313598633,
98
+ "learning_rate": 5.274643825209358e-06,
99
+ "loss": 0.0684,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.23248090335436733,
104
+ "grad_norm": 9.789453506469727,
105
+ "learning_rate": 5.261732864654187e-06,
106
+ "loss": 0.0826,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.24908668216539356,
111
+ "grad_norm": 0.013454285450279713,
112
+ "learning_rate": 5.248821904099017e-06,
113
+ "loss": 0.0672,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.2656924609764198,
118
+ "grad_norm": 0.8878294825553894,
119
+ "learning_rate": 5.2359109435438465e-06,
120
+ "loss": 0.0472,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.282298239787446,
125
+ "grad_norm": 15.41006088256836,
126
+ "learning_rate": 5.222999982988676e-06,
127
+ "loss": 0.0616,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.29890401859847227,
132
+ "grad_norm": 0.04324938729405403,
133
+ "learning_rate": 5.210089022433506e-06,
134
+ "loss": 0.0215,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.3155097974094985,
139
+ "grad_norm": 0.011849366128444672,
140
+ "learning_rate": 5.197178061878335e-06,
141
+ "loss": 0.0398,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.33211557622052473,
146
+ "grad_norm": 0.0020897299982607365,
147
+ "learning_rate": 5.184267101323165e-06,
148
+ "loss": 0.0294,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.348721355031551,
153
+ "grad_norm": 0.00038467388367280364,
154
+ "learning_rate": 5.171356140767994e-06,
155
+ "loss": 0.0328,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.3653271338425772,
160
+ "grad_norm": 0.0022064056247472763,
161
+ "learning_rate": 5.158445180212823e-06,
162
+ "loss": 0.0216,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.38193291265360346,
167
+ "grad_norm": 0.012603014707565308,
168
+ "learning_rate": 5.145534219657653e-06,
169
+ "loss": 0.0293,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.39853869146462967,
174
+ "grad_norm": 0.002970542525872588,
175
+ "learning_rate": 5.132623259102483e-06,
176
+ "loss": 0.0133,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.41514447027565593,
181
+ "grad_norm": 0.09289965778589249,
182
+ "learning_rate": 5.119712298547312e-06,
183
+ "loss": 0.0189,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.4317502490866822,
188
+ "grad_norm": 0.030116688460111618,
189
+ "learning_rate": 5.106801337992142e-06,
190
+ "loss": 0.0266,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.4483560278977084,
195
+ "grad_norm": 23.291847229003906,
196
+ "learning_rate": 5.0938903774369705e-06,
197
+ "loss": 0.0378,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.46496180670873466,
202
+ "grad_norm": 0.00580954784527421,
203
+ "learning_rate": 5.0809794168818e-06,
204
+ "loss": 0.0002,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.48156758551976087,
209
+ "grad_norm": 0.0036250711418688297,
210
+ "learning_rate": 5.06806845632663e-06,
211
+ "loss": 0.0297,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.4981733643307871,
216
+ "grad_norm": 0.0013630707981064916,
217
+ "learning_rate": 5.05515749577146e-06,
218
+ "loss": 0.0114,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.5147791431418134,
223
+ "grad_norm": 0.025447094812989235,
224
+ "learning_rate": 5.042246535216289e-06,
225
+ "loss": 0.0019,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 0.5313849219528396,
230
+ "grad_norm": 18.81841468811035,
231
+ "learning_rate": 5.0293355746611185e-06,
232
+ "loss": 0.0286,
233
+ "step": 1600
234
+ },
235
+ {
236
+ "epoch": 0.5479907007638658,
237
+ "grad_norm": 0.0033424277789890766,
238
+ "learning_rate": 5.016424614105948e-06,
239
+ "loss": 0.0393,
240
+ "step": 1650
241
+ },
242
+ {
243
+ "epoch": 0.564596479574892,
244
+ "grad_norm": 0.039123374968767166,
245
+ "learning_rate": 5.003513653550777e-06,
246
+ "loss": 0.0186,
247
+ "step": 1700
248
+ },
249
+ {
250
+ "epoch": 0.5812022583859183,
251
+ "grad_norm": 0.0005275913863442838,
252
+ "learning_rate": 4.990602692995607e-06,
253
+ "loss": 0.0003,
254
+ "step": 1750
255
+ },
256
+ {
257
+ "epoch": 0.5978080371969445,
258
+ "grad_norm": 0.005070064682513475,
259
+ "learning_rate": 4.977691732440437e-06,
260
+ "loss": 0.01,
261
+ "step": 1800
262
+ },
263
+ {
264
+ "epoch": 0.6144138160079707,
265
+ "grad_norm": 0.003932475112378597,
266
+ "learning_rate": 4.9647807718852664e-06,
267
+ "loss": 0.0222,
268
+ "step": 1850
269
+ },
270
+ {
271
+ "epoch": 0.631019594818997,
272
+ "grad_norm": 0.6544032692909241,
273
+ "learning_rate": 4.951869811330095e-06,
274
+ "loss": 0.0138,
275
+ "step": 1900
276
+ },
277
+ {
278
+ "epoch": 0.6476253736300233,
279
+ "grad_norm": 0.008768323808908463,
280
+ "learning_rate": 4.938958850774925e-06,
281
+ "loss": 0.0056,
282
+ "step": 1950
283
+ },
284
+ {
285
+ "epoch": 0.6642311524410495,
286
+ "grad_norm": 0.0021180976182222366,
287
+ "learning_rate": 4.926047890219754e-06,
288
+ "loss": 0.0049,
289
+ "step": 2000
290
+ },
291
+ {
292
+ "epoch": 0.6808369312520757,
293
+ "grad_norm": 0.002039346843957901,
294
+ "learning_rate": 4.913136929664584e-06,
295
+ "loss": 0.0142,
296
+ "step": 2050
297
+ },
298
+ {
299
+ "epoch": 0.697442710063102,
300
+ "grad_norm": 0.012900142930448055,
301
+ "learning_rate": 4.9002259691094136e-06,
302
+ "loss": 0.0105,
303
+ "step": 2100
304
+ },
305
+ {
306
+ "epoch": 0.7140484888741282,
307
+ "grad_norm": 0.0022153747268021107,
308
+ "learning_rate": 4.887315008554243e-06,
309
+ "loss": 0.0142,
310
+ "step": 2150
311
+ },
312
+ {
313
+ "epoch": 0.7306542676851544,
314
+ "grad_norm": 0.001426122267730534,
315
+ "learning_rate": 4.874404047999072e-06,
316
+ "loss": 0.0068,
317
+ "step": 2200
318
+ },
319
+ {
320
+ "epoch": 0.7472600464961807,
321
+ "grad_norm": 0.0008603449095971882,
322
+ "learning_rate": 4.861493087443902e-06,
323
+ "loss": 0.0119,
324
+ "step": 2250
325
+ },
326
+ {
327
+ "epoch": 0.7638658253072069,
328
+ "grad_norm": 0.0006780526018701494,
329
+ "learning_rate": 4.848582126888731e-06,
330
+ "loss": 0.0108,
331
+ "step": 2300
332
+ },
333
+ {
334
+ "epoch": 0.7804716041182331,
335
+ "grad_norm": 0.014527379535138607,
336
+ "learning_rate": 4.835671166333561e-06,
337
+ "loss": 0.0002,
338
+ "step": 2350
339
+ },
340
+ {
341
+ "epoch": 0.7970773829292593,
342
+ "grad_norm": 0.00022624376288149506,
343
+ "learning_rate": 4.8227602057783904e-06,
344
+ "loss": 0.0092,
345
+ "step": 2400
346
+ },
347
+ {
348
+ "epoch": 0.8136831617402857,
349
+ "grad_norm": 0.0044932495802640915,
350
+ "learning_rate": 4.80984924522322e-06,
351
+ "loss": 0.0001,
352
+ "step": 2450
353
+ },
354
+ {
355
+ "epoch": 0.8302889405513119,
356
+ "grad_norm": 0.0009355309884995222,
357
+ "learning_rate": 4.79693828466805e-06,
358
+ "loss": 0.0002,
359
+ "step": 2500
360
+ },
361
+ {
362
+ "epoch": 0.8468947193623381,
363
+ "grad_norm": 0.12550997734069824,
364
+ "learning_rate": 4.784027324112879e-06,
365
+ "loss": 0.0024,
366
+ "step": 2550
367
+ },
368
+ {
369
+ "epoch": 0.8635004981733644,
370
+ "grad_norm": 0.02399071305990219,
371
+ "learning_rate": 4.771116363557709e-06,
372
+ "loss": 0.0099,
373
+ "step": 2600
374
+ },
375
+ {
376
+ "epoch": 0.8801062769843906,
377
+ "grad_norm": 0.008470265194773674,
378
+ "learning_rate": 4.7582054030025375e-06,
379
+ "loss": 0.0157,
380
+ "step": 2650
381
+ },
382
+ {
383
+ "epoch": 0.8967120557954168,
384
+ "grad_norm": 3.967735028709285e-05,
385
+ "learning_rate": 4.745294442447367e-06,
386
+ "loss": 0.0013,
387
+ "step": 2700
388
+ },
389
+ {
390
+ "epoch": 0.913317834606443,
391
+ "grad_norm": 0.0005532742943614721,
392
+ "learning_rate": 4.732383481892197e-06,
393
+ "loss": 0.0025,
394
+ "step": 2750
395
+ },
396
+ {
397
+ "epoch": 0.9299236134174693,
398
+ "grad_norm": 9.227233022102155e-06,
399
+ "learning_rate": 4.719472521337027e-06,
400
+ "loss": 0.0028,
401
+ "step": 2800
402
+ },
403
+ {
404
+ "epoch": 0.9465293922284955,
405
+ "grad_norm": 0.280258446931839,
406
+ "learning_rate": 4.706561560781856e-06,
407
+ "loss": 0.0004,
408
+ "step": 2850
409
+ },
410
+ {
411
+ "epoch": 0.9631351710395217,
412
+ "grad_norm": 27.427757263183594,
413
+ "learning_rate": 4.6936506002266855e-06,
414
+ "loss": 0.0127,
415
+ "step": 2900
416
+ },
417
+ {
418
+ "epoch": 0.9797409498505479,
419
+ "grad_norm": 176.85423278808594,
420
+ "learning_rate": 4.680739639671514e-06,
421
+ "loss": 0.0298,
422
+ "step": 2950
423
+ },
424
+ {
425
+ "epoch": 0.9963467286615743,
426
+ "grad_norm": 0.00011263355554547161,
427
+ "learning_rate": 4.667828679116344e-06,
428
+ "loss": 0.001,
429
+ "step": 3000
430
+ },
431
+ {
432
+ "epoch": 1.0,
433
+ "eval_accuracy": 0.9963024809160306,
434
+ "eval_f1": 0.9962431632227496,
435
+ "eval_loss": 0.04071500524878502,
436
+ "eval_precision": 0.9962693439313673,
437
+ "eval_recall": 0.9963024809160306,
438
+ "eval_runtime": 38.0003,
439
+ "eval_samples_per_second": 220.63,
440
+ "eval_steps_per_second": 13.789,
441
+ "step": 3011
442
+ },
443
+ {
444
+ "epoch": 1.0129525074726005,
445
+ "grad_norm": 0.05092976614832878,
446
+ "learning_rate": 4.654917718561174e-06,
447
+ "loss": 0.018,
448
+ "step": 3050
449
+ },
450
+ {
451
+ "epoch": 1.0295582862836268,
452
+ "grad_norm": 3.4633874747669324e-05,
453
+ "learning_rate": 4.642006758006004e-06,
454
+ "loss": 0.0,
455
+ "step": 3100
456
+ },
457
+ {
458
+ "epoch": 1.0461640650946529,
459
+ "grad_norm": 8.058391540544108e-05,
460
+ "learning_rate": 4.629095797450833e-06,
461
+ "loss": 0.0,
462
+ "step": 3150
463
+ },
464
+ {
465
+ "epoch": 1.0627698439056792,
466
+ "grad_norm": 0.00043129033292643726,
467
+ "learning_rate": 4.616184836895662e-06,
468
+ "loss": 0.0,
469
+ "step": 3200
470
+ },
471
+ {
472
+ "epoch": 1.0793756227167055,
473
+ "grad_norm": 0.012417804449796677,
474
+ "learning_rate": 4.603273876340492e-06,
475
+ "loss": 0.0204,
476
+ "step": 3250
477
+ },
478
+ {
479
+ "epoch": 1.0959814015277316,
480
+ "grad_norm": 0.07707448303699493,
481
+ "learning_rate": 4.590362915785321e-06,
482
+ "loss": 0.0089,
483
+ "step": 3300
484
+ },
485
+ {
486
+ "epoch": 1.112587180338758,
487
+ "grad_norm": 0.0019856118597090244,
488
+ "learning_rate": 4.577451955230151e-06,
489
+ "loss": 0.0003,
490
+ "step": 3350
491
+ },
492
+ {
493
+ "epoch": 1.1291929591497842,
494
+ "grad_norm": 0.0003844090970233083,
495
+ "learning_rate": 4.564540994674981e-06,
496
+ "loss": 0.0,
497
+ "step": 3400
498
+ },
499
+ {
500
+ "epoch": 1.1457987379608103,
501
+ "grad_norm": 0.004796341527253389,
502
+ "learning_rate": 4.55163003411981e-06,
503
+ "loss": 0.0054,
504
+ "step": 3450
505
+ },
506
+ {
507
+ "epoch": 1.1624045167718366,
508
+ "grad_norm": 0.0021394495852291584,
509
+ "learning_rate": 4.538719073564639e-06,
510
+ "loss": 0.0001,
511
+ "step": 3500
512
+ },
513
+ {
514
+ "epoch": 1.1790102955828627,
515
+ "grad_norm": 0.00016287445032503456,
516
+ "learning_rate": 4.525808113009469e-06,
517
+ "loss": 0.0017,
518
+ "step": 3550
519
+ },
520
+ {
521
+ "epoch": 1.195616074393889,
522
+ "grad_norm": 0.005753168836236,
523
+ "learning_rate": 4.512897152454298e-06,
524
+ "loss": 0.0132,
525
+ "step": 3600
526
+ },
527
+ {
528
+ "epoch": 1.2122218532049154,
529
+ "grad_norm": 0.00012519631127361208,
530
+ "learning_rate": 4.499986191899128e-06,
531
+ "loss": 0.0,
532
+ "step": 3650
533
+ },
534
+ {
535
+ "epoch": 1.2288276320159415,
536
+ "grad_norm": 0.0009526669164188206,
537
+ "learning_rate": 4.487075231343957e-06,
538
+ "loss": 0.0083,
539
+ "step": 3700
540
+ },
541
+ {
542
+ "epoch": 1.2454334108269678,
543
+ "grad_norm": 6.90124070388265e-05,
544
+ "learning_rate": 4.474164270788787e-06,
545
+ "loss": 0.0114,
546
+ "step": 3750
547
+ },
548
+ {
549
+ "epoch": 1.2620391896379939,
550
+ "grad_norm": 0.0029422417283058167,
551
+ "learning_rate": 4.461253310233616e-06,
552
+ "loss": 0.0001,
553
+ "step": 3800
554
+ },
555
+ {
556
+ "epoch": 1.2786449684490202,
557
+ "grad_norm": 1.6564589738845825,
558
+ "learning_rate": 4.448342349678446e-06,
559
+ "loss": 0.0065,
560
+ "step": 3850
561
+ },
562
+ {
563
+ "epoch": 1.2952507472600465,
564
+ "grad_norm": 4.6906425268389285e-05,
565
+ "learning_rate": 4.435431389123275e-06,
566
+ "loss": 0.0,
567
+ "step": 3900
568
+ },
569
+ {
570
+ "epoch": 1.3118565260710726,
571
+ "grad_norm": 1.4456440112553537e-05,
572
+ "learning_rate": 4.4225204285681046e-06,
573
+ "loss": 0.0,
574
+ "step": 3950
575
+ },
576
+ {
577
+ "epoch": 1.328462304882099,
578
+ "grad_norm": 4.6707005822099745e-05,
579
+ "learning_rate": 4.409609468012934e-06,
580
+ "loss": 0.0227,
581
+ "step": 4000
582
+ },
583
+ {
584
+ "epoch": 1.3450680836931252,
585
+ "grad_norm": 4.7155015636235476e-05,
586
+ "learning_rate": 4.396698507457763e-06,
587
+ "loss": 0.0002,
588
+ "step": 4050
589
+ },
590
+ {
591
+ "epoch": 1.3616738625041513,
592
+ "grad_norm": 0.01696430891752243,
593
+ "learning_rate": 4.383787546902593e-06,
594
+ "loss": 0.0188,
595
+ "step": 4100
596
+ },
597
+ {
598
+ "epoch": 1.3782796413151777,
599
+ "grad_norm": 0.0008329456904903054,
600
+ "learning_rate": 4.370876586347423e-06,
601
+ "loss": 0.0178,
602
+ "step": 4150
603
+ },
604
+ {
605
+ "epoch": 1.394885420126204,
606
+ "grad_norm": 9.179511835100129e-05,
607
+ "learning_rate": 4.3579656257922525e-06,
608
+ "loss": 0.0,
609
+ "step": 4200
610
+ },
611
+ {
612
+ "epoch": 1.41149119893723,
613
+ "grad_norm": 2.924172622442711e-05,
614
+ "learning_rate": 4.3450546652370814e-06,
615
+ "loss": 0.0013,
616
+ "step": 4250
617
+ },
618
+ {
619
+ "epoch": 1.4280969777482564,
620
+ "grad_norm": 0.015076125971972942,
621
+ "learning_rate": 4.332143704681911e-06,
622
+ "loss": 0.0104,
623
+ "step": 4300
624
+ },
625
+ {
626
+ "epoch": 1.4447027565592827,
627
+ "grad_norm": 5.385762415244244e-05,
628
+ "learning_rate": 4.31923274412674e-06,
629
+ "loss": 0.014,
630
+ "step": 4350
631
+ },
632
+ {
633
+ "epoch": 1.4613085353703088,
634
+ "grad_norm": 0.0007110639126040041,
635
+ "learning_rate": 4.30632178357157e-06,
636
+ "loss": 0.0126,
637
+ "step": 4400
638
+ },
639
+ {
640
+ "epoch": 1.4779143141813351,
641
+ "grad_norm": 0.00014339391782414168,
642
+ "learning_rate": 4.2934108230164e-06,
643
+ "loss": 0.0003,
644
+ "step": 4450
645
+ },
646
+ {
647
+ "epoch": 1.4945200929923614,
648
+ "grad_norm": 0.0006024091853760183,
649
+ "learning_rate": 4.280499862461229e-06,
650
+ "loss": 0.0118,
651
+ "step": 4500
652
+ },
653
+ {
654
+ "epoch": 1.5111258718033875,
655
+ "grad_norm": 0.0002353072923142463,
656
+ "learning_rate": 4.267588901906058e-06,
657
+ "loss": 0.0086,
658
+ "step": 4550
659
+ },
660
+ {
661
+ "epoch": 1.5277316506144138,
662
+ "grad_norm": 0.0008946498855948448,
663
+ "learning_rate": 4.254677941350888e-06,
664
+ "loss": 0.0,
665
+ "step": 4600
666
+ },
667
+ {
668
+ "epoch": 1.5443374294254402,
669
+ "grad_norm": 7.315174298128113e-05,
670
+ "learning_rate": 4.241766980795717e-06,
671
+ "loss": 0.0003,
672
+ "step": 4650
673
+ },
674
+ {
675
+ "epoch": 1.5609432082364663,
676
+ "grad_norm": 9.232313459506258e-05,
677
+ "learning_rate": 4.228856020240547e-06,
678
+ "loss": 0.0001,
679
+ "step": 4700
680
+ },
681
+ {
682
+ "epoch": 1.5775489870474926,
683
+ "grad_norm": 1.4020029084349517e-05,
684
+ "learning_rate": 4.2159450596853765e-06,
685
+ "loss": 0.0,
686
+ "step": 4750
687
+ },
688
+ {
689
+ "epoch": 1.594154765858519,
690
+ "grad_norm": 4.0607475966680795e-05,
691
+ "learning_rate": 4.203034099130206e-06,
692
+ "loss": 0.0,
693
+ "step": 4800
694
+ },
695
+ {
696
+ "epoch": 1.610760544669545,
697
+ "grad_norm": 4.69290571345482e-05,
698
+ "learning_rate": 4.190123138575036e-06,
699
+ "loss": 0.0177,
700
+ "step": 4850
701
+ },
702
+ {
703
+ "epoch": 1.627366323480571,
704
+ "grad_norm": 0.14096687734127045,
705
+ "learning_rate": 4.177212178019865e-06,
706
+ "loss": 0.0115,
707
+ "step": 4900
708
+ },
709
+ {
710
+ "epoch": 1.6439721022915976,
711
+ "grad_norm": 0.00020342542848084122,
712
+ "learning_rate": 4.164301217464695e-06,
713
+ "loss": 0.0001,
714
+ "step": 4950
715
+ },
716
+ {
717
+ "epoch": 1.6605778811026237,
718
+ "grad_norm": 0.0002786288969218731,
719
+ "learning_rate": 4.151390256909524e-06,
720
+ "loss": 0.0,
721
+ "step": 5000
722
+ },
723
+ {
724
+ "epoch": 1.6771836599136498,
725
+ "grad_norm": 2.8438846129574813e-05,
726
+ "learning_rate": 4.138479296354353e-06,
727
+ "loss": 0.0032,
728
+ "step": 5050
729
+ },
730
+ {
731
+ "epoch": 1.6937894387246761,
732
+ "grad_norm": 5.944320037087891e-06,
733
+ "learning_rate": 4.125568335799183e-06,
734
+ "loss": 0.0001,
735
+ "step": 5100
736
+ },
737
+ {
738
+ "epoch": 1.7103952175357025,
739
+ "grad_norm": 0.005958211608231068,
740
+ "learning_rate": 4.112657375244013e-06,
741
+ "loss": 0.0,
742
+ "step": 5150
743
+ },
744
+ {
745
+ "epoch": 1.7270009963467285,
746
+ "grad_norm": 0.002004456939175725,
747
+ "learning_rate": 4.099746414688842e-06,
748
+ "loss": 0.0106,
749
+ "step": 5200
750
+ },
751
+ {
752
+ "epoch": 1.7436067751577549,
753
+ "grad_norm": 0.0008562383009120822,
754
+ "learning_rate": 4.086835454133672e-06,
755
+ "loss": 0.0081,
756
+ "step": 5250
757
+ },
758
+ {
759
+ "epoch": 1.7602125539687812,
760
+ "grad_norm": 0.03570560738444328,
761
+ "learning_rate": 4.0739244935785005e-06,
762
+ "loss": 0.025,
763
+ "step": 5300
764
+ },
765
+ {
766
+ "epoch": 1.7768183327798073,
767
+ "grad_norm": 0.001486024702899158,
768
+ "learning_rate": 4.06101353302333e-06,
769
+ "loss": 0.0145,
770
+ "step": 5350
771
+ },
772
+ {
773
+ "epoch": 1.7934241115908336,
774
+ "grad_norm": 0.0015331929316744208,
775
+ "learning_rate": 4.04810257246816e-06,
776
+ "loss": 0.0001,
777
+ "step": 5400
778
+ },
779
+ {
780
+ "epoch": 1.81002989040186,
781
+ "grad_norm": 0.004162834957242012,
782
+ "learning_rate": 4.03519161191299e-06,
783
+ "loss": 0.0005,
784
+ "step": 5450
785
+ },
786
+ {
787
+ "epoch": 1.826635669212886,
788
+ "grad_norm": 0.0003064811462536454,
789
+ "learning_rate": 4.022280651357819e-06,
790
+ "loss": 0.0,
791
+ "step": 5500
792
+ },
793
+ {
794
+ "epoch": 1.8432414480239123,
795
+ "grad_norm": 0.000830256671179086,
796
+ "learning_rate": 4.0093696908026485e-06,
797
+ "loss": 0.0034,
798
+ "step": 5550
799
+ },
800
+ {
801
+ "epoch": 1.8598472268349386,
802
+ "grad_norm": 0.001540405093692243,
803
+ "learning_rate": 3.996458730247478e-06,
804
+ "loss": 0.0,
805
+ "step": 5600
806
+ },
807
+ {
808
+ "epoch": 1.8764530056459647,
809
+ "grad_norm": 0.011221639811992645,
810
+ "learning_rate": 3.983547769692307e-06,
811
+ "loss": 0.0116,
812
+ "step": 5650
813
+ },
814
+ {
815
+ "epoch": 1.893058784456991,
816
+ "grad_norm": 0.0031693174969404936,
817
+ "learning_rate": 3.970636809137137e-06,
818
+ "loss": 0.0061,
819
+ "step": 5700
820
+ },
821
+ {
822
+ "epoch": 1.9096645632680174,
823
+ "grad_norm": 7.828649540897459e-05,
824
+ "learning_rate": 3.957725848581967e-06,
825
+ "loss": 0.0,
826
+ "step": 5750
827
+ },
828
+ {
829
+ "epoch": 1.9262703420790435,
830
+ "grad_norm": 0.00892726145684719,
831
+ "learning_rate": 3.9448148880267964e-06,
832
+ "loss": 0.0003,
833
+ "step": 5800
834
+ },
835
+ {
836
+ "epoch": 1.9428761208900698,
837
+ "grad_norm": 0.0033830904867500067,
838
+ "learning_rate": 3.931903927471625e-06,
839
+ "loss": 0.0007,
840
+ "step": 5850
841
+ },
842
+ {
843
+ "epoch": 1.959481899701096,
844
+ "grad_norm": 0.017441514879465103,
845
+ "learning_rate": 3.918992966916455e-06,
846
+ "loss": 0.0109,
847
+ "step": 5900
848
+ },
849
+ {
850
+ "epoch": 1.9760876785121222,
851
+ "grad_norm": 0.006790176033973694,
852
+ "learning_rate": 3.906082006361284e-06,
853
+ "loss": 0.0101,
854
+ "step": 5950
855
+ },
856
+ {
857
+ "epoch": 1.9926934573231485,
858
+ "grad_norm": 0.0004248483164701611,
859
+ "learning_rate": 3.893171045806114e-06,
860
+ "loss": 0.0103,
861
+ "step": 6000
862
+ },
863
+ {
864
+ "epoch": 2.0,
865
+ "eval_accuracy": 0.9959446564885496,
866
+ "eval_f1": 0.9958827988724177,
867
+ "eval_loss": 0.031979888677597046,
868
+ "eval_precision": 0.9958978797187497,
869
+ "eval_recall": 0.9959446564885496,
870
+ "eval_runtime": 37.4063,
871
+ "eval_samples_per_second": 224.134,
872
+ "eval_steps_per_second": 14.008,
873
+ "step": 6022
874
+ }
875
+ ],
876
+ "logging_steps": 50,
877
+ "max_steps": 21077,
878
+ "num_input_tokens_seen": 0,
879
+ "num_train_epochs": 7,
880
+ "save_steps": 500,
881
+ "stateful_callbacks": {
882
+ "TrainerControl": {
883
+ "args": {
884
+ "should_epoch_stop": false,
885
+ "should_evaluate": false,
886
+ "should_log": false,
887
+ "should_save": true,
888
+ "should_training_stop": false
889
+ },
890
+ "attributes": {}
891
+ }
892
+ },
893
+ "total_flos": 3.282861088518144e+16,
894
+ "train_batch_size": 16,
895
+ "trial_name": null,
896
+ "trial_params": null
897
+ }
trial-2/checkpoint-6022/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9657a8731817c986f017540c64090098467c35e79328bfa7cab093c33da6a8e9
3
+ size 5368
trial-3/checkpoint-1506/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-3/checkpoint-1506/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:577af3b8b0a6d7db7f2ff1054a5c4c43704103dd0ed797800f9d9582a3237033
3
+ size 598439784
trial-3/checkpoint-1506/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:309810681fe0458054a9e76c6bfbb6fc2862ae83f89b084906874442e8913f57
3
+ size 1196967418
trial-3/checkpoint-1506/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
3
+ size 14244
trial-3/checkpoint-1506/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77511df67542c270c7a8ed9a3ae9f0a88d6822756582e31cb89e7ee9b503abfb
3
+ size 1064
trial-3/checkpoint-1506/trainer_state.json ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.03509189188480377,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-3/checkpoint-1506",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1506,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.033200531208499334,
13
+ "grad_norm": 6.976862907409668,
14
+ "learning_rate": 2.8972663455552343e-06,
15
+ "loss": 0.5378,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.06640106241699867,
20
+ "grad_norm": 3.674832344055176,
21
+ "learning_rate": 2.8648439379281615e-06,
22
+ "loss": 0.3375,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.099601593625498,
27
+ "grad_norm": 2.678229570388794,
28
+ "learning_rate": 2.8324215303010886e-06,
29
+ "loss": 0.2213,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.13280212483399734,
34
+ "grad_norm": 6.4370551109313965,
35
+ "learning_rate": 2.7999991226740153e-06,
36
+ "loss": 0.1558,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.16600265604249667,
41
+ "grad_norm": 6.4544525146484375,
42
+ "learning_rate": 2.767576715046943e-06,
43
+ "loss": 0.1457,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.199203187250996,
48
+ "grad_norm": 2.4753177165985107,
49
+ "learning_rate": 2.7351543074198696e-06,
50
+ "loss": 0.1349,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.23240371845949534,
55
+ "grad_norm": 3.116945743560791,
56
+ "learning_rate": 2.7027318997927968e-06,
57
+ "loss": 0.1144,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.2656042496679947,
62
+ "grad_norm": 10.000889778137207,
63
+ "learning_rate": 2.670309492165724e-06,
64
+ "loss": 0.0942,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.29880478087649404,
69
+ "grad_norm": 0.3915446996688843,
70
+ "learning_rate": 2.637887084538651e-06,
71
+ "loss": 0.0841,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.33200531208499334,
76
+ "grad_norm": 0.7093335390090942,
77
+ "learning_rate": 2.605464676911578e-06,
78
+ "loss": 0.0815,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.3652058432934927,
83
+ "grad_norm": 5.660763263702393,
84
+ "learning_rate": 2.5730422692845053e-06,
85
+ "loss": 0.058,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.398406374501992,
90
+ "grad_norm": 9.372917175292969,
91
+ "learning_rate": 2.5406198616574325e-06,
92
+ "loss": 0.0521,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.4316069057104914,
97
+ "grad_norm": 6.086747169494629,
98
+ "learning_rate": 2.5081974540303596e-06,
99
+ "loss": 0.0671,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.4648074369189907,
104
+ "grad_norm": 5.661391735076904,
105
+ "learning_rate": 2.4757750464032863e-06,
106
+ "loss": 0.0354,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.49800796812749004,
111
+ "grad_norm": 1.4707638025283813,
112
+ "learning_rate": 2.443352638776214e-06,
113
+ "loss": 0.0386,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.5312084993359893,
118
+ "grad_norm": 7.550576686859131,
119
+ "learning_rate": 2.4109302311491406e-06,
120
+ "loss": 0.0363,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5644090305444888,
125
+ "grad_norm": 11.072442054748535,
126
+ "learning_rate": 2.3785078235220678e-06,
127
+ "loss": 0.0254,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5976095617529881,
132
+ "grad_norm": 0.3040500581264496,
133
+ "learning_rate": 2.346085415894995e-06,
134
+ "loss": 0.018,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.6308100929614874,
139
+ "grad_norm": 11.503410339355469,
140
+ "learning_rate": 2.313663008267922e-06,
141
+ "loss": 0.0302,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6640106241699867,
146
+ "grad_norm": 0.7599239945411682,
147
+ "learning_rate": 2.281240600640849e-06,
148
+ "loss": 0.0267,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6972111553784861,
153
+ "grad_norm": 0.21025581657886505,
154
+ "learning_rate": 2.2488181930137764e-06,
155
+ "loss": 0.0211,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.7304116865869854,
160
+ "grad_norm": 11.052717208862305,
161
+ "learning_rate": 2.2163957853867035e-06,
162
+ "loss": 0.0112,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7636122177954847,
167
+ "grad_norm": 0.0778539627790451,
168
+ "learning_rate": 2.1839733777596302e-06,
169
+ "loss": 0.0212,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.796812749003984,
174
+ "grad_norm": 0.050592467188835144,
175
+ "learning_rate": 2.151550970132558e-06,
176
+ "loss": 0.0082,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.8300132802124834,
181
+ "grad_norm": 0.04680703952908516,
182
+ "learning_rate": 2.1191285625054845e-06,
183
+ "loss": 0.008,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8632138114209827,
188
+ "grad_norm": 127.69743347167969,
189
+ "learning_rate": 2.0867061548784117e-06,
190
+ "loss": 0.0192,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.896414342629482,
195
+ "grad_norm": 0.013791153207421303,
196
+ "learning_rate": 2.0542837472513392e-06,
197
+ "loss": 0.0063,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.9296148738379814,
202
+ "grad_norm": 0.011688283644616604,
203
+ "learning_rate": 2.021861339624266e-06,
204
+ "loss": 0.0068,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.9628154050464808,
209
+ "grad_norm": 14.885448455810547,
210
+ "learning_rate": 1.989438931997193e-06,
211
+ "loss": 0.004,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9960159362549801,
216
+ "grad_norm": 0.38216766715049744,
217
+ "learning_rate": 1.9570165243701202e-06,
218
+ "loss": 0.0069,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 1.0,
223
+ "eval_accuracy": 0.992604961832061,
224
+ "eval_f1": 0.9926480803352735,
225
+ "eval_loss": 0.03509189188480377,
226
+ "eval_precision": 0.9927020529431649,
227
+ "eval_recall": 0.992604961832061,
228
+ "eval_runtime": 31.6693,
229
+ "eval_samples_per_second": 264.736,
230
+ "eval_steps_per_second": 8.273,
231
+ "step": 1506
232
+ }
233
+ ],
234
+ "logging_steps": 50,
235
+ "max_steps": 4518,
236
+ "num_input_tokens_seen": 0,
237
+ "num_train_epochs": 3,
238
+ "save_steps": 500,
239
+ "stateful_callbacks": {
240
+ "TrainerControl": {
241
+ "args": {
242
+ "should_epoch_stop": false,
243
+ "should_evaluate": false,
244
+ "should_log": false,
245
+ "should_save": true,
246
+ "should_training_stop": false
247
+ },
248
+ "attributes": {}
249
+ }
250
+ },
251
+ "total_flos": 1.641430544259072e+16,
252
+ "train_batch_size": 32,
253
+ "trial_name": null,
254
+ "trial_params": null
255
+ }
trial-3/checkpoint-1506/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed06b7fefd178dad53ae3fef61fd304580c1d532a37d5010e58ca8f39e302fa
3
+ size 5368
trial-4/checkpoint-3011/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-4/checkpoint-3011/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6998cd19c83cb7aad4574fdf2f2d1d911f7f01e8d94fcb558dc40e5561e3d188
3
+ size 598439784
trial-4/checkpoint-3011/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0517bca24af0d5ed5988e5100a9e9f6f59df1b0d3e7ca53764baa7878d5d5e3
3
+ size 1196967418
trial-4/checkpoint-3011/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
3
+ size 14244
trial-4/checkpoint-3011/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c0dbc7f9aff9e32282e3dcfb80127104b5c3d0089b59d9cb1b981e6af6f8c41
3
+ size 1064
trial-4/checkpoint-3011/trainer_state.json ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.02325253002345562,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-4/checkpoint-3011",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3011,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016605778811026237,
13
+ "grad_norm": 7.4845476150512695,
14
+ "learning_rate": 1.3209406688296726e-05,
15
+ "loss": 0.427,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.033211557622052475,
20
+ "grad_norm": 8.739913940429688,
21
+ "learning_rate": 1.3184989137392264e-05,
22
+ "loss": 0.2079,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.04981733643307871,
27
+ "grad_norm": 10.918631553649902,
28
+ "learning_rate": 1.31605715864878e-05,
29
+ "loss": 0.1374,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.06642311524410495,
34
+ "grad_norm": 0.09207049757242203,
35
+ "learning_rate": 1.3136154035583336e-05,
36
+ "loss": 0.0971,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.08302889405513118,
41
+ "grad_norm": 0.1270512193441391,
42
+ "learning_rate": 1.3111736484678873e-05,
43
+ "loss": 0.0431,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.09963467286615742,
48
+ "grad_norm": 0.01078485231846571,
49
+ "learning_rate": 1.3087318933774408e-05,
50
+ "loss": 0.0679,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.11624045167718366,
55
+ "grad_norm": 0.16803160309791565,
56
+ "learning_rate": 1.3062901382869945e-05,
57
+ "loss": 0.0364,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.1328462304882099,
62
+ "grad_norm": 0.2863476872444153,
63
+ "learning_rate": 1.303848383196548e-05,
64
+ "loss": 0.0802,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.14945200929923613,
69
+ "grad_norm": 0.018498318269848824,
70
+ "learning_rate": 1.3014066281061019e-05,
71
+ "loss": 0.0324,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.16605778811026237,
76
+ "grad_norm": 12.099262237548828,
77
+ "learning_rate": 1.2989648730156554e-05,
78
+ "loss": 0.0567,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.1826635669212886,
83
+ "grad_norm": 0.04201498255133629,
84
+ "learning_rate": 1.296523117925209e-05,
85
+ "loss": 0.0265,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.19926934573231483,
90
+ "grad_norm": 13.225788116455078,
91
+ "learning_rate": 1.2940813628347628e-05,
92
+ "loss": 0.027,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.2158751245433411,
97
+ "grad_norm": 2.1863136291503906,
98
+ "learning_rate": 1.2916396077443163e-05,
99
+ "loss": 0.0325,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.23248090335436733,
104
+ "grad_norm": 0.0031948979012668133,
105
+ "learning_rate": 1.28919785265387e-05,
106
+ "loss": 0.0378,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.24908668216539356,
111
+ "grad_norm": 0.0001850352855399251,
112
+ "learning_rate": 1.2867560975634237e-05,
113
+ "loss": 0.0242,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.2656924609764198,
118
+ "grad_norm": 0.0007033672300167382,
119
+ "learning_rate": 1.2843143424729772e-05,
120
+ "loss": 0.0306,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.282298239787446,
125
+ "grad_norm": 13.938993453979492,
126
+ "learning_rate": 1.2818725873825309e-05,
127
+ "loss": 0.0458,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.29890401859847227,
132
+ "grad_norm": 0.02099405601620674,
133
+ "learning_rate": 1.2794308322920844e-05,
134
+ "loss": 0.0306,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.3155097974094985,
139
+ "grad_norm": 0.024268606677651405,
140
+ "learning_rate": 1.2769890772016383e-05,
141
+ "loss": 0.0142,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.33211557622052473,
146
+ "grad_norm": 0.004759958013892174,
147
+ "learning_rate": 1.2745473221111918e-05,
148
+ "loss": 0.0141,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.348721355031551,
153
+ "grad_norm": 0.0019629066810011864,
154
+ "learning_rate": 1.2721055670207453e-05,
155
+ "loss": 0.0345,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.3653271338425772,
160
+ "grad_norm": 0.00019358922145329416,
161
+ "learning_rate": 1.2696638119302992e-05,
162
+ "loss": 0.0089,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.38193291265360346,
167
+ "grad_norm": 0.0028237327933311462,
168
+ "learning_rate": 1.2672220568398527e-05,
169
+ "loss": 0.0239,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.39853869146462967,
174
+ "grad_norm": 0.00010467255196999758,
175
+ "learning_rate": 1.2647803017494064e-05,
176
+ "loss": 0.0094,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.41514447027565593,
181
+ "grad_norm": 0.05774892866611481,
182
+ "learning_rate": 1.26233854665896e-05,
183
+ "loss": 0.0246,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.4317502490866822,
188
+ "grad_norm": 0.024394717067480087,
189
+ "learning_rate": 1.2598967915685136e-05,
190
+ "loss": 0.0328,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.4483560278977084,
195
+ "grad_norm": 2.231964349746704,
196
+ "learning_rate": 1.2574550364780673e-05,
197
+ "loss": 0.0204,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.46496180670873466,
202
+ "grad_norm": 0.0014322358183562756,
203
+ "learning_rate": 1.2550132813876208e-05,
204
+ "loss": 0.0001,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.48156758551976087,
209
+ "grad_norm": 0.001744006876833737,
210
+ "learning_rate": 1.2525715262971747e-05,
211
+ "loss": 0.0392,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.4981733643307871,
216
+ "grad_norm": 0.027050139382481575,
217
+ "learning_rate": 1.2501297712067282e-05,
218
+ "loss": 0.0151,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.5147791431418134,
223
+ "grad_norm": 0.0001924823591252789,
224
+ "learning_rate": 1.2476880161162817e-05,
225
+ "loss": 0.0036,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 0.5313849219528396,
230
+ "grad_norm": 4.767300128936768,
231
+ "learning_rate": 1.2452462610258356e-05,
232
+ "loss": 0.0148,
233
+ "step": 1600
234
+ },
235
+ {
236
+ "epoch": 0.5479907007638658,
237
+ "grad_norm": 0.0022574588656425476,
238
+ "learning_rate": 1.242804505935389e-05,
239
+ "loss": 0.0384,
240
+ "step": 1650
241
+ },
242
+ {
243
+ "epoch": 0.564596479574892,
244
+ "grad_norm": 0.12995891273021698,
245
+ "learning_rate": 1.2403627508449428e-05,
246
+ "loss": 0.018,
247
+ "step": 1700
248
+ },
249
+ {
250
+ "epoch": 0.5812022583859183,
251
+ "grad_norm": 0.0005374422180466354,
252
+ "learning_rate": 1.2379209957544964e-05,
253
+ "loss": 0.0039,
254
+ "step": 1750
255
+ },
256
+ {
257
+ "epoch": 0.5978080371969445,
258
+ "grad_norm": 0.004592420998960733,
259
+ "learning_rate": 1.23547924066405e-05,
260
+ "loss": 0.0136,
261
+ "step": 1800
262
+ },
263
+ {
264
+ "epoch": 0.6144138160079707,
265
+ "grad_norm": 0.0008812470478005707,
266
+ "learning_rate": 1.2330374855736037e-05,
267
+ "loss": 0.0167,
268
+ "step": 1850
269
+ },
270
+ {
271
+ "epoch": 0.631019594818997,
272
+ "grad_norm": 28.337797164916992,
273
+ "learning_rate": 1.2305957304831572e-05,
274
+ "loss": 0.0098,
275
+ "step": 1900
276
+ },
277
+ {
278
+ "epoch": 0.6476253736300233,
279
+ "grad_norm": 0.0003208396374247968,
280
+ "learning_rate": 1.228153975392711e-05,
281
+ "loss": 0.0083,
282
+ "step": 1950
283
+ },
284
+ {
285
+ "epoch": 0.6642311524410495,
286
+ "grad_norm": 0.004917904268950224,
287
+ "learning_rate": 1.2257122203022646e-05,
288
+ "loss": 0.012,
289
+ "step": 2000
290
+ },
291
+ {
292
+ "epoch": 0.6808369312520757,
293
+ "grad_norm": 0.0006444657919928432,
294
+ "learning_rate": 1.2232704652118182e-05,
295
+ "loss": 0.0006,
296
+ "step": 2050
297
+ },
298
+ {
299
+ "epoch": 0.697442710063102,
300
+ "grad_norm": 0.00020880017837043852,
301
+ "learning_rate": 1.220828710121372e-05,
302
+ "loss": 0.0169,
303
+ "step": 2100
304
+ },
305
+ {
306
+ "epoch": 0.7140484888741282,
307
+ "grad_norm": 0.009818737395107746,
308
+ "learning_rate": 1.2183869550309254e-05,
309
+ "loss": 0.0143,
310
+ "step": 2150
311
+ },
312
+ {
313
+ "epoch": 0.7306542676851544,
314
+ "grad_norm": 0.0009041284793056548,
315
+ "learning_rate": 1.2159451999404791e-05,
316
+ "loss": 0.0026,
317
+ "step": 2200
318
+ },
319
+ {
320
+ "epoch": 0.7472600464961807,
321
+ "grad_norm": 2.3109569549560547,
322
+ "learning_rate": 1.2135034448500328e-05,
323
+ "loss": 0.0062,
324
+ "step": 2250
325
+ },
326
+ {
327
+ "epoch": 0.7638658253072069,
328
+ "grad_norm": 9.242107807949651e-06,
329
+ "learning_rate": 1.2110616897595863e-05,
330
+ "loss": 0.0029,
331
+ "step": 2300
332
+ },
333
+ {
334
+ "epoch": 0.7804716041182331,
335
+ "grad_norm": 0.00020709235104732215,
336
+ "learning_rate": 1.20861993466914e-05,
337
+ "loss": 0.0,
338
+ "step": 2350
339
+ },
340
+ {
341
+ "epoch": 0.7970773829292593,
342
+ "grad_norm": 0.0008476360817439854,
343
+ "learning_rate": 1.2061781795786937e-05,
344
+ "loss": 0.019,
345
+ "step": 2400
346
+ },
347
+ {
348
+ "epoch": 0.8136831617402857,
349
+ "grad_norm": 0.0002165739715564996,
350
+ "learning_rate": 1.2037364244882474e-05,
351
+ "loss": 0.0,
352
+ "step": 2450
353
+ },
354
+ {
355
+ "epoch": 0.8302889405513119,
356
+ "grad_norm": 0.029956847429275513,
357
+ "learning_rate": 1.201294669397801e-05,
358
+ "loss": 0.0012,
359
+ "step": 2500
360
+ },
361
+ {
362
+ "epoch": 0.8468947193623381,
363
+ "grad_norm": 0.0002400112134637311,
364
+ "learning_rate": 1.1988529143073546e-05,
365
+ "loss": 0.0191,
366
+ "step": 2550
367
+ },
368
+ {
369
+ "epoch": 0.8635004981733644,
370
+ "grad_norm": 0.0070993551053106785,
371
+ "learning_rate": 1.1964111592169083e-05,
372
+ "loss": 0.0155,
373
+ "step": 2600
374
+ },
375
+ {
376
+ "epoch": 0.8801062769843906,
377
+ "grad_norm": 5.127764234202914e-05,
378
+ "learning_rate": 1.1939694041264618e-05,
379
+ "loss": 0.0185,
380
+ "step": 2650
381
+ },
382
+ {
383
+ "epoch": 0.8967120557954168,
384
+ "grad_norm": 0.056577421724796295,
385
+ "learning_rate": 1.1915276490360155e-05,
386
+ "loss": 0.0063,
387
+ "step": 2700
388
+ },
389
+ {
390
+ "epoch": 0.913317834606443,
391
+ "grad_norm": 4.399678437039256e-05,
392
+ "learning_rate": 1.1890858939455692e-05,
393
+ "loss": 0.012,
394
+ "step": 2750
395
+ },
396
+ {
397
+ "epoch": 0.9299236134174693,
398
+ "grad_norm": 6.6589759626367595e-06,
399
+ "learning_rate": 1.1866441388551227e-05,
400
+ "loss": 0.0001,
401
+ "step": 2800
402
+ },
403
+ {
404
+ "epoch": 0.9465293922284955,
405
+ "grad_norm": 0.009270718321204185,
406
+ "learning_rate": 1.1842023837646764e-05,
407
+ "loss": 0.0001,
408
+ "step": 2850
409
+ },
410
+ {
411
+ "epoch": 0.9631351710395217,
412
+ "grad_norm": 6.743930339813232,
413
+ "learning_rate": 1.1817606286742301e-05,
414
+ "loss": 0.0019,
415
+ "step": 2900
416
+ },
417
+ {
418
+ "epoch": 0.9797409498505479,
419
+ "grad_norm": 10.679564476013184,
420
+ "learning_rate": 1.1793188735837838e-05,
421
+ "loss": 0.0258,
422
+ "step": 2950
423
+ },
424
+ {
425
+ "epoch": 0.9963467286615743,
426
+ "grad_norm": 0.0007653234642930329,
427
+ "learning_rate": 1.1768771184933373e-05,
428
+ "loss": 0.0018,
429
+ "step": 3000
430
+ },
431
+ {
432
+ "epoch": 1.0,
433
+ "eval_accuracy": 0.997256679389313,
434
+ "eval_f1": 0.9972464717374746,
435
+ "eval_loss": 0.02325253002345562,
436
+ "eval_precision": 0.997240941740882,
437
+ "eval_recall": 0.997256679389313,
438
+ "eval_runtime": 36.6991,
439
+ "eval_samples_per_second": 228.453,
440
+ "eval_steps_per_second": 14.278,
441
+ "step": 3011
442
+ }
443
+ ],
444
+ "logging_steps": 50,
445
+ "max_steps": 27099,
446
+ "num_input_tokens_seen": 0,
447
+ "num_train_epochs": 9,
448
+ "save_steps": 500,
449
+ "stateful_callbacks": {
450
+ "TrainerControl": {
451
+ "args": {
452
+ "should_epoch_stop": false,
453
+ "should_evaluate": false,
454
+ "should_log": false,
455
+ "should_save": true,
456
+ "should_training_stop": false
457
+ },
458
+ "attributes": {}
459
+ }
460
+ },
461
+ "total_flos": 1.641430544259072e+16,
462
+ "train_batch_size": 16,
463
+ "trial_name": null,
464
+ "trial_params": null
465
+ }
trial-4/checkpoint-3011/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89fb66224a4a1dbc68c030610c33a1d3f64ca676b2064b388b8e2a7385785f5d
3
+ size 5368
trial-5/checkpoint-3012/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-5/checkpoint-3012/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ba330b843aca1a1d0454785b900ed96671619efb6df36ea614d0870f5ef2aa
3
+ size 598439784
trial-5/checkpoint-3012/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60fb304abd0c5b9d4e6de61faca1856b99e71865a5c592f8acaa47567b9139d9
3
+ size 1196967418
trial-5/checkpoint-3012/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
3
+ size 14244
trial-5/checkpoint-3012/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c69ec29ae0867d661613f53dea74fb003b51f72db6450102f05c6dfa235171f
3
+ size 1064
trial-5/checkpoint-3012/trainer_state.json ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0418265163898468,
3
+ "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3012,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.033200531208499334,
13
+ "grad_norm": 6.311530113220215,
14
+ "learning_rate": 1.279094112727349e-06,
15
+ "loss": 0.7104,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.06640106241699867,
20
+ "grad_norm": 17.497058868408203,
21
+ "learning_rate": 1.2748333062225943e-06,
22
+ "loss": 0.5729,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.099601593625498,
27
+ "grad_norm": 7.590151309967041,
28
+ "learning_rate": 1.2705724997178397e-06,
29
+ "loss": 0.4714,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.13280212483399734,
34
+ "grad_norm": 6.96728515625,
35
+ "learning_rate": 1.2663116932130851e-06,
36
+ "loss": 0.3881,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.16600265604249667,
41
+ "grad_norm": 4.9838714599609375,
42
+ "learning_rate": 1.2620508867083303e-06,
43
+ "loss": 0.3194,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.199203187250996,
48
+ "grad_norm": 6.317371368408203,
49
+ "learning_rate": 1.2577900802035758e-06,
50
+ "loss": 0.2976,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.23240371845949534,
55
+ "grad_norm": 15.331583023071289,
56
+ "learning_rate": 1.2535292736988212e-06,
57
+ "loss": 0.2392,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.2656042496679947,
62
+ "grad_norm": 15.493165016174316,
63
+ "learning_rate": 1.2492684671940664e-06,
64
+ "loss": 0.2337,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.29880478087649404,
69
+ "grad_norm": 3.7081472873687744,
70
+ "learning_rate": 1.2450076606893118e-06,
71
+ "loss": 0.2037,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.33200531208499334,
76
+ "grad_norm": 4.029483318328857,
77
+ "learning_rate": 1.240746854184557e-06,
78
+ "loss": 0.2054,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.3652058432934927,
83
+ "grad_norm": 4.573270797729492,
84
+ "learning_rate": 1.2364860476798024e-06,
85
+ "loss": 0.1555,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.398406374501992,
90
+ "grad_norm": 15.748998641967773,
91
+ "learning_rate": 1.2322252411750478e-06,
92
+ "loss": 0.1486,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.4316069057104914,
97
+ "grad_norm": 12.240307807922363,
98
+ "learning_rate": 1.227964434670293e-06,
99
+ "loss": 0.1552,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.4648074369189907,
104
+ "grad_norm": 17.192546844482422,
105
+ "learning_rate": 1.2237036281655385e-06,
106
+ "loss": 0.1234,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.49800796812749004,
111
+ "grad_norm": 11.04953670501709,
112
+ "learning_rate": 1.2194428216607839e-06,
113
+ "loss": 0.1212,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.5312084993359893,
118
+ "grad_norm": 4.883615016937256,
119
+ "learning_rate": 1.215182015156029e-06,
120
+ "loss": 0.1059,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5644090305444888,
125
+ "grad_norm": 4.633565425872803,
126
+ "learning_rate": 1.2109212086512745e-06,
127
+ "loss": 0.0788,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5976095617529881,
132
+ "grad_norm": 2.6228833198547363,
133
+ "learning_rate": 1.20666040214652e-06,
134
+ "loss": 0.087,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.6308100929614874,
139
+ "grad_norm": 6.4782915115356445,
140
+ "learning_rate": 1.2023995956417651e-06,
141
+ "loss": 0.0802,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6640106241699867,
146
+ "grad_norm": 5.229304313659668,
147
+ "learning_rate": 1.1981387891370103e-06,
148
+ "loss": 0.077,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6972111553784861,
153
+ "grad_norm": 6.034313201904297,
154
+ "learning_rate": 1.1938779826322558e-06,
155
+ "loss": 0.0703,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.7304116865869854,
160
+ "grad_norm": 9.29736614227295,
161
+ "learning_rate": 1.1896171761275012e-06,
162
+ "loss": 0.066,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7636122177954847,
167
+ "grad_norm": 0.6172637343406677,
168
+ "learning_rate": 1.1853563696227464e-06,
169
+ "loss": 0.0692,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.796812749003984,
174
+ "grad_norm": 1.642548680305481,
175
+ "learning_rate": 1.1810955631179918e-06,
176
+ "loss": 0.0437,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.8300132802124834,
181
+ "grad_norm": 3.888737916946411,
182
+ "learning_rate": 1.176834756613237e-06,
183
+ "loss": 0.0474,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8632138114209827,
188
+ "grad_norm": 14.787779808044434,
189
+ "learning_rate": 1.1725739501084824e-06,
190
+ "loss": 0.0501,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.896414342629482,
195
+ "grad_norm": 0.8571153283119202,
196
+ "learning_rate": 1.1683131436037278e-06,
197
+ "loss": 0.0439,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.9296148738379814,
202
+ "grad_norm": 0.6915457248687744,
203
+ "learning_rate": 1.164052337098973e-06,
204
+ "loss": 0.0455,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.9628154050464808,
209
+ "grad_norm": 8.8081636428833,
210
+ "learning_rate": 1.1597915305942185e-06,
211
+ "loss": 0.0347,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9960159362549801,
216
+ "grad_norm": 8.551522254943848,
217
+ "learning_rate": 1.1555307240894639e-06,
218
+ "loss": 0.0346,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 1.0,
223
+ "eval_accuracy": 0.982824427480916,
224
+ "eval_f1": 0.9838970307302017,
225
+ "eval_loss": 0.05475565418601036,
226
+ "eval_precision": 0.986134299459291,
227
+ "eval_recall": 0.982824427480916,
228
+ "eval_runtime": 31.8933,
229
+ "eval_samples_per_second": 262.877,
230
+ "eval_steps_per_second": 8.215,
231
+ "step": 1506
232
+ },
233
+ {
234
+ "epoch": 1.0292164674634794,
235
+ "grad_norm": 13.078969955444336,
236
+ "learning_rate": 1.151269917584709e-06,
237
+ "loss": 0.0379,
238
+ "step": 1550
239
+ },
240
+ {
241
+ "epoch": 1.0624169986719787,
242
+ "grad_norm": 1.906078815460205,
243
+ "learning_rate": 1.1470091110799545e-06,
244
+ "loss": 0.0338,
245
+ "step": 1600
246
+ },
247
+ {
248
+ "epoch": 1.095617529880478,
249
+ "grad_norm": 0.4020080864429474,
250
+ "learning_rate": 1.1427483045752e-06,
251
+ "loss": 0.0298,
252
+ "step": 1650
253
+ },
254
+ {
255
+ "epoch": 1.1288180610889773,
256
+ "grad_norm": 2.647258758544922,
257
+ "learning_rate": 1.1384874980704451e-06,
258
+ "loss": 0.023,
259
+ "step": 1700
260
+ },
261
+ {
262
+ "epoch": 1.1620185922974768,
263
+ "grad_norm": 2.046747922897339,
264
+ "learning_rate": 1.1342266915656906e-06,
265
+ "loss": 0.0253,
266
+ "step": 1750
267
+ },
268
+ {
269
+ "epoch": 1.1952191235059761,
270
+ "grad_norm": 13.14510726928711,
271
+ "learning_rate": 1.129965885060936e-06,
272
+ "loss": 0.0268,
273
+ "step": 1800
274
+ },
275
+ {
276
+ "epoch": 1.2284196547144755,
277
+ "grad_norm": 0.12764006853103638,
278
+ "learning_rate": 1.1257050785561812e-06,
279
+ "loss": 0.0099,
280
+ "step": 1850
281
+ },
282
+ {
283
+ "epoch": 1.2616201859229748,
284
+ "grad_norm": 1.6261545419692993,
285
+ "learning_rate": 1.1214442720514266e-06,
286
+ "loss": 0.0252,
287
+ "step": 1900
288
+ },
289
+ {
290
+ "epoch": 1.294820717131474,
291
+ "grad_norm": 5.552518844604492,
292
+ "learning_rate": 1.117183465546672e-06,
293
+ "loss": 0.036,
294
+ "step": 1950
295
+ },
296
+ {
297
+ "epoch": 1.3280212483399734,
298
+ "grad_norm": 24.064516067504883,
299
+ "learning_rate": 1.1129226590419172e-06,
300
+ "loss": 0.0169,
301
+ "step": 2000
302
+ },
303
+ {
304
+ "epoch": 1.361221779548473,
305
+ "grad_norm": 0.00925782322883606,
306
+ "learning_rate": 1.1086618525371626e-06,
307
+ "loss": 0.0184,
308
+ "step": 2050
309
+ },
310
+ {
311
+ "epoch": 1.3944223107569722,
312
+ "grad_norm": 16.54283905029297,
313
+ "learning_rate": 1.1044010460324078e-06,
314
+ "loss": 0.0139,
315
+ "step": 2100
316
+ },
317
+ {
318
+ "epoch": 1.4276228419654715,
319
+ "grad_norm": 0.24406713247299194,
320
+ "learning_rate": 1.1001402395276533e-06,
321
+ "loss": 0.0126,
322
+ "step": 2150
323
+ },
324
+ {
325
+ "epoch": 1.4608233731739708,
326
+ "grad_norm": 0.02731563337147236,
327
+ "learning_rate": 1.0958794330228987e-06,
328
+ "loss": 0.0198,
329
+ "step": 2200
330
+ },
331
+ {
332
+ "epoch": 1.4940239043824701,
333
+ "grad_norm": 17.53055191040039,
334
+ "learning_rate": 1.0916186265181439e-06,
335
+ "loss": 0.0303,
336
+ "step": 2250
337
+ },
338
+ {
339
+ "epoch": 1.5272244355909694,
340
+ "grad_norm": 0.07282107323408127,
341
+ "learning_rate": 1.0873578200133893e-06,
342
+ "loss": 0.0016,
343
+ "step": 2300
344
+ },
345
+ {
346
+ "epoch": 1.5604249667994687,
347
+ "grad_norm": 20.794416427612305,
348
+ "learning_rate": 1.0830970135086347e-06,
349
+ "loss": 0.0225,
350
+ "step": 2350
351
+ },
352
+ {
353
+ "epoch": 1.593625498007968,
354
+ "grad_norm": 0.052418053150177,
355
+ "learning_rate": 1.07883620700388e-06,
356
+ "loss": 0.0076,
357
+ "step": 2400
358
+ },
359
+ {
360
+ "epoch": 1.6268260292164674,
361
+ "grad_norm": 0.21063362061977386,
362
+ "learning_rate": 1.0745754004991254e-06,
363
+ "loss": 0.0159,
364
+ "step": 2450
365
+ },
366
+ {
367
+ "epoch": 1.6600265604249667,
368
+ "grad_norm": 10.455537796020508,
369
+ "learning_rate": 1.0703145939943708e-06,
370
+ "loss": 0.0105,
371
+ "step": 2500
372
+ },
373
+ {
374
+ "epoch": 1.6932270916334662,
375
+ "grad_norm": 6.205326557159424,
376
+ "learning_rate": 1.066053787489616e-06,
377
+ "loss": 0.0081,
378
+ "step": 2550
379
+ },
380
+ {
381
+ "epoch": 1.7264276228419655,
382
+ "grad_norm": 6.523694038391113,
383
+ "learning_rate": 1.0617929809848614e-06,
384
+ "loss": 0.0159,
385
+ "step": 2600
386
+ },
387
+ {
388
+ "epoch": 1.7596281540504648,
389
+ "grad_norm": 0.010043232701718807,
390
+ "learning_rate": 1.0575321744801068e-06,
391
+ "loss": 0.0113,
392
+ "step": 2650
393
+ },
394
+ {
395
+ "epoch": 1.792828685258964,
396
+ "grad_norm": 0.00458578672260046,
397
+ "learning_rate": 1.053271367975352e-06,
398
+ "loss": 0.0086,
399
+ "step": 2700
400
+ },
401
+ {
402
+ "epoch": 1.8260292164674636,
403
+ "grad_norm": 0.10986531525850296,
404
+ "learning_rate": 1.0490105614705974e-06,
405
+ "loss": 0.008,
406
+ "step": 2750
407
+ },
408
+ {
409
+ "epoch": 1.859229747675963,
410
+ "grad_norm": 0.12284637242555618,
411
+ "learning_rate": 1.0447497549658429e-06,
412
+ "loss": 0.0052,
413
+ "step": 2800
414
+ },
415
+ {
416
+ "epoch": 1.8924302788844622,
417
+ "grad_norm": 0.14606119692325592,
418
+ "learning_rate": 1.040488948461088e-06,
419
+ "loss": 0.0176,
420
+ "step": 2850
421
+ },
422
+ {
423
+ "epoch": 1.9256308100929616,
424
+ "grad_norm": 0.020491423085331917,
425
+ "learning_rate": 1.0362281419563333e-06,
426
+ "loss": 0.0102,
427
+ "step": 2900
428
+ },
429
+ {
430
+ "epoch": 1.9588313413014609,
431
+ "grad_norm": 0.05764462426304817,
432
+ "learning_rate": 1.0319673354515787e-06,
433
+ "loss": 0.0044,
434
+ "step": 2950
435
+ },
436
+ {
437
+ "epoch": 1.9920318725099602,
438
+ "grad_norm": 0.7329011559486389,
439
+ "learning_rate": 1.027706528946824e-06,
440
+ "loss": 0.0139,
441
+ "step": 3000
442
+ },
443
+ {
444
+ "epoch": 2.0,
445
+ "eval_accuracy": 0.9924856870229007,
446
+ "eval_f1": 0.9924235722235019,
447
+ "eval_loss": 0.0418265163898468,
448
+ "eval_precision": 0.9923830636545329,
449
+ "eval_recall": 0.9924856870229007,
450
+ "eval_runtime": 31.6222,
451
+ "eval_samples_per_second": 265.131,
452
+ "eval_steps_per_second": 8.285,
453
+ "step": 3012
454
+ }
455
+ ],
456
+ "logging_steps": 50,
457
+ "max_steps": 15060,
458
+ "num_input_tokens_seen": 0,
459
+ "num_train_epochs": 10,
460
+ "save_steps": 500,
461
+ "stateful_callbacks": {
462
+ "TrainerControl": {
463
+ "args": {
464
+ "should_epoch_stop": false,
465
+ "should_evaluate": false,
466
+ "should_log": false,
467
+ "should_save": true,
468
+ "should_training_stop": false
469
+ },
470
+ "attributes": {}
471
+ }
472
+ },
473
+ "total_flos": 3.282861088518144e+16,
474
+ "train_batch_size": 32,
475
+ "trial_name": null,
476
+ "trial_params": null
477
+ }
trial-5/checkpoint-3012/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5a07ff58876babfad1d92462cc9e7062c8f5b0af8d8ba9142ab6f5e8880cf2
3
+ size 5368
trial-6/checkpoint-6022/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "problem_type": "single_label_classification",
40
+ "reference_compile": true,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 50368
47
+ }
trial-6/checkpoint-6022/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a60e2fc558ad0e5a9a4825234c28006f4c14c02aab969b5ebf7cb43d8f890d9e
3
+ size 598439784
trial-6/checkpoint-6022/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ccfa5cc878422afe6f38c7ea21cef7e9f532ec15d2d9169693197daa8b04fb0
3
+ size 1196967418
trial-6/checkpoint-6022/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
3
+ size 14244