thak123 commited on
Commit
2caa73c
1 Parent(s): f94c4cd

Upload 34 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,128 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - all
7
+ metrics:
8
+ - precision
9
+ - recall
10
+ - f1
11
+ model-index:
12
+ - name: cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42
20
+
21
+ This model is a fine-tuned version of [cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual) on the all dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.6972
24
+ - Precision: 0.6687
25
+ - Recall: 0.6729
26
+ - F1: 0.6703
27
+
28
+ ## Model description
29
+
30
+ More information needed
31
+
32
+ ## Intended uses & limitations
33
+
34
+ More information needed
35
+
36
+ ## Training and evaluation data
37
+
38
+ More information needed
39
+
40
+ ## Training procedure
41
+
42
+ ### Training hyperparameters
43
+
44
+ The following hyperparameters were used during training:
45
+ - learning_rate: 5e-05
46
+ - train_batch_size: 16
47
+ - eval_batch_size: 16
48
+ - seed: 42
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: linear
51
+ - num_epochs: 50.0
52
+ - mixed_precision_training: Native AMP
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 |
57
+ |:-------------:|:-----:|:-----:|:---------------:|:---------:|:------:|:------:|
58
+ | 0.6122 | 0.06 | 500 | 0.8542 | 0.6559 | 0.4905 | 0.4841 |
59
+ | 0.5497 | 0.12 | 1000 | 0.8037 | 0.7044 | 0.6070 | 0.6209 |
60
+ | 0.5404 | 0.18 | 1500 | 0.9700 | 0.5591 | 0.4176 | 0.3652 |
61
+ | 0.5165 | 0.24 | 2000 | 0.7449 | 0.7349 | 0.5297 | 0.5369 |
62
+ | 0.5136 | 0.3 | 2500 | 0.7885 | 0.6766 | 0.5025 | 0.5001 |
63
+ | 0.5072 | 0.36 | 3000 | 0.8124 | 0.6076 | 0.6132 | 0.5917 |
64
+ | 0.5011 | 0.42 | 3500 | 0.8767 | 0.6427 | 0.5987 | 0.5784 |
65
+ | 0.5021 | 0.48 | 4000 | 0.7958 | 0.6848 | 0.6362 | 0.6503 |
66
+ | 0.4946 | 0.54 | 4500 | 0.8045 | 0.7220 | 0.4968 | 0.4983 |
67
+ | 0.4928 | 0.6 | 5000 | 0.7803 | 0.7582 | 0.5381 | 0.5503 |
68
+ | 0.5008 | 0.66 | 5500 | 0.7507 | 0.4407 | 0.4798 | 0.4594 |
69
+ | 0.4966 | 0.72 | 6000 | 0.8239 | 0.6140 | 0.6767 | 0.6311 |
70
+ | 0.4791 | 0.78 | 6500 | 0.7028 | 0.6568 | 0.5206 | 0.5413 |
71
+ | 0.494 | 0.84 | 7000 | 0.8034 | 0.6660 | 0.5189 | 0.5227 |
72
+ | 0.4861 | 0.9 | 7500 | 0.9003 | 0.5781 | 0.4785 | 0.4541 |
73
+ | 0.4804 | 0.96 | 8000 | 0.7740 | 0.6239 | 0.5775 | 0.5792 |
74
+ | 0.4614 | 1.02 | 8500 | 0.7397 | 0.6848 | 0.6312 | 0.6471 |
75
+ | 0.4315 | 1.08 | 9000 | 0.7889 | 0.6642 | 0.6035 | 0.6149 |
76
+ | 0.4506 | 1.14 | 9500 | 0.8784 | 0.6387 | 0.5017 | 0.4968 |
77
+ | 0.4489 | 1.2 | 10000 | 0.7994 | 0.5340 | 0.4964 | 0.4949 |
78
+ | 0.4466 | 1.26 | 10500 | 0.8110 | 0.5776 | 0.4735 | 0.4464 |
79
+ | 0.4319 | 1.32 | 11000 | 0.8069 | 0.6612 | 0.5399 | 0.5481 |
80
+ | 0.4243 | 1.38 | 11500 | 0.7942 | 0.5948 | 0.5705 | 0.5797 |
81
+ | 0.4398 | 1.44 | 12000 | 0.9738 | 0.5370 | 0.6070 | 0.5247 |
82
+ | 0.4526 | 1.5 | 12500 | 0.7196 | 0.7046 | 0.5478 | 0.5590 |
83
+ | 0.4529 | 1.56 | 13000 | 0.8050 | 0.6419 | 0.5731 | 0.5863 |
84
+ | 0.446 | 1.62 | 13500 | 0.7564 | 0.6521 | 0.5912 | 0.6107 |
85
+ | 0.4315 | 1.68 | 14000 | 0.7515 | 0.6475 | 0.6069 | 0.6212 |
86
+ | 0.4464 | 1.74 | 14500 | 0.8308 | 0.6276 | 0.5513 | 0.5599 |
87
+ | 0.4423 | 1.8 | 15000 | 0.7982 | 0.6176 | 0.5937 | 0.5992 |
88
+ | 0.4551 | 1.86 | 15500 | 0.8223 | 0.6356 | 0.5934 | 0.6020 |
89
+ | 0.4408 | 1.92 | 16000 | 0.7691 | 0.6088 | 0.5147 | 0.5131 |
90
+ | 0.4389 | 1.98 | 16500 | 0.6972 | 0.6687 | 0.6729 | 0.6703 |
91
+ | 0.3886 | 2.04 | 17000 | 0.7798 | 0.6126 | 0.5437 | 0.5543 |
92
+ | 0.3883 | 2.1 | 17500 | 0.8385 | 0.5948 | 0.6225 | 0.5978 |
93
+ | 0.4011 | 2.16 | 18000 | 0.7755 | 0.6551 | 0.5787 | 0.5915 |
94
+ | 0.3992 | 2.22 | 18500 | 0.7886 | 0.5582 | 0.5519 | 0.5472 |
95
+ | 0.393 | 2.28 | 19000 | 0.7660 | 0.5901 | 0.5923 | 0.5889 |
96
+ | 0.3891 | 2.34 | 19500 | 0.7702 | 0.5792 | 0.5331 | 0.5354 |
97
+ | 0.4119 | 2.41 | 20000 | 0.8545 | 0.5406 | 0.5243 | 0.5111 |
98
+ | 0.3981 | 2.47 | 20500 | 0.8641 | 0.5695 | 0.5536 | 0.5364 |
99
+ | 0.4 | 2.53 | 21000 | 0.8045 | 0.5988 | 0.5845 | 0.5822 |
100
+ | 0.4059 | 2.59 | 21500 | 0.8023 | 0.6301 | 0.5549 | 0.5696 |
101
+ | 0.3805 | 2.65 | 22000 | 0.8242 | 0.5633 | 0.5363 | 0.5387 |
102
+ | 0.4126 | 2.71 | 22500 | 0.8866 | 0.5630 | 0.5244 | 0.5253 |
103
+ | 0.3959 | 2.77 | 23000 | 0.9228 | 0.6486 | 0.5570 | 0.5716 |
104
+ | 0.3972 | 2.83 | 23500 | 0.8297 | 0.6415 | 0.6336 | 0.6330 |
105
+ | 0.3779 | 2.89 | 24000 | 0.8683 | 0.6023 | 0.5920 | 0.5897 |
106
+ | 0.3951 | 2.95 | 24500 | 0.8628 | 0.5892 | 0.5116 | 0.5125 |
107
+ | 0.3916 | 3.01 | 25000 | 0.9203 | 0.6305 | 0.5026 | 0.5024 |
108
+ | 0.3524 | 3.07 | 25500 | 0.9825 | 0.6089 | 0.5039 | 0.5011 |
109
+ | 0.3332 | 3.13 | 26000 | 0.8755 | 0.5980 | 0.5712 | 0.5814 |
110
+ | 0.3517 | 3.19 | 26500 | 0.9922 | 0.6701 | 0.5941 | 0.6181 |
111
+ | 0.3534 | 3.25 | 27000 | 0.9573 | 0.5653 | 0.5175 | 0.5243 |
112
+ | 0.3544 | 3.31 | 27500 | 0.9827 | 0.5739 | 0.5531 | 0.5551 |
113
+ | 0.3526 | 3.37 | 28000 | 0.9517 | 0.6019 | 0.4737 | 0.4657 |
114
+ | 0.3448 | 3.43 | 28500 | 0.9559 | 0.5744 | 0.5138 | 0.5232 |
115
+ | 0.3662 | 3.49 | 29000 | 0.8470 | 0.6417 | 0.6176 | 0.6173 |
116
+ | 0.3502 | 3.55 | 29500 | 0.8524 | 0.6606 | 0.5776 | 0.5912 |
117
+ | 0.3733 | 3.61 | 30000 | 0.9210 | 0.5578 | 0.5555 | 0.5466 |
118
+ | 0.3424 | 3.67 | 30500 | 0.9295 | 0.5863 | 0.6100 | 0.5809 |
119
+ | 0.3591 | 3.73 | 31000 | 0.9707 | 0.5828 | 0.4769 | 0.4588 |
120
+ | 0.3634 | 3.79 | 31500 | 0.8524 | 0.6136 | 0.5681 | 0.5752 |
121
+
122
+
123
+ ### Framework versions
124
+
125
+ - Transformers 4.38.2
126
+ - Pytorch 2.2.1+cu121
127
+ - Datasets 2.18.0
128
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.79,
3
+ "eval_f1": 0.6702519892656928,
4
+ "eval_loss": 0.6971784234046936,
5
+ "eval_precision": 0.6686766810877821,
6
+ "eval_recall": 0.6729106735558349,
7
+ "eval_runtime": 5.8753,
8
+ "eval_samples_per_second": 149.44,
9
+ "eval_steps_per_second": 9.361,
10
+ "seed": 42,
11
+ "test_f1": 0.6309333302387673,
12
+ "test_loss": 0.6790701746940613,
13
+ "test_precision": 0.6301019352201087,
14
+ "test_recall": 0.6496975678367595,
15
+ "test_runtime": 36.8059,
16
+ "test_samples_per_second": 246.645,
17
+ "test_size": 9078,
18
+ "test_steps_per_second": 15.432,
19
+ "train_loss": 0.42908321610708083,
20
+ "train_runtime": 7590.5884,
21
+ "train_samples_per_second": 876.388,
22
+ "train_size": 133046,
23
+ "train_steps_per_second": 54.778,
24
+ "valid_size": 878
25
+ }
checkpoint-16500/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "xlm-roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "output_past": true,
32
+ "pad_token_id": 1,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.38.2",
37
+ "type_vocab_size": 1,
38
+ "use_cache": true,
39
+ "vocab_size": 250002
40
+ }
checkpoint-16500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2551fca71136ff1d6066acb7a921ef8341781d26fe6b9244440344e5e89c13d5
3
+ size 1112208084
checkpoint-16500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d3186b36b2ff77720757ef62ca4fdcc541b8828d6710d0ccd98f4c0af4033a6
3
+ size 2308666
checkpoint-16500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad655fec337ca7ca2b9595b63c533d57f097543972d21db0fa554526526da31e
3
+ size 14244
checkpoint-16500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc8bae5b8747fc9e9b3acf56b2729b61f1bf2887bb2328ba98c1266147297d8
3
+ size 1064
checkpoint-16500/trainer_state.json ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6971784234046936,
3
+ "best_model_checkpoint": "./cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42/checkpoint-16500",
4
+ "epoch": 1.9841269841269842,
5
+ "eval_steps": 500,
6
+ "global_step": 16500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "grad_norm": 3.298647880554199,
14
+ "learning_rate": 4.994023569023569e-05,
15
+ "loss": 0.6122,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "eval_f1": 0.4840638597456899,
21
+ "eval_loss": 0.854165256023407,
22
+ "eval_precision": 0.6558887250350466,
23
+ "eval_recall": 0.49045198529069495,
24
+ "eval_runtime": 5.9285,
25
+ "eval_samples_per_second": 148.099,
26
+ "eval_steps_per_second": 9.277,
27
+ "step": 500
28
+ },
29
+ {
30
+ "epoch": 0.12,
31
+ "grad_norm": 5.411099433898926,
32
+ "learning_rate": 4.988011063011063e-05,
33
+ "loss": 0.5497,
34
+ "step": 1000
35
+ },
36
+ {
37
+ "epoch": 0.12,
38
+ "eval_f1": 0.6209225023342669,
39
+ "eval_loss": 0.8037390112876892,
40
+ "eval_precision": 0.704421745545341,
41
+ "eval_recall": 0.6070083321696225,
42
+ "eval_runtime": 6.1691,
43
+ "eval_samples_per_second": 142.322,
44
+ "eval_steps_per_second": 8.915,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.18,
49
+ "grad_norm": 5.836483001708984,
50
+ "learning_rate": 4.9820105820105825e-05,
51
+ "loss": 0.5404,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.18,
56
+ "eval_f1": 0.3652071944289921,
57
+ "eval_loss": 0.9700150489807129,
58
+ "eval_precision": 0.5591482310679367,
59
+ "eval_recall": 0.4176288227901131,
60
+ "eval_runtime": 5.8886,
61
+ "eval_samples_per_second": 149.101,
62
+ "eval_steps_per_second": 9.34,
63
+ "step": 1500
64
+ },
65
+ {
66
+ "epoch": 0.24,
67
+ "grad_norm": 13.717193603515625,
68
+ "learning_rate": 4.975998075998076e-05,
69
+ "loss": 0.5165,
70
+ "step": 2000
71
+ },
72
+ {
73
+ "epoch": 0.24,
74
+ "eval_f1": 0.5369027892847279,
75
+ "eval_loss": 0.744874894618988,
76
+ "eval_precision": 0.7349445049700448,
77
+ "eval_recall": 0.529664385793418,
78
+ "eval_runtime": 5.996,
79
+ "eval_samples_per_second": 146.43,
80
+ "eval_steps_per_second": 9.173,
81
+ "step": 2000
82
+ },
83
+ {
84
+ "epoch": 0.3,
85
+ "grad_norm": 2.4534995555877686,
86
+ "learning_rate": 4.969997594997595e-05,
87
+ "loss": 0.5136,
88
+ "step": 2500
89
+ },
90
+ {
91
+ "epoch": 0.3,
92
+ "eval_f1": 0.5001381202499963,
93
+ "eval_loss": 0.7884698510169983,
94
+ "eval_precision": 0.6766332095394413,
95
+ "eval_recall": 0.5025275799469348,
96
+ "eval_runtime": 5.9085,
97
+ "eval_samples_per_second": 148.6,
98
+ "eval_steps_per_second": 9.309,
99
+ "step": 2500
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "grad_norm": 3.195244550704956,
104
+ "learning_rate": 4.963985088985089e-05,
105
+ "loss": 0.5072,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 0.36,
110
+ "eval_f1": 0.5917137619940201,
111
+ "eval_loss": 0.8123684525489807,
112
+ "eval_precision": 0.6076358199852175,
113
+ "eval_recall": 0.6132374435600242,
114
+ "eval_runtime": 6.1108,
115
+ "eval_samples_per_second": 143.68,
116
+ "eval_steps_per_second": 9.0,
117
+ "step": 3000
118
+ },
119
+ {
120
+ "epoch": 0.42,
121
+ "grad_norm": 7.579603672027588,
122
+ "learning_rate": 4.957972582972583e-05,
123
+ "loss": 0.5011,
124
+ "step": 3500
125
+ },
126
+ {
127
+ "epoch": 0.42,
128
+ "eval_f1": 0.578405909718061,
129
+ "eval_loss": 0.8767459392547607,
130
+ "eval_precision": 0.642659899090607,
131
+ "eval_recall": 0.5987143322627193,
132
+ "eval_runtime": 6.1563,
133
+ "eval_samples_per_second": 142.618,
134
+ "eval_steps_per_second": 8.934,
135
+ "step": 3500
136
+ },
137
+ {
138
+ "epoch": 0.48,
139
+ "grad_norm": 3.266787052154541,
140
+ "learning_rate": 4.951960076960077e-05,
141
+ "loss": 0.5021,
142
+ "step": 4000
143
+ },
144
+ {
145
+ "epoch": 0.48,
146
+ "eval_f1": 0.6502990015105321,
147
+ "eval_loss": 0.7957776784896851,
148
+ "eval_precision": 0.6847923256926328,
149
+ "eval_recall": 0.636192338127822,
150
+ "eval_runtime": 6.5221,
151
+ "eval_samples_per_second": 134.618,
152
+ "eval_steps_per_second": 8.433,
153
+ "step": 4000
154
+ },
155
+ {
156
+ "epoch": 0.54,
157
+ "grad_norm": 6.044332027435303,
158
+ "learning_rate": 4.945959595959596e-05,
159
+ "loss": 0.4946,
160
+ "step": 4500
161
+ },
162
+ {
163
+ "epoch": 0.54,
164
+ "eval_f1": 0.4982912515017284,
165
+ "eval_loss": 0.8045271039009094,
166
+ "eval_precision": 0.7220405815528763,
167
+ "eval_recall": 0.4968300516687614,
168
+ "eval_runtime": 6.1928,
169
+ "eval_samples_per_second": 141.778,
170
+ "eval_steps_per_second": 8.881,
171
+ "step": 4500
172
+ },
173
+ {
174
+ "epoch": 0.6,
175
+ "grad_norm": 5.152063846588135,
176
+ "learning_rate": 4.93994708994709e-05,
177
+ "loss": 0.4928,
178
+ "step": 5000
179
+ },
180
+ {
181
+ "epoch": 0.6,
182
+ "eval_f1": 0.550273048506264,
183
+ "eval_loss": 0.780342698097229,
184
+ "eval_precision": 0.7581894624319455,
185
+ "eval_recall": 0.5380887213145278,
186
+ "eval_runtime": 6.123,
187
+ "eval_samples_per_second": 143.395,
188
+ "eval_steps_per_second": 8.983,
189
+ "step": 5000
190
+ },
191
+ {
192
+ "epoch": 0.66,
193
+ "grad_norm": 4.54200553894043,
194
+ "learning_rate": 4.933934583934584e-05,
195
+ "loss": 0.5008,
196
+ "step": 5500
197
+ },
198
+ {
199
+ "epoch": 0.66,
200
+ "eval_f1": 0.4594232264185665,
201
+ "eval_loss": 0.7507085204124451,
202
+ "eval_precision": 0.44070483572560937,
203
+ "eval_recall": 0.47984452823162504,
204
+ "eval_runtime": 5.932,
205
+ "eval_samples_per_second": 148.011,
206
+ "eval_steps_per_second": 9.272,
207
+ "step": 5500
208
+ },
209
+ {
210
+ "epoch": 0.72,
211
+ "grad_norm": 4.075632095336914,
212
+ "learning_rate": 4.927922077922078e-05,
213
+ "loss": 0.4966,
214
+ "step": 6000
215
+ },
216
+ {
217
+ "epoch": 0.72,
218
+ "eval_f1": 0.6310991936984806,
219
+ "eval_loss": 0.8238988518714905,
220
+ "eval_precision": 0.6139657275796522,
221
+ "eval_recall": 0.6767434715821813,
222
+ "eval_runtime": 5.8918,
223
+ "eval_samples_per_second": 149.02,
224
+ "eval_steps_per_second": 9.335,
225
+ "step": 6000
226
+ },
227
+ {
228
+ "epoch": 0.78,
229
+ "grad_norm": 4.8725104331970215,
230
+ "learning_rate": 4.921909571909572e-05,
231
+ "loss": 0.4791,
232
+ "step": 6500
233
+ },
234
+ {
235
+ "epoch": 0.78,
236
+ "eval_f1": 0.5412559573187593,
237
+ "eval_loss": 0.7028306722640991,
238
+ "eval_precision": 0.6567775474615866,
239
+ "eval_recall": 0.520631196760229,
240
+ "eval_runtime": 6.3113,
241
+ "eval_samples_per_second": 139.116,
242
+ "eval_steps_per_second": 8.715,
243
+ "step": 6500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "grad_norm": 1.4915893077850342,
248
+ "learning_rate": 4.915897065897066e-05,
249
+ "loss": 0.494,
250
+ "step": 7000
251
+ },
252
+ {
253
+ "epoch": 0.84,
254
+ "eval_f1": 0.5227267406470947,
255
+ "eval_loss": 0.8033522367477417,
256
+ "eval_precision": 0.6660302960734323,
257
+ "eval_recall": 0.5188623562817111,
258
+ "eval_runtime": 6.1252,
259
+ "eval_samples_per_second": 143.342,
260
+ "eval_steps_per_second": 8.979,
261
+ "step": 7000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "grad_norm": 2.151014804840088,
266
+ "learning_rate": 4.90988455988456e-05,
267
+ "loss": 0.4861,
268
+ "step": 7500
269
+ },
270
+ {
271
+ "epoch": 0.9,
272
+ "eval_f1": 0.4541201667750796,
273
+ "eval_loss": 0.900325357913971,
274
+ "eval_precision": 0.5780562441152168,
275
+ "eval_recall": 0.4784564539403249,
276
+ "eval_runtime": 6.144,
277
+ "eval_samples_per_second": 142.903,
278
+ "eval_steps_per_second": 8.952,
279
+ "step": 7500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "grad_norm": 4.770496368408203,
284
+ "learning_rate": 4.903872053872054e-05,
285
+ "loss": 0.4804,
286
+ "step": 8000
287
+ },
288
+ {
289
+ "epoch": 0.96,
290
+ "eval_f1": 0.5791890202588422,
291
+ "eval_loss": 0.773960530757904,
292
+ "eval_precision": 0.6238945275403609,
293
+ "eval_recall": 0.5775003491132523,
294
+ "eval_runtime": 6.556,
295
+ "eval_samples_per_second": 133.923,
296
+ "eval_steps_per_second": 8.389,
297
+ "step": 8000
298
+ },
299
+ {
300
+ "epoch": 1.02,
301
+ "grad_norm": 2.520460367202759,
302
+ "learning_rate": 4.897859547859548e-05,
303
+ "loss": 0.4614,
304
+ "step": 8500
305
+ },
306
+ {
307
+ "epoch": 1.02,
308
+ "eval_f1": 0.6470888284841774,
309
+ "eval_loss": 0.7397181391716003,
310
+ "eval_precision": 0.6848151355984641,
311
+ "eval_recall": 0.6312358609132803,
312
+ "eval_runtime": 6.1813,
313
+ "eval_samples_per_second": 142.042,
314
+ "eval_steps_per_second": 8.898,
315
+ "step": 8500
316
+ },
317
+ {
318
+ "epoch": 1.08,
319
+ "grad_norm": 4.375688552856445,
320
+ "learning_rate": 4.891847041847042e-05,
321
+ "loss": 0.4315,
322
+ "step": 9000
323
+ },
324
+ {
325
+ "epoch": 1.08,
326
+ "eval_f1": 0.614857769662433,
327
+ "eval_loss": 0.788919985294342,
328
+ "eval_precision": 0.6641593406916259,
329
+ "eval_recall": 0.6034743750872783,
330
+ "eval_runtime": 6.1798,
331
+ "eval_samples_per_second": 142.076,
332
+ "eval_steps_per_second": 8.9,
333
+ "step": 9000
334
+ },
335
+ {
336
+ "epoch": 1.14,
337
+ "grad_norm": 4.091088771820068,
338
+ "learning_rate": 4.885834535834536e-05,
339
+ "loss": 0.4506,
340
+ "step": 9500
341
+ },
342
+ {
343
+ "epoch": 1.14,
344
+ "eval_f1": 0.4967964786589283,
345
+ "eval_loss": 0.8783875703811646,
346
+ "eval_precision": 0.6387377173091459,
347
+ "eval_recall": 0.5016645719871526,
348
+ "eval_runtime": 5.9164,
349
+ "eval_samples_per_second": 148.401,
350
+ "eval_steps_per_second": 9.296,
351
+ "step": 9500
352
+ },
353
+ {
354
+ "epoch": 1.2,
355
+ "grad_norm": 3.3903276920318604,
356
+ "learning_rate": 4.87982202982203e-05,
357
+ "loss": 0.4489,
358
+ "step": 10000
359
+ },
360
+ {
361
+ "epoch": 1.2,
362
+ "eval_f1": 0.4949153076705755,
363
+ "eval_loss": 0.7994188070297241,
364
+ "eval_precision": 0.5340329579250159,
365
+ "eval_recall": 0.49638597961178615,
366
+ "eval_runtime": 5.9029,
367
+ "eval_samples_per_second": 148.74,
368
+ "eval_steps_per_second": 9.317,
369
+ "step": 10000
370
+ },
371
+ {
372
+ "epoch": 1.26,
373
+ "grad_norm": 3.929879903793335,
374
+ "learning_rate": 4.8738095238095235e-05,
375
+ "loss": 0.4466,
376
+ "step": 10500
377
+ },
378
+ {
379
+ "epoch": 1.26,
380
+ "eval_f1": 0.44642812881455524,
381
+ "eval_loss": 0.8109920024871826,
382
+ "eval_precision": 0.5776119229607602,
383
+ "eval_recall": 0.47351207931853095,
384
+ "eval_runtime": 5.9766,
385
+ "eval_samples_per_second": 146.907,
386
+ "eval_steps_per_second": 9.203,
387
+ "step": 10500
388
+ },
389
+ {
390
+ "epoch": 1.32,
391
+ "grad_norm": 6.443171501159668,
392
+ "learning_rate": 4.8677970177970176e-05,
393
+ "loss": 0.4319,
394
+ "step": 11000
395
+ },
396
+ {
397
+ "epoch": 1.32,
398
+ "eval_f1": 0.5481427288492505,
399
+ "eval_loss": 0.8068605661392212,
400
+ "eval_precision": 0.6612496177619213,
401
+ "eval_recall": 0.5399497276916632,
402
+ "eval_runtime": 5.9001,
403
+ "eval_samples_per_second": 148.811,
404
+ "eval_steps_per_second": 9.322,
405
+ "step": 11000
406
+ },
407
+ {
408
+ "epoch": 1.38,
409
+ "grad_norm": 7.633645057678223,
410
+ "learning_rate": 4.8617845117845116e-05,
411
+ "loss": 0.4243,
412
+ "step": 11500
413
+ },
414
+ {
415
+ "epoch": 1.38,
416
+ "eval_f1": 0.5797306372413114,
417
+ "eval_loss": 0.7941620349884033,
418
+ "eval_precision": 0.5948358635007136,
419
+ "eval_recall": 0.5704752595075175,
420
+ "eval_runtime": 6.145,
421
+ "eval_samples_per_second": 142.881,
422
+ "eval_steps_per_second": 8.95,
423
+ "step": 11500
424
+ },
425
+ {
426
+ "epoch": 1.44,
427
+ "grad_norm": 3.275371789932251,
428
+ "learning_rate": 4.8557720057720056e-05,
429
+ "loss": 0.4398,
430
+ "step": 12000
431
+ },
432
+ {
433
+ "epoch": 1.44,
434
+ "eval_f1": 0.5247242844808815,
435
+ "eval_loss": 0.9738017916679382,
436
+ "eval_precision": 0.5370369073777802,
437
+ "eval_recall": 0.6070139179816599,
438
+ "eval_runtime": 6.219,
439
+ "eval_samples_per_second": 141.18,
440
+ "eval_steps_per_second": 8.844,
441
+ "step": 12000
442
+ },
443
+ {
444
+ "epoch": 1.5,
445
+ "grad_norm": 2.4162724018096924,
446
+ "learning_rate": 4.8497594997595e-05,
447
+ "loss": 0.4526,
448
+ "step": 12500
449
+ },
450
+ {
451
+ "epoch": 1.5,
452
+ "eval_f1": 0.5589742980399895,
453
+ "eval_loss": 0.7195601463317871,
454
+ "eval_precision": 0.7046240283838195,
455
+ "eval_recall": 0.5477959316668994,
456
+ "eval_runtime": 6.3918,
457
+ "eval_samples_per_second": 137.363,
458
+ "eval_steps_per_second": 8.605,
459
+ "step": 12500
460
+ },
461
+ {
462
+ "epoch": 1.56,
463
+ "grad_norm": 6.926381587982178,
464
+ "learning_rate": 4.8437469937469944e-05,
465
+ "loss": 0.4529,
466
+ "step": 13000
467
+ },
468
+ {
469
+ "epoch": 1.56,
470
+ "eval_f1": 0.5863097712686139,
471
+ "eval_loss": 0.8049713969230652,
472
+ "eval_precision": 0.6419448505612538,
473
+ "eval_recall": 0.5730605595121724,
474
+ "eval_runtime": 6.3636,
475
+ "eval_samples_per_second": 137.971,
476
+ "eval_steps_per_second": 8.643,
477
+ "step": 13000
478
+ },
479
+ {
480
+ "epoch": 1.62,
481
+ "grad_norm": 1.8420650959014893,
482
+ "learning_rate": 4.837746512746513e-05,
483
+ "loss": 0.446,
484
+ "step": 13500
485
+ },
486
+ {
487
+ "epoch": 1.62,
488
+ "eval_f1": 0.6107236144330398,
489
+ "eval_loss": 0.7564206719398499,
490
+ "eval_precision": 0.6520992658162544,
491
+ "eval_recall": 0.5912358609132803,
492
+ "eval_runtime": 6.4128,
493
+ "eval_samples_per_second": 136.914,
494
+ "eval_steps_per_second": 8.577,
495
+ "step": 13500
496
+ },
497
+ {
498
+ "epoch": 1.68,
499
+ "grad_norm": 2.423569679260254,
500
+ "learning_rate": 4.831746031746032e-05,
501
+ "loss": 0.4315,
502
+ "step": 14000
503
+ },
504
+ {
505
+ "epoch": 1.68,
506
+ "eval_f1": 0.621245910301715,
507
+ "eval_loss": 0.751511812210083,
508
+ "eval_precision": 0.6474767054531395,
509
+ "eval_recall": 0.6069198901456967,
510
+ "eval_runtime": 5.9833,
511
+ "eval_samples_per_second": 146.741,
512
+ "eval_steps_per_second": 9.192,
513
+ "step": 14000
514
+ },
515
+ {
516
+ "epoch": 1.74,
517
+ "grad_norm": 6.773381233215332,
518
+ "learning_rate": 4.825733525733526e-05,
519
+ "loss": 0.4464,
520
+ "step": 14500
521
+ },
522
+ {
523
+ "epoch": 1.74,
524
+ "eval_f1": 0.559868694735591,
525
+ "eval_loss": 0.8307517170906067,
526
+ "eval_precision": 0.627583612882644,
527
+ "eval_recall": 0.5512991667830377,
528
+ "eval_runtime": 6.1679,
529
+ "eval_samples_per_second": 142.35,
530
+ "eval_steps_per_second": 8.917,
531
+ "step": 14500
532
+ },
533
+ {
534
+ "epoch": 1.8,
535
+ "grad_norm": 6.220128059387207,
536
+ "learning_rate": 4.8197330447330455e-05,
537
+ "loss": 0.4423,
538
+ "step": 15000
539
+ },
540
+ {
541
+ "epoch": 1.8,
542
+ "eval_f1": 0.5991996711711277,
543
+ "eval_loss": 0.798150360584259,
544
+ "eval_precision": 0.6176196711770697,
545
+ "eval_recall": 0.5936535865568123,
546
+ "eval_runtime": 6.0738,
547
+ "eval_samples_per_second": 144.556,
548
+ "eval_steps_per_second": 9.055,
549
+ "step": 15000
550
+ },
551
+ {
552
+ "epoch": 1.86,
553
+ "grad_norm": 1.1065833568572998,
554
+ "learning_rate": 4.8137205387205395e-05,
555
+ "loss": 0.4551,
556
+ "step": 15500
557
+ },
558
+ {
559
+ "epoch": 1.86,
560
+ "eval_f1": 0.6019748538222912,
561
+ "eval_loss": 0.822293221950531,
562
+ "eval_precision": 0.6355921902599784,
563
+ "eval_recall": 0.5933528836754642,
564
+ "eval_runtime": 6.1197,
565
+ "eval_samples_per_second": 143.472,
566
+ "eval_steps_per_second": 8.987,
567
+ "step": 15500
568
+ },
569
+ {
570
+ "epoch": 1.92,
571
+ "grad_norm": 8.631648063659668,
572
+ "learning_rate": 4.807708032708033e-05,
573
+ "loss": 0.4408,
574
+ "step": 16000
575
+ },
576
+ {
577
+ "epoch": 1.92,
578
+ "eval_f1": 0.5131249172090748,
579
+ "eval_loss": 0.7691208124160767,
580
+ "eval_precision": 0.608759764068229,
581
+ "eval_recall": 0.5147484057161477,
582
+ "eval_runtime": 6.3609,
583
+ "eval_samples_per_second": 138.031,
584
+ "eval_steps_per_second": 8.647,
585
+ "step": 16000
586
+ },
587
+ {
588
+ "epoch": 1.98,
589
+ "grad_norm": 6.755849361419678,
590
+ "learning_rate": 4.801695526695527e-05,
591
+ "loss": 0.4389,
592
+ "step": 16500
593
+ },
594
+ {
595
+ "epoch": 1.98,
596
+ "eval_f1": 0.6702519892656928,
597
+ "eval_loss": 0.6971784234046936,
598
+ "eval_precision": 0.6686766810877821,
599
+ "eval_recall": 0.6729106735558349,
600
+ "eval_runtime": 6.1341,
601
+ "eval_samples_per_second": 143.134,
602
+ "eval_steps_per_second": 8.966,
603
+ "step": 16500
604
+ }
605
+ ],
606
+ "logging_steps": 500,
607
+ "max_steps": 415800,
608
+ "num_input_tokens_seen": 0,
609
+ "num_train_epochs": 50,
610
+ "save_steps": 500,
611
+ "total_flos": 6.945931114601472e+16,
612
+ "train_batch_size": 16,
613
+ "trial_name": null,
614
+ "trial_params": null
615
+ }
checkpoint-16500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f5c815d879e81b472ce52bc0bc6843f93b2e6b15b32a07dfe75c40e595456a
3
+ size 4920
checkpoint-31000/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "xlm-roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "output_past": true,
32
+ "pad_token_id": 1,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.38.2",
37
+ "type_vocab_size": 1,
38
+ "use_cache": true,
39
+ "vocab_size": 250002
40
+ }
checkpoint-31000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384b65e5479b79cadd94fa66df1804fae4a22f5621ccc0504be940f791e5a83b
3
+ size 1112208084
checkpoint-31000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0013ddab46eb667d57a2c2f9d5e475f8c9fed64c8ea1f06e0fe3f3b3bfb9f576
3
+ size 2308666
checkpoint-31000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42acac852774fa7d87a3629ee864fe78f5fe1f644e34daa22d44b6595770ee18
3
+ size 14244
checkpoint-31000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40440e9428471f311084cafb4a31702072a8153131d834f0b8b395a77a216cba
3
+ size 1064
checkpoint-31000/trainer_state.json ADDED
@@ -0,0 +1,1137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6971784234046936,
3
+ "best_model_checkpoint": "./cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42/checkpoint-16500",
4
+ "epoch": 3.727753727753728,
5
+ "eval_steps": 500,
6
+ "global_step": 31000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "grad_norm": 3.298647880554199,
14
+ "learning_rate": 4.994023569023569e-05,
15
+ "loss": 0.6122,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "eval_f1": 0.4840638597456899,
21
+ "eval_loss": 0.854165256023407,
22
+ "eval_precision": 0.6558887250350466,
23
+ "eval_recall": 0.49045198529069495,
24
+ "eval_runtime": 5.9285,
25
+ "eval_samples_per_second": 148.099,
26
+ "eval_steps_per_second": 9.277,
27
+ "step": 500
28
+ },
29
+ {
30
+ "epoch": 0.12,
31
+ "grad_norm": 5.411099433898926,
32
+ "learning_rate": 4.988011063011063e-05,
33
+ "loss": 0.5497,
34
+ "step": 1000
35
+ },
36
+ {
37
+ "epoch": 0.12,
38
+ "eval_f1": 0.6209225023342669,
39
+ "eval_loss": 0.8037390112876892,
40
+ "eval_precision": 0.704421745545341,
41
+ "eval_recall": 0.6070083321696225,
42
+ "eval_runtime": 6.1691,
43
+ "eval_samples_per_second": 142.322,
44
+ "eval_steps_per_second": 8.915,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.18,
49
+ "grad_norm": 5.836483001708984,
50
+ "learning_rate": 4.9820105820105825e-05,
51
+ "loss": 0.5404,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.18,
56
+ "eval_f1": 0.3652071944289921,
57
+ "eval_loss": 0.9700150489807129,
58
+ "eval_precision": 0.5591482310679367,
59
+ "eval_recall": 0.4176288227901131,
60
+ "eval_runtime": 5.8886,
61
+ "eval_samples_per_second": 149.101,
62
+ "eval_steps_per_second": 9.34,
63
+ "step": 1500
64
+ },
65
+ {
66
+ "epoch": 0.24,
67
+ "grad_norm": 13.717193603515625,
68
+ "learning_rate": 4.975998075998076e-05,
69
+ "loss": 0.5165,
70
+ "step": 2000
71
+ },
72
+ {
73
+ "epoch": 0.24,
74
+ "eval_f1": 0.5369027892847279,
75
+ "eval_loss": 0.744874894618988,
76
+ "eval_precision": 0.7349445049700448,
77
+ "eval_recall": 0.529664385793418,
78
+ "eval_runtime": 5.996,
79
+ "eval_samples_per_second": 146.43,
80
+ "eval_steps_per_second": 9.173,
81
+ "step": 2000
82
+ },
83
+ {
84
+ "epoch": 0.3,
85
+ "grad_norm": 2.4534995555877686,
86
+ "learning_rate": 4.969997594997595e-05,
87
+ "loss": 0.5136,
88
+ "step": 2500
89
+ },
90
+ {
91
+ "epoch": 0.3,
92
+ "eval_f1": 0.5001381202499963,
93
+ "eval_loss": 0.7884698510169983,
94
+ "eval_precision": 0.6766332095394413,
95
+ "eval_recall": 0.5025275799469348,
96
+ "eval_runtime": 5.9085,
97
+ "eval_samples_per_second": 148.6,
98
+ "eval_steps_per_second": 9.309,
99
+ "step": 2500
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "grad_norm": 3.195244550704956,
104
+ "learning_rate": 4.963985088985089e-05,
105
+ "loss": 0.5072,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 0.36,
110
+ "eval_f1": 0.5917137619940201,
111
+ "eval_loss": 0.8123684525489807,
112
+ "eval_precision": 0.6076358199852175,
113
+ "eval_recall": 0.6132374435600242,
114
+ "eval_runtime": 6.1108,
115
+ "eval_samples_per_second": 143.68,
116
+ "eval_steps_per_second": 9.0,
117
+ "step": 3000
118
+ },
119
+ {
120
+ "epoch": 0.42,
121
+ "grad_norm": 7.579603672027588,
122
+ "learning_rate": 4.957972582972583e-05,
123
+ "loss": 0.5011,
124
+ "step": 3500
125
+ },
126
+ {
127
+ "epoch": 0.42,
128
+ "eval_f1": 0.578405909718061,
129
+ "eval_loss": 0.8767459392547607,
130
+ "eval_precision": 0.642659899090607,
131
+ "eval_recall": 0.5987143322627193,
132
+ "eval_runtime": 6.1563,
133
+ "eval_samples_per_second": 142.618,
134
+ "eval_steps_per_second": 8.934,
135
+ "step": 3500
136
+ },
137
+ {
138
+ "epoch": 0.48,
139
+ "grad_norm": 3.266787052154541,
140
+ "learning_rate": 4.951960076960077e-05,
141
+ "loss": 0.5021,
142
+ "step": 4000
143
+ },
144
+ {
145
+ "epoch": 0.48,
146
+ "eval_f1": 0.6502990015105321,
147
+ "eval_loss": 0.7957776784896851,
148
+ "eval_precision": 0.6847923256926328,
149
+ "eval_recall": 0.636192338127822,
150
+ "eval_runtime": 6.5221,
151
+ "eval_samples_per_second": 134.618,
152
+ "eval_steps_per_second": 8.433,
153
+ "step": 4000
154
+ },
155
+ {
156
+ "epoch": 0.54,
157
+ "grad_norm": 6.044332027435303,
158
+ "learning_rate": 4.945959595959596e-05,
159
+ "loss": 0.4946,
160
+ "step": 4500
161
+ },
162
+ {
163
+ "epoch": 0.54,
164
+ "eval_f1": 0.4982912515017284,
165
+ "eval_loss": 0.8045271039009094,
166
+ "eval_precision": 0.7220405815528763,
167
+ "eval_recall": 0.4968300516687614,
168
+ "eval_runtime": 6.1928,
169
+ "eval_samples_per_second": 141.778,
170
+ "eval_steps_per_second": 8.881,
171
+ "step": 4500
172
+ },
173
+ {
174
+ "epoch": 0.6,
175
+ "grad_norm": 5.152063846588135,
176
+ "learning_rate": 4.93994708994709e-05,
177
+ "loss": 0.4928,
178
+ "step": 5000
179
+ },
180
+ {
181
+ "epoch": 0.6,
182
+ "eval_f1": 0.550273048506264,
183
+ "eval_loss": 0.780342698097229,
184
+ "eval_precision": 0.7581894624319455,
185
+ "eval_recall": 0.5380887213145278,
186
+ "eval_runtime": 6.123,
187
+ "eval_samples_per_second": 143.395,
188
+ "eval_steps_per_second": 8.983,
189
+ "step": 5000
190
+ },
191
+ {
192
+ "epoch": 0.66,
193
+ "grad_norm": 4.54200553894043,
194
+ "learning_rate": 4.933934583934584e-05,
195
+ "loss": 0.5008,
196
+ "step": 5500
197
+ },
198
+ {
199
+ "epoch": 0.66,
200
+ "eval_f1": 0.4594232264185665,
201
+ "eval_loss": 0.7507085204124451,
202
+ "eval_precision": 0.44070483572560937,
203
+ "eval_recall": 0.47984452823162504,
204
+ "eval_runtime": 5.932,
205
+ "eval_samples_per_second": 148.011,
206
+ "eval_steps_per_second": 9.272,
207
+ "step": 5500
208
+ },
209
+ {
210
+ "epoch": 0.72,
211
+ "grad_norm": 4.075632095336914,
212
+ "learning_rate": 4.927922077922078e-05,
213
+ "loss": 0.4966,
214
+ "step": 6000
215
+ },
216
+ {
217
+ "epoch": 0.72,
218
+ "eval_f1": 0.6310991936984806,
219
+ "eval_loss": 0.8238988518714905,
220
+ "eval_precision": 0.6139657275796522,
221
+ "eval_recall": 0.6767434715821813,
222
+ "eval_runtime": 5.8918,
223
+ "eval_samples_per_second": 149.02,
224
+ "eval_steps_per_second": 9.335,
225
+ "step": 6000
226
+ },
227
+ {
228
+ "epoch": 0.78,
229
+ "grad_norm": 4.8725104331970215,
230
+ "learning_rate": 4.921909571909572e-05,
231
+ "loss": 0.4791,
232
+ "step": 6500
233
+ },
234
+ {
235
+ "epoch": 0.78,
236
+ "eval_f1": 0.5412559573187593,
237
+ "eval_loss": 0.7028306722640991,
238
+ "eval_precision": 0.6567775474615866,
239
+ "eval_recall": 0.520631196760229,
240
+ "eval_runtime": 6.3113,
241
+ "eval_samples_per_second": 139.116,
242
+ "eval_steps_per_second": 8.715,
243
+ "step": 6500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "grad_norm": 1.4915893077850342,
248
+ "learning_rate": 4.915897065897066e-05,
249
+ "loss": 0.494,
250
+ "step": 7000
251
+ },
252
+ {
253
+ "epoch": 0.84,
254
+ "eval_f1": 0.5227267406470947,
255
+ "eval_loss": 0.8033522367477417,
256
+ "eval_precision": 0.6660302960734323,
257
+ "eval_recall": 0.5188623562817111,
258
+ "eval_runtime": 6.1252,
259
+ "eval_samples_per_second": 143.342,
260
+ "eval_steps_per_second": 8.979,
261
+ "step": 7000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "grad_norm": 2.151014804840088,
266
+ "learning_rate": 4.90988455988456e-05,
267
+ "loss": 0.4861,
268
+ "step": 7500
269
+ },
270
+ {
271
+ "epoch": 0.9,
272
+ "eval_f1": 0.4541201667750796,
273
+ "eval_loss": 0.900325357913971,
274
+ "eval_precision": 0.5780562441152168,
275
+ "eval_recall": 0.4784564539403249,
276
+ "eval_runtime": 6.144,
277
+ "eval_samples_per_second": 142.903,
278
+ "eval_steps_per_second": 8.952,
279
+ "step": 7500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "grad_norm": 4.770496368408203,
284
+ "learning_rate": 4.903872053872054e-05,
285
+ "loss": 0.4804,
286
+ "step": 8000
287
+ },
288
+ {
289
+ "epoch": 0.96,
290
+ "eval_f1": 0.5791890202588422,
291
+ "eval_loss": 0.773960530757904,
292
+ "eval_precision": 0.6238945275403609,
293
+ "eval_recall": 0.5775003491132523,
294
+ "eval_runtime": 6.556,
295
+ "eval_samples_per_second": 133.923,
296
+ "eval_steps_per_second": 8.389,
297
+ "step": 8000
298
+ },
299
+ {
300
+ "epoch": 1.02,
301
+ "grad_norm": 2.520460367202759,
302
+ "learning_rate": 4.897859547859548e-05,
303
+ "loss": 0.4614,
304
+ "step": 8500
305
+ },
306
+ {
307
+ "epoch": 1.02,
308
+ "eval_f1": 0.6470888284841774,
309
+ "eval_loss": 0.7397181391716003,
310
+ "eval_precision": 0.6848151355984641,
311
+ "eval_recall": 0.6312358609132803,
312
+ "eval_runtime": 6.1813,
313
+ "eval_samples_per_second": 142.042,
314
+ "eval_steps_per_second": 8.898,
315
+ "step": 8500
316
+ },
317
+ {
318
+ "epoch": 1.08,
319
+ "grad_norm": 4.375688552856445,
320
+ "learning_rate": 4.891847041847042e-05,
321
+ "loss": 0.4315,
322
+ "step": 9000
323
+ },
324
+ {
325
+ "epoch": 1.08,
326
+ "eval_f1": 0.614857769662433,
327
+ "eval_loss": 0.788919985294342,
328
+ "eval_precision": 0.6641593406916259,
329
+ "eval_recall": 0.6034743750872783,
330
+ "eval_runtime": 6.1798,
331
+ "eval_samples_per_second": 142.076,
332
+ "eval_steps_per_second": 8.9,
333
+ "step": 9000
334
+ },
335
+ {
336
+ "epoch": 1.14,
337
+ "grad_norm": 4.091088771820068,
338
+ "learning_rate": 4.885834535834536e-05,
339
+ "loss": 0.4506,
340
+ "step": 9500
341
+ },
342
+ {
343
+ "epoch": 1.14,
344
+ "eval_f1": 0.4967964786589283,
345
+ "eval_loss": 0.8783875703811646,
346
+ "eval_precision": 0.6387377173091459,
347
+ "eval_recall": 0.5016645719871526,
348
+ "eval_runtime": 5.9164,
349
+ "eval_samples_per_second": 148.401,
350
+ "eval_steps_per_second": 9.296,
351
+ "step": 9500
352
+ },
353
+ {
354
+ "epoch": 1.2,
355
+ "grad_norm": 3.3903276920318604,
356
+ "learning_rate": 4.87982202982203e-05,
357
+ "loss": 0.4489,
358
+ "step": 10000
359
+ },
360
+ {
361
+ "epoch": 1.2,
362
+ "eval_f1": 0.4949153076705755,
363
+ "eval_loss": 0.7994188070297241,
364
+ "eval_precision": 0.5340329579250159,
365
+ "eval_recall": 0.49638597961178615,
366
+ "eval_runtime": 5.9029,
367
+ "eval_samples_per_second": 148.74,
368
+ "eval_steps_per_second": 9.317,
369
+ "step": 10000
370
+ },
371
+ {
372
+ "epoch": 1.26,
373
+ "grad_norm": 3.929879903793335,
374
+ "learning_rate": 4.8738095238095235e-05,
375
+ "loss": 0.4466,
376
+ "step": 10500
377
+ },
378
+ {
379
+ "epoch": 1.26,
380
+ "eval_f1": 0.44642812881455524,
381
+ "eval_loss": 0.8109920024871826,
382
+ "eval_precision": 0.5776119229607602,
383
+ "eval_recall": 0.47351207931853095,
384
+ "eval_runtime": 5.9766,
385
+ "eval_samples_per_second": 146.907,
386
+ "eval_steps_per_second": 9.203,
387
+ "step": 10500
388
+ },
389
+ {
390
+ "epoch": 1.32,
391
+ "grad_norm": 6.443171501159668,
392
+ "learning_rate": 4.8677970177970176e-05,
393
+ "loss": 0.4319,
394
+ "step": 11000
395
+ },
396
+ {
397
+ "epoch": 1.32,
398
+ "eval_f1": 0.5481427288492505,
399
+ "eval_loss": 0.8068605661392212,
400
+ "eval_precision": 0.6612496177619213,
401
+ "eval_recall": 0.5399497276916632,
402
+ "eval_runtime": 5.9001,
403
+ "eval_samples_per_second": 148.811,
404
+ "eval_steps_per_second": 9.322,
405
+ "step": 11000
406
+ },
407
+ {
408
+ "epoch": 1.38,
409
+ "grad_norm": 7.633645057678223,
410
+ "learning_rate": 4.8617845117845116e-05,
411
+ "loss": 0.4243,
412
+ "step": 11500
413
+ },
414
+ {
415
+ "epoch": 1.38,
416
+ "eval_f1": 0.5797306372413114,
417
+ "eval_loss": 0.7941620349884033,
418
+ "eval_precision": 0.5948358635007136,
419
+ "eval_recall": 0.5704752595075175,
420
+ "eval_runtime": 6.145,
421
+ "eval_samples_per_second": 142.881,
422
+ "eval_steps_per_second": 8.95,
423
+ "step": 11500
424
+ },
425
+ {
426
+ "epoch": 1.44,
427
+ "grad_norm": 3.275371789932251,
428
+ "learning_rate": 4.8557720057720056e-05,
429
+ "loss": 0.4398,
430
+ "step": 12000
431
+ },
432
+ {
433
+ "epoch": 1.44,
434
+ "eval_f1": 0.5247242844808815,
435
+ "eval_loss": 0.9738017916679382,
436
+ "eval_precision": 0.5370369073777802,
437
+ "eval_recall": 0.6070139179816599,
438
+ "eval_runtime": 6.219,
439
+ "eval_samples_per_second": 141.18,
440
+ "eval_steps_per_second": 8.844,
441
+ "step": 12000
442
+ },
443
+ {
444
+ "epoch": 1.5,
445
+ "grad_norm": 2.4162724018096924,
446
+ "learning_rate": 4.8497594997595e-05,
447
+ "loss": 0.4526,
448
+ "step": 12500
449
+ },
450
+ {
451
+ "epoch": 1.5,
452
+ "eval_f1": 0.5589742980399895,
453
+ "eval_loss": 0.7195601463317871,
454
+ "eval_precision": 0.7046240283838195,
455
+ "eval_recall": 0.5477959316668994,
456
+ "eval_runtime": 6.3918,
457
+ "eval_samples_per_second": 137.363,
458
+ "eval_steps_per_second": 8.605,
459
+ "step": 12500
460
+ },
461
+ {
462
+ "epoch": 1.56,
463
+ "grad_norm": 6.926381587982178,
464
+ "learning_rate": 4.8437469937469944e-05,
465
+ "loss": 0.4529,
466
+ "step": 13000
467
+ },
468
+ {
469
+ "epoch": 1.56,
470
+ "eval_f1": 0.5863097712686139,
471
+ "eval_loss": 0.8049713969230652,
472
+ "eval_precision": 0.6419448505612538,
473
+ "eval_recall": 0.5730605595121724,
474
+ "eval_runtime": 6.3636,
475
+ "eval_samples_per_second": 137.971,
476
+ "eval_steps_per_second": 8.643,
477
+ "step": 13000
478
+ },
479
+ {
480
+ "epoch": 1.62,
481
+ "grad_norm": 1.8420650959014893,
482
+ "learning_rate": 4.837746512746513e-05,
483
+ "loss": 0.446,
484
+ "step": 13500
485
+ },
486
+ {
487
+ "epoch": 1.62,
488
+ "eval_f1": 0.6107236144330398,
489
+ "eval_loss": 0.7564206719398499,
490
+ "eval_precision": 0.6520992658162544,
491
+ "eval_recall": 0.5912358609132803,
492
+ "eval_runtime": 6.4128,
493
+ "eval_samples_per_second": 136.914,
494
+ "eval_steps_per_second": 8.577,
495
+ "step": 13500
496
+ },
497
+ {
498
+ "epoch": 1.68,
499
+ "grad_norm": 2.423569679260254,
500
+ "learning_rate": 4.831746031746032e-05,
501
+ "loss": 0.4315,
502
+ "step": 14000
503
+ },
504
+ {
505
+ "epoch": 1.68,
506
+ "eval_f1": 0.621245910301715,
507
+ "eval_loss": 0.751511812210083,
508
+ "eval_precision": 0.6474767054531395,
509
+ "eval_recall": 0.6069198901456967,
510
+ "eval_runtime": 5.9833,
511
+ "eval_samples_per_second": 146.741,
512
+ "eval_steps_per_second": 9.192,
513
+ "step": 14000
514
+ },
515
+ {
516
+ "epoch": 1.74,
517
+ "grad_norm": 6.773381233215332,
518
+ "learning_rate": 4.825733525733526e-05,
519
+ "loss": 0.4464,
520
+ "step": 14500
521
+ },
522
+ {
523
+ "epoch": 1.74,
524
+ "eval_f1": 0.559868694735591,
525
+ "eval_loss": 0.8307517170906067,
526
+ "eval_precision": 0.627583612882644,
527
+ "eval_recall": 0.5512991667830377,
528
+ "eval_runtime": 6.1679,
529
+ "eval_samples_per_second": 142.35,
530
+ "eval_steps_per_second": 8.917,
531
+ "step": 14500
532
+ },
533
+ {
534
+ "epoch": 1.8,
535
+ "grad_norm": 6.220128059387207,
536
+ "learning_rate": 4.8197330447330455e-05,
537
+ "loss": 0.4423,
538
+ "step": 15000
539
+ },
540
+ {
541
+ "epoch": 1.8,
542
+ "eval_f1": 0.5991996711711277,
543
+ "eval_loss": 0.798150360584259,
544
+ "eval_precision": 0.6176196711770697,
545
+ "eval_recall": 0.5936535865568123,
546
+ "eval_runtime": 6.0738,
547
+ "eval_samples_per_second": 144.556,
548
+ "eval_steps_per_second": 9.055,
549
+ "step": 15000
550
+ },
551
+ {
552
+ "epoch": 1.86,
553
+ "grad_norm": 1.1065833568572998,
554
+ "learning_rate": 4.8137205387205395e-05,
555
+ "loss": 0.4551,
556
+ "step": 15500
557
+ },
558
+ {
559
+ "epoch": 1.86,
560
+ "eval_f1": 0.6019748538222912,
561
+ "eval_loss": 0.822293221950531,
562
+ "eval_precision": 0.6355921902599784,
563
+ "eval_recall": 0.5933528836754642,
564
+ "eval_runtime": 6.1197,
565
+ "eval_samples_per_second": 143.472,
566
+ "eval_steps_per_second": 8.987,
567
+ "step": 15500
568
+ },
569
+ {
570
+ "epoch": 1.92,
571
+ "grad_norm": 8.631648063659668,
572
+ "learning_rate": 4.807708032708033e-05,
573
+ "loss": 0.4408,
574
+ "step": 16000
575
+ },
576
+ {
577
+ "epoch": 1.92,
578
+ "eval_f1": 0.5131249172090748,
579
+ "eval_loss": 0.7691208124160767,
580
+ "eval_precision": 0.608759764068229,
581
+ "eval_recall": 0.5147484057161477,
582
+ "eval_runtime": 6.3609,
583
+ "eval_samples_per_second": 138.031,
584
+ "eval_steps_per_second": 8.647,
585
+ "step": 16000
586
+ },
587
+ {
588
+ "epoch": 1.98,
589
+ "grad_norm": 6.755849361419678,
590
+ "learning_rate": 4.801695526695527e-05,
591
+ "loss": 0.4389,
592
+ "step": 16500
593
+ },
594
+ {
595
+ "epoch": 1.98,
596
+ "eval_f1": 0.6702519892656928,
597
+ "eval_loss": 0.6971784234046936,
598
+ "eval_precision": 0.6686766810877821,
599
+ "eval_recall": 0.6729106735558349,
600
+ "eval_runtime": 6.1341,
601
+ "eval_samples_per_second": 143.134,
602
+ "eval_steps_per_second": 8.966,
603
+ "step": 16500
604
+ },
605
+ {
606
+ "epoch": 2.04,
607
+ "grad_norm": 19.813188552856445,
608
+ "learning_rate": 4.795683020683021e-05,
609
+ "loss": 0.3886,
610
+ "step": 17000
611
+ },
612
+ {
613
+ "epoch": 2.04,
614
+ "eval_f1": 0.5543489692487942,
615
+ "eval_loss": 0.7798230648040771,
616
+ "eval_precision": 0.6125764375980934,
617
+ "eval_recall": 0.543671740445934,
618
+ "eval_runtime": 6.7491,
619
+ "eval_samples_per_second": 130.09,
620
+ "eval_steps_per_second": 8.149,
621
+ "step": 17000
622
+ },
623
+ {
624
+ "epoch": 2.1,
625
+ "grad_norm": 7.927220821380615,
626
+ "learning_rate": 4.789670514670515e-05,
627
+ "loss": 0.3883,
628
+ "step": 17500
629
+ },
630
+ {
631
+ "epoch": 2.1,
632
+ "eval_f1": 0.5978449313058904,
633
+ "eval_loss": 0.8385018110275269,
634
+ "eval_precision": 0.5948463716988197,
635
+ "eval_recall": 0.6225499231950845,
636
+ "eval_runtime": 6.122,
637
+ "eval_samples_per_second": 143.416,
638
+ "eval_steps_per_second": 8.984,
639
+ "step": 17500
640
+ },
641
+ {
642
+ "epoch": 2.16,
643
+ "grad_norm": 6.237366199493408,
644
+ "learning_rate": 4.783658008658009e-05,
645
+ "loss": 0.4011,
646
+ "step": 18000
647
+ },
648
+ {
649
+ "epoch": 2.16,
650
+ "eval_f1": 0.5914931472808443,
651
+ "eval_loss": 0.7754688858985901,
652
+ "eval_precision": 0.655128213311837,
653
+ "eval_recall": 0.578716194200065,
654
+ "eval_runtime": 6.558,
655
+ "eval_samples_per_second": 133.882,
656
+ "eval_steps_per_second": 8.387,
657
+ "step": 18000
658
+ },
659
+ {
660
+ "epoch": 2.22,
661
+ "grad_norm": 3.3301048278808594,
662
+ "learning_rate": 4.777645502645503e-05,
663
+ "loss": 0.3992,
664
+ "step": 18500
665
+ },
666
+ {
667
+ "epoch": 2.22,
668
+ "eval_f1": 0.5472455226037474,
669
+ "eval_loss": 0.788632333278656,
670
+ "eval_precision": 0.558195855728615,
671
+ "eval_recall": 0.5519042964204254,
672
+ "eval_runtime": 6.124,
673
+ "eval_samples_per_second": 143.371,
674
+ "eval_steps_per_second": 8.981,
675
+ "step": 18500
676
+ },
677
+ {
678
+ "epoch": 2.28,
679
+ "grad_norm": 8.471348762512207,
680
+ "learning_rate": 4.771645021645022e-05,
681
+ "loss": 0.393,
682
+ "step": 19000
683
+ },
684
+ {
685
+ "epoch": 2.28,
686
+ "eval_f1": 0.5889012942356766,
687
+ "eval_loss": 0.7660124897956848,
688
+ "eval_precision": 0.5901145289176211,
689
+ "eval_recall": 0.592326956197924,
690
+ "eval_runtime": 5.8572,
691
+ "eval_samples_per_second": 149.9,
692
+ "eval_steps_per_second": 9.39,
693
+ "step": 19000
694
+ },
695
+ {
696
+ "epoch": 2.34,
697
+ "grad_norm": 15.840304374694824,
698
+ "learning_rate": 4.765632515632516e-05,
699
+ "loss": 0.3891,
700
+ "step": 19500
701
+ },
702
+ {
703
+ "epoch": 2.34,
704
+ "eval_f1": 0.5354251462409856,
705
+ "eval_loss": 0.7701670527458191,
706
+ "eval_precision": 0.579215207029406,
707
+ "eval_recall": 0.5330605595121725,
708
+ "eval_runtime": 6.1187,
709
+ "eval_samples_per_second": 143.495,
710
+ "eval_steps_per_second": 8.989,
711
+ "step": 19500
712
+ },
713
+ {
714
+ "epoch": 2.41,
715
+ "grad_norm": 1.6515294313430786,
716
+ "learning_rate": 4.75962000962001e-05,
717
+ "loss": 0.4119,
718
+ "step": 20000
719
+ },
720
+ {
721
+ "epoch": 2.41,
722
+ "eval_f1": 0.5110658029804255,
723
+ "eval_loss": 0.8545361161231995,
724
+ "eval_precision": 0.5405823804957771,
725
+ "eval_recall": 0.5243262114229856,
726
+ "eval_runtime": 6.2418,
727
+ "eval_samples_per_second": 140.665,
728
+ "eval_steps_per_second": 8.812,
729
+ "step": 20000
730
+ },
731
+ {
732
+ "epoch": 2.47,
733
+ "grad_norm": 3.166147470474243,
734
+ "learning_rate": 4.753607503607504e-05,
735
+ "loss": 0.3981,
736
+ "step": 20500
737
+ },
738
+ {
739
+ "epoch": 2.47,
740
+ "eval_f1": 0.53639943040752,
741
+ "eval_loss": 0.864085853099823,
742
+ "eval_precision": 0.5695344700259635,
743
+ "eval_recall": 0.5536247265279522,
744
+ "eval_runtime": 5.9635,
745
+ "eval_samples_per_second": 147.229,
746
+ "eval_steps_per_second": 9.223,
747
+ "step": 20500
748
+ },
749
+ {
750
+ "epoch": 2.53,
751
+ "grad_norm": 4.143538475036621,
752
+ "learning_rate": 4.747594997594998e-05,
753
+ "loss": 0.4,
754
+ "step": 21000
755
+ },
756
+ {
757
+ "epoch": 2.53,
758
+ "eval_f1": 0.582186065915728,
759
+ "eval_loss": 0.8044998049736023,
760
+ "eval_precision": 0.5987904356270873,
761
+ "eval_recall": 0.5844826141600334,
762
+ "eval_runtime": 5.9156,
763
+ "eval_samples_per_second": 148.422,
764
+ "eval_steps_per_second": 9.298,
765
+ "step": 21000
766
+ },
767
+ {
768
+ "epoch": 2.59,
769
+ "grad_norm": 5.849362850189209,
770
+ "learning_rate": 4.741582491582492e-05,
771
+ "loss": 0.4059,
772
+ "step": 21500
773
+ },
774
+ {
775
+ "epoch": 2.59,
776
+ "eval_f1": 0.569600279809319,
777
+ "eval_loss": 0.8023470044136047,
778
+ "eval_precision": 0.6300909361955873,
779
+ "eval_recall": 0.5548880510170833,
780
+ "eval_runtime": 5.9073,
781
+ "eval_samples_per_second": 148.629,
782
+ "eval_steps_per_second": 9.31,
783
+ "step": 21500
784
+ },
785
+ {
786
+ "epoch": 2.65,
787
+ "grad_norm": 2.0296847820281982,
788
+ "learning_rate": 4.735582010582011e-05,
789
+ "loss": 0.3805,
790
+ "step": 22000
791
+ },
792
+ {
793
+ "epoch": 2.65,
794
+ "eval_f1": 0.5387095557628462,
795
+ "eval_loss": 0.8242425322532654,
796
+ "eval_precision": 0.5632921859195318,
797
+ "eval_recall": 0.536337569240795,
798
+ "eval_runtime": 6.1681,
799
+ "eval_samples_per_second": 142.345,
800
+ "eval_steps_per_second": 8.917,
801
+ "step": 22000
802
+ },
803
+ {
804
+ "epoch": 2.71,
805
+ "grad_norm": 5.022754192352295,
806
+ "learning_rate": 4.729569504569505e-05,
807
+ "loss": 0.4126,
808
+ "step": 22500
809
+ },
810
+ {
811
+ "epoch": 2.71,
812
+ "eval_f1": 0.525337187977395,
813
+ "eval_loss": 0.8866151571273804,
814
+ "eval_precision": 0.563019122327633,
815
+ "eval_recall": 0.5244211702276219,
816
+ "eval_runtime": 6.5791,
817
+ "eval_samples_per_second": 133.453,
818
+ "eval_steps_per_second": 8.36,
819
+ "step": 22500
820
+ },
821
+ {
822
+ "epoch": 2.77,
823
+ "grad_norm": 6.320919990539551,
824
+ "learning_rate": 4.7235690235690236e-05,
825
+ "loss": 0.3959,
826
+ "step": 23000
827
+ },
828
+ {
829
+ "epoch": 2.77,
830
+ "eval_f1": 0.5715827904573106,
831
+ "eval_loss": 0.922848641872406,
832
+ "eval_precision": 0.6485667793604627,
833
+ "eval_recall": 0.5569566634082763,
834
+ "eval_runtime": 6.5486,
835
+ "eval_samples_per_second": 134.075,
836
+ "eval_steps_per_second": 8.399,
837
+ "step": 23000
838
+ },
839
+ {
840
+ "epoch": 2.83,
841
+ "grad_norm": 3.2674639225006104,
842
+ "learning_rate": 4.717556517556518e-05,
843
+ "loss": 0.3972,
844
+ "step": 23500
845
+ },
846
+ {
847
+ "epoch": 2.83,
848
+ "eval_f1": 0.6330230633421515,
849
+ "eval_loss": 0.8297170400619507,
850
+ "eval_precision": 0.64149542011954,
851
+ "eval_recall": 0.633559558720849,
852
+ "eval_runtime": 6.1502,
853
+ "eval_samples_per_second": 142.759,
854
+ "eval_steps_per_second": 8.943,
855
+ "step": 23500
856
+ },
857
+ {
858
+ "epoch": 2.89,
859
+ "grad_norm": 5.248292922973633,
860
+ "learning_rate": 4.711544011544012e-05,
861
+ "loss": 0.3779,
862
+ "step": 24000
863
+ },
864
+ {
865
+ "epoch": 2.89,
866
+ "eval_f1": 0.5897470753706388,
867
+ "eval_loss": 0.8682935833930969,
868
+ "eval_precision": 0.6023327508623889,
869
+ "eval_recall": 0.5919508448540706,
870
+ "eval_runtime": 6.3839,
871
+ "eval_samples_per_second": 137.534,
872
+ "eval_steps_per_second": 8.615,
873
+ "step": 24000
874
+ },
875
+ {
876
+ "epoch": 2.95,
877
+ "grad_norm": 4.1834635734558105,
878
+ "learning_rate": 4.705531505531506e-05,
879
+ "loss": 0.3951,
880
+ "step": 24500
881
+ },
882
+ {
883
+ "epoch": 2.95,
884
+ "eval_f1": 0.5124969418380673,
885
+ "eval_loss": 0.8628427982330322,
886
+ "eval_precision": 0.5891878367677518,
887
+ "eval_recall": 0.5116492110040497,
888
+ "eval_runtime": 6.1272,
889
+ "eval_samples_per_second": 143.295,
890
+ "eval_steps_per_second": 8.976,
891
+ "step": 24500
892
+ },
893
+ {
894
+ "epoch": 3.01,
895
+ "grad_norm": 12.86809253692627,
896
+ "learning_rate": 4.699518999519e-05,
897
+ "loss": 0.3916,
898
+ "step": 25000
899
+ },
900
+ {
901
+ "epoch": 3.01,
902
+ "eval_f1": 0.5024144172335627,
903
+ "eval_loss": 0.9203388094902039,
904
+ "eval_precision": 0.6304846593419121,
905
+ "eval_recall": 0.5026001955034213,
906
+ "eval_runtime": 6.0613,
907
+ "eval_samples_per_second": 144.854,
908
+ "eval_steps_per_second": 9.074,
909
+ "step": 25000
910
+ },
911
+ {
912
+ "epoch": 3.07,
913
+ "grad_norm": 3.2101404666900635,
914
+ "learning_rate": 4.693506493506494e-05,
915
+ "loss": 0.3524,
916
+ "step": 25500
917
+ },
918
+ {
919
+ "epoch": 3.07,
920
+ "eval_f1": 0.5010573535401949,
921
+ "eval_loss": 0.9825400710105896,
922
+ "eval_precision": 0.6088672873311428,
923
+ "eval_recall": 0.5039249639249639,
924
+ "eval_runtime": 5.9279,
925
+ "eval_samples_per_second": 148.113,
926
+ "eval_steps_per_second": 9.278,
927
+ "step": 25500
928
+ },
929
+ {
930
+ "epoch": 3.13,
931
+ "grad_norm": 16.025983810424805,
932
+ "learning_rate": 4.687493987493988e-05,
933
+ "loss": 0.3332,
934
+ "step": 26000
935
+ },
936
+ {
937
+ "epoch": 3.13,
938
+ "eval_f1": 0.5814110917677252,
939
+ "eval_loss": 0.8755331635475159,
940
+ "eval_precision": 0.5979503457905185,
941
+ "eval_recall": 0.5711502117953731,
942
+ "eval_runtime": 6.5321,
943
+ "eval_samples_per_second": 134.413,
944
+ "eval_steps_per_second": 8.42,
945
+ "step": 26000
946
+ },
947
+ {
948
+ "epoch": 3.19,
949
+ "grad_norm": 12.575716972351074,
950
+ "learning_rate": 4.681481481481482e-05,
951
+ "loss": 0.3517,
952
+ "step": 26500
953
+ },
954
+ {
955
+ "epoch": 3.19,
956
+ "eval_f1": 0.6181463909269773,
957
+ "eval_loss": 0.9921577572822571,
958
+ "eval_precision": 0.6701390442386371,
959
+ "eval_recall": 0.5940511101801424,
960
+ "eval_runtime": 6.2002,
961
+ "eval_samples_per_second": 141.609,
962
+ "eval_steps_per_second": 8.871,
963
+ "step": 26500
964
+ },
965
+ {
966
+ "epoch": 3.25,
967
+ "grad_norm": 2.219468355178833,
968
+ "learning_rate": 4.675468975468976e-05,
969
+ "loss": 0.3534,
970
+ "step": 27000
971
+ },
972
+ {
973
+ "epoch": 3.25,
974
+ "eval_f1": 0.5242620258087817,
975
+ "eval_loss": 0.9572548866271973,
976
+ "eval_precision": 0.5652503976549385,
977
+ "eval_recall": 0.5174640413350091,
978
+ "eval_runtime": 6.4041,
979
+ "eval_samples_per_second": 137.101,
980
+ "eval_steps_per_second": 8.588,
981
+ "step": 27000
982
+ },
983
+ {
984
+ "epoch": 3.31,
985
+ "grad_norm": 2.1716973781585693,
986
+ "learning_rate": 4.6694684944684945e-05,
987
+ "loss": 0.3544,
988
+ "step": 27500
989
+ },
990
+ {
991
+ "epoch": 3.31,
992
+ "eval_f1": 0.5551290620723939,
993
+ "eval_loss": 0.9826774001121521,
994
+ "eval_precision": 0.5738657811880764,
995
+ "eval_recall": 0.5531322440999861,
996
+ "eval_runtime": 5.8897,
997
+ "eval_samples_per_second": 149.075,
998
+ "eval_steps_per_second": 9.338,
999
+ "step": 27500
1000
+ },
1001
+ {
1002
+ "epoch": 3.37,
1003
+ "grad_norm": 5.642761707305908,
1004
+ "learning_rate": 4.6634559884559885e-05,
1005
+ "loss": 0.3526,
1006
+ "step": 28000
1007
+ },
1008
+ {
1009
+ "epoch": 3.37,
1010
+ "eval_f1": 0.46574966897620484,
1011
+ "eval_loss": 0.9517427682876587,
1012
+ "eval_precision": 0.6019158514451703,
1013
+ "eval_recall": 0.4737364427687008,
1014
+ "eval_runtime": 6.2232,
1015
+ "eval_samples_per_second": 141.086,
1016
+ "eval_steps_per_second": 8.838,
1017
+ "step": 28000
1018
+ },
1019
+ {
1020
+ "epoch": 3.43,
1021
+ "grad_norm": 8.693815231323242,
1022
+ "learning_rate": 4.6574434824434825e-05,
1023
+ "loss": 0.3448,
1024
+ "step": 28500
1025
+ },
1026
+ {
1027
+ "epoch": 3.43,
1028
+ "eval_f1": 0.5231658522131929,
1029
+ "eval_loss": 0.955856204032898,
1030
+ "eval_precision": 0.5743577178625582,
1031
+ "eval_recall": 0.5138062654191686,
1032
+ "eval_runtime": 6.2254,
1033
+ "eval_samples_per_second": 141.036,
1034
+ "eval_steps_per_second": 8.835,
1035
+ "step": 28500
1036
+ },
1037
+ {
1038
+ "epoch": 3.49,
1039
+ "grad_norm": 10.058433532714844,
1040
+ "learning_rate": 4.6514309764309766e-05,
1041
+ "loss": 0.3662,
1042
+ "step": 29000
1043
+ },
1044
+ {
1045
+ "epoch": 3.49,
1046
+ "eval_f1": 0.6173176500366803,
1047
+ "eval_loss": 0.8469758033752441,
1048
+ "eval_precision": 0.6416565078769693,
1049
+ "eval_recall": 0.6176418563515337,
1050
+ "eval_runtime": 6.1339,
1051
+ "eval_samples_per_second": 143.14,
1052
+ "eval_steps_per_second": 8.967,
1053
+ "step": 29000
1054
+ },
1055
+ {
1056
+ "epoch": 3.55,
1057
+ "grad_norm": 9.207432746887207,
1058
+ "learning_rate": 4.645466570466571e-05,
1059
+ "loss": 0.3502,
1060
+ "step": 29500
1061
+ },
1062
+ {
1063
+ "epoch": 3.55,
1064
+ "eval_f1": 0.5911826792863208,
1065
+ "eval_loss": 0.8524171113967896,
1066
+ "eval_precision": 0.6606129937002267,
1067
+ "eval_recall": 0.577619513103384,
1068
+ "eval_runtime": 5.9367,
1069
+ "eval_samples_per_second": 147.893,
1070
+ "eval_steps_per_second": 9.264,
1071
+ "step": 29500
1072
+ },
1073
+ {
1074
+ "epoch": 3.61,
1075
+ "grad_norm": 2.538233757019043,
1076
+ "learning_rate": 4.639454064454065e-05,
1077
+ "loss": 0.3733,
1078
+ "step": 30000
1079
+ },
1080
+ {
1081
+ "epoch": 3.61,
1082
+ "eval_f1": 0.5466184654496565,
1083
+ "eval_loss": 0.9210164546966553,
1084
+ "eval_precision": 0.5577658998711631,
1085
+ "eval_recall": 0.5554857329050877,
1086
+ "eval_runtime": 6.4254,
1087
+ "eval_samples_per_second": 136.645,
1088
+ "eval_steps_per_second": 8.56,
1089
+ "step": 30000
1090
+ },
1091
+ {
1092
+ "epoch": 3.67,
1093
+ "grad_norm": 2.017235279083252,
1094
+ "learning_rate": 4.633441558441559e-05,
1095
+ "loss": 0.3424,
1096
+ "step": 30500
1097
+ },
1098
+ {
1099
+ "epoch": 3.67,
1100
+ "eval_f1": 0.5809192439862544,
1101
+ "eval_loss": 0.9294881820678711,
1102
+ "eval_precision": 0.5863171312403235,
1103
+ "eval_recall": 0.6100302564818694,
1104
+ "eval_runtime": 6.2949,
1105
+ "eval_samples_per_second": 139.477,
1106
+ "eval_steps_per_second": 8.737,
1107
+ "step": 30500
1108
+ },
1109
+ {
1110
+ "epoch": 3.73,
1111
+ "grad_norm": 7.538774490356445,
1112
+ "learning_rate": 4.627429052429053e-05,
1113
+ "loss": 0.3591,
1114
+ "step": 31000
1115
+ },
1116
+ {
1117
+ "epoch": 3.73,
1118
+ "eval_f1": 0.4588251776601326,
1119
+ "eval_loss": 0.970705509185791,
1120
+ "eval_precision": 0.5827537007312288,
1121
+ "eval_recall": 0.4768803239770982,
1122
+ "eval_runtime": 6.0168,
1123
+ "eval_samples_per_second": 145.925,
1124
+ "eval_steps_per_second": 9.141,
1125
+ "step": 31000
1126
+ }
1127
+ ],
1128
+ "logging_steps": 500,
1129
+ "max_steps": 415800,
1130
+ "num_input_tokens_seen": 0,
1131
+ "num_train_epochs": 50,
1132
+ "save_steps": 500,
1133
+ "total_flos": 1.3049636179055616e+17,
1134
+ "train_batch_size": 16,
1135
+ "trial_name": null,
1136
+ "trial_params": null
1137
+ }
checkpoint-31000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f5c815d879e81b472ce52bc0bc6843f93b2e6b15b32a07dfe75c40e595456a
3
+ size 4920
checkpoint-31500/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "xlm-roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "output_past": true,
32
+ "pad_token_id": 1,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.38.2",
37
+ "type_vocab_size": 1,
38
+ "use_cache": true,
39
+ "vocab_size": 250002
40
+ }
checkpoint-31500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a32b2e951e360766c09f089885b1a24baabd4b1270271c1254453d23b00cbe47
3
+ size 1112208084
checkpoint-31500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:718b6005e6950e21342847c3ee4c6afac2660b9edcfc21f7ce2365e04c699cdd
3
+ size 2308666
checkpoint-31500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ec384dd27ca1285010b435199fe96924a8cacaad0bf01bd47a0b5dbc447610
3
+ size 14244
checkpoint-31500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6409693cc35a237a60aaed423aa394a034d367e998aa6eac4e0ae4401d5b90d2
3
+ size 1064
checkpoint-31500/trainer_state.json ADDED
@@ -0,0 +1,1155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6971784234046936,
3
+ "best_model_checkpoint": "./cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42/checkpoint-16500",
4
+ "epoch": 3.787878787878788,
5
+ "eval_steps": 500,
6
+ "global_step": 31500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "grad_norm": 3.298647880554199,
14
+ "learning_rate": 4.994023569023569e-05,
15
+ "loss": 0.6122,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "eval_f1": 0.4840638597456899,
21
+ "eval_loss": 0.854165256023407,
22
+ "eval_precision": 0.6558887250350466,
23
+ "eval_recall": 0.49045198529069495,
24
+ "eval_runtime": 5.9285,
25
+ "eval_samples_per_second": 148.099,
26
+ "eval_steps_per_second": 9.277,
27
+ "step": 500
28
+ },
29
+ {
30
+ "epoch": 0.12,
31
+ "grad_norm": 5.411099433898926,
32
+ "learning_rate": 4.988011063011063e-05,
33
+ "loss": 0.5497,
34
+ "step": 1000
35
+ },
36
+ {
37
+ "epoch": 0.12,
38
+ "eval_f1": 0.6209225023342669,
39
+ "eval_loss": 0.8037390112876892,
40
+ "eval_precision": 0.704421745545341,
41
+ "eval_recall": 0.6070083321696225,
42
+ "eval_runtime": 6.1691,
43
+ "eval_samples_per_second": 142.322,
44
+ "eval_steps_per_second": 8.915,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.18,
49
+ "grad_norm": 5.836483001708984,
50
+ "learning_rate": 4.9820105820105825e-05,
51
+ "loss": 0.5404,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.18,
56
+ "eval_f1": 0.3652071944289921,
57
+ "eval_loss": 0.9700150489807129,
58
+ "eval_precision": 0.5591482310679367,
59
+ "eval_recall": 0.4176288227901131,
60
+ "eval_runtime": 5.8886,
61
+ "eval_samples_per_second": 149.101,
62
+ "eval_steps_per_second": 9.34,
63
+ "step": 1500
64
+ },
65
+ {
66
+ "epoch": 0.24,
67
+ "grad_norm": 13.717193603515625,
68
+ "learning_rate": 4.975998075998076e-05,
69
+ "loss": 0.5165,
70
+ "step": 2000
71
+ },
72
+ {
73
+ "epoch": 0.24,
74
+ "eval_f1": 0.5369027892847279,
75
+ "eval_loss": 0.744874894618988,
76
+ "eval_precision": 0.7349445049700448,
77
+ "eval_recall": 0.529664385793418,
78
+ "eval_runtime": 5.996,
79
+ "eval_samples_per_second": 146.43,
80
+ "eval_steps_per_second": 9.173,
81
+ "step": 2000
82
+ },
83
+ {
84
+ "epoch": 0.3,
85
+ "grad_norm": 2.4534995555877686,
86
+ "learning_rate": 4.969997594997595e-05,
87
+ "loss": 0.5136,
88
+ "step": 2500
89
+ },
90
+ {
91
+ "epoch": 0.3,
92
+ "eval_f1": 0.5001381202499963,
93
+ "eval_loss": 0.7884698510169983,
94
+ "eval_precision": 0.6766332095394413,
95
+ "eval_recall": 0.5025275799469348,
96
+ "eval_runtime": 5.9085,
97
+ "eval_samples_per_second": 148.6,
98
+ "eval_steps_per_second": 9.309,
99
+ "step": 2500
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "grad_norm": 3.195244550704956,
104
+ "learning_rate": 4.963985088985089e-05,
105
+ "loss": 0.5072,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 0.36,
110
+ "eval_f1": 0.5917137619940201,
111
+ "eval_loss": 0.8123684525489807,
112
+ "eval_precision": 0.6076358199852175,
113
+ "eval_recall": 0.6132374435600242,
114
+ "eval_runtime": 6.1108,
115
+ "eval_samples_per_second": 143.68,
116
+ "eval_steps_per_second": 9.0,
117
+ "step": 3000
118
+ },
119
+ {
120
+ "epoch": 0.42,
121
+ "grad_norm": 7.579603672027588,
122
+ "learning_rate": 4.957972582972583e-05,
123
+ "loss": 0.5011,
124
+ "step": 3500
125
+ },
126
+ {
127
+ "epoch": 0.42,
128
+ "eval_f1": 0.578405909718061,
129
+ "eval_loss": 0.8767459392547607,
130
+ "eval_precision": 0.642659899090607,
131
+ "eval_recall": 0.5987143322627193,
132
+ "eval_runtime": 6.1563,
133
+ "eval_samples_per_second": 142.618,
134
+ "eval_steps_per_second": 8.934,
135
+ "step": 3500
136
+ },
137
+ {
138
+ "epoch": 0.48,
139
+ "grad_norm": 3.266787052154541,
140
+ "learning_rate": 4.951960076960077e-05,
141
+ "loss": 0.5021,
142
+ "step": 4000
143
+ },
144
+ {
145
+ "epoch": 0.48,
146
+ "eval_f1": 0.6502990015105321,
147
+ "eval_loss": 0.7957776784896851,
148
+ "eval_precision": 0.6847923256926328,
149
+ "eval_recall": 0.636192338127822,
150
+ "eval_runtime": 6.5221,
151
+ "eval_samples_per_second": 134.618,
152
+ "eval_steps_per_second": 8.433,
153
+ "step": 4000
154
+ },
155
+ {
156
+ "epoch": 0.54,
157
+ "grad_norm": 6.044332027435303,
158
+ "learning_rate": 4.945959595959596e-05,
159
+ "loss": 0.4946,
160
+ "step": 4500
161
+ },
162
+ {
163
+ "epoch": 0.54,
164
+ "eval_f1": 0.4982912515017284,
165
+ "eval_loss": 0.8045271039009094,
166
+ "eval_precision": 0.7220405815528763,
167
+ "eval_recall": 0.4968300516687614,
168
+ "eval_runtime": 6.1928,
169
+ "eval_samples_per_second": 141.778,
170
+ "eval_steps_per_second": 8.881,
171
+ "step": 4500
172
+ },
173
+ {
174
+ "epoch": 0.6,
175
+ "grad_norm": 5.152063846588135,
176
+ "learning_rate": 4.93994708994709e-05,
177
+ "loss": 0.4928,
178
+ "step": 5000
179
+ },
180
+ {
181
+ "epoch": 0.6,
182
+ "eval_f1": 0.550273048506264,
183
+ "eval_loss": 0.780342698097229,
184
+ "eval_precision": 0.7581894624319455,
185
+ "eval_recall": 0.5380887213145278,
186
+ "eval_runtime": 6.123,
187
+ "eval_samples_per_second": 143.395,
188
+ "eval_steps_per_second": 8.983,
189
+ "step": 5000
190
+ },
191
+ {
192
+ "epoch": 0.66,
193
+ "grad_norm": 4.54200553894043,
194
+ "learning_rate": 4.933934583934584e-05,
195
+ "loss": 0.5008,
196
+ "step": 5500
197
+ },
198
+ {
199
+ "epoch": 0.66,
200
+ "eval_f1": 0.4594232264185665,
201
+ "eval_loss": 0.7507085204124451,
202
+ "eval_precision": 0.44070483572560937,
203
+ "eval_recall": 0.47984452823162504,
204
+ "eval_runtime": 5.932,
205
+ "eval_samples_per_second": 148.011,
206
+ "eval_steps_per_second": 9.272,
207
+ "step": 5500
208
+ },
209
+ {
210
+ "epoch": 0.72,
211
+ "grad_norm": 4.075632095336914,
212
+ "learning_rate": 4.927922077922078e-05,
213
+ "loss": 0.4966,
214
+ "step": 6000
215
+ },
216
+ {
217
+ "epoch": 0.72,
218
+ "eval_f1": 0.6310991936984806,
219
+ "eval_loss": 0.8238988518714905,
220
+ "eval_precision": 0.6139657275796522,
221
+ "eval_recall": 0.6767434715821813,
222
+ "eval_runtime": 5.8918,
223
+ "eval_samples_per_second": 149.02,
224
+ "eval_steps_per_second": 9.335,
225
+ "step": 6000
226
+ },
227
+ {
228
+ "epoch": 0.78,
229
+ "grad_norm": 4.8725104331970215,
230
+ "learning_rate": 4.921909571909572e-05,
231
+ "loss": 0.4791,
232
+ "step": 6500
233
+ },
234
+ {
235
+ "epoch": 0.78,
236
+ "eval_f1": 0.5412559573187593,
237
+ "eval_loss": 0.7028306722640991,
238
+ "eval_precision": 0.6567775474615866,
239
+ "eval_recall": 0.520631196760229,
240
+ "eval_runtime": 6.3113,
241
+ "eval_samples_per_second": 139.116,
242
+ "eval_steps_per_second": 8.715,
243
+ "step": 6500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "grad_norm": 1.4915893077850342,
248
+ "learning_rate": 4.915897065897066e-05,
249
+ "loss": 0.494,
250
+ "step": 7000
251
+ },
252
+ {
253
+ "epoch": 0.84,
254
+ "eval_f1": 0.5227267406470947,
255
+ "eval_loss": 0.8033522367477417,
256
+ "eval_precision": 0.6660302960734323,
257
+ "eval_recall": 0.5188623562817111,
258
+ "eval_runtime": 6.1252,
259
+ "eval_samples_per_second": 143.342,
260
+ "eval_steps_per_second": 8.979,
261
+ "step": 7000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "grad_norm": 2.151014804840088,
266
+ "learning_rate": 4.90988455988456e-05,
267
+ "loss": 0.4861,
268
+ "step": 7500
269
+ },
270
+ {
271
+ "epoch": 0.9,
272
+ "eval_f1": 0.4541201667750796,
273
+ "eval_loss": 0.900325357913971,
274
+ "eval_precision": 0.5780562441152168,
275
+ "eval_recall": 0.4784564539403249,
276
+ "eval_runtime": 6.144,
277
+ "eval_samples_per_second": 142.903,
278
+ "eval_steps_per_second": 8.952,
279
+ "step": 7500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "grad_norm": 4.770496368408203,
284
+ "learning_rate": 4.903872053872054e-05,
285
+ "loss": 0.4804,
286
+ "step": 8000
287
+ },
288
+ {
289
+ "epoch": 0.96,
290
+ "eval_f1": 0.5791890202588422,
291
+ "eval_loss": 0.773960530757904,
292
+ "eval_precision": 0.6238945275403609,
293
+ "eval_recall": 0.5775003491132523,
294
+ "eval_runtime": 6.556,
295
+ "eval_samples_per_second": 133.923,
296
+ "eval_steps_per_second": 8.389,
297
+ "step": 8000
298
+ },
299
+ {
300
+ "epoch": 1.02,
301
+ "grad_norm": 2.520460367202759,
302
+ "learning_rate": 4.897859547859548e-05,
303
+ "loss": 0.4614,
304
+ "step": 8500
305
+ },
306
+ {
307
+ "epoch": 1.02,
308
+ "eval_f1": 0.6470888284841774,
309
+ "eval_loss": 0.7397181391716003,
310
+ "eval_precision": 0.6848151355984641,
311
+ "eval_recall": 0.6312358609132803,
312
+ "eval_runtime": 6.1813,
313
+ "eval_samples_per_second": 142.042,
314
+ "eval_steps_per_second": 8.898,
315
+ "step": 8500
316
+ },
317
+ {
318
+ "epoch": 1.08,
319
+ "grad_norm": 4.375688552856445,
320
+ "learning_rate": 4.891847041847042e-05,
321
+ "loss": 0.4315,
322
+ "step": 9000
323
+ },
324
+ {
325
+ "epoch": 1.08,
326
+ "eval_f1": 0.614857769662433,
327
+ "eval_loss": 0.788919985294342,
328
+ "eval_precision": 0.6641593406916259,
329
+ "eval_recall": 0.6034743750872783,
330
+ "eval_runtime": 6.1798,
331
+ "eval_samples_per_second": 142.076,
332
+ "eval_steps_per_second": 8.9,
333
+ "step": 9000
334
+ },
335
+ {
336
+ "epoch": 1.14,
337
+ "grad_norm": 4.091088771820068,
338
+ "learning_rate": 4.885834535834536e-05,
339
+ "loss": 0.4506,
340
+ "step": 9500
341
+ },
342
+ {
343
+ "epoch": 1.14,
344
+ "eval_f1": 0.4967964786589283,
345
+ "eval_loss": 0.8783875703811646,
346
+ "eval_precision": 0.6387377173091459,
347
+ "eval_recall": 0.5016645719871526,
348
+ "eval_runtime": 5.9164,
349
+ "eval_samples_per_second": 148.401,
350
+ "eval_steps_per_second": 9.296,
351
+ "step": 9500
352
+ },
353
+ {
354
+ "epoch": 1.2,
355
+ "grad_norm": 3.3903276920318604,
356
+ "learning_rate": 4.87982202982203e-05,
357
+ "loss": 0.4489,
358
+ "step": 10000
359
+ },
360
+ {
361
+ "epoch": 1.2,
362
+ "eval_f1": 0.4949153076705755,
363
+ "eval_loss": 0.7994188070297241,
364
+ "eval_precision": 0.5340329579250159,
365
+ "eval_recall": 0.49638597961178615,
366
+ "eval_runtime": 5.9029,
367
+ "eval_samples_per_second": 148.74,
368
+ "eval_steps_per_second": 9.317,
369
+ "step": 10000
370
+ },
371
+ {
372
+ "epoch": 1.26,
373
+ "grad_norm": 3.929879903793335,
374
+ "learning_rate": 4.8738095238095235e-05,
375
+ "loss": 0.4466,
376
+ "step": 10500
377
+ },
378
+ {
379
+ "epoch": 1.26,
380
+ "eval_f1": 0.44642812881455524,
381
+ "eval_loss": 0.8109920024871826,
382
+ "eval_precision": 0.5776119229607602,
383
+ "eval_recall": 0.47351207931853095,
384
+ "eval_runtime": 5.9766,
385
+ "eval_samples_per_second": 146.907,
386
+ "eval_steps_per_second": 9.203,
387
+ "step": 10500
388
+ },
389
+ {
390
+ "epoch": 1.32,
391
+ "grad_norm": 6.443171501159668,
392
+ "learning_rate": 4.8677970177970176e-05,
393
+ "loss": 0.4319,
394
+ "step": 11000
395
+ },
396
+ {
397
+ "epoch": 1.32,
398
+ "eval_f1": 0.5481427288492505,
399
+ "eval_loss": 0.8068605661392212,
400
+ "eval_precision": 0.6612496177619213,
401
+ "eval_recall": 0.5399497276916632,
402
+ "eval_runtime": 5.9001,
403
+ "eval_samples_per_second": 148.811,
404
+ "eval_steps_per_second": 9.322,
405
+ "step": 11000
406
+ },
407
+ {
408
+ "epoch": 1.38,
409
+ "grad_norm": 7.633645057678223,
410
+ "learning_rate": 4.8617845117845116e-05,
411
+ "loss": 0.4243,
412
+ "step": 11500
413
+ },
414
+ {
415
+ "epoch": 1.38,
416
+ "eval_f1": 0.5797306372413114,
417
+ "eval_loss": 0.7941620349884033,
418
+ "eval_precision": 0.5948358635007136,
419
+ "eval_recall": 0.5704752595075175,
420
+ "eval_runtime": 6.145,
421
+ "eval_samples_per_second": 142.881,
422
+ "eval_steps_per_second": 8.95,
423
+ "step": 11500
424
+ },
425
+ {
426
+ "epoch": 1.44,
427
+ "grad_norm": 3.275371789932251,
428
+ "learning_rate": 4.8557720057720056e-05,
429
+ "loss": 0.4398,
430
+ "step": 12000
431
+ },
432
+ {
433
+ "epoch": 1.44,
434
+ "eval_f1": 0.5247242844808815,
435
+ "eval_loss": 0.9738017916679382,
436
+ "eval_precision": 0.5370369073777802,
437
+ "eval_recall": 0.6070139179816599,
438
+ "eval_runtime": 6.219,
439
+ "eval_samples_per_second": 141.18,
440
+ "eval_steps_per_second": 8.844,
441
+ "step": 12000
442
+ },
443
+ {
444
+ "epoch": 1.5,
445
+ "grad_norm": 2.4162724018096924,
446
+ "learning_rate": 4.8497594997595e-05,
447
+ "loss": 0.4526,
448
+ "step": 12500
449
+ },
450
+ {
451
+ "epoch": 1.5,
452
+ "eval_f1": 0.5589742980399895,
453
+ "eval_loss": 0.7195601463317871,
454
+ "eval_precision": 0.7046240283838195,
455
+ "eval_recall": 0.5477959316668994,
456
+ "eval_runtime": 6.3918,
457
+ "eval_samples_per_second": 137.363,
458
+ "eval_steps_per_second": 8.605,
459
+ "step": 12500
460
+ },
461
+ {
462
+ "epoch": 1.56,
463
+ "grad_norm": 6.926381587982178,
464
+ "learning_rate": 4.8437469937469944e-05,
465
+ "loss": 0.4529,
466
+ "step": 13000
467
+ },
468
+ {
469
+ "epoch": 1.56,
470
+ "eval_f1": 0.5863097712686139,
471
+ "eval_loss": 0.8049713969230652,
472
+ "eval_precision": 0.6419448505612538,
473
+ "eval_recall": 0.5730605595121724,
474
+ "eval_runtime": 6.3636,
475
+ "eval_samples_per_second": 137.971,
476
+ "eval_steps_per_second": 8.643,
477
+ "step": 13000
478
+ },
479
+ {
480
+ "epoch": 1.62,
481
+ "grad_norm": 1.8420650959014893,
482
+ "learning_rate": 4.837746512746513e-05,
483
+ "loss": 0.446,
484
+ "step": 13500
485
+ },
486
+ {
487
+ "epoch": 1.62,
488
+ "eval_f1": 0.6107236144330398,
489
+ "eval_loss": 0.7564206719398499,
490
+ "eval_precision": 0.6520992658162544,
491
+ "eval_recall": 0.5912358609132803,
492
+ "eval_runtime": 6.4128,
493
+ "eval_samples_per_second": 136.914,
494
+ "eval_steps_per_second": 8.577,
495
+ "step": 13500
496
+ },
497
+ {
498
+ "epoch": 1.68,
499
+ "grad_norm": 2.423569679260254,
500
+ "learning_rate": 4.831746031746032e-05,
501
+ "loss": 0.4315,
502
+ "step": 14000
503
+ },
504
+ {
505
+ "epoch": 1.68,
506
+ "eval_f1": 0.621245910301715,
507
+ "eval_loss": 0.751511812210083,
508
+ "eval_precision": 0.6474767054531395,
509
+ "eval_recall": 0.6069198901456967,
510
+ "eval_runtime": 5.9833,
511
+ "eval_samples_per_second": 146.741,
512
+ "eval_steps_per_second": 9.192,
513
+ "step": 14000
514
+ },
515
+ {
516
+ "epoch": 1.74,
517
+ "grad_norm": 6.773381233215332,
518
+ "learning_rate": 4.825733525733526e-05,
519
+ "loss": 0.4464,
520
+ "step": 14500
521
+ },
522
+ {
523
+ "epoch": 1.74,
524
+ "eval_f1": 0.559868694735591,
525
+ "eval_loss": 0.8307517170906067,
526
+ "eval_precision": 0.627583612882644,
527
+ "eval_recall": 0.5512991667830377,
528
+ "eval_runtime": 6.1679,
529
+ "eval_samples_per_second": 142.35,
530
+ "eval_steps_per_second": 8.917,
531
+ "step": 14500
532
+ },
533
+ {
534
+ "epoch": 1.8,
535
+ "grad_norm": 6.220128059387207,
536
+ "learning_rate": 4.8197330447330455e-05,
537
+ "loss": 0.4423,
538
+ "step": 15000
539
+ },
540
+ {
541
+ "epoch": 1.8,
542
+ "eval_f1": 0.5991996711711277,
543
+ "eval_loss": 0.798150360584259,
544
+ "eval_precision": 0.6176196711770697,
545
+ "eval_recall": 0.5936535865568123,
546
+ "eval_runtime": 6.0738,
547
+ "eval_samples_per_second": 144.556,
548
+ "eval_steps_per_second": 9.055,
549
+ "step": 15000
550
+ },
551
+ {
552
+ "epoch": 1.86,
553
+ "grad_norm": 1.1065833568572998,
554
+ "learning_rate": 4.8137205387205395e-05,
555
+ "loss": 0.4551,
556
+ "step": 15500
557
+ },
558
+ {
559
+ "epoch": 1.86,
560
+ "eval_f1": 0.6019748538222912,
561
+ "eval_loss": 0.822293221950531,
562
+ "eval_precision": 0.6355921902599784,
563
+ "eval_recall": 0.5933528836754642,
564
+ "eval_runtime": 6.1197,
565
+ "eval_samples_per_second": 143.472,
566
+ "eval_steps_per_second": 8.987,
567
+ "step": 15500
568
+ },
569
+ {
570
+ "epoch": 1.92,
571
+ "grad_norm": 8.631648063659668,
572
+ "learning_rate": 4.807708032708033e-05,
573
+ "loss": 0.4408,
574
+ "step": 16000
575
+ },
576
+ {
577
+ "epoch": 1.92,
578
+ "eval_f1": 0.5131249172090748,
579
+ "eval_loss": 0.7691208124160767,
580
+ "eval_precision": 0.608759764068229,
581
+ "eval_recall": 0.5147484057161477,
582
+ "eval_runtime": 6.3609,
583
+ "eval_samples_per_second": 138.031,
584
+ "eval_steps_per_second": 8.647,
585
+ "step": 16000
586
+ },
587
+ {
588
+ "epoch": 1.98,
589
+ "grad_norm": 6.755849361419678,
590
+ "learning_rate": 4.801695526695527e-05,
591
+ "loss": 0.4389,
592
+ "step": 16500
593
+ },
594
+ {
595
+ "epoch": 1.98,
596
+ "eval_f1": 0.6702519892656928,
597
+ "eval_loss": 0.6971784234046936,
598
+ "eval_precision": 0.6686766810877821,
599
+ "eval_recall": 0.6729106735558349,
600
+ "eval_runtime": 6.1341,
601
+ "eval_samples_per_second": 143.134,
602
+ "eval_steps_per_second": 8.966,
603
+ "step": 16500
604
+ },
605
+ {
606
+ "epoch": 2.04,
607
+ "grad_norm": 19.813188552856445,
608
+ "learning_rate": 4.795683020683021e-05,
609
+ "loss": 0.3886,
610
+ "step": 17000
611
+ },
612
+ {
613
+ "epoch": 2.04,
614
+ "eval_f1": 0.5543489692487942,
615
+ "eval_loss": 0.7798230648040771,
616
+ "eval_precision": 0.6125764375980934,
617
+ "eval_recall": 0.543671740445934,
618
+ "eval_runtime": 6.7491,
619
+ "eval_samples_per_second": 130.09,
620
+ "eval_steps_per_second": 8.149,
621
+ "step": 17000
622
+ },
623
+ {
624
+ "epoch": 2.1,
625
+ "grad_norm": 7.927220821380615,
626
+ "learning_rate": 4.789670514670515e-05,
627
+ "loss": 0.3883,
628
+ "step": 17500
629
+ },
630
+ {
631
+ "epoch": 2.1,
632
+ "eval_f1": 0.5978449313058904,
633
+ "eval_loss": 0.8385018110275269,
634
+ "eval_precision": 0.5948463716988197,
635
+ "eval_recall": 0.6225499231950845,
636
+ "eval_runtime": 6.122,
637
+ "eval_samples_per_second": 143.416,
638
+ "eval_steps_per_second": 8.984,
639
+ "step": 17500
640
+ },
641
+ {
642
+ "epoch": 2.16,
643
+ "grad_norm": 6.237366199493408,
644
+ "learning_rate": 4.783658008658009e-05,
645
+ "loss": 0.4011,
646
+ "step": 18000
647
+ },
648
+ {
649
+ "epoch": 2.16,
650
+ "eval_f1": 0.5914931472808443,
651
+ "eval_loss": 0.7754688858985901,
652
+ "eval_precision": 0.655128213311837,
653
+ "eval_recall": 0.578716194200065,
654
+ "eval_runtime": 6.558,
655
+ "eval_samples_per_second": 133.882,
656
+ "eval_steps_per_second": 8.387,
657
+ "step": 18000
658
+ },
659
+ {
660
+ "epoch": 2.22,
661
+ "grad_norm": 3.3301048278808594,
662
+ "learning_rate": 4.777645502645503e-05,
663
+ "loss": 0.3992,
664
+ "step": 18500
665
+ },
666
+ {
667
+ "epoch": 2.22,
668
+ "eval_f1": 0.5472455226037474,
669
+ "eval_loss": 0.788632333278656,
670
+ "eval_precision": 0.558195855728615,
671
+ "eval_recall": 0.5519042964204254,
672
+ "eval_runtime": 6.124,
673
+ "eval_samples_per_second": 143.371,
674
+ "eval_steps_per_second": 8.981,
675
+ "step": 18500
676
+ },
677
+ {
678
+ "epoch": 2.28,
679
+ "grad_norm": 8.471348762512207,
680
+ "learning_rate": 4.771645021645022e-05,
681
+ "loss": 0.393,
682
+ "step": 19000
683
+ },
684
+ {
685
+ "epoch": 2.28,
686
+ "eval_f1": 0.5889012942356766,
687
+ "eval_loss": 0.7660124897956848,
688
+ "eval_precision": 0.5901145289176211,
689
+ "eval_recall": 0.592326956197924,
690
+ "eval_runtime": 5.8572,
691
+ "eval_samples_per_second": 149.9,
692
+ "eval_steps_per_second": 9.39,
693
+ "step": 19000
694
+ },
695
+ {
696
+ "epoch": 2.34,
697
+ "grad_norm": 15.840304374694824,
698
+ "learning_rate": 4.765632515632516e-05,
699
+ "loss": 0.3891,
700
+ "step": 19500
701
+ },
702
+ {
703
+ "epoch": 2.34,
704
+ "eval_f1": 0.5354251462409856,
705
+ "eval_loss": 0.7701670527458191,
706
+ "eval_precision": 0.579215207029406,
707
+ "eval_recall": 0.5330605595121725,
708
+ "eval_runtime": 6.1187,
709
+ "eval_samples_per_second": 143.495,
710
+ "eval_steps_per_second": 8.989,
711
+ "step": 19500
712
+ },
713
+ {
714
+ "epoch": 2.41,
715
+ "grad_norm": 1.6515294313430786,
716
+ "learning_rate": 4.75962000962001e-05,
717
+ "loss": 0.4119,
718
+ "step": 20000
719
+ },
720
+ {
721
+ "epoch": 2.41,
722
+ "eval_f1": 0.5110658029804255,
723
+ "eval_loss": 0.8545361161231995,
724
+ "eval_precision": 0.5405823804957771,
725
+ "eval_recall": 0.5243262114229856,
726
+ "eval_runtime": 6.2418,
727
+ "eval_samples_per_second": 140.665,
728
+ "eval_steps_per_second": 8.812,
729
+ "step": 20000
730
+ },
731
+ {
732
+ "epoch": 2.47,
733
+ "grad_norm": 3.166147470474243,
734
+ "learning_rate": 4.753607503607504e-05,
735
+ "loss": 0.3981,
736
+ "step": 20500
737
+ },
738
+ {
739
+ "epoch": 2.47,
740
+ "eval_f1": 0.53639943040752,
741
+ "eval_loss": 0.864085853099823,
742
+ "eval_precision": 0.5695344700259635,
743
+ "eval_recall": 0.5536247265279522,
744
+ "eval_runtime": 5.9635,
745
+ "eval_samples_per_second": 147.229,
746
+ "eval_steps_per_second": 9.223,
747
+ "step": 20500
748
+ },
749
+ {
750
+ "epoch": 2.53,
751
+ "grad_norm": 4.143538475036621,
752
+ "learning_rate": 4.747594997594998e-05,
753
+ "loss": 0.4,
754
+ "step": 21000
755
+ },
756
+ {
757
+ "epoch": 2.53,
758
+ "eval_f1": 0.582186065915728,
759
+ "eval_loss": 0.8044998049736023,
760
+ "eval_precision": 0.5987904356270873,
761
+ "eval_recall": 0.5844826141600334,
762
+ "eval_runtime": 5.9156,
763
+ "eval_samples_per_second": 148.422,
764
+ "eval_steps_per_second": 9.298,
765
+ "step": 21000
766
+ },
767
+ {
768
+ "epoch": 2.59,
769
+ "grad_norm": 5.849362850189209,
770
+ "learning_rate": 4.741582491582492e-05,
771
+ "loss": 0.4059,
772
+ "step": 21500
773
+ },
774
+ {
775
+ "epoch": 2.59,
776
+ "eval_f1": 0.569600279809319,
777
+ "eval_loss": 0.8023470044136047,
778
+ "eval_precision": 0.6300909361955873,
779
+ "eval_recall": 0.5548880510170833,
780
+ "eval_runtime": 5.9073,
781
+ "eval_samples_per_second": 148.629,
782
+ "eval_steps_per_second": 9.31,
783
+ "step": 21500
784
+ },
785
+ {
786
+ "epoch": 2.65,
787
+ "grad_norm": 2.0296847820281982,
788
+ "learning_rate": 4.735582010582011e-05,
789
+ "loss": 0.3805,
790
+ "step": 22000
791
+ },
792
+ {
793
+ "epoch": 2.65,
794
+ "eval_f1": 0.5387095557628462,
795
+ "eval_loss": 0.8242425322532654,
796
+ "eval_precision": 0.5632921859195318,
797
+ "eval_recall": 0.536337569240795,
798
+ "eval_runtime": 6.1681,
799
+ "eval_samples_per_second": 142.345,
800
+ "eval_steps_per_second": 8.917,
801
+ "step": 22000
802
+ },
803
+ {
804
+ "epoch": 2.71,
805
+ "grad_norm": 5.022754192352295,
806
+ "learning_rate": 4.729569504569505e-05,
807
+ "loss": 0.4126,
808
+ "step": 22500
809
+ },
810
+ {
811
+ "epoch": 2.71,
812
+ "eval_f1": 0.525337187977395,
813
+ "eval_loss": 0.8866151571273804,
814
+ "eval_precision": 0.563019122327633,
815
+ "eval_recall": 0.5244211702276219,
816
+ "eval_runtime": 6.5791,
817
+ "eval_samples_per_second": 133.453,
818
+ "eval_steps_per_second": 8.36,
819
+ "step": 22500
820
+ },
821
+ {
822
+ "epoch": 2.77,
823
+ "grad_norm": 6.320919990539551,
824
+ "learning_rate": 4.7235690235690236e-05,
825
+ "loss": 0.3959,
826
+ "step": 23000
827
+ },
828
+ {
829
+ "epoch": 2.77,
830
+ "eval_f1": 0.5715827904573106,
831
+ "eval_loss": 0.922848641872406,
832
+ "eval_precision": 0.6485667793604627,
833
+ "eval_recall": 0.5569566634082763,
834
+ "eval_runtime": 6.5486,
835
+ "eval_samples_per_second": 134.075,
836
+ "eval_steps_per_second": 8.399,
837
+ "step": 23000
838
+ },
839
+ {
840
+ "epoch": 2.83,
841
+ "grad_norm": 3.2674639225006104,
842
+ "learning_rate": 4.717556517556518e-05,
843
+ "loss": 0.3972,
844
+ "step": 23500
845
+ },
846
+ {
847
+ "epoch": 2.83,
848
+ "eval_f1": 0.6330230633421515,
849
+ "eval_loss": 0.8297170400619507,
850
+ "eval_precision": 0.64149542011954,
851
+ "eval_recall": 0.633559558720849,
852
+ "eval_runtime": 6.1502,
853
+ "eval_samples_per_second": 142.759,
854
+ "eval_steps_per_second": 8.943,
855
+ "step": 23500
856
+ },
857
+ {
858
+ "epoch": 2.89,
859
+ "grad_norm": 5.248292922973633,
860
+ "learning_rate": 4.711544011544012e-05,
861
+ "loss": 0.3779,
862
+ "step": 24000
863
+ },
864
+ {
865
+ "epoch": 2.89,
866
+ "eval_f1": 0.5897470753706388,
867
+ "eval_loss": 0.8682935833930969,
868
+ "eval_precision": 0.6023327508623889,
869
+ "eval_recall": 0.5919508448540706,
870
+ "eval_runtime": 6.3839,
871
+ "eval_samples_per_second": 137.534,
872
+ "eval_steps_per_second": 8.615,
873
+ "step": 24000
874
+ },
875
+ {
876
+ "epoch": 2.95,
877
+ "grad_norm": 4.1834635734558105,
878
+ "learning_rate": 4.705531505531506e-05,
879
+ "loss": 0.3951,
880
+ "step": 24500
881
+ },
882
+ {
883
+ "epoch": 2.95,
884
+ "eval_f1": 0.5124969418380673,
885
+ "eval_loss": 0.8628427982330322,
886
+ "eval_precision": 0.5891878367677518,
887
+ "eval_recall": 0.5116492110040497,
888
+ "eval_runtime": 6.1272,
889
+ "eval_samples_per_second": 143.295,
890
+ "eval_steps_per_second": 8.976,
891
+ "step": 24500
892
+ },
893
+ {
894
+ "epoch": 3.01,
895
+ "grad_norm": 12.86809253692627,
896
+ "learning_rate": 4.699518999519e-05,
897
+ "loss": 0.3916,
898
+ "step": 25000
899
+ },
900
+ {
901
+ "epoch": 3.01,
902
+ "eval_f1": 0.5024144172335627,
903
+ "eval_loss": 0.9203388094902039,
904
+ "eval_precision": 0.6304846593419121,
905
+ "eval_recall": 0.5026001955034213,
906
+ "eval_runtime": 6.0613,
907
+ "eval_samples_per_second": 144.854,
908
+ "eval_steps_per_second": 9.074,
909
+ "step": 25000
910
+ },
911
+ {
912
+ "epoch": 3.07,
913
+ "grad_norm": 3.2101404666900635,
914
+ "learning_rate": 4.693506493506494e-05,
915
+ "loss": 0.3524,
916
+ "step": 25500
917
+ },
918
+ {
919
+ "epoch": 3.07,
920
+ "eval_f1": 0.5010573535401949,
921
+ "eval_loss": 0.9825400710105896,
922
+ "eval_precision": 0.6088672873311428,
923
+ "eval_recall": 0.5039249639249639,
924
+ "eval_runtime": 5.9279,
925
+ "eval_samples_per_second": 148.113,
926
+ "eval_steps_per_second": 9.278,
927
+ "step": 25500
928
+ },
929
+ {
930
+ "epoch": 3.13,
931
+ "grad_norm": 16.025983810424805,
932
+ "learning_rate": 4.687493987493988e-05,
933
+ "loss": 0.3332,
934
+ "step": 26000
935
+ },
936
+ {
937
+ "epoch": 3.13,
938
+ "eval_f1": 0.5814110917677252,
939
+ "eval_loss": 0.8755331635475159,
940
+ "eval_precision": 0.5979503457905185,
941
+ "eval_recall": 0.5711502117953731,
942
+ "eval_runtime": 6.5321,
943
+ "eval_samples_per_second": 134.413,
944
+ "eval_steps_per_second": 8.42,
945
+ "step": 26000
946
+ },
947
+ {
948
+ "epoch": 3.19,
949
+ "grad_norm": 12.575716972351074,
950
+ "learning_rate": 4.681481481481482e-05,
951
+ "loss": 0.3517,
952
+ "step": 26500
953
+ },
954
+ {
955
+ "epoch": 3.19,
956
+ "eval_f1": 0.6181463909269773,
957
+ "eval_loss": 0.9921577572822571,
958
+ "eval_precision": 0.6701390442386371,
959
+ "eval_recall": 0.5940511101801424,
960
+ "eval_runtime": 6.2002,
961
+ "eval_samples_per_second": 141.609,
962
+ "eval_steps_per_second": 8.871,
963
+ "step": 26500
964
+ },
965
+ {
966
+ "epoch": 3.25,
967
+ "grad_norm": 2.219468355178833,
968
+ "learning_rate": 4.675468975468976e-05,
969
+ "loss": 0.3534,
970
+ "step": 27000
971
+ },
972
+ {
973
+ "epoch": 3.25,
974
+ "eval_f1": 0.5242620258087817,
975
+ "eval_loss": 0.9572548866271973,
976
+ "eval_precision": 0.5652503976549385,
977
+ "eval_recall": 0.5174640413350091,
978
+ "eval_runtime": 6.4041,
979
+ "eval_samples_per_second": 137.101,
980
+ "eval_steps_per_second": 8.588,
981
+ "step": 27000
982
+ },
983
+ {
984
+ "epoch": 3.31,
985
+ "grad_norm": 2.1716973781585693,
986
+ "learning_rate": 4.6694684944684945e-05,
987
+ "loss": 0.3544,
988
+ "step": 27500
989
+ },
990
+ {
991
+ "epoch": 3.31,
992
+ "eval_f1": 0.5551290620723939,
993
+ "eval_loss": 0.9826774001121521,
994
+ "eval_precision": 0.5738657811880764,
995
+ "eval_recall": 0.5531322440999861,
996
+ "eval_runtime": 5.8897,
997
+ "eval_samples_per_second": 149.075,
998
+ "eval_steps_per_second": 9.338,
999
+ "step": 27500
1000
+ },
1001
+ {
1002
+ "epoch": 3.37,
1003
+ "grad_norm": 5.642761707305908,
1004
+ "learning_rate": 4.6634559884559885e-05,
1005
+ "loss": 0.3526,
1006
+ "step": 28000
1007
+ },
1008
+ {
1009
+ "epoch": 3.37,
1010
+ "eval_f1": 0.46574966897620484,
1011
+ "eval_loss": 0.9517427682876587,
1012
+ "eval_precision": 0.6019158514451703,
1013
+ "eval_recall": 0.4737364427687008,
1014
+ "eval_runtime": 6.2232,
1015
+ "eval_samples_per_second": 141.086,
1016
+ "eval_steps_per_second": 8.838,
1017
+ "step": 28000
1018
+ },
1019
+ {
1020
+ "epoch": 3.43,
1021
+ "grad_norm": 8.693815231323242,
1022
+ "learning_rate": 4.6574434824434825e-05,
1023
+ "loss": 0.3448,
1024
+ "step": 28500
1025
+ },
1026
+ {
1027
+ "epoch": 3.43,
1028
+ "eval_f1": 0.5231658522131929,
1029
+ "eval_loss": 0.955856204032898,
1030
+ "eval_precision": 0.5743577178625582,
1031
+ "eval_recall": 0.5138062654191686,
1032
+ "eval_runtime": 6.2254,
1033
+ "eval_samples_per_second": 141.036,
1034
+ "eval_steps_per_second": 8.835,
1035
+ "step": 28500
1036
+ },
1037
+ {
1038
+ "epoch": 3.49,
1039
+ "grad_norm": 10.058433532714844,
1040
+ "learning_rate": 4.6514309764309766e-05,
1041
+ "loss": 0.3662,
1042
+ "step": 29000
1043
+ },
1044
+ {
1045
+ "epoch": 3.49,
1046
+ "eval_f1": 0.6173176500366803,
1047
+ "eval_loss": 0.8469758033752441,
1048
+ "eval_precision": 0.6416565078769693,
1049
+ "eval_recall": 0.6176418563515337,
1050
+ "eval_runtime": 6.1339,
1051
+ "eval_samples_per_second": 143.14,
1052
+ "eval_steps_per_second": 8.967,
1053
+ "step": 29000
1054
+ },
1055
+ {
1056
+ "epoch": 3.55,
1057
+ "grad_norm": 9.207432746887207,
1058
+ "learning_rate": 4.645466570466571e-05,
1059
+ "loss": 0.3502,
1060
+ "step": 29500
1061
+ },
1062
+ {
1063
+ "epoch": 3.55,
1064
+ "eval_f1": 0.5911826792863208,
1065
+ "eval_loss": 0.8524171113967896,
1066
+ "eval_precision": 0.6606129937002267,
1067
+ "eval_recall": 0.577619513103384,
1068
+ "eval_runtime": 5.9367,
1069
+ "eval_samples_per_second": 147.893,
1070
+ "eval_steps_per_second": 9.264,
1071
+ "step": 29500
1072
+ },
1073
+ {
1074
+ "epoch": 3.61,
1075
+ "grad_norm": 2.538233757019043,
1076
+ "learning_rate": 4.639454064454065e-05,
1077
+ "loss": 0.3733,
1078
+ "step": 30000
1079
+ },
1080
+ {
1081
+ "epoch": 3.61,
1082
+ "eval_f1": 0.5466184654496565,
1083
+ "eval_loss": 0.9210164546966553,
1084
+ "eval_precision": 0.5577658998711631,
1085
+ "eval_recall": 0.5554857329050877,
1086
+ "eval_runtime": 6.4254,
1087
+ "eval_samples_per_second": 136.645,
1088
+ "eval_steps_per_second": 8.56,
1089
+ "step": 30000
1090
+ },
1091
+ {
1092
+ "epoch": 3.67,
1093
+ "grad_norm": 2.017235279083252,
1094
+ "learning_rate": 4.633441558441559e-05,
1095
+ "loss": 0.3424,
1096
+ "step": 30500
1097
+ },
1098
+ {
1099
+ "epoch": 3.67,
1100
+ "eval_f1": 0.5809192439862544,
1101
+ "eval_loss": 0.9294881820678711,
1102
+ "eval_precision": 0.5863171312403235,
1103
+ "eval_recall": 0.6100302564818694,
1104
+ "eval_runtime": 6.2949,
1105
+ "eval_samples_per_second": 139.477,
1106
+ "eval_steps_per_second": 8.737,
1107
+ "step": 30500
1108
+ },
1109
+ {
1110
+ "epoch": 3.73,
1111
+ "grad_norm": 7.538774490356445,
1112
+ "learning_rate": 4.627429052429053e-05,
1113
+ "loss": 0.3591,
1114
+ "step": 31000
1115
+ },
1116
+ {
1117
+ "epoch": 3.73,
1118
+ "eval_f1": 0.4588251776601326,
1119
+ "eval_loss": 0.970705509185791,
1120
+ "eval_precision": 0.5827537007312288,
1121
+ "eval_recall": 0.4768803239770982,
1122
+ "eval_runtime": 6.0168,
1123
+ "eval_samples_per_second": 145.925,
1124
+ "eval_steps_per_second": 9.141,
1125
+ "step": 31000
1126
+ },
1127
+ {
1128
+ "epoch": 3.79,
1129
+ "grad_norm": 4.64936637878418,
1130
+ "learning_rate": 4.621416546416546e-05,
1131
+ "loss": 0.3634,
1132
+ "step": 31500
1133
+ },
1134
+ {
1135
+ "epoch": 3.79,
1136
+ "eval_f1": 0.575160103511553,
1137
+ "eval_loss": 0.8524229526519775,
1138
+ "eval_precision": 0.6136046998053873,
1139
+ "eval_recall": 0.5680603267700042,
1140
+ "eval_runtime": 6.5694,
1141
+ "eval_samples_per_second": 133.651,
1142
+ "eval_steps_per_second": 8.372,
1143
+ "step": 31500
1144
+ }
1145
+ ],
1146
+ "logging_steps": 500,
1147
+ "max_steps": 415800,
1148
+ "num_input_tokens_seen": 0,
1149
+ "num_train_epochs": 50,
1150
+ "save_steps": 500,
1151
+ "total_flos": 1.3260126913238016e+17,
1152
+ "train_batch_size": 16,
1153
+ "trial_name": null,
1154
+ "trial_params": null
1155
+ }
checkpoint-31500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f5c815d879e81b472ce52bc0bc6843f93b2e6b15b32a07dfe75c40e595456a
3
+ size 4920
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "xlm-roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "output_past": true,
32
+ "pad_token_id": 1,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.38.2",
37
+ "type_vocab_size": 1,
38
+ "use_cache": true,
39
+ "vocab_size": 250002
40
+ }
eval_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.79,
3
+ "eval_f1": 0.6702519892656928,
4
+ "eval_loss": 0.6971784234046936,
5
+ "eval_precision": 0.6686766810877821,
6
+ "eval_recall": 0.6729106735558349,
7
+ "eval_runtime": 5.8753,
8
+ "eval_samples_per_second": 149.44,
9
+ "eval_steps_per_second": 9.361,
10
+ "seed": 42,
11
+ "test_size": 9078,
12
+ "train_size": 133046,
13
+ "valid_size": 878
14
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2551fca71136ff1d6066acb7a921ef8341781d26fe6b9244440344e5e89c13d5
3
+ size 1112208084
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
test_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.79,
3
+ "test_f1": 0.6309333302387673,
4
+ "test_loss": 0.6790701746940613,
5
+ "test_precision": 0.6301019352201087,
6
+ "test_recall": 0.6496975678367595,
7
+ "test_runtime": 36.8059,
8
+ "test_samples_per_second": 246.645,
9
+ "test_steps_per_second": 15.432
10
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6f76fe13d42f80dcee0cb86a1aeb5f14f8909bb8a8782f7a4a4ad76697ef164
3
+ size 17083021
tokenizer_config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "max_length": 128,
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "<pad>",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "</s>",
56
+ "stride": 0,
57
+ "tokenizer_class": "XLMRobertaTokenizer",
58
+ "truncation_side": "right",
59
+ "truncation_strategy": "longest_first",
60
+ "unk_token": "<unk>"
61
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.79,
3
+ "train_loss": 0.42908321610708083,
4
+ "train_runtime": 7590.5884,
5
+ "train_samples_per_second": 876.388,
6
+ "train_steps_per_second": 54.778
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,1164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6971784234046936,
3
+ "best_model_checkpoint": "./cardiffnlp-twitter-xlmr-finetuned-txtnly-all-42/checkpoint-16500",
4
+ "epoch": 3.787878787878788,
5
+ "eval_steps": 500,
6
+ "global_step": 31500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "grad_norm": 3.298647880554199,
14
+ "learning_rate": 4.994023569023569e-05,
15
+ "loss": 0.6122,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "eval_f1": 0.4840638597456899,
21
+ "eval_loss": 0.854165256023407,
22
+ "eval_precision": 0.6558887250350466,
23
+ "eval_recall": 0.49045198529069495,
24
+ "eval_runtime": 5.9285,
25
+ "eval_samples_per_second": 148.099,
26
+ "eval_steps_per_second": 9.277,
27
+ "step": 500
28
+ },
29
+ {
30
+ "epoch": 0.12,
31
+ "grad_norm": 5.411099433898926,
32
+ "learning_rate": 4.988011063011063e-05,
33
+ "loss": 0.5497,
34
+ "step": 1000
35
+ },
36
+ {
37
+ "epoch": 0.12,
38
+ "eval_f1": 0.6209225023342669,
39
+ "eval_loss": 0.8037390112876892,
40
+ "eval_precision": 0.704421745545341,
41
+ "eval_recall": 0.6070083321696225,
42
+ "eval_runtime": 6.1691,
43
+ "eval_samples_per_second": 142.322,
44
+ "eval_steps_per_second": 8.915,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.18,
49
+ "grad_norm": 5.836483001708984,
50
+ "learning_rate": 4.9820105820105825e-05,
51
+ "loss": 0.5404,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.18,
56
+ "eval_f1": 0.3652071944289921,
57
+ "eval_loss": 0.9700150489807129,
58
+ "eval_precision": 0.5591482310679367,
59
+ "eval_recall": 0.4176288227901131,
60
+ "eval_runtime": 5.8886,
61
+ "eval_samples_per_second": 149.101,
62
+ "eval_steps_per_second": 9.34,
63
+ "step": 1500
64
+ },
65
+ {
66
+ "epoch": 0.24,
67
+ "grad_norm": 13.717193603515625,
68
+ "learning_rate": 4.975998075998076e-05,
69
+ "loss": 0.5165,
70
+ "step": 2000
71
+ },
72
+ {
73
+ "epoch": 0.24,
74
+ "eval_f1": 0.5369027892847279,
75
+ "eval_loss": 0.744874894618988,
76
+ "eval_precision": 0.7349445049700448,
77
+ "eval_recall": 0.529664385793418,
78
+ "eval_runtime": 5.996,
79
+ "eval_samples_per_second": 146.43,
80
+ "eval_steps_per_second": 9.173,
81
+ "step": 2000
82
+ },
83
+ {
84
+ "epoch": 0.3,
85
+ "grad_norm": 2.4534995555877686,
86
+ "learning_rate": 4.969997594997595e-05,
87
+ "loss": 0.5136,
88
+ "step": 2500
89
+ },
90
+ {
91
+ "epoch": 0.3,
92
+ "eval_f1": 0.5001381202499963,
93
+ "eval_loss": 0.7884698510169983,
94
+ "eval_precision": 0.6766332095394413,
95
+ "eval_recall": 0.5025275799469348,
96
+ "eval_runtime": 5.9085,
97
+ "eval_samples_per_second": 148.6,
98
+ "eval_steps_per_second": 9.309,
99
+ "step": 2500
100
+ },
101
+ {
102
+ "epoch": 0.36,
103
+ "grad_norm": 3.195244550704956,
104
+ "learning_rate": 4.963985088985089e-05,
105
+ "loss": 0.5072,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 0.36,
110
+ "eval_f1": 0.5917137619940201,
111
+ "eval_loss": 0.8123684525489807,
112
+ "eval_precision": 0.6076358199852175,
113
+ "eval_recall": 0.6132374435600242,
114
+ "eval_runtime": 6.1108,
115
+ "eval_samples_per_second": 143.68,
116
+ "eval_steps_per_second": 9.0,
117
+ "step": 3000
118
+ },
119
+ {
120
+ "epoch": 0.42,
121
+ "grad_norm": 7.579603672027588,
122
+ "learning_rate": 4.957972582972583e-05,
123
+ "loss": 0.5011,
124
+ "step": 3500
125
+ },
126
+ {
127
+ "epoch": 0.42,
128
+ "eval_f1": 0.578405909718061,
129
+ "eval_loss": 0.8767459392547607,
130
+ "eval_precision": 0.642659899090607,
131
+ "eval_recall": 0.5987143322627193,
132
+ "eval_runtime": 6.1563,
133
+ "eval_samples_per_second": 142.618,
134
+ "eval_steps_per_second": 8.934,
135
+ "step": 3500
136
+ },
137
+ {
138
+ "epoch": 0.48,
139
+ "grad_norm": 3.266787052154541,
140
+ "learning_rate": 4.951960076960077e-05,
141
+ "loss": 0.5021,
142
+ "step": 4000
143
+ },
144
+ {
145
+ "epoch": 0.48,
146
+ "eval_f1": 0.6502990015105321,
147
+ "eval_loss": 0.7957776784896851,
148
+ "eval_precision": 0.6847923256926328,
149
+ "eval_recall": 0.636192338127822,
150
+ "eval_runtime": 6.5221,
151
+ "eval_samples_per_second": 134.618,
152
+ "eval_steps_per_second": 8.433,
153
+ "step": 4000
154
+ },
155
+ {
156
+ "epoch": 0.54,
157
+ "grad_norm": 6.044332027435303,
158
+ "learning_rate": 4.945959595959596e-05,
159
+ "loss": 0.4946,
160
+ "step": 4500
161
+ },
162
+ {
163
+ "epoch": 0.54,
164
+ "eval_f1": 0.4982912515017284,
165
+ "eval_loss": 0.8045271039009094,
166
+ "eval_precision": 0.7220405815528763,
167
+ "eval_recall": 0.4968300516687614,
168
+ "eval_runtime": 6.1928,
169
+ "eval_samples_per_second": 141.778,
170
+ "eval_steps_per_second": 8.881,
171
+ "step": 4500
172
+ },
173
+ {
174
+ "epoch": 0.6,
175
+ "grad_norm": 5.152063846588135,
176
+ "learning_rate": 4.93994708994709e-05,
177
+ "loss": 0.4928,
178
+ "step": 5000
179
+ },
180
+ {
181
+ "epoch": 0.6,
182
+ "eval_f1": 0.550273048506264,
183
+ "eval_loss": 0.780342698097229,
184
+ "eval_precision": 0.7581894624319455,
185
+ "eval_recall": 0.5380887213145278,
186
+ "eval_runtime": 6.123,
187
+ "eval_samples_per_second": 143.395,
188
+ "eval_steps_per_second": 8.983,
189
+ "step": 5000
190
+ },
191
+ {
192
+ "epoch": 0.66,
193
+ "grad_norm": 4.54200553894043,
194
+ "learning_rate": 4.933934583934584e-05,
195
+ "loss": 0.5008,
196
+ "step": 5500
197
+ },
198
+ {
199
+ "epoch": 0.66,
200
+ "eval_f1": 0.4594232264185665,
201
+ "eval_loss": 0.7507085204124451,
202
+ "eval_precision": 0.44070483572560937,
203
+ "eval_recall": 0.47984452823162504,
204
+ "eval_runtime": 5.932,
205
+ "eval_samples_per_second": 148.011,
206
+ "eval_steps_per_second": 9.272,
207
+ "step": 5500
208
+ },
209
+ {
210
+ "epoch": 0.72,
211
+ "grad_norm": 4.075632095336914,
212
+ "learning_rate": 4.927922077922078e-05,
213
+ "loss": 0.4966,
214
+ "step": 6000
215
+ },
216
+ {
217
+ "epoch": 0.72,
218
+ "eval_f1": 0.6310991936984806,
219
+ "eval_loss": 0.8238988518714905,
220
+ "eval_precision": 0.6139657275796522,
221
+ "eval_recall": 0.6767434715821813,
222
+ "eval_runtime": 5.8918,
223
+ "eval_samples_per_second": 149.02,
224
+ "eval_steps_per_second": 9.335,
225
+ "step": 6000
226
+ },
227
+ {
228
+ "epoch": 0.78,
229
+ "grad_norm": 4.8725104331970215,
230
+ "learning_rate": 4.921909571909572e-05,
231
+ "loss": 0.4791,
232
+ "step": 6500
233
+ },
234
+ {
235
+ "epoch": 0.78,
236
+ "eval_f1": 0.5412559573187593,
237
+ "eval_loss": 0.7028306722640991,
238
+ "eval_precision": 0.6567775474615866,
239
+ "eval_recall": 0.520631196760229,
240
+ "eval_runtime": 6.3113,
241
+ "eval_samples_per_second": 139.116,
242
+ "eval_steps_per_second": 8.715,
243
+ "step": 6500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "grad_norm": 1.4915893077850342,
248
+ "learning_rate": 4.915897065897066e-05,
249
+ "loss": 0.494,
250
+ "step": 7000
251
+ },
252
+ {
253
+ "epoch": 0.84,
254
+ "eval_f1": 0.5227267406470947,
255
+ "eval_loss": 0.8033522367477417,
256
+ "eval_precision": 0.6660302960734323,
257
+ "eval_recall": 0.5188623562817111,
258
+ "eval_runtime": 6.1252,
259
+ "eval_samples_per_second": 143.342,
260
+ "eval_steps_per_second": 8.979,
261
+ "step": 7000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "grad_norm": 2.151014804840088,
266
+ "learning_rate": 4.90988455988456e-05,
267
+ "loss": 0.4861,
268
+ "step": 7500
269
+ },
270
+ {
271
+ "epoch": 0.9,
272
+ "eval_f1": 0.4541201667750796,
273
+ "eval_loss": 0.900325357913971,
274
+ "eval_precision": 0.5780562441152168,
275
+ "eval_recall": 0.4784564539403249,
276
+ "eval_runtime": 6.144,
277
+ "eval_samples_per_second": 142.903,
278
+ "eval_steps_per_second": 8.952,
279
+ "step": 7500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "grad_norm": 4.770496368408203,
284
+ "learning_rate": 4.903872053872054e-05,
285
+ "loss": 0.4804,
286
+ "step": 8000
287
+ },
288
+ {
289
+ "epoch": 0.96,
290
+ "eval_f1": 0.5791890202588422,
291
+ "eval_loss": 0.773960530757904,
292
+ "eval_precision": 0.6238945275403609,
293
+ "eval_recall": 0.5775003491132523,
294
+ "eval_runtime": 6.556,
295
+ "eval_samples_per_second": 133.923,
296
+ "eval_steps_per_second": 8.389,
297
+ "step": 8000
298
+ },
299
+ {
300
+ "epoch": 1.02,
301
+ "grad_norm": 2.520460367202759,
302
+ "learning_rate": 4.897859547859548e-05,
303
+ "loss": 0.4614,
304
+ "step": 8500
305
+ },
306
+ {
307
+ "epoch": 1.02,
308
+ "eval_f1": 0.6470888284841774,
309
+ "eval_loss": 0.7397181391716003,
310
+ "eval_precision": 0.6848151355984641,
311
+ "eval_recall": 0.6312358609132803,
312
+ "eval_runtime": 6.1813,
313
+ "eval_samples_per_second": 142.042,
314
+ "eval_steps_per_second": 8.898,
315
+ "step": 8500
316
+ },
317
+ {
318
+ "epoch": 1.08,
319
+ "grad_norm": 4.375688552856445,
320
+ "learning_rate": 4.891847041847042e-05,
321
+ "loss": 0.4315,
322
+ "step": 9000
323
+ },
324
+ {
325
+ "epoch": 1.08,
326
+ "eval_f1": 0.614857769662433,
327
+ "eval_loss": 0.788919985294342,
328
+ "eval_precision": 0.6641593406916259,
329
+ "eval_recall": 0.6034743750872783,
330
+ "eval_runtime": 6.1798,
331
+ "eval_samples_per_second": 142.076,
332
+ "eval_steps_per_second": 8.9,
333
+ "step": 9000
334
+ },
335
+ {
336
+ "epoch": 1.14,
337
+ "grad_norm": 4.091088771820068,
338
+ "learning_rate": 4.885834535834536e-05,
339
+ "loss": 0.4506,
340
+ "step": 9500
341
+ },
342
+ {
343
+ "epoch": 1.14,
344
+ "eval_f1": 0.4967964786589283,
345
+ "eval_loss": 0.8783875703811646,
346
+ "eval_precision": 0.6387377173091459,
347
+ "eval_recall": 0.5016645719871526,
348
+ "eval_runtime": 5.9164,
349
+ "eval_samples_per_second": 148.401,
350
+ "eval_steps_per_second": 9.296,
351
+ "step": 9500
352
+ },
353
+ {
354
+ "epoch": 1.2,
355
+ "grad_norm": 3.3903276920318604,
356
+ "learning_rate": 4.87982202982203e-05,
357
+ "loss": 0.4489,
358
+ "step": 10000
359
+ },
360
+ {
361
+ "epoch": 1.2,
362
+ "eval_f1": 0.4949153076705755,
363
+ "eval_loss": 0.7994188070297241,
364
+ "eval_precision": 0.5340329579250159,
365
+ "eval_recall": 0.49638597961178615,
366
+ "eval_runtime": 5.9029,
367
+ "eval_samples_per_second": 148.74,
368
+ "eval_steps_per_second": 9.317,
369
+ "step": 10000
370
+ },
371
+ {
372
+ "epoch": 1.26,
373
+ "grad_norm": 3.929879903793335,
374
+ "learning_rate": 4.8738095238095235e-05,
375
+ "loss": 0.4466,
376
+ "step": 10500
377
+ },
378
+ {
379
+ "epoch": 1.26,
380
+ "eval_f1": 0.44642812881455524,
381
+ "eval_loss": 0.8109920024871826,
382
+ "eval_precision": 0.5776119229607602,
383
+ "eval_recall": 0.47351207931853095,
384
+ "eval_runtime": 5.9766,
385
+ "eval_samples_per_second": 146.907,
386
+ "eval_steps_per_second": 9.203,
387
+ "step": 10500
388
+ },
389
+ {
390
+ "epoch": 1.32,
391
+ "grad_norm": 6.443171501159668,
392
+ "learning_rate": 4.8677970177970176e-05,
393
+ "loss": 0.4319,
394
+ "step": 11000
395
+ },
396
+ {
397
+ "epoch": 1.32,
398
+ "eval_f1": 0.5481427288492505,
399
+ "eval_loss": 0.8068605661392212,
400
+ "eval_precision": 0.6612496177619213,
401
+ "eval_recall": 0.5399497276916632,
402
+ "eval_runtime": 5.9001,
403
+ "eval_samples_per_second": 148.811,
404
+ "eval_steps_per_second": 9.322,
405
+ "step": 11000
406
+ },
407
+ {
408
+ "epoch": 1.38,
409
+ "grad_norm": 7.633645057678223,
410
+ "learning_rate": 4.8617845117845116e-05,
411
+ "loss": 0.4243,
412
+ "step": 11500
413
+ },
414
+ {
415
+ "epoch": 1.38,
416
+ "eval_f1": 0.5797306372413114,
417
+ "eval_loss": 0.7941620349884033,
418
+ "eval_precision": 0.5948358635007136,
419
+ "eval_recall": 0.5704752595075175,
420
+ "eval_runtime": 6.145,
421
+ "eval_samples_per_second": 142.881,
422
+ "eval_steps_per_second": 8.95,
423
+ "step": 11500
424
+ },
425
+ {
426
+ "epoch": 1.44,
427
+ "grad_norm": 3.275371789932251,
428
+ "learning_rate": 4.8557720057720056e-05,
429
+ "loss": 0.4398,
430
+ "step": 12000
431
+ },
432
+ {
433
+ "epoch": 1.44,
434
+ "eval_f1": 0.5247242844808815,
435
+ "eval_loss": 0.9738017916679382,
436
+ "eval_precision": 0.5370369073777802,
437
+ "eval_recall": 0.6070139179816599,
438
+ "eval_runtime": 6.219,
439
+ "eval_samples_per_second": 141.18,
440
+ "eval_steps_per_second": 8.844,
441
+ "step": 12000
442
+ },
443
+ {
444
+ "epoch": 1.5,
445
+ "grad_norm": 2.4162724018096924,
446
+ "learning_rate": 4.8497594997595e-05,
447
+ "loss": 0.4526,
448
+ "step": 12500
449
+ },
450
+ {
451
+ "epoch": 1.5,
452
+ "eval_f1": 0.5589742980399895,
453
+ "eval_loss": 0.7195601463317871,
454
+ "eval_precision": 0.7046240283838195,
455
+ "eval_recall": 0.5477959316668994,
456
+ "eval_runtime": 6.3918,
457
+ "eval_samples_per_second": 137.363,
458
+ "eval_steps_per_second": 8.605,
459
+ "step": 12500
460
+ },
461
+ {
462
+ "epoch": 1.56,
463
+ "grad_norm": 6.926381587982178,
464
+ "learning_rate": 4.8437469937469944e-05,
465
+ "loss": 0.4529,
466
+ "step": 13000
467
+ },
468
+ {
469
+ "epoch": 1.56,
470
+ "eval_f1": 0.5863097712686139,
471
+ "eval_loss": 0.8049713969230652,
472
+ "eval_precision": 0.6419448505612538,
473
+ "eval_recall": 0.5730605595121724,
474
+ "eval_runtime": 6.3636,
475
+ "eval_samples_per_second": 137.971,
476
+ "eval_steps_per_second": 8.643,
477
+ "step": 13000
478
+ },
479
+ {
480
+ "epoch": 1.62,
481
+ "grad_norm": 1.8420650959014893,
482
+ "learning_rate": 4.837746512746513e-05,
483
+ "loss": 0.446,
484
+ "step": 13500
485
+ },
486
+ {
487
+ "epoch": 1.62,
488
+ "eval_f1": 0.6107236144330398,
489
+ "eval_loss": 0.7564206719398499,
490
+ "eval_precision": 0.6520992658162544,
491
+ "eval_recall": 0.5912358609132803,
492
+ "eval_runtime": 6.4128,
493
+ "eval_samples_per_second": 136.914,
494
+ "eval_steps_per_second": 8.577,
495
+ "step": 13500
496
+ },
497
+ {
498
+ "epoch": 1.68,
499
+ "grad_norm": 2.423569679260254,
500
+ "learning_rate": 4.831746031746032e-05,
501
+ "loss": 0.4315,
502
+ "step": 14000
503
+ },
504
+ {
505
+ "epoch": 1.68,
506
+ "eval_f1": 0.621245910301715,
507
+ "eval_loss": 0.751511812210083,
508
+ "eval_precision": 0.6474767054531395,
509
+ "eval_recall": 0.6069198901456967,
510
+ "eval_runtime": 5.9833,
511
+ "eval_samples_per_second": 146.741,
512
+ "eval_steps_per_second": 9.192,
513
+ "step": 14000
514
+ },
515
+ {
516
+ "epoch": 1.74,
517
+ "grad_norm": 6.773381233215332,
518
+ "learning_rate": 4.825733525733526e-05,
519
+ "loss": 0.4464,
520
+ "step": 14500
521
+ },
522
+ {
523
+ "epoch": 1.74,
524
+ "eval_f1": 0.559868694735591,
525
+ "eval_loss": 0.8307517170906067,
526
+ "eval_precision": 0.627583612882644,
527
+ "eval_recall": 0.5512991667830377,
528
+ "eval_runtime": 6.1679,
529
+ "eval_samples_per_second": 142.35,
530
+ "eval_steps_per_second": 8.917,
531
+ "step": 14500
532
+ },
533
+ {
534
+ "epoch": 1.8,
535
+ "grad_norm": 6.220128059387207,
536
+ "learning_rate": 4.8197330447330455e-05,
537
+ "loss": 0.4423,
538
+ "step": 15000
539
+ },
540
+ {
541
+ "epoch": 1.8,
542
+ "eval_f1": 0.5991996711711277,
543
+ "eval_loss": 0.798150360584259,
544
+ "eval_precision": 0.6176196711770697,
545
+ "eval_recall": 0.5936535865568123,
546
+ "eval_runtime": 6.0738,
547
+ "eval_samples_per_second": 144.556,
548
+ "eval_steps_per_second": 9.055,
549
+ "step": 15000
550
+ },
551
+ {
552
+ "epoch": 1.86,
553
+ "grad_norm": 1.1065833568572998,
554
+ "learning_rate": 4.8137205387205395e-05,
555
+ "loss": 0.4551,
556
+ "step": 15500
557
+ },
558
+ {
559
+ "epoch": 1.86,
560
+ "eval_f1": 0.6019748538222912,
561
+ "eval_loss": 0.822293221950531,
562
+ "eval_precision": 0.6355921902599784,
563
+ "eval_recall": 0.5933528836754642,
564
+ "eval_runtime": 6.1197,
565
+ "eval_samples_per_second": 143.472,
566
+ "eval_steps_per_second": 8.987,
567
+ "step": 15500
568
+ },
569
+ {
570
+ "epoch": 1.92,
571
+ "grad_norm": 8.631648063659668,
572
+ "learning_rate": 4.807708032708033e-05,
573
+ "loss": 0.4408,
574
+ "step": 16000
575
+ },
576
+ {
577
+ "epoch": 1.92,
578
+ "eval_f1": 0.5131249172090748,
579
+ "eval_loss": 0.7691208124160767,
580
+ "eval_precision": 0.608759764068229,
581
+ "eval_recall": 0.5147484057161477,
582
+ "eval_runtime": 6.3609,
583
+ "eval_samples_per_second": 138.031,
584
+ "eval_steps_per_second": 8.647,
585
+ "step": 16000
586
+ },
587
+ {
588
+ "epoch": 1.98,
589
+ "grad_norm": 6.755849361419678,
590
+ "learning_rate": 4.801695526695527e-05,
591
+ "loss": 0.4389,
592
+ "step": 16500
593
+ },
594
+ {
595
+ "epoch": 1.98,
596
+ "eval_f1": 0.6702519892656928,
597
+ "eval_loss": 0.6971784234046936,
598
+ "eval_precision": 0.6686766810877821,
599
+ "eval_recall": 0.6729106735558349,
600
+ "eval_runtime": 6.1341,
601
+ "eval_samples_per_second": 143.134,
602
+ "eval_steps_per_second": 8.966,
603
+ "step": 16500
604
+ },
605
+ {
606
+ "epoch": 2.04,
607
+ "grad_norm": 19.813188552856445,
608
+ "learning_rate": 4.795683020683021e-05,
609
+ "loss": 0.3886,
610
+ "step": 17000
611
+ },
612
+ {
613
+ "epoch": 2.04,
614
+ "eval_f1": 0.5543489692487942,
615
+ "eval_loss": 0.7798230648040771,
616
+ "eval_precision": 0.6125764375980934,
617
+ "eval_recall": 0.543671740445934,
618
+ "eval_runtime": 6.7491,
619
+ "eval_samples_per_second": 130.09,
620
+ "eval_steps_per_second": 8.149,
621
+ "step": 17000
622
+ },
623
+ {
624
+ "epoch": 2.1,
625
+ "grad_norm": 7.927220821380615,
626
+ "learning_rate": 4.789670514670515e-05,
627
+ "loss": 0.3883,
628
+ "step": 17500
629
+ },
630
+ {
631
+ "epoch": 2.1,
632
+ "eval_f1": 0.5978449313058904,
633
+ "eval_loss": 0.8385018110275269,
634
+ "eval_precision": 0.5948463716988197,
635
+ "eval_recall": 0.6225499231950845,
636
+ "eval_runtime": 6.122,
637
+ "eval_samples_per_second": 143.416,
638
+ "eval_steps_per_second": 8.984,
639
+ "step": 17500
640
+ },
641
+ {
642
+ "epoch": 2.16,
643
+ "grad_norm": 6.237366199493408,
644
+ "learning_rate": 4.783658008658009e-05,
645
+ "loss": 0.4011,
646
+ "step": 18000
647
+ },
648
+ {
649
+ "epoch": 2.16,
650
+ "eval_f1": 0.5914931472808443,
651
+ "eval_loss": 0.7754688858985901,
652
+ "eval_precision": 0.655128213311837,
653
+ "eval_recall": 0.578716194200065,
654
+ "eval_runtime": 6.558,
655
+ "eval_samples_per_second": 133.882,
656
+ "eval_steps_per_second": 8.387,
657
+ "step": 18000
658
+ },
659
+ {
660
+ "epoch": 2.22,
661
+ "grad_norm": 3.3301048278808594,
662
+ "learning_rate": 4.777645502645503e-05,
663
+ "loss": 0.3992,
664
+ "step": 18500
665
+ },
666
+ {
667
+ "epoch": 2.22,
668
+ "eval_f1": 0.5472455226037474,
669
+ "eval_loss": 0.788632333278656,
670
+ "eval_precision": 0.558195855728615,
671
+ "eval_recall": 0.5519042964204254,
672
+ "eval_runtime": 6.124,
673
+ "eval_samples_per_second": 143.371,
674
+ "eval_steps_per_second": 8.981,
675
+ "step": 18500
676
+ },
677
+ {
678
+ "epoch": 2.28,
679
+ "grad_norm": 8.471348762512207,
680
+ "learning_rate": 4.771645021645022e-05,
681
+ "loss": 0.393,
682
+ "step": 19000
683
+ },
684
+ {
685
+ "epoch": 2.28,
686
+ "eval_f1": 0.5889012942356766,
687
+ "eval_loss": 0.7660124897956848,
688
+ "eval_precision": 0.5901145289176211,
689
+ "eval_recall": 0.592326956197924,
690
+ "eval_runtime": 5.8572,
691
+ "eval_samples_per_second": 149.9,
692
+ "eval_steps_per_second": 9.39,
693
+ "step": 19000
694
+ },
695
+ {
696
+ "epoch": 2.34,
697
+ "grad_norm": 15.840304374694824,
698
+ "learning_rate": 4.765632515632516e-05,
699
+ "loss": 0.3891,
700
+ "step": 19500
701
+ },
702
+ {
703
+ "epoch": 2.34,
704
+ "eval_f1": 0.5354251462409856,
705
+ "eval_loss": 0.7701670527458191,
706
+ "eval_precision": 0.579215207029406,
707
+ "eval_recall": 0.5330605595121725,
708
+ "eval_runtime": 6.1187,
709
+ "eval_samples_per_second": 143.495,
710
+ "eval_steps_per_second": 8.989,
711
+ "step": 19500
712
+ },
713
+ {
714
+ "epoch": 2.41,
715
+ "grad_norm": 1.6515294313430786,
716
+ "learning_rate": 4.75962000962001e-05,
717
+ "loss": 0.4119,
718
+ "step": 20000
719
+ },
720
+ {
721
+ "epoch": 2.41,
722
+ "eval_f1": 0.5110658029804255,
723
+ "eval_loss": 0.8545361161231995,
724
+ "eval_precision": 0.5405823804957771,
725
+ "eval_recall": 0.5243262114229856,
726
+ "eval_runtime": 6.2418,
727
+ "eval_samples_per_second": 140.665,
728
+ "eval_steps_per_second": 8.812,
729
+ "step": 20000
730
+ },
731
+ {
732
+ "epoch": 2.47,
733
+ "grad_norm": 3.166147470474243,
734
+ "learning_rate": 4.753607503607504e-05,
735
+ "loss": 0.3981,
736
+ "step": 20500
737
+ },
738
+ {
739
+ "epoch": 2.47,
740
+ "eval_f1": 0.53639943040752,
741
+ "eval_loss": 0.864085853099823,
742
+ "eval_precision": 0.5695344700259635,
743
+ "eval_recall": 0.5536247265279522,
744
+ "eval_runtime": 5.9635,
745
+ "eval_samples_per_second": 147.229,
746
+ "eval_steps_per_second": 9.223,
747
+ "step": 20500
748
+ },
749
+ {
750
+ "epoch": 2.53,
751
+ "grad_norm": 4.143538475036621,
752
+ "learning_rate": 4.747594997594998e-05,
753
+ "loss": 0.4,
754
+ "step": 21000
755
+ },
756
+ {
757
+ "epoch": 2.53,
758
+ "eval_f1": 0.582186065915728,
759
+ "eval_loss": 0.8044998049736023,
760
+ "eval_precision": 0.5987904356270873,
761
+ "eval_recall": 0.5844826141600334,
762
+ "eval_runtime": 5.9156,
763
+ "eval_samples_per_second": 148.422,
764
+ "eval_steps_per_second": 9.298,
765
+ "step": 21000
766
+ },
767
+ {
768
+ "epoch": 2.59,
769
+ "grad_norm": 5.849362850189209,
770
+ "learning_rate": 4.741582491582492e-05,
771
+ "loss": 0.4059,
772
+ "step": 21500
773
+ },
774
+ {
775
+ "epoch": 2.59,
776
+ "eval_f1": 0.569600279809319,
777
+ "eval_loss": 0.8023470044136047,
778
+ "eval_precision": 0.6300909361955873,
779
+ "eval_recall": 0.5548880510170833,
780
+ "eval_runtime": 5.9073,
781
+ "eval_samples_per_second": 148.629,
782
+ "eval_steps_per_second": 9.31,
783
+ "step": 21500
784
+ },
785
+ {
786
+ "epoch": 2.65,
787
+ "grad_norm": 2.0296847820281982,
788
+ "learning_rate": 4.735582010582011e-05,
789
+ "loss": 0.3805,
790
+ "step": 22000
791
+ },
792
+ {
793
+ "epoch": 2.65,
794
+ "eval_f1": 0.5387095557628462,
795
+ "eval_loss": 0.8242425322532654,
796
+ "eval_precision": 0.5632921859195318,
797
+ "eval_recall": 0.536337569240795,
798
+ "eval_runtime": 6.1681,
799
+ "eval_samples_per_second": 142.345,
800
+ "eval_steps_per_second": 8.917,
801
+ "step": 22000
802
+ },
803
+ {
804
+ "epoch": 2.71,
805
+ "grad_norm": 5.022754192352295,
806
+ "learning_rate": 4.729569504569505e-05,
807
+ "loss": 0.4126,
808
+ "step": 22500
809
+ },
810
+ {
811
+ "epoch": 2.71,
812
+ "eval_f1": 0.525337187977395,
813
+ "eval_loss": 0.8866151571273804,
814
+ "eval_precision": 0.563019122327633,
815
+ "eval_recall": 0.5244211702276219,
816
+ "eval_runtime": 6.5791,
817
+ "eval_samples_per_second": 133.453,
818
+ "eval_steps_per_second": 8.36,
819
+ "step": 22500
820
+ },
821
+ {
822
+ "epoch": 2.77,
823
+ "grad_norm": 6.320919990539551,
824
+ "learning_rate": 4.7235690235690236e-05,
825
+ "loss": 0.3959,
826
+ "step": 23000
827
+ },
828
+ {
829
+ "epoch": 2.77,
830
+ "eval_f1": 0.5715827904573106,
831
+ "eval_loss": 0.922848641872406,
832
+ "eval_precision": 0.6485667793604627,
833
+ "eval_recall": 0.5569566634082763,
834
+ "eval_runtime": 6.5486,
835
+ "eval_samples_per_second": 134.075,
836
+ "eval_steps_per_second": 8.399,
837
+ "step": 23000
838
+ },
839
+ {
840
+ "epoch": 2.83,
841
+ "grad_norm": 3.2674639225006104,
842
+ "learning_rate": 4.717556517556518e-05,
843
+ "loss": 0.3972,
844
+ "step": 23500
845
+ },
846
+ {
847
+ "epoch": 2.83,
848
+ "eval_f1": 0.6330230633421515,
849
+ "eval_loss": 0.8297170400619507,
850
+ "eval_precision": 0.64149542011954,
851
+ "eval_recall": 0.633559558720849,
852
+ "eval_runtime": 6.1502,
853
+ "eval_samples_per_second": 142.759,
854
+ "eval_steps_per_second": 8.943,
855
+ "step": 23500
856
+ },
857
+ {
858
+ "epoch": 2.89,
859
+ "grad_norm": 5.248292922973633,
860
+ "learning_rate": 4.711544011544012e-05,
861
+ "loss": 0.3779,
862
+ "step": 24000
863
+ },
864
+ {
865
+ "epoch": 2.89,
866
+ "eval_f1": 0.5897470753706388,
867
+ "eval_loss": 0.8682935833930969,
868
+ "eval_precision": 0.6023327508623889,
869
+ "eval_recall": 0.5919508448540706,
870
+ "eval_runtime": 6.3839,
871
+ "eval_samples_per_second": 137.534,
872
+ "eval_steps_per_second": 8.615,
873
+ "step": 24000
874
+ },
875
+ {
876
+ "epoch": 2.95,
877
+ "grad_norm": 4.1834635734558105,
878
+ "learning_rate": 4.705531505531506e-05,
879
+ "loss": 0.3951,
880
+ "step": 24500
881
+ },
882
+ {
883
+ "epoch": 2.95,
884
+ "eval_f1": 0.5124969418380673,
885
+ "eval_loss": 0.8628427982330322,
886
+ "eval_precision": 0.5891878367677518,
887
+ "eval_recall": 0.5116492110040497,
888
+ "eval_runtime": 6.1272,
889
+ "eval_samples_per_second": 143.295,
890
+ "eval_steps_per_second": 8.976,
891
+ "step": 24500
892
+ },
893
+ {
894
+ "epoch": 3.01,
895
+ "grad_norm": 12.86809253692627,
896
+ "learning_rate": 4.699518999519e-05,
897
+ "loss": 0.3916,
898
+ "step": 25000
899
+ },
900
+ {
901
+ "epoch": 3.01,
902
+ "eval_f1": 0.5024144172335627,
903
+ "eval_loss": 0.9203388094902039,
904
+ "eval_precision": 0.6304846593419121,
905
+ "eval_recall": 0.5026001955034213,
906
+ "eval_runtime": 6.0613,
907
+ "eval_samples_per_second": 144.854,
908
+ "eval_steps_per_second": 9.074,
909
+ "step": 25000
910
+ },
911
+ {
912
+ "epoch": 3.07,
913
+ "grad_norm": 3.2101404666900635,
914
+ "learning_rate": 4.693506493506494e-05,
915
+ "loss": 0.3524,
916
+ "step": 25500
917
+ },
918
+ {
919
+ "epoch": 3.07,
920
+ "eval_f1": 0.5010573535401949,
921
+ "eval_loss": 0.9825400710105896,
922
+ "eval_precision": 0.6088672873311428,
923
+ "eval_recall": 0.5039249639249639,
924
+ "eval_runtime": 5.9279,
925
+ "eval_samples_per_second": 148.113,
926
+ "eval_steps_per_second": 9.278,
927
+ "step": 25500
928
+ },
929
+ {
930
+ "epoch": 3.13,
931
+ "grad_norm": 16.025983810424805,
932
+ "learning_rate": 4.687493987493988e-05,
933
+ "loss": 0.3332,
934
+ "step": 26000
935
+ },
936
+ {
937
+ "epoch": 3.13,
938
+ "eval_f1": 0.5814110917677252,
939
+ "eval_loss": 0.8755331635475159,
940
+ "eval_precision": 0.5979503457905185,
941
+ "eval_recall": 0.5711502117953731,
942
+ "eval_runtime": 6.5321,
943
+ "eval_samples_per_second": 134.413,
944
+ "eval_steps_per_second": 8.42,
945
+ "step": 26000
946
+ },
947
+ {
948
+ "epoch": 3.19,
949
+ "grad_norm": 12.575716972351074,
950
+ "learning_rate": 4.681481481481482e-05,
951
+ "loss": 0.3517,
952
+ "step": 26500
953
+ },
954
+ {
955
+ "epoch": 3.19,
956
+ "eval_f1": 0.6181463909269773,
957
+ "eval_loss": 0.9921577572822571,
958
+ "eval_precision": 0.6701390442386371,
959
+ "eval_recall": 0.5940511101801424,
960
+ "eval_runtime": 6.2002,
961
+ "eval_samples_per_second": 141.609,
962
+ "eval_steps_per_second": 8.871,
963
+ "step": 26500
964
+ },
965
+ {
966
+ "epoch": 3.25,
967
+ "grad_norm": 2.219468355178833,
968
+ "learning_rate": 4.675468975468976e-05,
969
+ "loss": 0.3534,
970
+ "step": 27000
971
+ },
972
+ {
973
+ "epoch": 3.25,
974
+ "eval_f1": 0.5242620258087817,
975
+ "eval_loss": 0.9572548866271973,
976
+ "eval_precision": 0.5652503976549385,
977
+ "eval_recall": 0.5174640413350091,
978
+ "eval_runtime": 6.4041,
979
+ "eval_samples_per_second": 137.101,
980
+ "eval_steps_per_second": 8.588,
981
+ "step": 27000
982
+ },
983
+ {
984
+ "epoch": 3.31,
985
+ "grad_norm": 2.1716973781585693,
986
+ "learning_rate": 4.6694684944684945e-05,
987
+ "loss": 0.3544,
988
+ "step": 27500
989
+ },
990
+ {
991
+ "epoch": 3.31,
992
+ "eval_f1": 0.5551290620723939,
993
+ "eval_loss": 0.9826774001121521,
994
+ "eval_precision": 0.5738657811880764,
995
+ "eval_recall": 0.5531322440999861,
996
+ "eval_runtime": 5.8897,
997
+ "eval_samples_per_second": 149.075,
998
+ "eval_steps_per_second": 9.338,
999
+ "step": 27500
1000
+ },
1001
+ {
1002
+ "epoch": 3.37,
1003
+ "grad_norm": 5.642761707305908,
1004
+ "learning_rate": 4.6634559884559885e-05,
1005
+ "loss": 0.3526,
1006
+ "step": 28000
1007
+ },
1008
+ {
1009
+ "epoch": 3.37,
1010
+ "eval_f1": 0.46574966897620484,
1011
+ "eval_loss": 0.9517427682876587,
1012
+ "eval_precision": 0.6019158514451703,
1013
+ "eval_recall": 0.4737364427687008,
1014
+ "eval_runtime": 6.2232,
1015
+ "eval_samples_per_second": 141.086,
1016
+ "eval_steps_per_second": 8.838,
1017
+ "step": 28000
1018
+ },
1019
+ {
1020
+ "epoch": 3.43,
1021
+ "grad_norm": 8.693815231323242,
1022
+ "learning_rate": 4.6574434824434825e-05,
1023
+ "loss": 0.3448,
1024
+ "step": 28500
1025
+ },
1026
+ {
1027
+ "epoch": 3.43,
1028
+ "eval_f1": 0.5231658522131929,
1029
+ "eval_loss": 0.955856204032898,
1030
+ "eval_precision": 0.5743577178625582,
1031
+ "eval_recall": 0.5138062654191686,
1032
+ "eval_runtime": 6.2254,
1033
+ "eval_samples_per_second": 141.036,
1034
+ "eval_steps_per_second": 8.835,
1035
+ "step": 28500
1036
+ },
1037
+ {
1038
+ "epoch": 3.49,
1039
+ "grad_norm": 10.058433532714844,
1040
+ "learning_rate": 4.6514309764309766e-05,
1041
+ "loss": 0.3662,
1042
+ "step": 29000
1043
+ },
1044
+ {
1045
+ "epoch": 3.49,
1046
+ "eval_f1": 0.6173176500366803,
1047
+ "eval_loss": 0.8469758033752441,
1048
+ "eval_precision": 0.6416565078769693,
1049
+ "eval_recall": 0.6176418563515337,
1050
+ "eval_runtime": 6.1339,
1051
+ "eval_samples_per_second": 143.14,
1052
+ "eval_steps_per_second": 8.967,
1053
+ "step": 29000
1054
+ },
1055
+ {
1056
+ "epoch": 3.55,
1057
+ "grad_norm": 9.207432746887207,
1058
+ "learning_rate": 4.645466570466571e-05,
1059
+ "loss": 0.3502,
1060
+ "step": 29500
1061
+ },
1062
+ {
1063
+ "epoch": 3.55,
1064
+ "eval_f1": 0.5911826792863208,
1065
+ "eval_loss": 0.8524171113967896,
1066
+ "eval_precision": 0.6606129937002267,
1067
+ "eval_recall": 0.577619513103384,
1068
+ "eval_runtime": 5.9367,
1069
+ "eval_samples_per_second": 147.893,
1070
+ "eval_steps_per_second": 9.264,
1071
+ "step": 29500
1072
+ },
1073
+ {
1074
+ "epoch": 3.61,
1075
+ "grad_norm": 2.538233757019043,
1076
+ "learning_rate": 4.639454064454065e-05,
1077
+ "loss": 0.3733,
1078
+ "step": 30000
1079
+ },
1080
+ {
1081
+ "epoch": 3.61,
1082
+ "eval_f1": 0.5466184654496565,
1083
+ "eval_loss": 0.9210164546966553,
1084
+ "eval_precision": 0.5577658998711631,
1085
+ "eval_recall": 0.5554857329050877,
1086
+ "eval_runtime": 6.4254,
1087
+ "eval_samples_per_second": 136.645,
1088
+ "eval_steps_per_second": 8.56,
1089
+ "step": 30000
1090
+ },
1091
+ {
1092
+ "epoch": 3.67,
1093
+ "grad_norm": 2.017235279083252,
1094
+ "learning_rate": 4.633441558441559e-05,
1095
+ "loss": 0.3424,
1096
+ "step": 30500
1097
+ },
1098
+ {
1099
+ "epoch": 3.67,
1100
+ "eval_f1": 0.5809192439862544,
1101
+ "eval_loss": 0.9294881820678711,
1102
+ "eval_precision": 0.5863171312403235,
1103
+ "eval_recall": 0.6100302564818694,
1104
+ "eval_runtime": 6.2949,
1105
+ "eval_samples_per_second": 139.477,
1106
+ "eval_steps_per_second": 8.737,
1107
+ "step": 30500
1108
+ },
1109
+ {
1110
+ "epoch": 3.73,
1111
+ "grad_norm": 7.538774490356445,
1112
+ "learning_rate": 4.627429052429053e-05,
1113
+ "loss": 0.3591,
1114
+ "step": 31000
1115
+ },
1116
+ {
1117
+ "epoch": 3.73,
1118
+ "eval_f1": 0.4588251776601326,
1119
+ "eval_loss": 0.970705509185791,
1120
+ "eval_precision": 0.5827537007312288,
1121
+ "eval_recall": 0.4768803239770982,
1122
+ "eval_runtime": 6.0168,
1123
+ "eval_samples_per_second": 145.925,
1124
+ "eval_steps_per_second": 9.141,
1125
+ "step": 31000
1126
+ },
1127
+ {
1128
+ "epoch": 3.79,
1129
+ "grad_norm": 4.64936637878418,
1130
+ "learning_rate": 4.621416546416546e-05,
1131
+ "loss": 0.3634,
1132
+ "step": 31500
1133
+ },
1134
+ {
1135
+ "epoch": 3.79,
1136
+ "eval_f1": 0.575160103511553,
1137
+ "eval_loss": 0.8524229526519775,
1138
+ "eval_precision": 0.6136046998053873,
1139
+ "eval_recall": 0.5680603267700042,
1140
+ "eval_runtime": 6.5694,
1141
+ "eval_samples_per_second": 133.651,
1142
+ "eval_steps_per_second": 8.372,
1143
+ "step": 31500
1144
+ },
1145
+ {
1146
+ "epoch": 3.79,
1147
+ "step": 31500,
1148
+ "total_flos": 1.3260126913238016e+17,
1149
+ "train_loss": 0.42908321610708083,
1150
+ "train_runtime": 7590.5884,
1151
+ "train_samples_per_second": 876.388,
1152
+ "train_steps_per_second": 54.778
1153
+ }
1154
+ ],
1155
+ "logging_steps": 500,
1156
+ "max_steps": 415800,
1157
+ "num_input_tokens_seen": 0,
1158
+ "num_train_epochs": 50,
1159
+ "save_steps": 500,
1160
+ "total_flos": 1.3260126913238016e+17,
1161
+ "train_batch_size": 16,
1162
+ "trial_name": null,
1163
+ "trial_params": null
1164
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f5c815d879e81b472ce52bc0bc6843f93b2e6b15b32a07dfe75c40e595456a
3
+ size 4920