Shresthadev403 commited on
Commit
6ab019f
·
1 Parent(s): 1593c29

End of training

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 4.4993
19
 
20
  ## Model description
21
 
@@ -40,22 +40,62 @@ The following hyperparameters were used during training:
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
- - num_epochs: 10
44
 
45
  ### Training results
46
 
47
  | Training Loss | Epoch | Step | Validation Loss |
48
  |:-------------:|:-----:|:----:|:---------------:|
49
- | 4.2775 | 1.0 | 1 | 4.4072 |
50
- | 4.0189 | 2.0 | 2 | 4.4238 |
51
- | 3.7789 | 3.0 | 3 | 4.4360 |
52
- | 3.6276 | 4.0 | 4 | 4.4511 |
53
- | 3.4433 | 5.0 | 5 | 4.4713 |
54
- | 3.3643 | 6.0 | 6 | 4.4851 |
55
- | 3.2763 | 7.0 | 7 | 4.4929 |
56
- | 3.1594 | 8.0 | 8 | 4.4962 |
57
- | 3.1192 | 9.0 | 9 | 4.4983 |
58
- | 3.0833 | 10.0 | 10 | 4.4993 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 4.5134
19
 
20
  ## Model description
21
 
 
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
+ - num_epochs: 50
44
 
45
  ### Training results
46
 
47
  | Training Loss | Epoch | Step | Validation Loss |
48
  |:-------------:|:-----:|:----:|:---------------:|
49
+ | 4.3188 | 1.0 | 1 | 4.3048 |
50
+ | 4.0523 | 2.0 | 2 | 4.3228 |
51
+ | 3.7875 | 3.0 | 3 | 4.3474 |
52
+ | 3.621 | 4.0 | 4 | 4.3642 |
53
+ | 3.463 | 5.0 | 5 | 4.3595 |
54
+ | 3.2716 | 6.0 | 6 | 4.3646 |
55
+ | 3.1239 | 7.0 | 7 | 4.3788 |
56
+ | 2.95 | 8.0 | 8 | 4.3982 |
57
+ | 2.7756 | 9.0 | 9 | 4.4180 |
58
+ | 2.6549 | 10.0 | 10 | 4.4303 |
59
+ | 2.5244 | 11.0 | 11 | 4.4385 |
60
+ | 2.3896 | 12.0 | 12 | 4.4430 |
61
+ | 2.2981 | 13.0 | 13 | 4.4451 |
62
+ | 2.2074 | 14.0 | 14 | 4.4551 |
63
+ | 2.1064 | 15.0 | 15 | 4.4691 |
64
+ | 1.9709 | 16.0 | 16 | 4.4816 |
65
+ | 1.8962 | 17.0 | 17 | 4.4862 |
66
+ | 1.8365 | 18.0 | 18 | 4.4862 |
67
+ | 1.7521 | 19.0 | 19 | 4.4818 |
68
+ | 1.6779 | 20.0 | 20 | 4.4766 |
69
+ | 1.5921 | 21.0 | 21 | 4.4748 |
70
+ | 1.4871 | 22.0 | 22 | 4.4769 |
71
+ | 1.4557 | 23.0 | 23 | 4.4795 |
72
+ | 1.3541 | 24.0 | 24 | 4.4829 |
73
+ | 1.3201 | 25.0 | 25 | 4.4822 |
74
+ | 1.2871 | 26.0 | 26 | 4.4803 |
75
+ | 1.1979 | 27.0 | 27 | 4.4797 |
76
+ | 1.1499 | 28.0 | 28 | 4.4822 |
77
+ | 1.114 | 29.0 | 29 | 4.4855 |
78
+ | 1.0698 | 30.0 | 30 | 4.4885 |
79
+ | 1.0635 | 31.0 | 31 | 4.4903 |
80
+ | 1.0178 | 32.0 | 32 | 4.4908 |
81
+ | 0.976 | 33.0 | 33 | 4.4920 |
82
+ | 0.9467 | 34.0 | 34 | 4.4933 |
83
+ | 0.9269 | 35.0 | 35 | 4.4953 |
84
+ | 0.8948 | 36.0 | 36 | 4.4968 |
85
+ | 0.8524 | 37.0 | 37 | 4.4994 |
86
+ | 0.8592 | 38.0 | 38 | 4.5028 |
87
+ | 0.8301 | 39.0 | 39 | 4.5061 |
88
+ | 0.799 | 40.0 | 40 | 4.5076 |
89
+ | 0.7772 | 41.0 | 41 | 4.5092 |
90
+ | 0.7611 | 42.0 | 42 | 4.5109 |
91
+ | 0.7345 | 43.0 | 43 | 4.5127 |
92
+ | 0.8036 | 44.0 | 44 | 4.5138 |
93
+ | 0.7261 | 45.0 | 45 | 4.5143 |
94
+ | 0.7305 | 46.0 | 46 | 4.5140 |
95
+ | 0.6898 | 47.0 | 47 | 4.5136 |
96
+ | 0.6926 | 48.0 | 48 | 4.5134 |
97
+ | 0.7093 | 49.0 | 49 | 4.5134 |
98
+ | 0.7009 | 50.0 | 50 | 4.5134 |
99
 
100
 
101
  ### Framework versions
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1616f03a313d145cd3d624191b2cc4d24eea8aa5e570cdc0e245523832abdaa1
3
  size 435820636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a2d4374956c155ac9bee48ebb63b4df3368b7b92fd6771dd3c084b393a111f
3
  size 435820636
runs/Jan05_07-56-15_414819e23027/events.out.tfevents.1704441386.414819e23027.9579.12 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c82d8815f8bf3674f1cf72721f1df697fe10d27e73cc771775941b1f31d805
3
+ size 28725
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trainer_state.json CHANGED
@@ -1,168 +1,728 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 1,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "learning_rate": 4.5e-05,
14
- "loss": 4.2775,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 1.0,
19
- "eval_loss": 4.40718936920166,
20
- "eval_runtime": 0.0266,
21
- "eval_samples_per_second": 75.136,
22
- "eval_steps_per_second": 37.568,
23
  "step": 1
24
  },
25
  {
26
  "epoch": 2.0,
27
- "learning_rate": 4e-05,
28
- "loss": 4.0189,
29
  "step": 2
30
  },
31
  {
32
  "epoch": 2.0,
33
- "eval_loss": 4.423794269561768,
34
- "eval_runtime": 0.0247,
35
- "eval_samples_per_second": 80.844,
36
- "eval_steps_per_second": 40.422,
37
  "step": 2
38
  },
39
  {
40
  "epoch": 3.0,
41
- "learning_rate": 3.5e-05,
42
- "loss": 3.7789,
43
  "step": 3
44
  },
45
  {
46
  "epoch": 3.0,
47
- "eval_loss": 4.435977458953857,
48
- "eval_runtime": 0.0252,
49
- "eval_samples_per_second": 79.233,
50
- "eval_steps_per_second": 39.616,
51
  "step": 3
52
  },
53
  {
54
  "epoch": 4.0,
55
- "learning_rate": 3e-05,
56
- "loss": 3.6276,
57
  "step": 4
58
  },
59
  {
60
  "epoch": 4.0,
61
- "eval_loss": 4.451086044311523,
62
- "eval_runtime": 0.0269,
63
- "eval_samples_per_second": 74.426,
64
- "eval_steps_per_second": 37.213,
65
  "step": 4
66
  },
67
  {
68
  "epoch": 5.0,
69
- "learning_rate": 2.5e-05,
70
- "loss": 3.4433,
71
  "step": 5
72
  },
73
  {
74
  "epoch": 5.0,
75
- "eval_loss": 4.471280097961426,
76
- "eval_runtime": 0.0268,
77
- "eval_samples_per_second": 74.615,
78
- "eval_steps_per_second": 37.308,
79
  "step": 5
80
  },
81
  {
82
  "epoch": 6.0,
83
- "learning_rate": 2e-05,
84
- "loss": 3.3643,
85
  "step": 6
86
  },
87
  {
88
  "epoch": 6.0,
89
- "eval_loss": 4.485055923461914,
90
- "eval_runtime": 0.0247,
91
- "eval_samples_per_second": 80.912,
92
- "eval_steps_per_second": 40.456,
93
  "step": 6
94
  },
95
  {
96
  "epoch": 7.0,
97
- "learning_rate": 1.5e-05,
98
- "loss": 3.2763,
99
  "step": 7
100
  },
101
  {
102
  "epoch": 7.0,
103
- "eval_loss": 4.492944240570068,
104
- "eval_runtime": 0.0253,
105
- "eval_samples_per_second": 79.031,
106
- "eval_steps_per_second": 39.516,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 8.0,
111
- "learning_rate": 1e-05,
112
- "loss": 3.1594,
113
  "step": 8
114
  },
115
  {
116
  "epoch": 8.0,
117
- "eval_loss": 4.496211528778076,
118
- "eval_runtime": 0.0273,
119
- "eval_samples_per_second": 73.312,
120
- "eval_steps_per_second": 36.656,
121
  "step": 8
122
  },
123
  {
124
  "epoch": 9.0,
125
- "learning_rate": 5e-06,
126
- "loss": 3.1192,
127
  "step": 9
128
  },
129
  {
130
  "epoch": 9.0,
131
- "eval_loss": 4.498274326324463,
132
- "eval_runtime": 0.0287,
133
- "eval_samples_per_second": 69.616,
134
- "eval_steps_per_second": 34.808,
135
  "step": 9
136
  },
137
  {
138
  "epoch": 10.0,
139
- "learning_rate": 0.0,
140
- "loss": 3.0833,
141
  "step": 10
142
  },
143
  {
144
  "epoch": 10.0,
145
- "eval_loss": 4.4992828369140625,
146
- "eval_runtime": 0.026,
147
- "eval_samples_per_second": 76.897,
148
- "eval_steps_per_second": 38.448,
149
  "step": 10
150
  },
151
  {
152
- "epoch": 10.0,
153
- "step": 10,
154
- "total_flos": 5760493524000.0,
155
- "train_loss": 3.514873218536377,
156
- "train_runtime": 2.4835,
157
- "train_samples_per_second": 32.212,
158
- "train_steps_per_second": 4.027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
  ],
161
  "logging_steps": 1,
162
- "max_steps": 10,
163
- "num_train_epochs": 10,
164
  "save_steps": 500,
165
- "total_flos": 5760493524000.0,
166
  "trial_name": null,
167
  "trial_params": null
168
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 50.0,
5
  "eval_steps": 1,
6
+ "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "learning_rate": 4.9e-05,
14
+ "loss": 4.3188,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 1.0,
19
+ "eval_loss": 4.304834842681885,
20
+ "eval_runtime": 0.0403,
21
+ "eval_samples_per_second": 49.662,
22
+ "eval_steps_per_second": 24.831,
23
  "step": 1
24
  },
25
  {
26
  "epoch": 2.0,
27
+ "learning_rate": 4.8e-05,
28
+ "loss": 4.0523,
29
  "step": 2
30
  },
31
  {
32
  "epoch": 2.0,
33
+ "eval_loss": 4.322819709777832,
34
+ "eval_runtime": 0.0337,
35
+ "eval_samples_per_second": 59.353,
36
+ "eval_steps_per_second": 29.677,
37
  "step": 2
38
  },
39
  {
40
  "epoch": 3.0,
41
+ "learning_rate": 4.7e-05,
42
+ "loss": 3.7875,
43
  "step": 3
44
  },
45
  {
46
  "epoch": 3.0,
47
+ "eval_loss": 4.347378730773926,
48
+ "eval_runtime": 0.0393,
49
+ "eval_samples_per_second": 50.859,
50
+ "eval_steps_per_second": 25.429,
51
  "step": 3
52
  },
53
  {
54
  "epoch": 4.0,
55
+ "learning_rate": 4.600000000000001e-05,
56
+ "loss": 3.621,
57
  "step": 4
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "eval_loss": 4.364194869995117,
62
+ "eval_runtime": 0.0329,
63
+ "eval_samples_per_second": 60.728,
64
+ "eval_steps_per_second": 30.364,
65
  "step": 4
66
  },
67
  {
68
  "epoch": 5.0,
69
+ "learning_rate": 4.5e-05,
70
+ "loss": 3.463,
71
  "step": 5
72
  },
73
  {
74
  "epoch": 5.0,
75
+ "eval_loss": 4.35947847366333,
76
+ "eval_runtime": 0.0331,
77
+ "eval_samples_per_second": 60.333,
78
+ "eval_steps_per_second": 30.167,
79
  "step": 5
80
  },
81
  {
82
  "epoch": 6.0,
83
+ "learning_rate": 4.4000000000000006e-05,
84
+ "loss": 3.2716,
85
  "step": 6
86
  },
87
  {
88
  "epoch": 6.0,
89
+ "eval_loss": 4.364583492279053,
90
+ "eval_runtime": 0.0348,
91
+ "eval_samples_per_second": 57.43,
92
+ "eval_steps_per_second": 28.715,
93
  "step": 6
94
  },
95
  {
96
  "epoch": 7.0,
97
+ "learning_rate": 4.3e-05,
98
+ "loss": 3.1239,
99
  "step": 7
100
  },
101
  {
102
  "epoch": 7.0,
103
+ "eval_loss": 4.3788161277771,
104
+ "eval_runtime": 0.0323,
105
+ "eval_samples_per_second": 61.86,
106
+ "eval_steps_per_second": 30.93,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 8.0,
111
+ "learning_rate": 4.2e-05,
112
+ "loss": 2.95,
113
  "step": 8
114
  },
115
  {
116
  "epoch": 8.0,
117
+ "eval_loss": 4.398151874542236,
118
+ "eval_runtime": 0.04,
119
+ "eval_samples_per_second": 49.976,
120
+ "eval_steps_per_second": 24.988,
121
  "step": 8
122
  },
123
  {
124
  "epoch": 9.0,
125
+ "learning_rate": 4.1e-05,
126
+ "loss": 2.7756,
127
  "step": 9
128
  },
129
  {
130
  "epoch": 9.0,
131
+ "eval_loss": 4.418046951293945,
132
+ "eval_runtime": 0.0418,
133
+ "eval_samples_per_second": 47.897,
134
+ "eval_steps_per_second": 23.949,
135
  "step": 9
136
  },
137
  {
138
  "epoch": 10.0,
139
+ "learning_rate": 4e-05,
140
+ "loss": 2.6549,
141
  "step": 10
142
  },
143
  {
144
  "epoch": 10.0,
145
+ "eval_loss": 4.430344581604004,
146
+ "eval_runtime": 0.0286,
147
+ "eval_samples_per_second": 69.986,
148
+ "eval_steps_per_second": 34.993,
149
  "step": 10
150
  },
151
  {
152
+ "epoch": 11.0,
153
+ "learning_rate": 3.9000000000000006e-05,
154
+ "loss": 2.5244,
155
+ "step": 11
156
+ },
157
+ {
158
+ "epoch": 11.0,
159
+ "eval_loss": 4.438453197479248,
160
+ "eval_runtime": 0.0276,
161
+ "eval_samples_per_second": 72.47,
162
+ "eval_steps_per_second": 36.235,
163
+ "step": 11
164
+ },
165
+ {
166
+ "epoch": 12.0,
167
+ "learning_rate": 3.8e-05,
168
+ "loss": 2.3896,
169
+ "step": 12
170
+ },
171
+ {
172
+ "epoch": 12.0,
173
+ "eval_loss": 4.4430317878723145,
174
+ "eval_runtime": 0.0287,
175
+ "eval_samples_per_second": 69.577,
176
+ "eval_steps_per_second": 34.788,
177
+ "step": 12
178
+ },
179
+ {
180
+ "epoch": 13.0,
181
+ "learning_rate": 3.7e-05,
182
+ "loss": 2.2981,
183
+ "step": 13
184
+ },
185
+ {
186
+ "epoch": 13.0,
187
+ "eval_loss": 4.445078372955322,
188
+ "eval_runtime": 0.0283,
189
+ "eval_samples_per_second": 70.789,
190
+ "eval_steps_per_second": 35.394,
191
+ "step": 13
192
+ },
193
+ {
194
+ "epoch": 14.0,
195
+ "learning_rate": 3.6e-05,
196
+ "loss": 2.2074,
197
+ "step": 14
198
+ },
199
+ {
200
+ "epoch": 14.0,
201
+ "eval_loss": 4.455099105834961,
202
+ "eval_runtime": 0.0298,
203
+ "eval_samples_per_second": 67.215,
204
+ "eval_steps_per_second": 33.608,
205
+ "step": 14
206
+ },
207
+ {
208
+ "epoch": 15.0,
209
+ "learning_rate": 3.5e-05,
210
+ "loss": 2.1064,
211
+ "step": 15
212
+ },
213
+ {
214
+ "epoch": 15.0,
215
+ "eval_loss": 4.469105243682861,
216
+ "eval_runtime": 0.0268,
217
+ "eval_samples_per_second": 74.497,
218
+ "eval_steps_per_second": 37.248,
219
+ "step": 15
220
+ },
221
+ {
222
+ "epoch": 16.0,
223
+ "learning_rate": 3.4000000000000007e-05,
224
+ "loss": 1.9709,
225
+ "step": 16
226
+ },
227
+ {
228
+ "epoch": 16.0,
229
+ "eval_loss": 4.4815754890441895,
230
+ "eval_runtime": 0.0324,
231
+ "eval_samples_per_second": 61.699,
232
+ "eval_steps_per_second": 30.85,
233
+ "step": 16
234
+ },
235
+ {
236
+ "epoch": 17.0,
237
+ "learning_rate": 3.3e-05,
238
+ "loss": 1.8962,
239
+ "step": 17
240
+ },
241
+ {
242
+ "epoch": 17.0,
243
+ "eval_loss": 4.486156940460205,
244
+ "eval_runtime": 0.028,
245
+ "eval_samples_per_second": 71.398,
246
+ "eval_steps_per_second": 35.699,
247
+ "step": 17
248
+ },
249
+ {
250
+ "epoch": 18.0,
251
+ "learning_rate": 3.2000000000000005e-05,
252
+ "loss": 1.8365,
253
+ "step": 18
254
+ },
255
+ {
256
+ "epoch": 18.0,
257
+ "eval_loss": 4.486203670501709,
258
+ "eval_runtime": 0.0375,
259
+ "eval_samples_per_second": 53.347,
260
+ "eval_steps_per_second": 26.674,
261
+ "step": 18
262
+ },
263
+ {
264
+ "epoch": 19.0,
265
+ "learning_rate": 3.1e-05,
266
+ "loss": 1.7521,
267
+ "step": 19
268
+ },
269
+ {
270
+ "epoch": 19.0,
271
+ "eval_loss": 4.481803894042969,
272
+ "eval_runtime": 0.0261,
273
+ "eval_samples_per_second": 76.657,
274
+ "eval_steps_per_second": 38.329,
275
+ "step": 19
276
+ },
277
+ {
278
+ "epoch": 20.0,
279
+ "learning_rate": 3e-05,
280
+ "loss": 1.6779,
281
+ "step": 20
282
+ },
283
+ {
284
+ "epoch": 20.0,
285
+ "eval_loss": 4.476602077484131,
286
+ "eval_runtime": 0.0289,
287
+ "eval_samples_per_second": 69.157,
288
+ "eval_steps_per_second": 34.579,
289
+ "step": 20
290
+ },
291
+ {
292
+ "epoch": 21.0,
293
+ "learning_rate": 2.9e-05,
294
+ "loss": 1.5921,
295
+ "step": 21
296
+ },
297
+ {
298
+ "epoch": 21.0,
299
+ "eval_loss": 4.474806785583496,
300
+ "eval_runtime": 0.0306,
301
+ "eval_samples_per_second": 65.323,
302
+ "eval_steps_per_second": 32.661,
303
+ "step": 21
304
+ },
305
+ {
306
+ "epoch": 22.0,
307
+ "learning_rate": 2.8000000000000003e-05,
308
+ "loss": 1.4871,
309
+ "step": 22
310
+ },
311
+ {
312
+ "epoch": 22.0,
313
+ "eval_loss": 4.4769287109375,
314
+ "eval_runtime": 0.0275,
315
+ "eval_samples_per_second": 72.678,
316
+ "eval_steps_per_second": 36.339,
317
+ "step": 22
318
+ },
319
+ {
320
+ "epoch": 23.0,
321
+ "learning_rate": 2.7000000000000002e-05,
322
+ "loss": 1.4557,
323
+ "step": 23
324
+ },
325
+ {
326
+ "epoch": 23.0,
327
+ "eval_loss": 4.479461193084717,
328
+ "eval_runtime": 0.0288,
329
+ "eval_samples_per_second": 69.4,
330
+ "eval_steps_per_second": 34.7,
331
+ "step": 23
332
+ },
333
+ {
334
+ "epoch": 24.0,
335
+ "learning_rate": 2.6000000000000002e-05,
336
+ "loss": 1.3541,
337
+ "step": 24
338
+ },
339
+ {
340
+ "epoch": 24.0,
341
+ "eval_loss": 4.482919216156006,
342
+ "eval_runtime": 0.0299,
343
+ "eval_samples_per_second": 66.98,
344
+ "eval_steps_per_second": 33.49,
345
+ "step": 24
346
+ },
347
+ {
348
+ "epoch": 25.0,
349
+ "learning_rate": 2.5e-05,
350
+ "loss": 1.3201,
351
+ "step": 25
352
+ },
353
+ {
354
+ "epoch": 25.0,
355
+ "eval_loss": 4.482161998748779,
356
+ "eval_runtime": 0.0318,
357
+ "eval_samples_per_second": 62.819,
358
+ "eval_steps_per_second": 31.409,
359
+ "step": 25
360
+ },
361
+ {
362
+ "epoch": 26.0,
363
+ "learning_rate": 2.4e-05,
364
+ "loss": 1.2871,
365
+ "step": 26
366
+ },
367
+ {
368
+ "epoch": 26.0,
369
+ "eval_loss": 4.480334281921387,
370
+ "eval_runtime": 0.028,
371
+ "eval_samples_per_second": 71.489,
372
+ "eval_steps_per_second": 35.745,
373
+ "step": 26
374
+ },
375
+ {
376
+ "epoch": 27.0,
377
+ "learning_rate": 2.3000000000000003e-05,
378
+ "loss": 1.1979,
379
+ "step": 27
380
+ },
381
+ {
382
+ "epoch": 27.0,
383
+ "eval_loss": 4.479716777801514,
384
+ "eval_runtime": 0.0295,
385
+ "eval_samples_per_second": 67.699,
386
+ "eval_steps_per_second": 33.849,
387
+ "step": 27
388
+ },
389
+ {
390
+ "epoch": 28.0,
391
+ "learning_rate": 2.2000000000000003e-05,
392
+ "loss": 1.1499,
393
+ "step": 28
394
+ },
395
+ {
396
+ "epoch": 28.0,
397
+ "eval_loss": 4.4821858406066895,
398
+ "eval_runtime": 0.0339,
399
+ "eval_samples_per_second": 59.02,
400
+ "eval_steps_per_second": 29.51,
401
+ "step": 28
402
+ },
403
+ {
404
+ "epoch": 29.0,
405
+ "learning_rate": 2.1e-05,
406
+ "loss": 1.114,
407
+ "step": 29
408
+ },
409
+ {
410
+ "epoch": 29.0,
411
+ "eval_loss": 4.485532283782959,
412
+ "eval_runtime": 0.0295,
413
+ "eval_samples_per_second": 67.701,
414
+ "eval_steps_per_second": 33.851,
415
+ "step": 29
416
+ },
417
+ {
418
+ "epoch": 30.0,
419
+ "learning_rate": 2e-05,
420
+ "loss": 1.0698,
421
+ "step": 30
422
+ },
423
+ {
424
+ "epoch": 30.0,
425
+ "eval_loss": 4.4885077476501465,
426
+ "eval_runtime": 0.0292,
427
+ "eval_samples_per_second": 68.609,
428
+ "eval_steps_per_second": 34.305,
429
+ "step": 30
430
+ },
431
+ {
432
+ "epoch": 31.0,
433
+ "learning_rate": 1.9e-05,
434
+ "loss": 1.0635,
435
+ "step": 31
436
+ },
437
+ {
438
+ "epoch": 31.0,
439
+ "eval_loss": 4.490349769592285,
440
+ "eval_runtime": 0.0278,
441
+ "eval_samples_per_second": 71.898,
442
+ "eval_steps_per_second": 35.949,
443
+ "step": 31
444
+ },
445
+ {
446
+ "epoch": 32.0,
447
+ "learning_rate": 1.8e-05,
448
+ "loss": 1.0178,
449
+ "step": 32
450
+ },
451
+ {
452
+ "epoch": 32.0,
453
+ "eval_loss": 4.4907612800598145,
454
+ "eval_runtime": 0.0278,
455
+ "eval_samples_per_second": 71.828,
456
+ "eval_steps_per_second": 35.914,
457
+ "step": 32
458
+ },
459
+ {
460
+ "epoch": 33.0,
461
+ "learning_rate": 1.7000000000000003e-05,
462
+ "loss": 0.976,
463
+ "step": 33
464
+ },
465
+ {
466
+ "epoch": 33.0,
467
+ "eval_loss": 4.491974830627441,
468
+ "eval_runtime": 0.0282,
469
+ "eval_samples_per_second": 71.029,
470
+ "eval_steps_per_second": 35.515,
471
+ "step": 33
472
+ },
473
+ {
474
+ "epoch": 34.0,
475
+ "learning_rate": 1.6000000000000003e-05,
476
+ "loss": 0.9467,
477
+ "step": 34
478
+ },
479
+ {
480
+ "epoch": 34.0,
481
+ "eval_loss": 4.493342876434326,
482
+ "eval_runtime": 0.0333,
483
+ "eval_samples_per_second": 60.02,
484
+ "eval_steps_per_second": 30.01,
485
+ "step": 34
486
+ },
487
+ {
488
+ "epoch": 35.0,
489
+ "learning_rate": 1.5e-05,
490
+ "loss": 0.9269,
491
+ "step": 35
492
+ },
493
+ {
494
+ "epoch": 35.0,
495
+ "eval_loss": 4.49529504776001,
496
+ "eval_runtime": 0.0269,
497
+ "eval_samples_per_second": 74.297,
498
+ "eval_steps_per_second": 37.149,
499
+ "step": 35
500
+ },
501
+ {
502
+ "epoch": 36.0,
503
+ "learning_rate": 1.4000000000000001e-05,
504
+ "loss": 0.8948,
505
+ "step": 36
506
+ },
507
+ {
508
+ "epoch": 36.0,
509
+ "eval_loss": 4.496817588806152,
510
+ "eval_runtime": 0.0305,
511
+ "eval_samples_per_second": 65.479,
512
+ "eval_steps_per_second": 32.74,
513
+ "step": 36
514
+ },
515
+ {
516
+ "epoch": 37.0,
517
+ "learning_rate": 1.3000000000000001e-05,
518
+ "loss": 0.8524,
519
+ "step": 37
520
+ },
521
+ {
522
+ "epoch": 37.0,
523
+ "eval_loss": 4.499395847320557,
524
+ "eval_runtime": 0.0281,
525
+ "eval_samples_per_second": 71.264,
526
+ "eval_steps_per_second": 35.632,
527
+ "step": 37
528
+ },
529
+ {
530
+ "epoch": 38.0,
531
+ "learning_rate": 1.2e-05,
532
+ "loss": 0.8592,
533
+ "step": 38
534
+ },
535
+ {
536
+ "epoch": 38.0,
537
+ "eval_loss": 4.502837657928467,
538
+ "eval_runtime": 0.0297,
539
+ "eval_samples_per_second": 67.239,
540
+ "eval_steps_per_second": 33.62,
541
+ "step": 38
542
+ },
543
+ {
544
+ "epoch": 39.0,
545
+ "learning_rate": 1.1000000000000001e-05,
546
+ "loss": 0.8301,
547
+ "step": 39
548
+ },
549
+ {
550
+ "epoch": 39.0,
551
+ "eval_loss": 4.506129264831543,
552
+ "eval_runtime": 0.0327,
553
+ "eval_samples_per_second": 61.108,
554
+ "eval_steps_per_second": 30.554,
555
+ "step": 39
556
+ },
557
+ {
558
+ "epoch": 40.0,
559
+ "learning_rate": 1e-05,
560
+ "loss": 0.799,
561
+ "step": 40
562
+ },
563
+ {
564
+ "epoch": 40.0,
565
+ "eval_loss": 4.507645130157471,
566
+ "eval_runtime": 0.0271,
567
+ "eval_samples_per_second": 73.666,
568
+ "eval_steps_per_second": 36.833,
569
+ "step": 40
570
+ },
571
+ {
572
+ "epoch": 41.0,
573
+ "learning_rate": 9e-06,
574
+ "loss": 0.7772,
575
+ "step": 41
576
+ },
577
+ {
578
+ "epoch": 41.0,
579
+ "eval_loss": 4.509157180786133,
580
+ "eval_runtime": 0.0285,
581
+ "eval_samples_per_second": 70.159,
582
+ "eval_steps_per_second": 35.08,
583
+ "step": 41
584
+ },
585
+ {
586
+ "epoch": 42.0,
587
+ "learning_rate": 8.000000000000001e-06,
588
+ "loss": 0.7611,
589
+ "step": 42
590
+ },
591
+ {
592
+ "epoch": 42.0,
593
+ "eval_loss": 4.510908603668213,
594
+ "eval_runtime": 0.0285,
595
+ "eval_samples_per_second": 70.164,
596
+ "eval_steps_per_second": 35.082,
597
+ "step": 42
598
+ },
599
+ {
600
+ "epoch": 43.0,
601
+ "learning_rate": 7.000000000000001e-06,
602
+ "loss": 0.7345,
603
+ "step": 43
604
+ },
605
+ {
606
+ "epoch": 43.0,
607
+ "eval_loss": 4.512662410736084,
608
+ "eval_runtime": 0.0295,
609
+ "eval_samples_per_second": 67.814,
610
+ "eval_steps_per_second": 33.907,
611
+ "step": 43
612
+ },
613
+ {
614
+ "epoch": 44.0,
615
+ "learning_rate": 6e-06,
616
+ "loss": 0.8036,
617
+ "step": 44
618
+ },
619
+ {
620
+ "epoch": 44.0,
621
+ "eval_loss": 4.51375150680542,
622
+ "eval_runtime": 0.0295,
623
+ "eval_samples_per_second": 67.686,
624
+ "eval_steps_per_second": 33.843,
625
+ "step": 44
626
+ },
627
+ {
628
+ "epoch": 45.0,
629
+ "learning_rate": 5e-06,
630
+ "loss": 0.7261,
631
+ "step": 45
632
+ },
633
+ {
634
+ "epoch": 45.0,
635
+ "eval_loss": 4.514307022094727,
636
+ "eval_runtime": 0.0366,
637
+ "eval_samples_per_second": 54.69,
638
+ "eval_steps_per_second": 27.345,
639
+ "step": 45
640
+ },
641
+ {
642
+ "epoch": 46.0,
643
+ "learning_rate": 4.000000000000001e-06,
644
+ "loss": 0.7305,
645
+ "step": 46
646
+ },
647
+ {
648
+ "epoch": 46.0,
649
+ "eval_loss": 4.514035701751709,
650
+ "eval_runtime": 0.0325,
651
+ "eval_samples_per_second": 61.45,
652
+ "eval_steps_per_second": 30.725,
653
+ "step": 46
654
+ },
655
+ {
656
+ "epoch": 47.0,
657
+ "learning_rate": 3e-06,
658
+ "loss": 0.6898,
659
+ "step": 47
660
+ },
661
+ {
662
+ "epoch": 47.0,
663
+ "eval_loss": 4.51362419128418,
664
+ "eval_runtime": 0.0279,
665
+ "eval_samples_per_second": 71.677,
666
+ "eval_steps_per_second": 35.838,
667
+ "step": 47
668
+ },
669
+ {
670
+ "epoch": 48.0,
671
+ "learning_rate": 2.0000000000000003e-06,
672
+ "loss": 0.6926,
673
+ "step": 48
674
+ },
675
+ {
676
+ "epoch": 48.0,
677
+ "eval_loss": 4.513439178466797,
678
+ "eval_runtime": 0.0286,
679
+ "eval_samples_per_second": 69.881,
680
+ "eval_steps_per_second": 34.94,
681
+ "step": 48
682
+ },
683
+ {
684
+ "epoch": 49.0,
685
+ "learning_rate": 1.0000000000000002e-06,
686
+ "loss": 0.7093,
687
+ "step": 49
688
+ },
689
+ {
690
+ "epoch": 49.0,
691
+ "eval_loss": 4.513415813446045,
692
+ "eval_runtime": 0.0336,
693
+ "eval_samples_per_second": 59.457,
694
+ "eval_steps_per_second": 29.728,
695
+ "step": 49
696
+ },
697
+ {
698
+ "epoch": 50.0,
699
+ "learning_rate": 0.0,
700
+ "loss": 0.7009,
701
+ "step": 50
702
+ },
703
+ {
704
+ "epoch": 50.0,
705
+ "eval_loss": 4.513373851776123,
706
+ "eval_runtime": 0.0276,
707
+ "eval_samples_per_second": 72.415,
708
+ "eval_steps_per_second": 36.208,
709
+ "step": 50
710
+ },
711
+ {
712
+ "epoch": 50.0,
713
+ "step": 50,
714
+ "total_flos": 28802467620000.0,
715
+ "train_loss": 1.6839551556110381,
716
+ "train_runtime": 12.3811,
717
+ "train_samples_per_second": 32.307,
718
+ "train_steps_per_second": 4.038
719
  }
720
  ],
721
  "logging_steps": 1,
722
+ "max_steps": 50,
723
+ "num_train_epochs": 50,
724
  "save_steps": 500,
725
+ "total_flos": 28802467620000.0,
726
  "trial_name": null,
727
  "trial_params": null
728
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57f7585dd786be8c781109f854276009a7760d09f7b9631f8eb1474d4d2d507d
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058e290455a7ba56c355ecc012bae0490088b0f15056d1558ffb8bfa78af940f
3
  size 4600
vocab.txt ADDED
The diff for this file is too large to render. See raw diff