gyr66 commited on
Commit
c4fdd18
·
1 Parent(s): 0e7db07

Add parameters

Browse files
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "THUDM/chatglm3-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "THUDM/chatglm3-6b--configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 8192,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.37.1",
45
+ "use_cache": false,
46
+ "vocab_size": 65024
47
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.37.1",
6
+ "use_cache": false
7
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97e5e600417c14384def2c941f4642664fc5fe152bef37c8770f97bed2295ae
3
+ size 7341306
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
3
+ size 1018370
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "THUDM/chatglm3-6b--tokenization_chatglm.ChatGLMTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|> \n {{ message['content'] }}{% else %}<|{{ message['role'] }}|> \n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
10
+ "clean_up_tokenization_spaces": false,
11
+ "do_lower_case": false,
12
+ "eos_token": "</s>",
13
+ "model_max_length": 1000000000000000019884624838656,
14
+ "pad_token": "<unk>",
15
+ "padding_side": "left",
16
+ "remove_space": false,
17
+ "tokenizer_class": "ChatGLMTokenizer",
18
+ "unk_token": "<unk>"
19
+ }
train.log ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/100 [00:00<?, ?it/s]/home/vipuser/miniconda3/envs/GLM/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
 
 
1
  1%| | 1/100 [00:13<22:36, 13.70s/it]
2
 
 
3
  1%| | 1/100 [00:13<22:36, 13.70s/it]
4
  2%|▏ | 2/100 [00:25<20:33, 12.59s/it]
5
 
 
6
  2%|▏ | 2/100 [00:25<20:33, 12.59s/it]
7
  3%|▎ | 3/100 [00:37<19:50, 12.27s/it]
8
 
 
9
  3%|▎ | 3/100 [00:37<19:50, 12.27s/it]
10
  4%|▍ | 4/100 [00:49<19:25, 12.14s/it]
11
 
 
12
  4%|▍ | 4/100 [00:49<19:25, 12.14s/it]
13
  5%|▌ | 5/100 [01:01<19:09, 12.10s/it]
14
 
 
15
  5%|▌ | 5/100 [01:01<19:09, 12.10s/it]
16
  6%|▌ | 6/100 [01:13<18:55, 12.08s/it]
17
 
 
18
  6%|▌ | 6/100 [01:13<18:55, 12.08s/it]
19
  7%|▋ | 7/100 [01:25<18:45, 12.11s/it]
20
 
 
21
  7%|▋ | 7/100 [01:25<18:45, 12.11s/it]
22
  8%|▊ | 8/100 [01:37<18:35, 12.13s/it]
23
 
 
24
  8%|▊ | 8/100 [01:37<18:35, 12.13s/it]
25
  9%|▉ | 9/100 [01:49<18:25, 12.15s/it]
26
 
 
27
  9%|▉ | 9/100 [01:49<18:25, 12.15s/it]
28
  10%|█ | 10/100 [02:02<18:13, 12.15s/it]
29
 
 
30
  10%|█ | 10/100 [02:02<18:13, 12.15s/it]
31
  11%|█ | 11/100 [02:14<18:01, 12.16s/it]
32
 
 
33
  11%|█ | 11/100 [02:14<18:01, 12.16s/it]
34
  12%|█▏ | 12/100 [02:26<17:51, 12.18s/it]
35
 
 
36
  12%|█▏ | 12/100 [02:26<17:51, 12.18s/it]
37
  13%|█▎ | 13/100 [02:38<17:39, 12.18s/it]
38
 
 
39
  13%|█▎ | 13/100 [02:38<17:39, 12.18s/it]
40
  14%|█▍ | 14/100 [02:50<17:28, 12.19s/it]
41
 
 
42
  14%|█▍ | 14/100 [02:50<17:28, 12.19s/it]
43
  15%|█▌ | 15/100 [03:03<17:17, 12.20s/it]
44
 
 
45
  15%|█▌ | 15/100 [03:03<17:17, 12.20s/it]
46
  16%|█▌ | 16/100 [03:15<17:04, 12.20s/it]
47
 
 
48
  16%|█▌ | 16/100 [03:15<17:04, 12.20s/it]
49
  17%|█▋ | 17/100 [03:27<16:52, 12.20s/it]
50
 
 
51
  17%|█▋ | 17/100 [03:27<16:52, 12.20s/it]
52
  18%|█▊ | 18/100 [03:39<16:39, 12.19s/it]
53
 
 
54
  18%|█▊ | 18/100 [03:39<16:39, 12.19s/it]
55
  19%|█▉ | 19/100 [03:51<16:27, 12.19s/it]
56
 
 
57
  19%|█▉ | 19/100 [03:51<16:27, 12.19s/it]
58
  20%|██ | 20/100 [04:04<16:14, 12.18s/it]
59
 
 
60
  20%|██ | 20/100 [04:04<16:14, 12.18s/it]
61
  21%|██ | 21/100 [04:16<16:01, 12.17s/it]
62
 
 
63
  21%|██ | 21/100 [04:16<16:01, 12.17s/it]
64
  22%|██▏ | 22/100 [04:28<15:50, 12.18s/it]
65
 
 
66
  22%|██▏ | 22/100 [04:28<15:50, 12.18s/it]
67
  23%|██▎ | 23/100 [04:40<15:37, 12.18s/it]
68
 
 
69
  23%|██▎ | 23/100 [04:40<15:37, 12.18s/it]
70
  24%|██▍ | 24/100 [04:52<15:25, 12.17s/it]
71
 
 
72
  24%|██▍ | 24/100 [04:52<15:25, 12.17s/it]
73
  25%|██▌ | 25/100 [05:04<15:13, 12.18s/it]
74
 
 
75
  25%|██▌ | 25/100 [05:04<15:13, 12.18s/it]
76
  26%|██▌ | 26/100 [05:17<15:00, 12.17s/it]
77
 
 
78
  26%|██▌ | 26/100 [05:17<15:00, 12.17s/it]
79
  27%|██▋ | 27/100 [05:29<14:49, 12.18s/it]
80
 
 
81
  27%|██▋ | 27/100 [05:29<14:49, 12.18s/it]
82
  28%|██▊ | 28/100 [05:41<14:36, 12.17s/it]
83
 
 
84
  28%|██▊ | 28/100 [05:41<14:36, 12.17s/it]
85
  29%|██▉ | 29/100 [05:53<14:25, 12.19s/it]
86
 
 
87
  29%|██▉ | 29/100 [05:53<14:25, 12.19s/it]
88
  30%|███ | 30/100 [06:05<14:13, 12.19s/it]
89
 
 
90
  30%|███ | 30/100 [06:05<14:13, 12.19s/it]
91
  31%|███ | 31/100 [06:17<14:00, 12.18s/it]
92
 
 
93
  31%|███ | 31/100 [06:17<14:00, 12.18s/it]
94
  32%|███▏ | 32/100 [06:30<13:48, 12.18s/it]
95
 
 
96
  32%|███▏ | 32/100 [06:30<13:48, 12.18s/it]
97
  33%|███▎ | 33/100 [06:42<13:36, 12.18s/it]
98
 
 
99
  33%|███▎ | 33/100 [06:42<13:36, 12.18s/it]
100
  34%|███▍ | 34/100 [06:54<13:24, 12.18s/it]
101
 
 
102
  34%|███▍ | 34/100 [06:54<13:24, 12.18s/it]
103
  35%|███▌ | 35/100 [07:06<13:10, 12.16s/it]
104
 
 
105
  35%|███▌ | 35/100 [07:06<13:10, 12.16s/it]
106
  36%|███▌ | 36/100 [07:18<12:57, 12.16s/it]
107
 
 
108
  36%|███▌ | 36/100 [07:18<12:57, 12.16s/it]
109
  37%|███▋ | 37/100 [07:30<12:46, 12.17s/it]
110
 
 
111
  37%|███▋ | 37/100 [07:30<12:46, 12.17s/it]
112
  38%|███▊ | 38/100 [07:43<12:34, 12.17s/it]
113
 
 
114
  38%|███▊ | 38/100 [07:43<12:34, 12.17s/it]
115
  39%|███▉ | 39/100 [07:55<12:22, 12.17s/it]
116
 
 
117
  39%|███▉ | 39/100 [07:55<12:22, 12.17s/it]
118
  40%|████ | 40/100 [08:07<12:10, 12.17s/it]
119
 
 
120
  40%|████ | 40/100 [08:07<12:10, 12.17s/it]
121
  41%|████ | 41/100 [08:19<11:58, 12.18s/it]
122
 
 
123
  41%|████ | 41/100 [08:19<11:58, 12.18s/it]
124
  42%|████▏ | 42/100 [08:31<11:46, 12.17s/it]
125
 
 
126
  42%|████▏ | 42/100 [08:31<11:46, 12.17s/it]
127
  43%|████▎ | 43/100 [08:44<11:34, 12.18s/it]
128
 
 
129
  43%|████▎ | 43/100 [08:44<11:34, 12.18s/it]
130
  44%|████▍ | 44/100 [08:56<11:21, 12.17s/it]
131
 
 
132
  44%|████▍ | 44/100 [08:56<11:21, 12.17s/it]
133
  45%|████▌ | 45/100 [09:08<11:09, 12.17s/it]
134
 
 
135
  45%|████▌ | 45/100 [09:08<11:09, 12.17s/it]
136
  46%|████▌ | 46/100 [09:20<10:57, 12.18s/it]
137
 
 
138
  46%|████▌ | 46/100 [09:20<10:57, 12.18s/it]
139
  47%|████▋ | 47/100 [09:32<10:44, 12.17s/it]
140
 
 
141
  47%|████▋ | 47/100 [09:32<10:44, 12.17s/it]
142
  48%|████▊ | 48/100 [09:44<10:33, 12.18s/it]
143
 
 
144
  48%|████▊ | 48/100 [09:44<10:33, 12.18s/it]
145
  49%|████▉ | 49/100 [09:57<10:21, 12.19s/it]
146
 
 
147
  49%|████▉ | 49/100 [09:57<10:21, 12.19s/it]
148
  50%|█████ | 50/100 [10:09<10:09, 12.18s/it]
149
 
 
150
  50%|█████ | 50/100 [10:09<10:09, 12.18s/it]
151
  51%|█████ | 51/100 [10:21<09:56, 12.17s/it]
152
 
 
153
  51%|█████ | 51/100 [10:21<09:56, 12.17s/it]
154
  52%|█████▏ | 52/100 [10:33<09:44, 12.18s/it]
155
 
 
156
  52%|█████▏ | 52/100 [10:33<09:44, 12.18s/it]
157
  53%|█████▎ | 53/100 [10:45<09:32, 12.19s/it]
158
 
 
159
  53%|█████▎ | 53/100 [10:45<09:32, 12.19s/it]
160
  54%|█████▍ | 54/100 [10:58<09:20, 12.18s/it]
161
 
 
162
  54%|█████▍ | 54/100 [10:58<09:20, 12.18s/it]
163
  55%|█████▌ | 55/100 [11:10<09:08, 12.18s/it]
164
 
 
165
  55%|█████▌ | 55/100 [11:10<09:08, 12.18s/it]
166
  56%|█████▌ | 56/100 [11:22<08:56, 12.19s/it]
167
 
 
168
  56%|█████▌ | 56/100 [11:22<08:56, 12.19s/it]
169
  57%|█████▋ | 57/100 [11:34<08:43, 12.18s/it]
170
 
 
171
  57%|█████▋ | 57/100 [11:34<08:43, 12.18s/it]
172
  58%|█████▊ | 58/100 [11:46<08:31, 12.18s/it]
173
 
 
174
  58%|█████▊ | 58/100 [11:46<08:31, 12.18s/it]
175
  59%|█████▉ | 59/100 [11:58<08:19, 12.18s/it]
176
 
 
177
  59%|█████▉ | 59/100 [11:58<08:19, 12.18s/it]
178
  60%|██████ | 60/100 [12:11<08:07, 12.18s/it]
179
 
 
180
  60%|██████ | 60/100 [12:11<08:07, 12.18s/it]
181
  61%|██████ | 61/100 [12:23<07:55, 12.19s/it]
182
 
 
183
  61%|██████ | 61/100 [12:23<07:55, 12.19s/it]
184
  62%|██████▏ | 62/100 [12:35<07:42, 12.18s/it]
185
 
 
186
  62%|██████▏ | 62/100 [12:35<07:42, 12.18s/it]
187
  63%|██████▎ | 63/100 [12:47<07:30, 12.18s/it]
188
 
 
189
  63%|██████▎ | 63/100 [12:47<07:30, 12.18s/it]
190
  64%|██████▍ | 64/100 [12:59<07:18, 12.18s/it]
191
 
 
192
  64%|██████▍ | 64/100 [12:59<07:18, 12.18s/it]
193
  65%|██████▌ | 65/100 [13:12<07:06, 12.18s/it]
194
 
 
195
  65%|██████▌ | 65/100 [13:12<07:06, 12.18s/it]
196
  66%|██████▌ | 66/100 [13:24<06:54, 12.18s/it]
197
 
 
198
  66%|██████▌ | 66/100 [13:24<06:54, 12.18s/it]
199
  67%|██████▋ | 67/100 [13:36<06:42, 12.19s/it]
200
 
 
201
  67%|██████▋ | 67/100 [13:36<06:42, 12.19s/it]
202
  68%|██████▊ | 68/100 [13:48<06:29, 12.17s/it]
203
 
 
204
  68%|██████▊ | 68/100 [13:48<06:29, 12.17s/it]
205
  69%|██████▉ | 69/100 [14:00<06:17, 12.16s/it]
206
 
 
207
  69%|██████▉ | 69/100 [14:00<06:17, 12.16s/it]
208
  70%|███████ | 70/100 [14:12<06:04, 12.16s/it]
209
 
 
210
  70%|███████ | 70/100 [14:12<06:04, 12.16s/it]
211
  71%|███████ | 71/100 [14:25<05:52, 12.16s/it]
212
 
 
213
  71%|███████ | 71/100 [14:25<05:52, 12.16s/it]
214
  72%|███████▏ | 72/100 [14:37<05:41, 12.19s/it]
215
 
 
216
  72%|███████▏ | 72/100 [14:37<05:41, 12.19s/it]
217
  73%|███████▎ | 73/100 [14:49<05:28, 12.18s/it]
218
 
 
219
  73%|███████▎ | 73/100 [14:49<05:28, 12.18s/it]
220
  74%|███████▍ | 74/100 [15:01<05:16, 12.18s/it]
221
 
 
222
  74%|███████▍ | 74/100 [15:01<05:16, 12.18s/it]
223
  75%|███████▌ | 75/100 [15:13<05:04, 12.19s/it]
224
 
 
225
  75%|███████▌ | 75/100 [15:13<05:04, 12.19s/it]
226
  76%|███████▌ | 76/100 [15:26<04:52, 12.19s/it]
227
 
 
228
  76%|███████▌ | 76/100 [15:26<04:52, 12.19s/it]
229
  77%|███████▋ | 77/100 [15:38<04:40, 12.19s/it]
230
 
 
231
  77%|███████▋ | 77/100 [15:38<04:40, 12.19s/it]
232
  78%|███████▊ | 78/100 [15:50<04:28, 12.19s/it]
233
 
 
234
  78%|███████▊ | 78/100 [15:50<04:28, 12.19s/it]
235
  79%|███████▉ | 79/100 [16:02<04:16, 12.20s/it]
236
 
 
237
  79%|███████▉ | 79/100 [16:02<04:16, 12.20s/it]
238
  80%|████████ | 80/100 [16:14<04:03, 12.17s/it]
239
 
 
240
  80%|████████ | 80/100 [16:14<04:03, 12.17s/it]
241
  81%|████████ | 81/100 [16:26<03:51, 12.18s/it]
242
 
 
243
  81%|████████ | 81/100 [16:26<03:51, 12.18s/it]
244
  82%|████████▏ | 82/100 [16:39<03:39, 12.18s/it]
245
 
 
246
  82%|████████▏ | 82/100 [16:39<03:39, 12.18s/it]
247
  83%|████████▎ | 83/100 [16:51<03:26, 12.16s/it]
248
 
 
249
  83%|████████▎ | 83/100 [16:51<03:26, 12.16s/it]
250
  84%|████████▍ | 84/100 [17:03<03:14, 12.18s/it]
251
 
 
252
  84%|████████▍ | 84/100 [17:03<03:14, 12.18s/it]
253
  85%|████████▌ | 85/100 [17:15<03:02, 12.18s/it]
254
 
 
255
  85%|████████▌ | 85/100 [17:15<03:02, 12.18s/it]
256
  86%|████████▌ | 86/100 [17:27<02:50, 12.18s/it]
257
 
 
258
  86%|████████▌ | 86/100 [17:27<02:50, 12.18s/it]
259
  87%|████████▋ | 87/100 [17:39<02:38, 12.18s/it]
260
 
 
261
  87%|████████▋ | 87/100 [17:39<02:38, 12.18s/it]
262
  88%|████████▊ | 88/100 [17:52<02:26, 12.18s/it]
263
 
 
264
  88%|████████▊ | 88/100 [17:52<02:26, 12.18s/it]
265
  89%|████████▉ | 89/100 [18:04<02:13, 12.18s/it]
266
 
 
267
  89%|████████▉ | 89/100 [18:04<02:13, 12.18s/it]
268
  90%|█████████ | 90/100 [18:16<02:01, 12.18s/it]
269
 
 
270
  90%|█████████ | 90/100 [18:16<02:01, 12.18s/it]
271
  91%|█████████ | 91/100 [18:28<01:49, 12.18s/it]
272
 
 
273
  91%|█████████ | 91/100 [18:28<01:49, 12.18s/it]
274
  92%|█████████▏| 92/100 [18:40<01:37, 12.20s/it]
275
 
 
276
  92%|█████████▏| 92/100 [18:40<01:37, 12.20s/it]
277
  93%|█████████▎| 93/100 [18:53<01:25, 12.21s/it]
278
 
 
279
  93%|█████████▎| 93/100 [18:53<01:25, 12.21s/it]
280
  94%|█████████▍| 94/100 [19:05<01:13, 12.19s/it]
281
 
 
282
  94%|█████████▍| 94/100 [19:05<01:13, 12.19s/it]
283
  95%|█████████▌| 95/100 [19:17<01:00, 12.18s/it]
284
 
 
285
  95%|█████████▌| 95/100 [19:17<01:00, 12.18s/it]
286
  96%|█████████▌| 96/100 [19:29<00:48, 12.18s/it]
287
 
 
288
  96%|█████████▌| 96/100 [19:29<00:48, 12.18s/it]
289
  97%|█████████▋| 97/100 [19:41<00:36, 12.19s/it]
290
 
 
291
  97%|█████████▋| 97/100 [19:41<00:36, 12.19s/it]
292
  98%|█████████▊| 98/100 [19:54<00:24, 12.19s/it]
293
 
 
294
  98%|█████████▊| 98/100 [19:54<00:24, 12.19s/it]
295
  99%|█████████▉| 99/100 [20:06<00:12, 12.18s/it]
296
 
 
297
  99%|█████████▉| 99/100 [20:06<00:12, 12.18s/it]
298
 
 
 
 
 
 
 
299
 
 
 
 
 
 
 
 
 
1
+ [2024-01-26 12:54:39,523] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.
2
+ 01/26/2024 12:54:44 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
3
+ 01/26/2024 12:54:44 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
4
+ _n_gpu=1,
5
+ adafactor=False,
6
+ adam_beta1=0.9,
7
+ adam_beta2=0.999,
8
+ adam_epsilon=1e-08,
9
+ auto_find_batch_size=False,
10
+ bf16=False,
11
+ bf16_full_eval=False,
12
+ data_seed=None,
13
+ dataloader_drop_last=False,
14
+ dataloader_num_workers=0,
15
+ dataloader_persistent_workers=False,
16
+ dataloader_pin_memory=True,
17
+ ddp_backend=None,
18
+ ddp_broadcast_buffers=None,
19
+ ddp_bucket_cap_mb=None,
20
+ ddp_find_unused_parameters=False,
21
+ ddp_timeout=1800,
22
+ debug=[],
23
+ deepspeed=None,
24
+ disable_tqdm=False,
25
+ dispatch_batches=None,
26
+ do_eval=False,
27
+ do_predict=False,
28
+ do_train=False,
29
+ eval_accumulation_steps=None,
30
+ eval_delay=0,
31
+ eval_steps=None,
32
+ evaluation_strategy=no,
33
+ fp16=False,
34
+ fp16_backend=auto,
35
+ fp16_full_eval=False,
36
+ fp16_opt_level=O1,
37
+ fsdp=[],
38
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
39
+ fsdp_min_num_params=0,
40
+ fsdp_transformer_layer_cls_to_wrap=None,
41
+ full_determinism=False,
42
+ generation_config=None,
43
+ generation_max_length=None,
44
+ generation_num_beams=None,
45
+ gradient_accumulation_steps=32,
46
+ gradient_checkpointing=False,
47
+ gradient_checkpointing_kwargs=None,
48
+ greater_is_better=None,
49
+ group_by_length=False,
50
+ half_precision_backend=auto,
51
+ hub_always_push=False,
52
+ hub_model_id=None,
53
+ hub_private_repo=False,
54
+ hub_strategy=every_save,
55
+ hub_token=<HUB_TOKEN>,
56
+ ignore_data_skip=False,
57
+ include_inputs_for_metrics=False,
58
+ include_num_input_tokens_seen=False,
59
+ include_tokens_per_second=False,
60
+ jit_mode_eval=False,
61
+ label_names=None,
62
+ label_smoothing_factor=0.0,
63
+ learning_rate=0.02,
64
+ length_column_name=length,
65
+ load_best_model_at_end=False,
66
+ local_rank=0,
67
+ log_level=passive,
68
+ log_level_replica=warning,
69
+ log_on_each_node=True,
70
+ logging_dir=output/privacy_detection_pt-20240126-125436-128-2e-2/runs/Jan26_12-54-44_ubuntu1804,
71
+ logging_first_step=False,
72
+ logging_nan_inf_filter=True,
73
+ logging_steps=1.0,
74
+ logging_strategy=steps,
75
+ lr_scheduler_kwargs={},
76
+ lr_scheduler_type=linear,
77
+ max_grad_norm=1.0,
78
+ max_steps=100,
79
+ metric_for_best_model=None,
80
+ mp_parameters=,
81
+ neftune_noise_alpha=None,
82
+ no_cuda=False,
83
+ num_train_epochs=3.0,
84
+ optim=adamw_torch,
85
+ optim_args=None,
86
+ output_dir=output/privacy_detection_pt-20240126-125436-128-2e-2,
87
+ overwrite_output_dir=False,
88
+ past_index=-1,
89
+ per_device_eval_batch_size=8,
90
+ per_device_train_batch_size=1,
91
+ predict_with_generate=False,
92
+ prediction_loss_only=False,
93
+ push_to_hub=False,
94
+ push_to_hub_model_id=None,
95
+ push_to_hub_organization=None,
96
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
97
+ ray_scope=last,
98
+ remove_unused_columns=True,
99
+ report_to=[],
100
+ resume_from_checkpoint=True,
101
+ run_name=output/privacy_detection_pt-20240126-125436-128-2e-2,
102
+ save_on_each_node=False,
103
+ save_only_model=False,
104
+ save_safetensors=False,
105
+ save_steps=500,
106
+ save_strategy=steps,
107
+ save_total_limit=None,
108
+ seed=42,
109
+ skip_memory_metrics=True,
110
+ sortish_sampler=False,
111
+ split_batches=False,
112
+ tf32=None,
113
+ torch_compile=False,
114
+ torch_compile_backend=None,
115
+ torch_compile_mode=None,
116
+ torchdynamo=None,
117
+ tpu_metrics_debug=False,
118
+ tpu_num_cores=None,
119
+ use_cpu=False,
120
+ use_ipex=False,
121
+ use_legacy_prediction_loop=False,
122
+ use_mps_device=False,
123
+ warmup_ratio=0.0,
124
+ warmup_steps=0,
125
+ weight_decay=0.0,
126
+ )
127
+ [INFO|configuration_utils.py:729] 2024-01-26 12:54:45,398 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/37f2196f481f8989ea443be625d05f97043652ea/config.json
128
+ [INFO|configuration_utils.py:729] 2024-01-26 12:54:45,957 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/37f2196f481f8989ea443be625d05f97043652ea/config.json
129
+ [INFO|configuration_utils.py:792] 2024-01-26 12:54:45,960 >> Model config ChatGLMConfig {
130
+ "_name_or_path": "THUDM/chatglm3-6b",
131
+ "add_bias_linear": false,
132
+ "add_qkv_bias": true,
133
+ "apply_query_key_layer_scaling": true,
134
+ "apply_residual_connection_post_layernorm": false,
135
+ "architectures": [
136
+ "ChatGLMModel"
137
+ ],
138
+ "attention_dropout": 0.0,
139
+ "attention_softmax_in_fp32": true,
140
+ "auto_map": {
141
+ "AutoConfig": "THUDM/chatglm3-6b--configuration_chatglm.ChatGLMConfig",
142
+ "AutoModel": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
143
+ "AutoModelForCausalLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
144
+ "AutoModelForSeq2SeqLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
145
+ "AutoModelForSequenceClassification": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForSequenceClassification"
146
+ },
147
+ "bias_dropout_fusion": true,
148
+ "classifier_dropout": null,
149
+ "eos_token_id": 2,
150
+ "ffn_hidden_size": 13696,
151
+ "fp32_residual_connection": false,
152
+ "hidden_dropout": 0.0,
153
+ "hidden_size": 4096,
154
+ "kv_channels": 128,
155
+ "layernorm_epsilon": 1e-05,
156
+ "model_type": "chatglm",
157
+ "multi_query_attention": true,
158
+ "multi_query_group_num": 2,
159
+ "num_attention_heads": 32,
160
+ "num_layers": 28,
161
+ "original_rope": true,
162
+ "pad_token_id": 0,
163
+ "padded_vocab_size": 65024,
164
+ "post_layer_norm": true,
165
+ "pre_seq_len": null,
166
+ "prefix_projection": false,
167
+ "quantization_bit": 0,
168
+ "rmsnorm": true,
169
+ "seq_length": 8192,
170
+ "tie_word_embeddings": false,
171
+ "torch_dtype": "float16",
172
+ "transformers_version": "4.37.1",
173
+ "use_cache": true,
174
+ "vocab_size": 65024
175
+ }
176
+
177
+ [INFO|tokenization_utils_base.py:2027] 2024-01-26 12:54:46,519 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/37f2196f481f8989ea443be625d05f97043652ea/tokenizer.model
178
+ [INFO|tokenization_utils_base.py:2027] 2024-01-26 12:54:46,519 >> loading file added_tokens.json from cache at None
179
+ [INFO|tokenization_utils_base.py:2027] 2024-01-26 12:54:46,519 >> loading file special_tokens_map.json from cache at None
180
+ [INFO|tokenization_utils_base.py:2027] 2024-01-26 12:54:46,519 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/37f2196f481f8989ea443be625d05f97043652ea/tokenizer_config.json
181
+ [INFO|tokenization_utils_base.py:2027] 2024-01-26 12:54:46,519 >> loading file tokenizer.json from cache at None
182
+ [INFO|modeling_utils.py:3478] 2024-01-26 12:54:47,170 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/37f2196f481f8989ea443be625d05f97043652ea/model.safetensors.index.json
183
+ [INFO|configuration_utils.py:826] 2024-01-26 12:54:47,177 >> Generate config GenerationConfig {
184
+ "eos_token_id": 2,
185
+ "pad_token_id": 0,
186
+ "use_cache": false
187
+ }
188
+
189
+
190
+ [INFO|modeling_utils.py:4352] 2024-01-26 12:55:07,172 >> All model checkpoint weights were used when initializing ChatGLMForConditionalGeneration.
191
+
192
+ [WARNING|modeling_utils.py:4354] 2024-01-26 12:55:07,173 >> Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at THUDM/chatglm3-6b and are newly initialized: ['transformer.prefix_encoder.embedding.weight']
193
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
194
+ [INFO|modeling_utils.py:3897] 2024-01-26 12:55:07,458 >> Generation config file not found, using a generation config created from the model config.
195
+ Sanity Check >>>>>>>>>>>>>
196
+ '[gMASK]': 64790 -> -100
197
+ 'sop': 64792 -> -100
198
+ '': 30910 -> -100
199
+ '请': 55073 -> -100
200
+ '找出': 40369 -> -100
201
+ '下面': 33182 -> -100
202
+ '文本': 36704 -> -100
203
+ '中的': 31697 -> -100
204
+ 'position': 6523 -> -100
205
+ ':': 31211 -> -100
206
+ '艺术': 31835 -> -100
207
+ '是': 54532 -> -100
208
+ '相同的': 38815 -> -100
209
+ ',': 31123 -> -100
210
+ '音乐': 32000 -> -100
211
+ '美术': 33020 -> -100
212
+ '体育': 32214 -> -100
213
+ '三': 54645 -> -100
214
+ '样': 54741 -> -100
215
+ '都是': 31700 -> -100
216
+ '艺术': 31835 -> -100
217
+ '。,': 37843 -> -100
218
+ '三': 54645 -> -100
219
+ '样': 54741 -> -100
220
+ '艺术': 31835 -> -100
221
+ '都是': 31700 -> -100
222
+ '靠': 55518 -> -100
223
+ '感觉': 32044 -> -100
224
+ '的': 54530 -> -100
225
+ '。': 31155 -> -100
226
+ '感觉': 32044 -> -100
227
+ '好玩': 42814 -> -100
228
+ '起来': 31841 -> -100
229
+ '就很': 40030 -> -100
230
+ '轻松': 33550 -> -100
231
+ ',': 31123 -> -100
232
+ '所以': 31672 -> -100
233
+ '叫做': 35528 -> -100
234
+ '玩': 55409 -> -100
235
+ '艺术': 31835 -> -100
236
+ '。': 31155 -> -100
237
+ '没': 54721 -> -100
238
+ '感觉': 32044 -> -100
239
+ '找不到': 37779 -> -100
240
+ '北': 54760 -> -100
241
+ '的': 54530 -> -100
242
+ '干脆': 43396 -> -100
243
+ '别': 54835 -> -100
244
+ '玩': 55409 -> -100
245
+ '了': 54537 -> -100
246
+ '!': 31404 -> -100
247
+ ',': 31123 -> -100
248
+ '香港': 31776 -> -100
249
+ '电影': 31867 -> -100
250
+ '国语': 54385 -> -100
251
+ '配音': 40392 -> -100
252
+ '名家': 40465 -> -100
253
+ '周': 54896 -> -100
254
+ '思': 54872 -> -100
255
+ '平': 54678 -> -100
256
+ ',': 31123 -> -100
257
+ '代表作': 43527 -> -100
258
+ '有': 54536 -> -100
259
+ 'TVB': 42671 -> -100
260
+ '《': 54611 -> -100
261
+ '上海': 31770 -> -100
262
+ '滩': 56928 -> -100
263
+ '》': 54612 -> -100
264
+ '周': 54896 -> -100
265
+ '润': 55826 -> -100
266
+ '发': 54559 -> -100
267
+ '等': 54609 -> -100
268
+ '香港': 37944 -> 37944
269
+ '电影': 31867 -> 31867
270
+ '国语': 54385 -> 54385
271
+ '配音': 40392 -> 40392
272
+ '名家': 40465 -> 40465
273
+ '': 2 -> 2
274
+ <<<<<<<<<<<<< Sanity Check
275
+ 01/26/2024 12:55:08 - WARNING - accelerate.utils.other - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
276
+ [INFO|trainer.py:522] 2024-01-26 12:55:20,019 >> max_steps is given, it will override any value given in num_train_epochs
277
+ [WARNING|modeling_utils.py:2134] 2024-01-26 12:55:20,020 >> You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
278
+ [INFO|trainer.py:1721] 2024-01-26 12:55:21,544 >> ***** Running training *****
279
+ [INFO|trainer.py:1722] 2024-01-26 12:55:21,544 >> Num examples = 2,515
280
+ [INFO|trainer.py:1723] 2024-01-26 12:55:21,544 >> Num Epochs = 2
281
+ [INFO|trainer.py:1724] 2024-01-26 12:55:21,544 >> Instantaneous batch size per device = 1
282
+ [INFO|trainer.py:1727] 2024-01-26 12:55:21,544 >> Total train batch size (w. parallel, distributed & accumulation) = 32
283
+ [INFO|trainer.py:1728] 2024-01-26 12:55:21,544 >> Gradient Accumulation steps = 32
284
+ [INFO|trainer.py:1729] 2024-01-26 12:55:21,544 >> Total optimization steps = 100
285
+ [INFO|trainer.py:1730] 2024-01-26 12:55:21,545 >> Number of trainable parameters = 1,835,008
286
+
287
  0%| | 0/100 [00:00<?, ?it/s]/home/vipuser/miniconda3/envs/GLM/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
288
+ warnings.warn(
289
+
290
  1%| | 1/100 [00:13<22:36, 13.70s/it]
291
 
292
+
293
  1%| | 1/100 [00:13<22:36, 13.70s/it]
294
  2%|▏ | 2/100 [00:25<20:33, 12.59s/it]
295
 
296
+
297
  2%|▏ | 2/100 [00:25<20:33, 12.59s/it]
298
  3%|▎ | 3/100 [00:37<19:50, 12.27s/it]
299
 
300
+
301
  3%|▎ | 3/100 [00:37<19:50, 12.27s/it]
302
  4%|▍ | 4/100 [00:49<19:25, 12.14s/it]
303
 
304
+
305
  4%|▍ | 4/100 [00:49<19:25, 12.14s/it]
306
  5%|▌ | 5/100 [01:01<19:09, 12.10s/it]
307
 
308
+
309
  5%|▌ | 5/100 [01:01<19:09, 12.10s/it]
310
  6%|▌ | 6/100 [01:13<18:55, 12.08s/it]
311
 
312
+
313
  6%|▌ | 6/100 [01:13<18:55, 12.08s/it]
314
  7%|▋ | 7/100 [01:25<18:45, 12.11s/it]
315
 
316
+
317
  7%|▋ | 7/100 [01:25<18:45, 12.11s/it]
318
  8%|▊ | 8/100 [01:37<18:35, 12.13s/it]
319
 
320
+
321
  8%|▊ | 8/100 [01:37<18:35, 12.13s/it]
322
  9%|▉ | 9/100 [01:49<18:25, 12.15s/it]
323
 
324
+
325
  9%|▉ | 9/100 [01:49<18:25, 12.15s/it]
326
  10%|█ | 10/100 [02:02<18:13, 12.15s/it]
327
 
328
+
329
  10%|█ | 10/100 [02:02<18:13, 12.15s/it]
330
  11%|█ | 11/100 [02:14<18:01, 12.16s/it]
331
 
332
+
333
  11%|█ | 11/100 [02:14<18:01, 12.16s/it]
334
  12%|█▏ | 12/100 [02:26<17:51, 12.18s/it]
335
 
336
+
337
  12%|█▏ | 12/100 [02:26<17:51, 12.18s/it]
338
  13%|█▎ | 13/100 [02:38<17:39, 12.18s/it]
339
 
340
+
341
  13%|█▎ | 13/100 [02:38<17:39, 12.18s/it]
342
  14%|█▍ | 14/100 [02:50<17:28, 12.19s/it]
343
 
344
+
345
  14%|█▍ | 14/100 [02:50<17:28, 12.19s/it]
346
  15%|█▌ | 15/100 [03:03<17:17, 12.20s/it]
347
 
348
+
349
  15%|█▌ | 15/100 [03:03<17:17, 12.20s/it]
350
  16%|█▌ | 16/100 [03:15<17:04, 12.20s/it]
351
 
352
+
353
  16%|█▌ | 16/100 [03:15<17:04, 12.20s/it]
354
  17%|█▋ | 17/100 [03:27<16:52, 12.20s/it]
355
 
356
+
357
  17%|█▋ | 17/100 [03:27<16:52, 12.20s/it]
358
  18%|█▊ | 18/100 [03:39<16:39, 12.19s/it]
359
 
360
+
361
  18%|█▊ | 18/100 [03:39<16:39, 12.19s/it]
362
  19%|█▉ | 19/100 [03:51<16:27, 12.19s/it]
363
 
364
+
365
  19%|█▉ | 19/100 [03:51<16:27, 12.19s/it]
366
  20%|██ | 20/100 [04:04<16:14, 12.18s/it]
367
 
368
+
369
  20%|██ | 20/100 [04:04<16:14, 12.18s/it]
370
  21%|██ | 21/100 [04:16<16:01, 12.17s/it]
371
 
372
+
373
  21%|██ | 21/100 [04:16<16:01, 12.17s/it]
374
  22%|██▏ | 22/100 [04:28<15:50, 12.18s/it]
375
 
376
+
377
  22%|██▏ | 22/100 [04:28<15:50, 12.18s/it]
378
  23%|██▎ | 23/100 [04:40<15:37, 12.18s/it]
379
 
380
+
381
  23%|██▎ | 23/100 [04:40<15:37, 12.18s/it]
382
  24%|██▍ | 24/100 [04:52<15:25, 12.17s/it]
383
 
384
+
385
  24%|██▍ | 24/100 [04:52<15:25, 12.17s/it]
386
  25%|██▌ | 25/100 [05:04<15:13, 12.18s/it]
387
 
388
+
389
  25%|██▌ | 25/100 [05:04<15:13, 12.18s/it]
390
  26%|██▌ | 26/100 [05:17<15:00, 12.17s/it]
391
 
392
+
393
  26%|██▌ | 26/100 [05:17<15:00, 12.17s/it]
394
  27%|██▋ | 27/100 [05:29<14:49, 12.18s/it]
395
 
396
+
397
  27%|██▋ | 27/100 [05:29<14:49, 12.18s/it]
398
  28%|██▊ | 28/100 [05:41<14:36, 12.17s/it]
399
 
400
+
401
  28%|██▊ | 28/100 [05:41<14:36, 12.17s/it]
402
  29%|██▉ | 29/100 [05:53<14:25, 12.19s/it]
403
 
404
+
405
  29%|██▉ | 29/100 [05:53<14:25, 12.19s/it]
406
  30%|███ | 30/100 [06:05<14:13, 12.19s/it]
407
 
408
+
409
  30%|███ | 30/100 [06:05<14:13, 12.19s/it]
410
  31%|███ | 31/100 [06:17<14:00, 12.18s/it]
411
 
412
+
413
  31%|███ | 31/100 [06:17<14:00, 12.18s/it]
414
  32%|███▏ | 32/100 [06:30<13:48, 12.18s/it]
415
 
416
+
417
  32%|███▏ | 32/100 [06:30<13:48, 12.18s/it]
418
  33%|███▎ | 33/100 [06:42<13:36, 12.18s/it]
419
 
420
+
421
  33%|███▎ | 33/100 [06:42<13:36, 12.18s/it]
422
  34%|███▍ | 34/100 [06:54<13:24, 12.18s/it]
423
 
424
+
425
  34%|███▍ | 34/100 [06:54<13:24, 12.18s/it]
426
  35%|███▌ | 35/100 [07:06<13:10, 12.16s/it]
427
 
428
+
429
  35%|███▌ | 35/100 [07:06<13:10, 12.16s/it]
430
  36%|███▌ | 36/100 [07:18<12:57, 12.16s/it]
431
 
432
+
433
  36%|███▌ | 36/100 [07:18<12:57, 12.16s/it]
434
  37%|███▋ | 37/100 [07:30<12:46, 12.17s/it]
435
 
436
+
437
  37%|███▋ | 37/100 [07:30<12:46, 12.17s/it]
438
  38%|███▊ | 38/100 [07:43<12:34, 12.17s/it]
439
 
440
+
441
  38%|███▊ | 38/100 [07:43<12:34, 12.17s/it]
442
  39%|███▉ | 39/100 [07:55<12:22, 12.17s/it]
443
 
444
+
445
  39%|███▉ | 39/100 [07:55<12:22, 12.17s/it]
446
  40%|████ | 40/100 [08:07<12:10, 12.17s/it]
447
 
448
+
449
  40%|████ | 40/100 [08:07<12:10, 12.17s/it]
450
  41%|████ | 41/100 [08:19<11:58, 12.18s/it]
451
 
452
+
453
  41%|████ | 41/100 [08:19<11:58, 12.18s/it]
454
  42%|████▏ | 42/100 [08:31<11:46, 12.17s/it]
455
 
456
+
457
  42%|████▏ | 42/100 [08:31<11:46, 12.17s/it]
458
  43%|████▎ | 43/100 [08:44<11:34, 12.18s/it]
459
 
460
+
461
  43%|████▎ | 43/100 [08:44<11:34, 12.18s/it]
462
  44%|████▍ | 44/100 [08:56<11:21, 12.17s/it]
463
 
464
+
465
  44%|████▍ | 44/100 [08:56<11:21, 12.17s/it]
466
  45%|████▌ | 45/100 [09:08<11:09, 12.17s/it]
467
 
468
+
469
  45%|████▌ | 45/100 [09:08<11:09, 12.17s/it]
470
  46%|████▌ | 46/100 [09:20<10:57, 12.18s/it]
471
 
472
+
473
  46%|████▌ | 46/100 [09:20<10:57, 12.18s/it]
474
  47%|████▋ | 47/100 [09:32<10:44, 12.17s/it]
475
 
476
+
477
  47%|████▋ | 47/100 [09:32<10:44, 12.17s/it]
478
  48%|████▊ | 48/100 [09:44<10:33, 12.18s/it]
479
 
480
+
481
  48%|████▊ | 48/100 [09:44<10:33, 12.18s/it]
482
  49%|████▉ | 49/100 [09:57<10:21, 12.19s/it]
483
 
484
+
485
  49%|████▉ | 49/100 [09:57<10:21, 12.19s/it]
486
  50%|█████ | 50/100 [10:09<10:09, 12.18s/it]
487
 
488
+
489
  50%|█████ | 50/100 [10:09<10:09, 12.18s/it]
490
  51%|█████ | 51/100 [10:21<09:56, 12.17s/it]
491
 
492
+
493
  51%|█████ | 51/100 [10:21<09:56, 12.17s/it]
494
  52%|█████▏ | 52/100 [10:33<09:44, 12.18s/it]
495
 
496
+
497
  52%|█████▏ | 52/100 [10:33<09:44, 12.18s/it]
498
  53%|█████▎ | 53/100 [10:45<09:32, 12.19s/it]
499
 
500
+
501
  53%|█████▎ | 53/100 [10:45<09:32, 12.19s/it]
502
  54%|█████▍ | 54/100 [10:58<09:20, 12.18s/it]
503
 
504
+
505
  54%|█████▍ | 54/100 [10:58<09:20, 12.18s/it]
506
  55%|█████▌ | 55/100 [11:10<09:08, 12.18s/it]
507
 
508
+
509
  55%|█████▌ | 55/100 [11:10<09:08, 12.18s/it]
510
  56%|█████▌ | 56/100 [11:22<08:56, 12.19s/it]
511
 
512
+
513
  56%|█████▌ | 56/100 [11:22<08:56, 12.19s/it]
514
  57%|█████▋ | 57/100 [11:34<08:43, 12.18s/it]
515
 
516
+
517
  57%|█████▋ | 57/100 [11:34<08:43, 12.18s/it]
518
  58%|█████▊ | 58/100 [11:46<08:31, 12.18s/it]
519
 
520
+
521
  58%|█████▊ | 58/100 [11:46<08:31, 12.18s/it]
522
  59%|█████▉ | 59/100 [11:58<08:19, 12.18s/it]
523
 
524
+
525
  59%|█████▉ | 59/100 [11:58<08:19, 12.18s/it]
526
  60%|██████ | 60/100 [12:11<08:07, 12.18s/it]
527
 
528
+
529
  60%|██████ | 60/100 [12:11<08:07, 12.18s/it]
530
  61%|██████ | 61/100 [12:23<07:55, 12.19s/it]
531
 
532
+
533
  61%|██████ | 61/100 [12:23<07:55, 12.19s/it]
534
  62%|██████▏ | 62/100 [12:35<07:42, 12.18s/it]
535
 
536
+
537
  62%|██████▏ | 62/100 [12:35<07:42, 12.18s/it]
538
  63%|██████▎ | 63/100 [12:47<07:30, 12.18s/it]
539
 
540
+
541
  63%|██████▎ | 63/100 [12:47<07:30, 12.18s/it]
542
  64%|██████▍ | 64/100 [12:59<07:18, 12.18s/it]
543
 
544
+
545
  64%|██████▍ | 64/100 [12:59<07:18, 12.18s/it]
546
  65%|██████▌ | 65/100 [13:12<07:06, 12.18s/it]
547
 
548
+
549
  65%|██████▌ | 65/100 [13:12<07:06, 12.18s/it]
550
  66%|██████▌ | 66/100 [13:24<06:54, 12.18s/it]
551
 
552
+
553
  66%|██████▌ | 66/100 [13:24<06:54, 12.18s/it]
554
  67%|██████▋ | 67/100 [13:36<06:42, 12.19s/it]
555
 
556
+
557
  67%|██████▋ | 67/100 [13:36<06:42, 12.19s/it]
558
  68%|██████▊ | 68/100 [13:48<06:29, 12.17s/it]
559
 
560
+
561
  68%|██████▊ | 68/100 [13:48<06:29, 12.17s/it]
562
  69%|██████▉ | 69/100 [14:00<06:17, 12.16s/it]
563
 
564
+
565
  69%|██████▉ | 69/100 [14:00<06:17, 12.16s/it]
566
  70%|███████ | 70/100 [14:12<06:04, 12.16s/it]
567
 
568
+
569
  70%|███████ | 70/100 [14:12<06:04, 12.16s/it]
570
  71%|███████ | 71/100 [14:25<05:52, 12.16s/it]
571
 
572
+
573
  71%|███████ | 71/100 [14:25<05:52, 12.16s/it]
574
  72%|███████▏ | 72/100 [14:37<05:41, 12.19s/it]
575
 
576
+
577
  72%|███████▏ | 72/100 [14:37<05:41, 12.19s/it]
578
  73%|███████▎ | 73/100 [14:49<05:28, 12.18s/it]
579
 
580
+
581
  73%|███████▎ | 73/100 [14:49<05:28, 12.18s/it]
582
  74%|███████▍ | 74/100 [15:01<05:16, 12.18s/it]
583
 
584
+
585
  74%|███████▍ | 74/100 [15:01<05:16, 12.18s/it]
586
  75%|███████▌ | 75/100 [15:13<05:04, 12.19s/it]
587
 
588
+
589
  75%|███████▌ | 75/100 [15:13<05:04, 12.19s/it]
590
  76%|███████▌ | 76/100 [15:26<04:52, 12.19s/it]
591
 
592
+
593
  76%|███████▌ | 76/100 [15:26<04:52, 12.19s/it]
594
  77%|███████▋ | 77/100 [15:38<04:40, 12.19s/it]
595
 
596
+
597
  77%|███████▋ | 77/100 [15:38<04:40, 12.19s/it]
598
  78%|███████▊ | 78/100 [15:50<04:28, 12.19s/it]
599
 
600
+
601
  78%|███████▊ | 78/100 [15:50<04:28, 12.19s/it]
602
  79%|███████▉ | 79/100 [16:02<04:16, 12.20s/it]
603
 
604
+
605
  79%|███████▉ | 79/100 [16:02<04:16, 12.20s/it]
606
  80%|████████ | 80/100 [16:14<04:03, 12.17s/it]
607
 
608
+
609
  80%|████████ | 80/100 [16:14<04:03, 12.17s/it]
610
  81%|████████ | 81/100 [16:26<03:51, 12.18s/it]
611
 
612
+
613
  81%|████████ | 81/100 [16:26<03:51, 12.18s/it]
614
  82%|████████▏ | 82/100 [16:39<03:39, 12.18s/it]
615
 
616
+
617
  82%|████████▏ | 82/100 [16:39<03:39, 12.18s/it]
618
  83%|████████▎ | 83/100 [16:51<03:26, 12.16s/it]
619
 
620
+
621
  83%|████████▎ | 83/100 [16:51<03:26, 12.16s/it]
622
  84%|████████▍ | 84/100 [17:03<03:14, 12.18s/it]
623
 
624
+
625
  84%|████████▍ | 84/100 [17:03<03:14, 12.18s/it]
626
  85%|████████▌ | 85/100 [17:15<03:02, 12.18s/it]
627
 
628
+
629
  85%|████████▌ | 85/100 [17:15<03:02, 12.18s/it]
630
  86%|████████▌ | 86/100 [17:27<02:50, 12.18s/it]
631
 
632
+
633
  86%|████████▌ | 86/100 [17:27<02:50, 12.18s/it]
634
  87%|████████▋ | 87/100 [17:39<02:38, 12.18s/it]
635
 
636
+
637
  87%|████████▋ | 87/100 [17:39<02:38, 12.18s/it]
638
  88%|████████▊ | 88/100 [17:52<02:26, 12.18s/it]
639
 
640
+
641
  88%|████████▊ | 88/100 [17:52<02:26, 12.18s/it]
642
  89%|████████▉ | 89/100 [18:04<02:13, 12.18s/it]
643
 
644
+
645
  89%|████████▉ | 89/100 [18:04<02:13, 12.18s/it]
646
  90%|█████████ | 90/100 [18:16<02:01, 12.18s/it]
647
 
648
+
649
  90%|█████████ | 90/100 [18:16<02:01, 12.18s/it]
650
  91%|█████████ | 91/100 [18:28<01:49, 12.18s/it]
651
 
652
+
653
  91%|█████████ | 91/100 [18:28<01:49, 12.18s/it]
654
  92%|█████████▏| 92/100 [18:40<01:37, 12.20s/it]
655
 
656
+
657
  92%|█████████▏| 92/100 [18:40<01:37, 12.20s/it]
658
  93%|█████████▎| 93/100 [18:53<01:25, 12.21s/it]
659
 
660
+
661
  93%|█████████▎| 93/100 [18:53<01:25, 12.21s/it]
662
  94%|█████████▍| 94/100 [19:05<01:13, 12.19s/it]
663
 
664
+
665
  94%|█████████▍| 94/100 [19:05<01:13, 12.19s/it]
666
  95%|█████████▌| 95/100 [19:17<01:00, 12.18s/it]
667
 
668
+
669
  95%|█████████▌| 95/100 [19:17<01:00, 12.18s/it]
670
  96%|█████████▌| 96/100 [19:29<00:48, 12.18s/it]
671
 
672
+
673
  96%|█████████▌| 96/100 [19:29<00:48, 12.18s/it]
674
  97%|█████████▋| 97/100 [19:41<00:36, 12.19s/it]
675
 
676
+
677
  97%|█████████▋| 97/100 [19:41<00:36, 12.19s/it]
678
  98%|█████████▊| 98/100 [19:54<00:24, 12.19s/it]
679
 
680
+
681
  98%|█████████▊| 98/100 [19:54<00:24, 12.19s/it]
682
  99%|█████████▉| 99/100 [20:06<00:12, 12.18s/it]
683
 
684
+
685
  99%|█████████▉| 99/100 [20:06<00:12, 12.18s/it]
686
 
687
+
688
+
689
+ Training completed. Do not forget to share your model on huggingface.co/models =)
690
+
691
+
692
+
693
 
694
+
695
+ Saving PrefixEncoder
696
+ [INFO|configuration_utils.py:473] 2024-01-26 13:15:40,038 >> Configuration saved in output/privacy_detection_pt-20240126-125436-128-2e-2/config.json
697
+ [INFO|configuration_utils.py:594] 2024-01-26 13:15:40,039 >> Configuration saved in output/privacy_detection_pt-20240126-125436-128-2e-2/generation_config.json
698
+ [INFO|modeling_utils.py:2495] 2024-01-26 13:15:40,068 >> Model weights saved in output/privacy_detection_pt-20240126-125436-128-2e-2/pytorch_model.bin
699
+ [INFO|tokenization_utils_base.py:2433] 2024-01-26 13:15:40,069 >> tokenizer config file saved in output/privacy_detection_pt-20240126-125436-128-2e-2/tokenizer_config.json
700
+ [INFO|tokenization_utils_base.py:2442] 2024-01-26 13:15:40,069 >> Special tokens file saved in output/privacy_detection_pt-20240126-125436-128-2e-2/special_tokens_map.json
trainer_state.json ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.2723658051689861,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 0.0198,
14
+ "loss": 0.8181,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.03,
19
+ "learning_rate": 0.0196,
20
+ "loss": 0.787,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.04,
25
+ "learning_rate": 0.0194,
26
+ "loss": 1.0047,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 0.05,
31
+ "learning_rate": 0.0192,
32
+ "loss": 0.8688,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 0.06,
37
+ "learning_rate": 0.019,
38
+ "loss": 0.7173,
39
+ "step": 5
40
+ },
41
+ {
42
+ "epoch": 0.08,
43
+ "learning_rate": 0.0188,
44
+ "loss": 0.5175,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 0.09,
49
+ "learning_rate": 0.018600000000000002,
50
+ "loss": 0.7559,
51
+ "step": 7
52
+ },
53
+ {
54
+ "epoch": 0.1,
55
+ "learning_rate": 0.0184,
56
+ "loss": 0.9278,
57
+ "step": 8
58
+ },
59
+ {
60
+ "epoch": 0.11,
61
+ "learning_rate": 0.0182,
62
+ "loss": 0.6011,
63
+ "step": 9
64
+ },
65
+ {
66
+ "epoch": 0.13,
67
+ "learning_rate": 0.018000000000000002,
68
+ "loss": 0.8014,
69
+ "step": 10
70
+ },
71
+ {
72
+ "epoch": 0.14,
73
+ "learning_rate": 0.0178,
74
+ "loss": 1.2581,
75
+ "step": 11
76
+ },
77
+ {
78
+ "epoch": 0.15,
79
+ "learning_rate": 0.0176,
80
+ "loss": 0.9886,
81
+ "step": 12
82
+ },
83
+ {
84
+ "epoch": 0.17,
85
+ "learning_rate": 0.0174,
86
+ "loss": 0.7866,
87
+ "step": 13
88
+ },
89
+ {
90
+ "epoch": 0.18,
91
+ "learning_rate": 0.0172,
92
+ "loss": 0.936,
93
+ "step": 14
94
+ },
95
+ {
96
+ "epoch": 0.19,
97
+ "learning_rate": 0.017,
98
+ "loss": 1.0503,
99
+ "step": 15
100
+ },
101
+ {
102
+ "epoch": 0.2,
103
+ "learning_rate": 0.0168,
104
+ "loss": 0.5689,
105
+ "step": 16
106
+ },
107
+ {
108
+ "epoch": 0.22,
109
+ "learning_rate": 0.0166,
110
+ "loss": 0.8576,
111
+ "step": 17
112
+ },
113
+ {
114
+ "epoch": 0.23,
115
+ "learning_rate": 0.016399999999999998,
116
+ "loss": 1.0946,
117
+ "step": 18
118
+ },
119
+ {
120
+ "epoch": 0.24,
121
+ "learning_rate": 0.016200000000000003,
122
+ "loss": 0.9075,
123
+ "step": 19
124
+ },
125
+ {
126
+ "epoch": 0.25,
127
+ "learning_rate": 0.016,
128
+ "loss": 1.1441,
129
+ "step": 20
130
+ },
131
+ {
132
+ "epoch": 0.27,
133
+ "learning_rate": 0.0158,
134
+ "loss": 0.7794,
135
+ "step": 21
136
+ },
137
+ {
138
+ "epoch": 0.28,
139
+ "learning_rate": 0.015600000000000001,
140
+ "loss": 0.9574,
141
+ "step": 22
142
+ },
143
+ {
144
+ "epoch": 0.29,
145
+ "learning_rate": 0.0154,
146
+ "loss": 0.8937,
147
+ "step": 23
148
+ },
149
+ {
150
+ "epoch": 0.31,
151
+ "learning_rate": 0.0152,
152
+ "loss": 0.709,
153
+ "step": 24
154
+ },
155
+ {
156
+ "epoch": 0.32,
157
+ "learning_rate": 0.015,
158
+ "loss": 0.8731,
159
+ "step": 25
160
+ },
161
+ {
162
+ "epoch": 0.33,
163
+ "learning_rate": 0.0148,
164
+ "loss": 0.719,
165
+ "step": 26
166
+ },
167
+ {
168
+ "epoch": 0.34,
169
+ "learning_rate": 0.0146,
170
+ "loss": 0.7419,
171
+ "step": 27
172
+ },
173
+ {
174
+ "epoch": 0.36,
175
+ "learning_rate": 0.0144,
176
+ "loss": 0.9224,
177
+ "step": 28
178
+ },
179
+ {
180
+ "epoch": 0.37,
181
+ "learning_rate": 0.014199999999999999,
182
+ "loss": 1.0802,
183
+ "step": 29
184
+ },
185
+ {
186
+ "epoch": 0.38,
187
+ "learning_rate": 0.013999999999999999,
188
+ "loss": 0.8187,
189
+ "step": 30
190
+ },
191
+ {
192
+ "epoch": 0.39,
193
+ "learning_rate": 0.0138,
194
+ "loss": 0.615,
195
+ "step": 31
196
+ },
197
+ {
198
+ "epoch": 0.41,
199
+ "learning_rate": 0.013600000000000001,
200
+ "loss": 0.5214,
201
+ "step": 32
202
+ },
203
+ {
204
+ "epoch": 0.42,
205
+ "learning_rate": 0.0134,
206
+ "loss": 0.649,
207
+ "step": 33
208
+ },
209
+ {
210
+ "epoch": 0.43,
211
+ "learning_rate": 0.013200000000000002,
212
+ "loss": 0.6523,
213
+ "step": 34
214
+ },
215
+ {
216
+ "epoch": 0.45,
217
+ "learning_rate": 0.013000000000000001,
218
+ "loss": 0.7002,
219
+ "step": 35
220
+ },
221
+ {
222
+ "epoch": 0.46,
223
+ "learning_rate": 0.0128,
224
+ "loss": 0.6161,
225
+ "step": 36
226
+ },
227
+ {
228
+ "epoch": 0.47,
229
+ "learning_rate": 0.0126,
230
+ "loss": 1.0374,
231
+ "step": 37
232
+ },
233
+ {
234
+ "epoch": 0.48,
235
+ "learning_rate": 0.0124,
236
+ "loss": 1.0328,
237
+ "step": 38
238
+ },
239
+ {
240
+ "epoch": 0.5,
241
+ "learning_rate": 0.0122,
242
+ "loss": 0.7637,
243
+ "step": 39
244
+ },
245
+ {
246
+ "epoch": 0.51,
247
+ "learning_rate": 0.012,
248
+ "loss": 0.6332,
249
+ "step": 40
250
+ },
251
+ {
252
+ "epoch": 0.52,
253
+ "learning_rate": 0.0118,
254
+ "loss": 0.74,
255
+ "step": 41
256
+ },
257
+ {
258
+ "epoch": 0.53,
259
+ "learning_rate": 0.0116,
260
+ "loss": 0.7284,
261
+ "step": 42
262
+ },
263
+ {
264
+ "epoch": 0.55,
265
+ "learning_rate": 0.011399999999999999,
266
+ "loss": 0.9198,
267
+ "step": 43
268
+ },
269
+ {
270
+ "epoch": 0.56,
271
+ "learning_rate": 0.011200000000000002,
272
+ "loss": 0.626,
273
+ "step": 44
274
+ },
275
+ {
276
+ "epoch": 0.57,
277
+ "learning_rate": 0.011000000000000001,
278
+ "loss": 0.628,
279
+ "step": 45
280
+ },
281
+ {
282
+ "epoch": 0.59,
283
+ "learning_rate": 0.0108,
284
+ "loss": 0.5322,
285
+ "step": 46
286
+ },
287
+ {
288
+ "epoch": 0.6,
289
+ "learning_rate": 0.0106,
290
+ "loss": 0.7844,
291
+ "step": 47
292
+ },
293
+ {
294
+ "epoch": 0.61,
295
+ "learning_rate": 0.010400000000000001,
296
+ "loss": 0.5957,
297
+ "step": 48
298
+ },
299
+ {
300
+ "epoch": 0.62,
301
+ "learning_rate": 0.0102,
302
+ "loss": 0.6681,
303
+ "step": 49
304
+ },
305
+ {
306
+ "epoch": 0.64,
307
+ "learning_rate": 0.01,
308
+ "loss": 0.8281,
309
+ "step": 50
310
+ },
311
+ {
312
+ "epoch": 0.65,
313
+ "learning_rate": 0.0098,
314
+ "loss": 0.5284,
315
+ "step": 51
316
+ },
317
+ {
318
+ "epoch": 0.66,
319
+ "learning_rate": 0.0096,
320
+ "loss": 0.8251,
321
+ "step": 52
322
+ },
323
+ {
324
+ "epoch": 0.67,
325
+ "learning_rate": 0.0094,
326
+ "loss": 0.9845,
327
+ "step": 53
328
+ },
329
+ {
330
+ "epoch": 0.69,
331
+ "learning_rate": 0.0092,
332
+ "loss": 0.9525,
333
+ "step": 54
334
+ },
335
+ {
336
+ "epoch": 0.7,
337
+ "learning_rate": 0.009000000000000001,
338
+ "loss": 0.9454,
339
+ "step": 55
340
+ },
341
+ {
342
+ "epoch": 0.71,
343
+ "learning_rate": 0.0088,
344
+ "loss": 0.4058,
345
+ "step": 56
346
+ },
347
+ {
348
+ "epoch": 0.73,
349
+ "learning_rate": 0.0086,
350
+ "loss": 0.5435,
351
+ "step": 57
352
+ },
353
+ {
354
+ "epoch": 0.74,
355
+ "learning_rate": 0.0084,
356
+ "loss": 0.6892,
357
+ "step": 58
358
+ },
359
+ {
360
+ "epoch": 0.75,
361
+ "learning_rate": 0.008199999999999999,
362
+ "loss": 0.6426,
363
+ "step": 59
364
+ },
365
+ {
366
+ "epoch": 0.76,
367
+ "learning_rate": 0.008,
368
+ "loss": 0.9414,
369
+ "step": 60
370
+ },
371
+ {
372
+ "epoch": 0.78,
373
+ "learning_rate": 0.0078000000000000005,
374
+ "loss": 0.7945,
375
+ "step": 61
376
+ },
377
+ {
378
+ "epoch": 0.79,
379
+ "learning_rate": 0.0076,
380
+ "loss": 0.6295,
381
+ "step": 62
382
+ },
383
+ {
384
+ "epoch": 0.8,
385
+ "learning_rate": 0.0074,
386
+ "loss": 0.7888,
387
+ "step": 63
388
+ },
389
+ {
390
+ "epoch": 0.81,
391
+ "learning_rate": 0.0072,
392
+ "loss": 0.5454,
393
+ "step": 64
394
+ },
395
+ {
396
+ "epoch": 0.83,
397
+ "learning_rate": 0.006999999999999999,
398
+ "loss": 0.711,
399
+ "step": 65
400
+ },
401
+ {
402
+ "epoch": 0.84,
403
+ "learning_rate": 0.0068000000000000005,
404
+ "loss": 0.713,
405
+ "step": 66
406
+ },
407
+ {
408
+ "epoch": 0.85,
409
+ "learning_rate": 0.006600000000000001,
410
+ "loss": 0.6058,
411
+ "step": 67
412
+ },
413
+ {
414
+ "epoch": 0.87,
415
+ "learning_rate": 0.0064,
416
+ "loss": 0.8203,
417
+ "step": 68
418
+ },
419
+ {
420
+ "epoch": 0.88,
421
+ "learning_rate": 0.0062,
422
+ "loss": 0.8275,
423
+ "step": 69
424
+ },
425
+ {
426
+ "epoch": 0.89,
427
+ "learning_rate": 0.006,
428
+ "loss": 0.4923,
429
+ "step": 70
430
+ },
431
+ {
432
+ "epoch": 0.9,
433
+ "learning_rate": 0.0058,
434
+ "loss": 0.5219,
435
+ "step": 71
436
+ },
437
+ {
438
+ "epoch": 0.92,
439
+ "learning_rate": 0.005600000000000001,
440
+ "loss": 0.9954,
441
+ "step": 72
442
+ },
443
+ {
444
+ "epoch": 0.93,
445
+ "learning_rate": 0.0054,
446
+ "loss": 0.6206,
447
+ "step": 73
448
+ },
449
+ {
450
+ "epoch": 0.94,
451
+ "learning_rate": 0.005200000000000001,
452
+ "loss": 0.6064,
453
+ "step": 74
454
+ },
455
+ {
456
+ "epoch": 0.95,
457
+ "learning_rate": 0.005,
458
+ "loss": 0.6584,
459
+ "step": 75
460
+ },
461
+ {
462
+ "epoch": 0.97,
463
+ "learning_rate": 0.0048,
464
+ "loss": 0.8461,
465
+ "step": 76
466
+ },
467
+ {
468
+ "epoch": 0.98,
469
+ "learning_rate": 0.0046,
470
+ "loss": 0.9615,
471
+ "step": 77
472
+ },
473
+ {
474
+ "epoch": 0.99,
475
+ "learning_rate": 0.0044,
476
+ "loss": 0.6508,
477
+ "step": 78
478
+ },
479
+ {
480
+ "epoch": 1.01,
481
+ "learning_rate": 0.0042,
482
+ "loss": 1.0089,
483
+ "step": 79
484
+ },
485
+ {
486
+ "epoch": 1.02,
487
+ "learning_rate": 0.004,
488
+ "loss": 0.7515,
489
+ "step": 80
490
+ },
491
+ {
492
+ "epoch": 1.03,
493
+ "learning_rate": 0.0038,
494
+ "loss": 0.4172,
495
+ "step": 81
496
+ },
497
+ {
498
+ "epoch": 1.04,
499
+ "learning_rate": 0.0036,
500
+ "loss": 0.7634,
501
+ "step": 82
502
+ },
503
+ {
504
+ "epoch": 1.06,
505
+ "learning_rate": 0.0034000000000000002,
506
+ "loss": 0.585,
507
+ "step": 83
508
+ },
509
+ {
510
+ "epoch": 1.07,
511
+ "learning_rate": 0.0032,
512
+ "loss": 0.7668,
513
+ "step": 84
514
+ },
515
+ {
516
+ "epoch": 1.08,
517
+ "learning_rate": 0.003,
518
+ "loss": 0.5403,
519
+ "step": 85
520
+ },
521
+ {
522
+ "epoch": 1.09,
523
+ "learning_rate": 0.0028000000000000004,
524
+ "loss": 0.5995,
525
+ "step": 86
526
+ },
527
+ {
528
+ "epoch": 1.11,
529
+ "learning_rate": 0.0026000000000000003,
530
+ "loss": 0.4515,
531
+ "step": 87
532
+ },
533
+ {
534
+ "epoch": 1.12,
535
+ "learning_rate": 0.0024,
536
+ "loss": 0.6288,
537
+ "step": 88
538
+ },
539
+ {
540
+ "epoch": 1.13,
541
+ "learning_rate": 0.0022,
542
+ "loss": 0.7387,
543
+ "step": 89
544
+ },
545
+ {
546
+ "epoch": 1.15,
547
+ "learning_rate": 0.002,
548
+ "loss": 0.6517,
549
+ "step": 90
550
+ },
551
+ {
552
+ "epoch": 1.16,
553
+ "learning_rate": 0.0018,
554
+ "loss": 0.5389,
555
+ "step": 91
556
+ },
557
+ {
558
+ "epoch": 1.17,
559
+ "learning_rate": 0.0016,
560
+ "loss": 0.4433,
561
+ "step": 92
562
+ },
563
+ {
564
+ "epoch": 1.18,
565
+ "learning_rate": 0.0014000000000000002,
566
+ "loss": 0.6643,
567
+ "step": 93
568
+ },
569
+ {
570
+ "epoch": 1.2,
571
+ "learning_rate": 0.0012,
572
+ "loss": 0.5825,
573
+ "step": 94
574
+ },
575
+ {
576
+ "epoch": 1.21,
577
+ "learning_rate": 0.001,
578
+ "loss": 0.7709,
579
+ "step": 95
580
+ },
581
+ {
582
+ "epoch": 1.22,
583
+ "learning_rate": 0.0008,
584
+ "loss": 0.562,
585
+ "step": 96
586
+ },
587
+ {
588
+ "epoch": 1.23,
589
+ "learning_rate": 0.0006,
590
+ "loss": 0.5581,
591
+ "step": 97
592
+ },
593
+ {
594
+ "epoch": 1.25,
595
+ "learning_rate": 0.0004,
596
+ "loss": 0.4679,
597
+ "step": 98
598
+ },
599
+ {
600
+ "epoch": 1.26,
601
+ "learning_rate": 0.0002,
602
+ "loss": 0.5063,
603
+ "step": 99
604
+ },
605
+ {
606
+ "epoch": 1.27,
607
+ "learning_rate": 0.0,
608
+ "loss": 0.5527,
609
+ "step": 100
610
+ },
611
+ {
612
+ "epoch": 1.27,
613
+ "step": 100,
614
+ "total_flos": 1.323218757484544e+17,
615
+ "train_loss": 0.7395605874061585,
616
+ "train_runtime": 1218.4689,
617
+ "train_samples_per_second": 2.626,
618
+ "train_steps_per_second": 0.082
619
+ }
620
+ ],
621
+ "logging_steps": 1.0,
622
+ "max_steps": 100,
623
+ "num_input_tokens_seen": 0,
624
+ "num_train_epochs": 2,
625
+ "save_steps": 500,
626
+ "total_flos": 1.323218757484544e+17,
627
+ "train_batch_size": 1,
628
+ "trial_name": null,
629
+ "trial_params": null
630
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af0586708ab25020ad621ec87f7cd1129b0243acf75571d59d2e719620fb9ef3
3
+ size 4920