xlmb-ck05-yid1 / training.log
Seth Kulick
update test
63a0ab7
raw
history blame
20.3 kB
2023-08-17 13:20:08,970 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:08,977 Model: "SequenceTagger(
(embeddings): TransformerWordEmbeddings(
(model): XLMRobertaModel(
(embeddings): XLMRobertaEmbeddings(
(word_embeddings): Embedding(250003, 768)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): XLMRobertaEncoder(
(layer): ModuleList(
(0): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): XLMRobertaPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
)
(word_dropout): WordDropout(p=0.05)
(locked_dropout): LockedDropout(p=0.5)
(linear): Linear(in_features=768, out_features=158, bias=True)
(loss_function): ViterbiLoss()
(crf): CRF()
)"
2023-08-17 13:20:08,995 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:08,996 Corpus: "Corpus: 7767 train + 409 dev + 0 test sentences"
2023-08-17 13:20:08,997 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:08,997 Parameters:
2023-08-17 13:20:08,997 - learning_rate: "0.000050"
2023-08-17 13:20:08,998 - mini_batch_size: "32"
2023-08-17 13:20:08,998 - patience: "3"
2023-08-17 13:20:08,998 - anneal_factor: "0.5"
2023-08-17 13:20:08,999 - max_epochs: "2"
2023-08-17 13:20:08,999 - shuffle: "True"
2023-08-17 13:20:09,000 - train_with_dev: "False"
2023-08-17 13:20:09,000 - batch_growth_annealing: "False"
2023-08-17 13:20:09,000 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:09,001 Model training base path: "/scratch/skulick/ppchy-11-pos/xlmb-ck05-yid1/split_final/train"
2023-08-17 13:20:09,001 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:09,001 Device: cuda:0
2023-08-17 13:20:09,002 ----------------------------------------------------------------------------------------------------
2023-08-17 13:20:09,002 Embeddings storage mode: none
2023-08-17 13:20:09,002 ----------------------------------------------------------------------------------------------------
2023-08-17 13:21:05,834 epoch 1 - iter 24/243 - loss 5.52841502 - time (sec): 56.83 - samples/sec: 131.44 - lr: 0.000025
2023-08-17 13:22:03,318 epoch 1 - iter 48/243 - loss 4.70686211 - time (sec): 114.32 - samples/sec: 130.45 - lr: 0.000050
2023-08-17 13:23:00,549 epoch 1 - iter 72/243 - loss 3.86110162 - time (sec): 171.55 - samples/sec: 131.94 - lr: 0.000047
2023-08-17 13:23:57,695 epoch 1 - iter 96/243 - loss 3.22106003 - time (sec): 228.69 - samples/sec: 132.37 - lr: 0.000045
2023-08-17 13:24:55,039 epoch 1 - iter 120/243 - loss 2.77518007 - time (sec): 286.04 - samples/sec: 132.92 - lr: 0.000042
2023-08-17 13:25:52,345 epoch 1 - iter 144/243 - loss 2.46009763 - time (sec): 343.34 - samples/sec: 133.06 - lr: 0.000039
2023-08-17 13:26:49,831 epoch 1 - iter 168/243 - loss 2.21288400 - time (sec): 400.83 - samples/sec: 134.04 - lr: 0.000036
2023-08-17 13:27:47,964 epoch 1 - iter 192/243 - loss 2.01670410 - time (sec): 458.96 - samples/sec: 134.63 - lr: 0.000034
2023-08-17 13:28:45,494 epoch 1 - iter 216/243 - loss 1.86783335 - time (sec): 516.49 - samples/sec: 134.47 - lr: 0.000031
2023-08-17 13:29:43,119 epoch 1 - iter 240/243 - loss 1.74523925 - time (sec): 574.12 - samples/sec: 135.25 - lr: 0.000028
2023-08-17 13:29:50,011 ----------------------------------------------------------------------------------------------------
2023-08-17 13:29:50,011 EPOCH 1 done: loss 1.7334 - lr 0.000028
2023-08-17 13:29:52,277 Evaluating as a multi-label problem: False
2023-08-17 13:29:52,376 DEV : loss 0.3509514629840851 - f1-score (micro avg) 0.9331
2023-08-17 13:29:52,410 saving best model
2023-08-17 13:29:54,774 ----------------------------------------------------------------------------------------------------
2023-08-17 13:30:44,972 epoch 2 - iter 24/243 - loss 0.58877620 - time (sec): 50.20 - samples/sec: 152.66 - lr: 0.000025
2023-08-17 13:31:36,455 epoch 2 - iter 48/243 - loss 0.60804646 - time (sec): 101.68 - samples/sec: 152.75 - lr: 0.000022
2023-08-17 13:32:27,132 epoch 2 - iter 72/243 - loss 0.60136722 - time (sec): 152.36 - samples/sec: 153.64 - lr: 0.000020
2023-08-17 13:33:17,902 epoch 2 - iter 96/243 - loss 0.59255541 - time (sec): 203.13 - samples/sec: 154.55 - lr: 0.000017
2023-08-17 13:34:08,949 epoch 2 - iter 120/243 - loss 0.58957421 - time (sec): 254.17 - samples/sec: 154.79 - lr: 0.000014
2023-08-17 13:35:00,256 epoch 2 - iter 144/243 - loss 0.58878210 - time (sec): 305.48 - samples/sec: 154.48 - lr: 0.000011
2023-08-17 13:35:51,214 epoch 2 - iter 168/243 - loss 0.58168957 - time (sec): 356.44 - samples/sec: 153.84 - lr: 0.000009
2023-08-17 13:36:42,167 epoch 2 - iter 192/243 - loss 0.57403444 - time (sec): 407.39 - samples/sec: 153.55 - lr: 0.000006
2023-08-17 13:37:32,761 epoch 2 - iter 216/243 - loss 0.57331317 - time (sec): 457.99 - samples/sec: 152.68 - lr: 0.000003
2023-08-17 13:38:23,745 epoch 2 - iter 240/243 - loss 0.56849021 - time (sec): 508.97 - samples/sec: 152.71 - lr: 0.000000
2023-08-17 13:38:29,500 ----------------------------------------------------------------------------------------------------
2023-08-17 13:38:29,500 EPOCH 2 done: loss 0.5679 - lr 0.000000
2023-08-17 13:38:31,769 Evaluating as a multi-label problem: False
2023-08-17 13:38:31,868 DEV : loss 0.23018118739128113 - f1-score (micro avg) 0.9562
2023-08-17 13:38:31,902 saving best model
2023-08-17 13:38:37,560 Test data not provided setting final score to 0