|
2023-08-17 13:20:08,970 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:08,977 Model: "SequenceTagger( |
|
(embeddings): TransformerWordEmbeddings( |
|
(model): XLMRobertaModel( |
|
(embeddings): XLMRobertaEmbeddings( |
|
(word_embeddings): Embedding(250003, 768) |
|
(position_embeddings): Embedding(514, 768, padding_idx=1) |
|
(token_type_embeddings): Embedding(1, 768) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(encoder): XLMRobertaEncoder( |
|
(layer): ModuleList( |
|
(0): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(1): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(2): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(3): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(4): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(5): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(6): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(7): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(8): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(9): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(10): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(11): XLMRobertaLayer( |
|
(attention): XLMRobertaAttention( |
|
(self): XLMRobertaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=True) |
|
(key): Linear(in_features=768, out_features=768, bias=True) |
|
(value): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(output): XLMRobertaSelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
(intermediate): XLMRobertaIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): XLMRobertaOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
(dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
) |
|
) |
|
) |
|
(pooler): XLMRobertaPooler( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(activation): Tanh() |
|
) |
|
) |
|
) |
|
(word_dropout): WordDropout(p=0.05) |
|
(locked_dropout): LockedDropout(p=0.5) |
|
(linear): Linear(in_features=768, out_features=158, bias=True) |
|
(loss_function): ViterbiLoss() |
|
(crf): CRF() |
|
)" |
|
2023-08-17 13:20:08,995 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:08,996 Corpus: "Corpus: 7767 train + 409 dev + 0 test sentences" |
|
2023-08-17 13:20:08,997 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:08,997 Parameters: |
|
2023-08-17 13:20:08,997 - learning_rate: "0.000050" |
|
2023-08-17 13:20:08,998 - mini_batch_size: "32" |
|
2023-08-17 13:20:08,998 - patience: "3" |
|
2023-08-17 13:20:08,998 - anneal_factor: "0.5" |
|
2023-08-17 13:20:08,999 - max_epochs: "2" |
|
2023-08-17 13:20:08,999 - shuffle: "True" |
|
2023-08-17 13:20:09,000 - train_with_dev: "False" |
|
2023-08-17 13:20:09,000 - batch_growth_annealing: "False" |
|
2023-08-17 13:20:09,000 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:09,001 Model training base path: "/scratch/skulick/ppchy-11-pos/xlmb-ck05-yid1/split_final/train" |
|
2023-08-17 13:20:09,001 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:09,001 Device: cuda:0 |
|
2023-08-17 13:20:09,002 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:20:09,002 Embeddings storage mode: none |
|
2023-08-17 13:20:09,002 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:21:05,834 epoch 1 - iter 24/243 - loss 5.52841502 - time (sec): 56.83 - samples/sec: 131.44 - lr: 0.000025 |
|
2023-08-17 13:22:03,318 epoch 1 - iter 48/243 - loss 4.70686211 - time (sec): 114.32 - samples/sec: 130.45 - lr: 0.000050 |
|
2023-08-17 13:23:00,549 epoch 1 - iter 72/243 - loss 3.86110162 - time (sec): 171.55 - samples/sec: 131.94 - lr: 0.000047 |
|
2023-08-17 13:23:57,695 epoch 1 - iter 96/243 - loss 3.22106003 - time (sec): 228.69 - samples/sec: 132.37 - lr: 0.000045 |
|
2023-08-17 13:24:55,039 epoch 1 - iter 120/243 - loss 2.77518007 - time (sec): 286.04 - samples/sec: 132.92 - lr: 0.000042 |
|
2023-08-17 13:25:52,345 epoch 1 - iter 144/243 - loss 2.46009763 - time (sec): 343.34 - samples/sec: 133.06 - lr: 0.000039 |
|
2023-08-17 13:26:49,831 epoch 1 - iter 168/243 - loss 2.21288400 - time (sec): 400.83 - samples/sec: 134.04 - lr: 0.000036 |
|
2023-08-17 13:27:47,964 epoch 1 - iter 192/243 - loss 2.01670410 - time (sec): 458.96 - samples/sec: 134.63 - lr: 0.000034 |
|
2023-08-17 13:28:45,494 epoch 1 - iter 216/243 - loss 1.86783335 - time (sec): 516.49 - samples/sec: 134.47 - lr: 0.000031 |
|
2023-08-17 13:29:43,119 epoch 1 - iter 240/243 - loss 1.74523925 - time (sec): 574.12 - samples/sec: 135.25 - lr: 0.000028 |
|
2023-08-17 13:29:50,011 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:29:50,011 EPOCH 1 done: loss 1.7334 - lr 0.000028 |
|
2023-08-17 13:29:52,277 Evaluating as a multi-label problem: False |
|
2023-08-17 13:29:52,376 DEV : loss 0.3509514629840851 - f1-score (micro avg) 0.9331 |
|
2023-08-17 13:29:52,410 saving best model |
|
2023-08-17 13:29:54,774 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:30:44,972 epoch 2 - iter 24/243 - loss 0.58877620 - time (sec): 50.20 - samples/sec: 152.66 - lr: 0.000025 |
|
2023-08-17 13:31:36,455 epoch 2 - iter 48/243 - loss 0.60804646 - time (sec): 101.68 - samples/sec: 152.75 - lr: 0.000022 |
|
2023-08-17 13:32:27,132 epoch 2 - iter 72/243 - loss 0.60136722 - time (sec): 152.36 - samples/sec: 153.64 - lr: 0.000020 |
|
2023-08-17 13:33:17,902 epoch 2 - iter 96/243 - loss 0.59255541 - time (sec): 203.13 - samples/sec: 154.55 - lr: 0.000017 |
|
2023-08-17 13:34:08,949 epoch 2 - iter 120/243 - loss 0.58957421 - time (sec): 254.17 - samples/sec: 154.79 - lr: 0.000014 |
|
2023-08-17 13:35:00,256 epoch 2 - iter 144/243 - loss 0.58878210 - time (sec): 305.48 - samples/sec: 154.48 - lr: 0.000011 |
|
2023-08-17 13:35:51,214 epoch 2 - iter 168/243 - loss 0.58168957 - time (sec): 356.44 - samples/sec: 153.84 - lr: 0.000009 |
|
2023-08-17 13:36:42,167 epoch 2 - iter 192/243 - loss 0.57403444 - time (sec): 407.39 - samples/sec: 153.55 - lr: 0.000006 |
|
2023-08-17 13:37:32,761 epoch 2 - iter 216/243 - loss 0.57331317 - time (sec): 457.99 - samples/sec: 152.68 - lr: 0.000003 |
|
2023-08-17 13:38:23,745 epoch 2 - iter 240/243 - loss 0.56849021 - time (sec): 508.97 - samples/sec: 152.71 - lr: 0.000000 |
|
2023-08-17 13:38:29,500 ---------------------------------------------------------------------------------------------------- |
|
2023-08-17 13:38:29,500 EPOCH 2 done: loss 0.5679 - lr 0.000000 |
|
2023-08-17 13:38:31,769 Evaluating as a multi-label problem: False |
|
2023-08-17 13:38:31,868 DEV : loss 0.23018118739128113 - f1-score (micro avg) 0.9562 |
|
2023-08-17 13:38:31,902 saving best model |
|
2023-08-17 13:38:37,560 Test data not provided setting final score to 0 |
|
|