stefan-it's picture
Upload ./training.log with huggingface_hub
1d797e4
2023-10-24 18:08:00,796 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Model: "SequenceTagger(
(embeddings): TransformerWordEmbeddings(
(model): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(64001, 768)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
)
(locked_dropout): LockedDropout(p=0.5)
(linear): Linear(in_features=768, out_features=13, bias=True)
(loss_function): CrossEntropyLoss()
)"
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 MultiCorpus: 7936 train + 992 dev + 992 test sentences
- NER_ICDAR_EUROPEANA Corpus: 7936 train + 992 dev + 992 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/fr
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Train: 7936 sentences
2023-10-24 18:08:00,797 (train_with_dev=False, train_with_test=False)
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Training Params:
2023-10-24 18:08:00,797 - learning_rate: "3e-05"
2023-10-24 18:08:00,797 - mini_batch_size: "4"
2023-10-24 18:08:00,797 - max_epochs: "10"
2023-10-24 18:08:00,797 - shuffle: "True"
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Plugins:
2023-10-24 18:08:00,797 - TensorboardLogger
2023-10-24 18:08:00,797 - LinearScheduler | warmup_fraction: '0.1'
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Final evaluation on model from best epoch (best-model.pt)
2023-10-24 18:08:00,797 - metric: "('micro avg', 'f1-score')"
2023-10-24 18:08:00,797 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,797 Computation:
2023-10-24 18:08:00,797 - compute on device: cuda:0
2023-10-24 18:08:00,798 - embedding storage: none
2023-10-24 18:08:00,798 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,798 Model training base path: "hmbench-icdar/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-4"
2023-10-24 18:08:00,798 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,798 ----------------------------------------------------------------------------------------------------
2023-10-24 18:08:00,798 Logging anything other than scalars to TensorBoard is currently not supported.
2023-10-24 18:08:12,645 epoch 1 - iter 198/1984 - loss 1.65883842 - time (sec): 11.85 - samples/sec: 1399.30 - lr: 0.000003 - momentum: 0.000000
2023-10-24 18:08:24,650 epoch 1 - iter 396/1984 - loss 0.99109184 - time (sec): 23.85 - samples/sec: 1395.13 - lr: 0.000006 - momentum: 0.000000
2023-10-24 18:08:36,790 epoch 1 - iter 594/1984 - loss 0.74189793 - time (sec): 35.99 - samples/sec: 1363.42 - lr: 0.000009 - momentum: 0.000000
2023-10-24 18:08:48,710 epoch 1 - iter 792/1984 - loss 0.60171309 - time (sec): 47.91 - samples/sec: 1355.98 - lr: 0.000012 - momentum: 0.000000
2023-10-24 18:09:00,807 epoch 1 - iter 990/1984 - loss 0.51096698 - time (sec): 60.01 - samples/sec: 1357.21 - lr: 0.000015 - momentum: 0.000000
2023-10-24 18:09:12,838 epoch 1 - iter 1188/1984 - loss 0.45403133 - time (sec): 72.04 - samples/sec: 1348.75 - lr: 0.000018 - momentum: 0.000000
2023-10-24 18:09:25,213 epoch 1 - iter 1386/1984 - loss 0.41042045 - time (sec): 84.41 - samples/sec: 1348.01 - lr: 0.000021 - momentum: 0.000000
2023-10-24 18:09:37,257 epoch 1 - iter 1584/1984 - loss 0.37518830 - time (sec): 96.46 - samples/sec: 1351.04 - lr: 0.000024 - momentum: 0.000000
2023-10-24 18:09:49,215 epoch 1 - iter 1782/1984 - loss 0.34872736 - time (sec): 108.42 - samples/sec: 1350.48 - lr: 0.000027 - momentum: 0.000000
2023-10-24 18:10:01,661 epoch 1 - iter 1980/1984 - loss 0.32450599 - time (sec): 120.86 - samples/sec: 1354.91 - lr: 0.000030 - momentum: 0.000000
2023-10-24 18:10:01,887 ----------------------------------------------------------------------------------------------------
2023-10-24 18:10:01,887 EPOCH 1 done: loss 0.3243 - lr: 0.000030
2023-10-24 18:10:04,980 DEV : loss 0.08822248876094818 - f1-score (micro avg) 0.7194
2023-10-24 18:10:04,995 saving best model
2023-10-24 18:10:05,463 ----------------------------------------------------------------------------------------------------
2023-10-24 18:10:17,722 epoch 2 - iter 198/1984 - loss 0.11538531 - time (sec): 12.26 - samples/sec: 1406.65 - lr: 0.000030 - momentum: 0.000000
2023-10-24 18:10:29,871 epoch 2 - iter 396/1984 - loss 0.11078638 - time (sec): 24.41 - samples/sec: 1382.17 - lr: 0.000029 - momentum: 0.000000
2023-10-24 18:10:41,861 epoch 2 - iter 594/1984 - loss 0.10902064 - time (sec): 36.40 - samples/sec: 1369.84 - lr: 0.000029 - momentum: 0.000000
2023-10-24 18:10:53,970 epoch 2 - iter 792/1984 - loss 0.11119279 - time (sec): 48.51 - samples/sec: 1366.96 - lr: 0.000029 - momentum: 0.000000
2023-10-24 18:11:06,235 epoch 2 - iter 990/1984 - loss 0.10982936 - time (sec): 60.77 - samples/sec: 1372.84 - lr: 0.000028 - momentum: 0.000000
2023-10-24 18:11:18,421 epoch 2 - iter 1188/1984 - loss 0.11198539 - time (sec): 72.96 - samples/sec: 1366.86 - lr: 0.000028 - momentum: 0.000000
2023-10-24 18:11:30,288 epoch 2 - iter 1386/1984 - loss 0.11253700 - time (sec): 84.82 - samples/sec: 1355.61 - lr: 0.000028 - momentum: 0.000000
2023-10-24 18:11:42,371 epoch 2 - iter 1584/1984 - loss 0.11167056 - time (sec): 96.91 - samples/sec: 1350.00 - lr: 0.000027 - momentum: 0.000000
2023-10-24 18:11:54,534 epoch 2 - iter 1782/1984 - loss 0.10945571 - time (sec): 109.07 - samples/sec: 1353.54 - lr: 0.000027 - momentum: 0.000000
2023-10-24 18:12:06,503 epoch 2 - iter 1980/1984 - loss 0.10890702 - time (sec): 121.04 - samples/sec: 1348.29 - lr: 0.000027 - momentum: 0.000000
2023-10-24 18:12:06,922 ----------------------------------------------------------------------------------------------------
2023-10-24 18:12:06,922 EPOCH 2 done: loss 0.1089 - lr: 0.000027
2023-10-24 18:12:10,353 DEV : loss 0.10544043034315109 - f1-score (micro avg) 0.7355
2023-10-24 18:12:10,368 saving best model
2023-10-24 18:12:10,938 ----------------------------------------------------------------------------------------------------
2023-10-24 18:12:22,889 epoch 3 - iter 198/1984 - loss 0.07127408 - time (sec): 11.95 - samples/sec: 1348.12 - lr: 0.000026 - momentum: 0.000000
2023-10-24 18:12:35,106 epoch 3 - iter 396/1984 - loss 0.06552621 - time (sec): 24.17 - samples/sec: 1369.18 - lr: 0.000026 - momentum: 0.000000
2023-10-24 18:12:47,225 epoch 3 - iter 594/1984 - loss 0.07374619 - time (sec): 36.29 - samples/sec: 1341.65 - lr: 0.000026 - momentum: 0.000000
2023-10-24 18:12:59,160 epoch 3 - iter 792/1984 - loss 0.07723250 - time (sec): 48.22 - samples/sec: 1325.49 - lr: 0.000025 - momentum: 0.000000
2023-10-24 18:13:11,313 epoch 3 - iter 990/1984 - loss 0.07632000 - time (sec): 60.37 - samples/sec: 1334.15 - lr: 0.000025 - momentum: 0.000000
2023-10-24 18:13:23,489 epoch 3 - iter 1188/1984 - loss 0.07708131 - time (sec): 72.55 - samples/sec: 1340.39 - lr: 0.000025 - momentum: 0.000000
2023-10-24 18:13:35,706 epoch 3 - iter 1386/1984 - loss 0.07772303 - time (sec): 84.77 - samples/sec: 1351.33 - lr: 0.000024 - momentum: 0.000000
2023-10-24 18:13:47,929 epoch 3 - iter 1584/1984 - loss 0.07931530 - time (sec): 96.99 - samples/sec: 1350.57 - lr: 0.000024 - momentum: 0.000000
2023-10-24 18:14:00,114 epoch 3 - iter 1782/1984 - loss 0.08200348 - time (sec): 109.17 - samples/sec: 1353.32 - lr: 0.000024 - momentum: 0.000000
2023-10-24 18:14:12,130 epoch 3 - iter 1980/1984 - loss 0.08111184 - time (sec): 121.19 - samples/sec: 1349.68 - lr: 0.000023 - momentum: 0.000000
2023-10-24 18:14:12,387 ----------------------------------------------------------------------------------------------------
2023-10-24 18:14:12,387 EPOCH 3 done: loss 0.0810 - lr: 0.000023
2023-10-24 18:14:15,515 DEV : loss 0.13007180392742157 - f1-score (micro avg) 0.7582
2023-10-24 18:14:15,530 saving best model
2023-10-24 18:14:16,148 ----------------------------------------------------------------------------------------------------
2023-10-24 18:14:28,115 epoch 4 - iter 198/1984 - loss 0.04311242 - time (sec): 11.97 - samples/sec: 1296.25 - lr: 0.000023 - momentum: 0.000000
2023-10-24 18:14:40,318 epoch 4 - iter 396/1984 - loss 0.05582813 - time (sec): 24.17 - samples/sec: 1338.58 - lr: 0.000023 - momentum: 0.000000
2023-10-24 18:14:52,490 epoch 4 - iter 594/1984 - loss 0.05639418 - time (sec): 36.34 - samples/sec: 1361.89 - lr: 0.000022 - momentum: 0.000000
2023-10-24 18:15:04,460 epoch 4 - iter 792/1984 - loss 0.05593021 - time (sec): 48.31 - samples/sec: 1341.54 - lr: 0.000022 - momentum: 0.000000
2023-10-24 18:15:16,489 epoch 4 - iter 990/1984 - loss 0.05809953 - time (sec): 60.34 - samples/sec: 1333.17 - lr: 0.000022 - momentum: 0.000000
2023-10-24 18:15:28,801 epoch 4 - iter 1188/1984 - loss 0.05764319 - time (sec): 72.65 - samples/sec: 1344.01 - lr: 0.000021 - momentum: 0.000000
2023-10-24 18:15:40,962 epoch 4 - iter 1386/1984 - loss 0.05706164 - time (sec): 84.81 - samples/sec: 1341.85 - lr: 0.000021 - momentum: 0.000000
2023-10-24 18:15:53,083 epoch 4 - iter 1584/1984 - loss 0.05865109 - time (sec): 96.93 - samples/sec: 1341.23 - lr: 0.000021 - momentum: 0.000000
2023-10-24 18:16:05,301 epoch 4 - iter 1782/1984 - loss 0.05768982 - time (sec): 109.15 - samples/sec: 1349.70 - lr: 0.000020 - momentum: 0.000000
2023-10-24 18:16:17,487 epoch 4 - iter 1980/1984 - loss 0.05892127 - time (sec): 121.34 - samples/sec: 1348.47 - lr: 0.000020 - momentum: 0.000000
2023-10-24 18:16:17,741 ----------------------------------------------------------------------------------------------------
2023-10-24 18:16:17,741 EPOCH 4 done: loss 0.0588 - lr: 0.000020
2023-10-24 18:16:21,164 DEV : loss 0.14774079620838165 - f1-score (micro avg) 0.7464
2023-10-24 18:16:21,179 ----------------------------------------------------------------------------------------------------
2023-10-24 18:16:33,451 epoch 5 - iter 198/1984 - loss 0.05564537 - time (sec): 12.27 - samples/sec: 1372.89 - lr: 0.000020 - momentum: 0.000000
2023-10-24 18:16:45,465 epoch 5 - iter 396/1984 - loss 0.04733119 - time (sec): 24.29 - samples/sec: 1359.12 - lr: 0.000019 - momentum: 0.000000
2023-10-24 18:16:57,576 epoch 5 - iter 594/1984 - loss 0.04636178 - time (sec): 36.40 - samples/sec: 1366.23 - lr: 0.000019 - momentum: 0.000000
2023-10-24 18:17:09,967 epoch 5 - iter 792/1984 - loss 0.04486463 - time (sec): 48.79 - samples/sec: 1379.59 - lr: 0.000019 - momentum: 0.000000
2023-10-24 18:17:22,045 epoch 5 - iter 990/1984 - loss 0.04414787 - time (sec): 60.86 - samples/sec: 1367.00 - lr: 0.000018 - momentum: 0.000000
2023-10-24 18:17:33,989 epoch 5 - iter 1188/1984 - loss 0.04391772 - time (sec): 72.81 - samples/sec: 1357.52 - lr: 0.000018 - momentum: 0.000000
2023-10-24 18:17:46,001 epoch 5 - iter 1386/1984 - loss 0.04395576 - time (sec): 84.82 - samples/sec: 1352.18 - lr: 0.000018 - momentum: 0.000000
2023-10-24 18:17:58,166 epoch 5 - iter 1584/1984 - loss 0.04447887 - time (sec): 96.99 - samples/sec: 1354.57 - lr: 0.000017 - momentum: 0.000000
2023-10-24 18:18:10,146 epoch 5 - iter 1782/1984 - loss 0.04456227 - time (sec): 108.97 - samples/sec: 1347.63 - lr: 0.000017 - momentum: 0.000000
2023-10-24 18:18:22,438 epoch 5 - iter 1980/1984 - loss 0.04356891 - time (sec): 121.26 - samples/sec: 1349.39 - lr: 0.000017 - momentum: 0.000000
2023-10-24 18:18:22,692 ----------------------------------------------------------------------------------------------------
2023-10-24 18:18:22,693 EPOCH 5 done: loss 0.0435 - lr: 0.000017
2023-10-24 18:18:25,801 DEV : loss 0.1785285472869873 - f1-score (micro avg) 0.7691
2023-10-24 18:18:25,816 saving best model
2023-10-24 18:18:26,434 ----------------------------------------------------------------------------------------------------
2023-10-24 18:18:38,509 epoch 6 - iter 198/1984 - loss 0.03770600 - time (sec): 12.07 - samples/sec: 1341.65 - lr: 0.000016 - momentum: 0.000000
2023-10-24 18:18:50,512 epoch 6 - iter 396/1984 - loss 0.03533865 - time (sec): 24.08 - samples/sec: 1337.14 - lr: 0.000016 - momentum: 0.000000
2023-10-24 18:19:02,726 epoch 6 - iter 594/1984 - loss 0.03400472 - time (sec): 36.29 - samples/sec: 1352.00 - lr: 0.000016 - momentum: 0.000000
2023-10-24 18:19:14,776 epoch 6 - iter 792/1984 - loss 0.03310510 - time (sec): 48.34 - samples/sec: 1338.78 - lr: 0.000015 - momentum: 0.000000
2023-10-24 18:19:27,111 epoch 6 - iter 990/1984 - loss 0.03249866 - time (sec): 60.68 - samples/sec: 1350.34 - lr: 0.000015 - momentum: 0.000000
2023-10-24 18:19:39,316 epoch 6 - iter 1188/1984 - loss 0.03090049 - time (sec): 72.88 - samples/sec: 1340.35 - lr: 0.000015 - momentum: 0.000000
2023-10-24 18:19:51,893 epoch 6 - iter 1386/1984 - loss 0.03241745 - time (sec): 85.46 - samples/sec: 1340.48 - lr: 0.000014 - momentum: 0.000000
2023-10-24 18:20:03,917 epoch 6 - iter 1584/1984 - loss 0.03297873 - time (sec): 97.48 - samples/sec: 1338.72 - lr: 0.000014 - momentum: 0.000000
2023-10-24 18:20:16,009 epoch 6 - iter 1782/1984 - loss 0.03333018 - time (sec): 109.57 - samples/sec: 1338.46 - lr: 0.000014 - momentum: 0.000000
2023-10-24 18:20:28,225 epoch 6 - iter 1980/1984 - loss 0.03294409 - time (sec): 121.79 - samples/sec: 1344.26 - lr: 0.000013 - momentum: 0.000000
2023-10-24 18:20:28,466 ----------------------------------------------------------------------------------------------------
2023-10-24 18:20:28,467 EPOCH 6 done: loss 0.0329 - lr: 0.000013
2023-10-24 18:20:31,597 DEV : loss 0.1928633749485016 - f1-score (micro avg) 0.7655
2023-10-24 18:20:31,612 ----------------------------------------------------------------------------------------------------
2023-10-24 18:20:44,023 epoch 7 - iter 198/1984 - loss 0.02473915 - time (sec): 12.41 - samples/sec: 1414.46 - lr: 0.000013 - momentum: 0.000000
2023-10-24 18:20:56,221 epoch 7 - iter 396/1984 - loss 0.02715590 - time (sec): 24.61 - samples/sec: 1392.39 - lr: 0.000013 - momentum: 0.000000
2023-10-24 18:21:08,255 epoch 7 - iter 594/1984 - loss 0.02685085 - time (sec): 36.64 - samples/sec: 1381.08 - lr: 0.000012 - momentum: 0.000000
2023-10-24 18:21:20,492 epoch 7 - iter 792/1984 - loss 0.02566708 - time (sec): 48.88 - samples/sec: 1369.54 - lr: 0.000012 - momentum: 0.000000
2023-10-24 18:21:32,601 epoch 7 - iter 990/1984 - loss 0.02548493 - time (sec): 60.99 - samples/sec: 1361.78 - lr: 0.000012 - momentum: 0.000000
2023-10-24 18:21:44,601 epoch 7 - iter 1188/1984 - loss 0.02537405 - time (sec): 72.99 - samples/sec: 1354.50 - lr: 0.000011 - momentum: 0.000000
2023-10-24 18:21:56,645 epoch 7 - iter 1386/1984 - loss 0.02423187 - time (sec): 85.03 - samples/sec: 1348.01 - lr: 0.000011 - momentum: 0.000000
2023-10-24 18:22:08,738 epoch 7 - iter 1584/1984 - loss 0.02555620 - time (sec): 97.12 - samples/sec: 1347.41 - lr: 0.000011 - momentum: 0.000000
2023-10-24 18:22:20,847 epoch 7 - iter 1782/1984 - loss 0.02502907 - time (sec): 109.23 - samples/sec: 1349.96 - lr: 0.000010 - momentum: 0.000000
2023-10-24 18:22:32,913 epoch 7 - iter 1980/1984 - loss 0.02462085 - time (sec): 121.30 - samples/sec: 1348.75 - lr: 0.000010 - momentum: 0.000000
2023-10-24 18:22:33,203 ----------------------------------------------------------------------------------------------------
2023-10-24 18:22:33,203 EPOCH 7 done: loss 0.0246 - lr: 0.000010
2023-10-24 18:22:36,323 DEV : loss 0.20923519134521484 - f1-score (micro avg) 0.7735
2023-10-24 18:22:36,338 saving best model
2023-10-24 18:22:36,954 ----------------------------------------------------------------------------------------------------
2023-10-24 18:22:49,099 epoch 8 - iter 198/1984 - loss 0.01338123 - time (sec): 12.14 - samples/sec: 1349.93 - lr: 0.000010 - momentum: 0.000000
2023-10-24 18:23:01,404 epoch 8 - iter 396/1984 - loss 0.01458755 - time (sec): 24.45 - samples/sec: 1372.58 - lr: 0.000009 - momentum: 0.000000
2023-10-24 18:23:13,503 epoch 8 - iter 594/1984 - loss 0.01472531 - time (sec): 36.55 - samples/sec: 1363.60 - lr: 0.000009 - momentum: 0.000000
2023-10-24 18:23:25,875 epoch 8 - iter 792/1984 - loss 0.01510012 - time (sec): 48.92 - samples/sec: 1352.70 - lr: 0.000009 - momentum: 0.000000
2023-10-24 18:23:38,187 epoch 8 - iter 990/1984 - loss 0.01631973 - time (sec): 61.23 - samples/sec: 1362.06 - lr: 0.000008 - momentum: 0.000000
2023-10-24 18:23:50,209 epoch 8 - iter 1188/1984 - loss 0.01696770 - time (sec): 73.25 - samples/sec: 1358.52 - lr: 0.000008 - momentum: 0.000000
2023-10-24 18:24:02,466 epoch 8 - iter 1386/1984 - loss 0.01742849 - time (sec): 85.51 - samples/sec: 1352.90 - lr: 0.000008 - momentum: 0.000000
2023-10-24 18:24:14,519 epoch 8 - iter 1584/1984 - loss 0.01641470 - time (sec): 97.56 - samples/sec: 1345.67 - lr: 0.000007 - momentum: 0.000000
2023-10-24 18:24:26,773 epoch 8 - iter 1782/1984 - loss 0.01644926 - time (sec): 109.82 - samples/sec: 1349.03 - lr: 0.000007 - momentum: 0.000000
2023-10-24 18:24:38,678 epoch 8 - iter 1980/1984 - loss 0.01625596 - time (sec): 121.72 - samples/sec: 1344.35 - lr: 0.000007 - momentum: 0.000000
2023-10-24 18:24:38,916 ----------------------------------------------------------------------------------------------------
2023-10-24 18:24:38,916 EPOCH 8 done: loss 0.0162 - lr: 0.000007
2023-10-24 18:24:42,041 DEV : loss 0.22515711188316345 - f1-score (micro avg) 0.7676
2023-10-24 18:24:42,056 ----------------------------------------------------------------------------------------------------
2023-10-24 18:24:54,265 epoch 9 - iter 198/1984 - loss 0.00852019 - time (sec): 12.21 - samples/sec: 1354.85 - lr: 0.000006 - momentum: 0.000000
2023-10-24 18:25:06,179 epoch 9 - iter 396/1984 - loss 0.01328350 - time (sec): 24.12 - samples/sec: 1336.12 - lr: 0.000006 - momentum: 0.000000
2023-10-24 18:25:18,591 epoch 9 - iter 594/1984 - loss 0.01339687 - time (sec): 36.53 - samples/sec: 1348.93 - lr: 0.000006 - momentum: 0.000000
2023-10-24 18:25:30,817 epoch 9 - iter 792/1984 - loss 0.01234555 - time (sec): 48.76 - samples/sec: 1347.68 - lr: 0.000005 - momentum: 0.000000
2023-10-24 18:25:42,890 epoch 9 - iter 990/1984 - loss 0.01167028 - time (sec): 60.83 - samples/sec: 1352.82 - lr: 0.000005 - momentum: 0.000000
2023-10-24 18:25:54,838 epoch 9 - iter 1188/1984 - loss 0.01071810 - time (sec): 72.78 - samples/sec: 1344.58 - lr: 0.000005 - momentum: 0.000000
2023-10-24 18:26:07,055 epoch 9 - iter 1386/1984 - loss 0.01111198 - time (sec): 85.00 - samples/sec: 1348.21 - lr: 0.000004 - momentum: 0.000000
2023-10-24 18:26:19,193 epoch 9 - iter 1584/1984 - loss 0.01162792 - time (sec): 97.14 - samples/sec: 1349.38 - lr: 0.000004 - momentum: 0.000000
2023-10-24 18:26:31,487 epoch 9 - iter 1782/1984 - loss 0.01233370 - time (sec): 109.43 - samples/sec: 1347.80 - lr: 0.000004 - momentum: 0.000000
2023-10-24 18:26:43,546 epoch 9 - iter 1980/1984 - loss 0.01214635 - time (sec): 121.49 - samples/sec: 1347.15 - lr: 0.000003 - momentum: 0.000000
2023-10-24 18:26:43,791 ----------------------------------------------------------------------------------------------------
2023-10-24 18:26:43,791 EPOCH 9 done: loss 0.0122 - lr: 0.000003
2023-10-24 18:26:46,921 DEV : loss 0.2285650223493576 - f1-score (micro avg) 0.7685
2023-10-24 18:26:46,936 ----------------------------------------------------------------------------------------------------
2023-10-24 18:26:59,335 epoch 10 - iter 198/1984 - loss 0.00472620 - time (sec): 12.40 - samples/sec: 1283.74 - lr: 0.000003 - momentum: 0.000000
2023-10-24 18:27:11,443 epoch 10 - iter 396/1984 - loss 0.00589847 - time (sec): 24.51 - samples/sec: 1336.66 - lr: 0.000003 - momentum: 0.000000
2023-10-24 18:27:23,523 epoch 10 - iter 594/1984 - loss 0.00695023 - time (sec): 36.59 - samples/sec: 1327.88 - lr: 0.000002 - momentum: 0.000000
2023-10-24 18:27:35,805 epoch 10 - iter 792/1984 - loss 0.00775105 - time (sec): 48.87 - samples/sec: 1333.36 - lr: 0.000002 - momentum: 0.000000
2023-10-24 18:27:48,037 epoch 10 - iter 990/1984 - loss 0.00815130 - time (sec): 61.10 - samples/sec: 1337.31 - lr: 0.000002 - momentum: 0.000000
2023-10-24 18:28:00,332 epoch 10 - iter 1188/1984 - loss 0.00801025 - time (sec): 73.40 - samples/sec: 1342.12 - lr: 0.000001 - momentum: 0.000000
2023-10-24 18:28:12,341 epoch 10 - iter 1386/1984 - loss 0.00793725 - time (sec): 85.40 - samples/sec: 1342.25 - lr: 0.000001 - momentum: 0.000000
2023-10-24 18:28:24,359 epoch 10 - iter 1584/1984 - loss 0.00790952 - time (sec): 97.42 - samples/sec: 1340.60 - lr: 0.000001 - momentum: 0.000000
2023-10-24 18:28:36,593 epoch 10 - iter 1782/1984 - loss 0.00760356 - time (sec): 109.66 - samples/sec: 1347.44 - lr: 0.000000 - momentum: 0.000000
2023-10-24 18:28:48,671 epoch 10 - iter 1980/1984 - loss 0.00764524 - time (sec): 121.73 - samples/sec: 1344.84 - lr: 0.000000 - momentum: 0.000000
2023-10-24 18:28:48,912 ----------------------------------------------------------------------------------------------------
2023-10-24 18:28:48,912 EPOCH 10 done: loss 0.0076 - lr: 0.000000
2023-10-24 18:28:52,025 DEV : loss 0.23562917113304138 - f1-score (micro avg) 0.7713
2023-10-24 18:28:52,525 ----------------------------------------------------------------------------------------------------
2023-10-24 18:28:52,526 Loading model from best epoch ...
2023-10-24 18:28:53,997 SequenceTagger predicts: Dictionary with 13 tags: O, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG
2023-10-24 18:28:57,085
Results:
- F-score (micro) 0.7853
- F-score (macro) 0.6936
- Accuracy 0.6639
By class:
precision recall f1-score support
LOC 0.8353 0.8595 0.8473 655
PER 0.7250 0.7803 0.7516 223
ORG 0.5699 0.4173 0.4818 127
micro avg 0.7845 0.7861 0.7853 1005
macro avg 0.7101 0.6857 0.6936 1005
weighted avg 0.7773 0.7861 0.7799 1005
2023-10-24 18:28:57,085 ----------------------------------------------------------------------------------------------------