2023-10-24 18:08:00,796 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 MultiCorpus: 7936 train + 992 dev + 992 test sentences - NER_ICDAR_EUROPEANA Corpus: 7936 train + 992 dev + 992 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/fr 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Train: 7936 sentences 2023-10-24 18:08:00,797 (train_with_dev=False, train_with_test=False) 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Training Params: 2023-10-24 18:08:00,797 - learning_rate: "3e-05" 2023-10-24 18:08:00,797 - mini_batch_size: "4" 2023-10-24 18:08:00,797 - max_epochs: "10" 2023-10-24 18:08:00,797 - shuffle: "True" 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Plugins: 2023-10-24 18:08:00,797 - TensorboardLogger 2023-10-24 18:08:00,797 - LinearScheduler | warmup_fraction: '0.1' 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Final evaluation on model from best epoch (best-model.pt) 2023-10-24 18:08:00,797 - metric: "('micro avg', 'f1-score')" 2023-10-24 18:08:00,797 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,797 Computation: 2023-10-24 18:08:00,797 - compute on device: cuda:0 2023-10-24 18:08:00,798 - embedding storage: none 2023-10-24 18:08:00,798 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,798 Model training base path: "hmbench-icdar/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-4" 2023-10-24 18:08:00,798 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,798 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:08:00,798 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-24 18:08:12,645 epoch 1 - iter 198/1984 - loss 1.65883842 - time (sec): 11.85 - samples/sec: 1399.30 - lr: 0.000003 - momentum: 0.000000 2023-10-24 18:08:24,650 epoch 1 - iter 396/1984 - loss 0.99109184 - time (sec): 23.85 - samples/sec: 1395.13 - lr: 0.000006 - momentum: 0.000000 2023-10-24 18:08:36,790 epoch 1 - iter 594/1984 - loss 0.74189793 - time (sec): 35.99 - samples/sec: 1363.42 - lr: 0.000009 - momentum: 0.000000 2023-10-24 18:08:48,710 epoch 1 - iter 792/1984 - loss 0.60171309 - time (sec): 47.91 - samples/sec: 1355.98 - lr: 0.000012 - momentum: 0.000000 2023-10-24 18:09:00,807 epoch 1 - iter 990/1984 - loss 0.51096698 - time (sec): 60.01 - samples/sec: 1357.21 - lr: 0.000015 - momentum: 0.000000 2023-10-24 18:09:12,838 epoch 1 - iter 1188/1984 - loss 0.45403133 - time (sec): 72.04 - samples/sec: 1348.75 - lr: 0.000018 - momentum: 0.000000 2023-10-24 18:09:25,213 epoch 1 - iter 1386/1984 - loss 0.41042045 - time (sec): 84.41 - samples/sec: 1348.01 - lr: 0.000021 - momentum: 0.000000 2023-10-24 18:09:37,257 epoch 1 - iter 1584/1984 - loss 0.37518830 - time (sec): 96.46 - samples/sec: 1351.04 - lr: 0.000024 - momentum: 0.000000 2023-10-24 18:09:49,215 epoch 1 - iter 1782/1984 - loss 0.34872736 - time (sec): 108.42 - samples/sec: 1350.48 - lr: 0.000027 - momentum: 0.000000 2023-10-24 18:10:01,661 epoch 1 - iter 1980/1984 - loss 0.32450599 - time (sec): 120.86 - samples/sec: 1354.91 - lr: 0.000030 - momentum: 0.000000 2023-10-24 18:10:01,887 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:10:01,887 EPOCH 1 done: loss 0.3243 - lr: 0.000030 2023-10-24 18:10:04,980 DEV : loss 0.08822248876094818 - f1-score (micro avg) 0.7194 2023-10-24 18:10:04,995 saving best model 2023-10-24 18:10:05,463 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:10:17,722 epoch 2 - iter 198/1984 - loss 0.11538531 - time (sec): 12.26 - samples/sec: 1406.65 - lr: 0.000030 - momentum: 0.000000 2023-10-24 18:10:29,871 epoch 2 - iter 396/1984 - loss 0.11078638 - time (sec): 24.41 - samples/sec: 1382.17 - lr: 0.000029 - momentum: 0.000000 2023-10-24 18:10:41,861 epoch 2 - iter 594/1984 - loss 0.10902064 - time (sec): 36.40 - samples/sec: 1369.84 - lr: 0.000029 - momentum: 0.000000 2023-10-24 18:10:53,970 epoch 2 - iter 792/1984 - loss 0.11119279 - time (sec): 48.51 - samples/sec: 1366.96 - lr: 0.000029 - momentum: 0.000000 2023-10-24 18:11:06,235 epoch 2 - iter 990/1984 - loss 0.10982936 - time (sec): 60.77 - samples/sec: 1372.84 - lr: 0.000028 - momentum: 0.000000 2023-10-24 18:11:18,421 epoch 2 - iter 1188/1984 - loss 0.11198539 - time (sec): 72.96 - samples/sec: 1366.86 - lr: 0.000028 - momentum: 0.000000 2023-10-24 18:11:30,288 epoch 2 - iter 1386/1984 - loss 0.11253700 - time (sec): 84.82 - samples/sec: 1355.61 - lr: 0.000028 - momentum: 0.000000 2023-10-24 18:11:42,371 epoch 2 - iter 1584/1984 - loss 0.11167056 - time (sec): 96.91 - samples/sec: 1350.00 - lr: 0.000027 - momentum: 0.000000 2023-10-24 18:11:54,534 epoch 2 - iter 1782/1984 - loss 0.10945571 - time (sec): 109.07 - samples/sec: 1353.54 - lr: 0.000027 - momentum: 0.000000 2023-10-24 18:12:06,503 epoch 2 - iter 1980/1984 - loss 0.10890702 - time (sec): 121.04 - samples/sec: 1348.29 - lr: 0.000027 - momentum: 0.000000 2023-10-24 18:12:06,922 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:12:06,922 EPOCH 2 done: loss 0.1089 - lr: 0.000027 2023-10-24 18:12:10,353 DEV : loss 0.10544043034315109 - f1-score (micro avg) 0.7355 2023-10-24 18:12:10,368 saving best model 2023-10-24 18:12:10,938 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:12:22,889 epoch 3 - iter 198/1984 - loss 0.07127408 - time (sec): 11.95 - samples/sec: 1348.12 - lr: 0.000026 - momentum: 0.000000 2023-10-24 18:12:35,106 epoch 3 - iter 396/1984 - loss 0.06552621 - time (sec): 24.17 - samples/sec: 1369.18 - lr: 0.000026 - momentum: 0.000000 2023-10-24 18:12:47,225 epoch 3 - iter 594/1984 - loss 0.07374619 - time (sec): 36.29 - samples/sec: 1341.65 - lr: 0.000026 - momentum: 0.000000 2023-10-24 18:12:59,160 epoch 3 - iter 792/1984 - loss 0.07723250 - time (sec): 48.22 - samples/sec: 1325.49 - lr: 0.000025 - momentum: 0.000000 2023-10-24 18:13:11,313 epoch 3 - iter 990/1984 - loss 0.07632000 - time (sec): 60.37 - samples/sec: 1334.15 - lr: 0.000025 - momentum: 0.000000 2023-10-24 18:13:23,489 epoch 3 - iter 1188/1984 - loss 0.07708131 - time (sec): 72.55 - samples/sec: 1340.39 - lr: 0.000025 - momentum: 0.000000 2023-10-24 18:13:35,706 epoch 3 - iter 1386/1984 - loss 0.07772303 - time (sec): 84.77 - samples/sec: 1351.33 - lr: 0.000024 - momentum: 0.000000 2023-10-24 18:13:47,929 epoch 3 - iter 1584/1984 - loss 0.07931530 - time (sec): 96.99 - samples/sec: 1350.57 - lr: 0.000024 - momentum: 0.000000 2023-10-24 18:14:00,114 epoch 3 - iter 1782/1984 - loss 0.08200348 - time (sec): 109.17 - samples/sec: 1353.32 - lr: 0.000024 - momentum: 0.000000 2023-10-24 18:14:12,130 epoch 3 - iter 1980/1984 - loss 0.08111184 - time (sec): 121.19 - samples/sec: 1349.68 - lr: 0.000023 - momentum: 0.000000 2023-10-24 18:14:12,387 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:14:12,387 EPOCH 3 done: loss 0.0810 - lr: 0.000023 2023-10-24 18:14:15,515 DEV : loss 0.13007180392742157 - f1-score (micro avg) 0.7582 2023-10-24 18:14:15,530 saving best model 2023-10-24 18:14:16,148 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:14:28,115 epoch 4 - iter 198/1984 - loss 0.04311242 - time (sec): 11.97 - samples/sec: 1296.25 - lr: 0.000023 - momentum: 0.000000 2023-10-24 18:14:40,318 epoch 4 - iter 396/1984 - loss 0.05582813 - time (sec): 24.17 - samples/sec: 1338.58 - lr: 0.000023 - momentum: 0.000000 2023-10-24 18:14:52,490 epoch 4 - iter 594/1984 - loss 0.05639418 - time (sec): 36.34 - samples/sec: 1361.89 - lr: 0.000022 - momentum: 0.000000 2023-10-24 18:15:04,460 epoch 4 - iter 792/1984 - loss 0.05593021 - time (sec): 48.31 - samples/sec: 1341.54 - lr: 0.000022 - momentum: 0.000000 2023-10-24 18:15:16,489 epoch 4 - iter 990/1984 - loss 0.05809953 - time (sec): 60.34 - samples/sec: 1333.17 - lr: 0.000022 - momentum: 0.000000 2023-10-24 18:15:28,801 epoch 4 - iter 1188/1984 - loss 0.05764319 - time (sec): 72.65 - samples/sec: 1344.01 - lr: 0.000021 - momentum: 0.000000 2023-10-24 18:15:40,962 epoch 4 - iter 1386/1984 - loss 0.05706164 - time (sec): 84.81 - samples/sec: 1341.85 - lr: 0.000021 - momentum: 0.000000 2023-10-24 18:15:53,083 epoch 4 - iter 1584/1984 - loss 0.05865109 - time (sec): 96.93 - samples/sec: 1341.23 - lr: 0.000021 - momentum: 0.000000 2023-10-24 18:16:05,301 epoch 4 - iter 1782/1984 - loss 0.05768982 - time (sec): 109.15 - samples/sec: 1349.70 - lr: 0.000020 - momentum: 0.000000 2023-10-24 18:16:17,487 epoch 4 - iter 1980/1984 - loss 0.05892127 - time (sec): 121.34 - samples/sec: 1348.47 - lr: 0.000020 - momentum: 0.000000 2023-10-24 18:16:17,741 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:16:17,741 EPOCH 4 done: loss 0.0588 - lr: 0.000020 2023-10-24 18:16:21,164 DEV : loss 0.14774079620838165 - f1-score (micro avg) 0.7464 2023-10-24 18:16:21,179 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:16:33,451 epoch 5 - iter 198/1984 - loss 0.05564537 - time (sec): 12.27 - samples/sec: 1372.89 - lr: 0.000020 - momentum: 0.000000 2023-10-24 18:16:45,465 epoch 5 - iter 396/1984 - loss 0.04733119 - time (sec): 24.29 - samples/sec: 1359.12 - lr: 0.000019 - momentum: 0.000000 2023-10-24 18:16:57,576 epoch 5 - iter 594/1984 - loss 0.04636178 - time (sec): 36.40 - samples/sec: 1366.23 - lr: 0.000019 - momentum: 0.000000 2023-10-24 18:17:09,967 epoch 5 - iter 792/1984 - loss 0.04486463 - time (sec): 48.79 - samples/sec: 1379.59 - lr: 0.000019 - momentum: 0.000000 2023-10-24 18:17:22,045 epoch 5 - iter 990/1984 - loss 0.04414787 - time (sec): 60.86 - samples/sec: 1367.00 - lr: 0.000018 - momentum: 0.000000 2023-10-24 18:17:33,989 epoch 5 - iter 1188/1984 - loss 0.04391772 - time (sec): 72.81 - samples/sec: 1357.52 - lr: 0.000018 - momentum: 0.000000 2023-10-24 18:17:46,001 epoch 5 - iter 1386/1984 - loss 0.04395576 - time (sec): 84.82 - samples/sec: 1352.18 - lr: 0.000018 - momentum: 0.000000 2023-10-24 18:17:58,166 epoch 5 - iter 1584/1984 - loss 0.04447887 - time (sec): 96.99 - samples/sec: 1354.57 - lr: 0.000017 - momentum: 0.000000 2023-10-24 18:18:10,146 epoch 5 - iter 1782/1984 - loss 0.04456227 - time (sec): 108.97 - samples/sec: 1347.63 - lr: 0.000017 - momentum: 0.000000 2023-10-24 18:18:22,438 epoch 5 - iter 1980/1984 - loss 0.04356891 - time (sec): 121.26 - samples/sec: 1349.39 - lr: 0.000017 - momentum: 0.000000 2023-10-24 18:18:22,692 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:18:22,693 EPOCH 5 done: loss 0.0435 - lr: 0.000017 2023-10-24 18:18:25,801 DEV : loss 0.1785285472869873 - f1-score (micro avg) 0.7691 2023-10-24 18:18:25,816 saving best model 2023-10-24 18:18:26,434 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:18:38,509 epoch 6 - iter 198/1984 - loss 0.03770600 - time (sec): 12.07 - samples/sec: 1341.65 - lr: 0.000016 - momentum: 0.000000 2023-10-24 18:18:50,512 epoch 6 - iter 396/1984 - loss 0.03533865 - time (sec): 24.08 - samples/sec: 1337.14 - lr: 0.000016 - momentum: 0.000000 2023-10-24 18:19:02,726 epoch 6 - iter 594/1984 - loss 0.03400472 - time (sec): 36.29 - samples/sec: 1352.00 - lr: 0.000016 - momentum: 0.000000 2023-10-24 18:19:14,776 epoch 6 - iter 792/1984 - loss 0.03310510 - time (sec): 48.34 - samples/sec: 1338.78 - lr: 0.000015 - momentum: 0.000000 2023-10-24 18:19:27,111 epoch 6 - iter 990/1984 - loss 0.03249866 - time (sec): 60.68 - samples/sec: 1350.34 - lr: 0.000015 - momentum: 0.000000 2023-10-24 18:19:39,316 epoch 6 - iter 1188/1984 - loss 0.03090049 - time (sec): 72.88 - samples/sec: 1340.35 - lr: 0.000015 - momentum: 0.000000 2023-10-24 18:19:51,893 epoch 6 - iter 1386/1984 - loss 0.03241745 - time (sec): 85.46 - samples/sec: 1340.48 - lr: 0.000014 - momentum: 0.000000 2023-10-24 18:20:03,917 epoch 6 - iter 1584/1984 - loss 0.03297873 - time (sec): 97.48 - samples/sec: 1338.72 - lr: 0.000014 - momentum: 0.000000 2023-10-24 18:20:16,009 epoch 6 - iter 1782/1984 - loss 0.03333018 - time (sec): 109.57 - samples/sec: 1338.46 - lr: 0.000014 - momentum: 0.000000 2023-10-24 18:20:28,225 epoch 6 - iter 1980/1984 - loss 0.03294409 - time (sec): 121.79 - samples/sec: 1344.26 - lr: 0.000013 - momentum: 0.000000 2023-10-24 18:20:28,466 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:20:28,467 EPOCH 6 done: loss 0.0329 - lr: 0.000013 2023-10-24 18:20:31,597 DEV : loss 0.1928633749485016 - f1-score (micro avg) 0.7655 2023-10-24 18:20:31,612 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:20:44,023 epoch 7 - iter 198/1984 - loss 0.02473915 - time (sec): 12.41 - samples/sec: 1414.46 - lr: 0.000013 - momentum: 0.000000 2023-10-24 18:20:56,221 epoch 7 - iter 396/1984 - loss 0.02715590 - time (sec): 24.61 - samples/sec: 1392.39 - lr: 0.000013 - momentum: 0.000000 2023-10-24 18:21:08,255 epoch 7 - iter 594/1984 - loss 0.02685085 - time (sec): 36.64 - samples/sec: 1381.08 - lr: 0.000012 - momentum: 0.000000 2023-10-24 18:21:20,492 epoch 7 - iter 792/1984 - loss 0.02566708 - time (sec): 48.88 - samples/sec: 1369.54 - lr: 0.000012 - momentum: 0.000000 2023-10-24 18:21:32,601 epoch 7 - iter 990/1984 - loss 0.02548493 - time (sec): 60.99 - samples/sec: 1361.78 - lr: 0.000012 - momentum: 0.000000 2023-10-24 18:21:44,601 epoch 7 - iter 1188/1984 - loss 0.02537405 - time (sec): 72.99 - samples/sec: 1354.50 - lr: 0.000011 - momentum: 0.000000 2023-10-24 18:21:56,645 epoch 7 - iter 1386/1984 - loss 0.02423187 - time (sec): 85.03 - samples/sec: 1348.01 - lr: 0.000011 - momentum: 0.000000 2023-10-24 18:22:08,738 epoch 7 - iter 1584/1984 - loss 0.02555620 - time (sec): 97.12 - samples/sec: 1347.41 - lr: 0.000011 - momentum: 0.000000 2023-10-24 18:22:20,847 epoch 7 - iter 1782/1984 - loss 0.02502907 - time (sec): 109.23 - samples/sec: 1349.96 - lr: 0.000010 - momentum: 0.000000 2023-10-24 18:22:32,913 epoch 7 - iter 1980/1984 - loss 0.02462085 - time (sec): 121.30 - samples/sec: 1348.75 - lr: 0.000010 - momentum: 0.000000 2023-10-24 18:22:33,203 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:22:33,203 EPOCH 7 done: loss 0.0246 - lr: 0.000010 2023-10-24 18:22:36,323 DEV : loss 0.20923519134521484 - f1-score (micro avg) 0.7735 2023-10-24 18:22:36,338 saving best model 2023-10-24 18:22:36,954 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:22:49,099 epoch 8 - iter 198/1984 - loss 0.01338123 - time (sec): 12.14 - samples/sec: 1349.93 - lr: 0.000010 - momentum: 0.000000 2023-10-24 18:23:01,404 epoch 8 - iter 396/1984 - loss 0.01458755 - time (sec): 24.45 - samples/sec: 1372.58 - lr: 0.000009 - momentum: 0.000000 2023-10-24 18:23:13,503 epoch 8 - iter 594/1984 - loss 0.01472531 - time (sec): 36.55 - samples/sec: 1363.60 - lr: 0.000009 - momentum: 0.000000 2023-10-24 18:23:25,875 epoch 8 - iter 792/1984 - loss 0.01510012 - time (sec): 48.92 - samples/sec: 1352.70 - lr: 0.000009 - momentum: 0.000000 2023-10-24 18:23:38,187 epoch 8 - iter 990/1984 - loss 0.01631973 - time (sec): 61.23 - samples/sec: 1362.06 - lr: 0.000008 - momentum: 0.000000 2023-10-24 18:23:50,209 epoch 8 - iter 1188/1984 - loss 0.01696770 - time (sec): 73.25 - samples/sec: 1358.52 - lr: 0.000008 - momentum: 0.000000 2023-10-24 18:24:02,466 epoch 8 - iter 1386/1984 - loss 0.01742849 - time (sec): 85.51 - samples/sec: 1352.90 - lr: 0.000008 - momentum: 0.000000 2023-10-24 18:24:14,519 epoch 8 - iter 1584/1984 - loss 0.01641470 - time (sec): 97.56 - samples/sec: 1345.67 - lr: 0.000007 - momentum: 0.000000 2023-10-24 18:24:26,773 epoch 8 - iter 1782/1984 - loss 0.01644926 - time (sec): 109.82 - samples/sec: 1349.03 - lr: 0.000007 - momentum: 0.000000 2023-10-24 18:24:38,678 epoch 8 - iter 1980/1984 - loss 0.01625596 - time (sec): 121.72 - samples/sec: 1344.35 - lr: 0.000007 - momentum: 0.000000 2023-10-24 18:24:38,916 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:24:38,916 EPOCH 8 done: loss 0.0162 - lr: 0.000007 2023-10-24 18:24:42,041 DEV : loss 0.22515711188316345 - f1-score (micro avg) 0.7676 2023-10-24 18:24:42,056 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:24:54,265 epoch 9 - iter 198/1984 - loss 0.00852019 - time (sec): 12.21 - samples/sec: 1354.85 - lr: 0.000006 - momentum: 0.000000 2023-10-24 18:25:06,179 epoch 9 - iter 396/1984 - loss 0.01328350 - time (sec): 24.12 - samples/sec: 1336.12 - lr: 0.000006 - momentum: 0.000000 2023-10-24 18:25:18,591 epoch 9 - iter 594/1984 - loss 0.01339687 - time (sec): 36.53 - samples/sec: 1348.93 - lr: 0.000006 - momentum: 0.000000 2023-10-24 18:25:30,817 epoch 9 - iter 792/1984 - loss 0.01234555 - time (sec): 48.76 - samples/sec: 1347.68 - lr: 0.000005 - momentum: 0.000000 2023-10-24 18:25:42,890 epoch 9 - iter 990/1984 - loss 0.01167028 - time (sec): 60.83 - samples/sec: 1352.82 - lr: 0.000005 - momentum: 0.000000 2023-10-24 18:25:54,838 epoch 9 - iter 1188/1984 - loss 0.01071810 - time (sec): 72.78 - samples/sec: 1344.58 - lr: 0.000005 - momentum: 0.000000 2023-10-24 18:26:07,055 epoch 9 - iter 1386/1984 - loss 0.01111198 - time (sec): 85.00 - samples/sec: 1348.21 - lr: 0.000004 - momentum: 0.000000 2023-10-24 18:26:19,193 epoch 9 - iter 1584/1984 - loss 0.01162792 - time (sec): 97.14 - samples/sec: 1349.38 - lr: 0.000004 - momentum: 0.000000 2023-10-24 18:26:31,487 epoch 9 - iter 1782/1984 - loss 0.01233370 - time (sec): 109.43 - samples/sec: 1347.80 - lr: 0.000004 - momentum: 0.000000 2023-10-24 18:26:43,546 epoch 9 - iter 1980/1984 - loss 0.01214635 - time (sec): 121.49 - samples/sec: 1347.15 - lr: 0.000003 - momentum: 0.000000 2023-10-24 18:26:43,791 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:26:43,791 EPOCH 9 done: loss 0.0122 - lr: 0.000003 2023-10-24 18:26:46,921 DEV : loss 0.2285650223493576 - f1-score (micro avg) 0.7685 2023-10-24 18:26:46,936 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:26:59,335 epoch 10 - iter 198/1984 - loss 0.00472620 - time (sec): 12.40 - samples/sec: 1283.74 - lr: 0.000003 - momentum: 0.000000 2023-10-24 18:27:11,443 epoch 10 - iter 396/1984 - loss 0.00589847 - time (sec): 24.51 - samples/sec: 1336.66 - lr: 0.000003 - momentum: 0.000000 2023-10-24 18:27:23,523 epoch 10 - iter 594/1984 - loss 0.00695023 - time (sec): 36.59 - samples/sec: 1327.88 - lr: 0.000002 - momentum: 0.000000 2023-10-24 18:27:35,805 epoch 10 - iter 792/1984 - loss 0.00775105 - time (sec): 48.87 - samples/sec: 1333.36 - lr: 0.000002 - momentum: 0.000000 2023-10-24 18:27:48,037 epoch 10 - iter 990/1984 - loss 0.00815130 - time (sec): 61.10 - samples/sec: 1337.31 - lr: 0.000002 - momentum: 0.000000 2023-10-24 18:28:00,332 epoch 10 - iter 1188/1984 - loss 0.00801025 - time (sec): 73.40 - samples/sec: 1342.12 - lr: 0.000001 - momentum: 0.000000 2023-10-24 18:28:12,341 epoch 10 - iter 1386/1984 - loss 0.00793725 - time (sec): 85.40 - samples/sec: 1342.25 - lr: 0.000001 - momentum: 0.000000 2023-10-24 18:28:24,359 epoch 10 - iter 1584/1984 - loss 0.00790952 - time (sec): 97.42 - samples/sec: 1340.60 - lr: 0.000001 - momentum: 0.000000 2023-10-24 18:28:36,593 epoch 10 - iter 1782/1984 - loss 0.00760356 - time (sec): 109.66 - samples/sec: 1347.44 - lr: 0.000000 - momentum: 0.000000 2023-10-24 18:28:48,671 epoch 10 - iter 1980/1984 - loss 0.00764524 - time (sec): 121.73 - samples/sec: 1344.84 - lr: 0.000000 - momentum: 0.000000 2023-10-24 18:28:48,912 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:28:48,912 EPOCH 10 done: loss 0.0076 - lr: 0.000000 2023-10-24 18:28:52,025 DEV : loss 0.23562917113304138 - f1-score (micro avg) 0.7713 2023-10-24 18:28:52,525 ---------------------------------------------------------------------------------------------------- 2023-10-24 18:28:52,526 Loading model from best epoch ... 2023-10-24 18:28:53,997 SequenceTagger predicts: Dictionary with 13 tags: O, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-24 18:28:57,085 Results: - F-score (micro) 0.7853 - F-score (macro) 0.6936 - Accuracy 0.6639 By class: precision recall f1-score support LOC 0.8353 0.8595 0.8473 655 PER 0.7250 0.7803 0.7516 223 ORG 0.5699 0.4173 0.4818 127 micro avg 0.7845 0.7861 0.7853 1005 macro avg 0.7101 0.6857 0.6936 1005 weighted avg 0.7773 0.7861 0.7799 1005 2023-10-24 18:28:57,085 ----------------------------------------------------------------------------------------------------