2023-10-24 19:20:16,509 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,510 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-24 19:20:16,511 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,511 MultiCorpus: 7936 train + 992 dev + 992 test sentences - NER_ICDAR_EUROPEANA Corpus: 7936 train + 992 dev + 992 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/fr 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Train: 7936 sentences 2023-10-24 19:20:16,512 (train_with_dev=False, train_with_test=False) 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Training Params: 2023-10-24 19:20:16,512 - learning_rate: "3e-05" 2023-10-24 19:20:16,512 - mini_batch_size: "4" 2023-10-24 19:20:16,512 - max_epochs: "10" 2023-10-24 19:20:16,512 - shuffle: "True" 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Plugins: 2023-10-24 19:20:16,512 - TensorboardLogger 2023-10-24 19:20:16,512 - LinearScheduler | warmup_fraction: '0.1' 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Final evaluation on model from best epoch (best-model.pt) 2023-10-24 19:20:16,512 - metric: "('micro avg', 'f1-score')" 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Computation: 2023-10-24 19:20:16,512 - compute on device: cuda:0 2023-10-24 19:20:16,512 - embedding storage: none 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,512 Model training base path: "hmbench-icdar/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-5" 2023-10-24 19:20:16,512 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,513 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:20:16,513 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-24 19:20:28,549 epoch 1 - iter 198/1984 - loss 1.32014092 - time (sec): 12.04 - samples/sec: 1372.54 - lr: 0.000003 - momentum: 0.000000 2023-10-24 19:20:40,509 epoch 1 - iter 396/1984 - loss 0.82590169 - time (sec): 24.00 - samples/sec: 1347.40 - lr: 0.000006 - momentum: 0.000000 2023-10-24 19:20:52,633 epoch 1 - iter 594/1984 - loss 0.62234473 - time (sec): 36.12 - samples/sec: 1359.25 - lr: 0.000009 - momentum: 0.000000 2023-10-24 19:21:04,703 epoch 1 - iter 792/1984 - loss 0.51779669 - time (sec): 48.19 - samples/sec: 1349.30 - lr: 0.000012 - momentum: 0.000000 2023-10-24 19:21:16,796 epoch 1 - iter 990/1984 - loss 0.44752052 - time (sec): 60.28 - samples/sec: 1352.27 - lr: 0.000015 - momentum: 0.000000 2023-10-24 19:21:28,869 epoch 1 - iter 1188/1984 - loss 0.39941383 - time (sec): 72.36 - samples/sec: 1352.98 - lr: 0.000018 - momentum: 0.000000 2023-10-24 19:21:40,973 epoch 1 - iter 1386/1984 - loss 0.36252978 - time (sec): 84.46 - samples/sec: 1355.89 - lr: 0.000021 - momentum: 0.000000 2023-10-24 19:21:53,022 epoch 1 - iter 1584/1984 - loss 0.33296148 - time (sec): 96.51 - samples/sec: 1351.32 - lr: 0.000024 - momentum: 0.000000 2023-10-24 19:22:05,124 epoch 1 - iter 1782/1984 - loss 0.31002868 - time (sec): 108.61 - samples/sec: 1353.86 - lr: 0.000027 - momentum: 0.000000 2023-10-24 19:22:17,309 epoch 1 - iter 1980/1984 - loss 0.29254745 - time (sec): 120.80 - samples/sec: 1355.42 - lr: 0.000030 - momentum: 0.000000 2023-10-24 19:22:17,540 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:22:17,540 EPOCH 1 done: loss 0.2922 - lr: 0.000030 2023-10-24 19:22:20,599 DEV : loss 0.11393631994724274 - f1-score (micro avg) 0.7142 2023-10-24 19:22:20,614 saving best model 2023-10-24 19:22:21,081 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:22:33,239 epoch 2 - iter 198/1984 - loss 0.11109567 - time (sec): 12.16 - samples/sec: 1354.61 - lr: 0.000030 - momentum: 0.000000 2023-10-24 19:22:45,371 epoch 2 - iter 396/1984 - loss 0.11188014 - time (sec): 24.29 - samples/sec: 1357.95 - lr: 0.000029 - momentum: 0.000000 2023-10-24 19:22:57,459 epoch 2 - iter 594/1984 - loss 0.12088148 - time (sec): 36.38 - samples/sec: 1369.44 - lr: 0.000029 - momentum: 0.000000 2023-10-24 19:23:09,475 epoch 2 - iter 792/1984 - loss 0.12005888 - time (sec): 48.39 - samples/sec: 1355.48 - lr: 0.000029 - momentum: 0.000000 2023-10-24 19:23:21,419 epoch 2 - iter 990/1984 - loss 0.11907747 - time (sec): 60.34 - samples/sec: 1344.35 - lr: 0.000028 - momentum: 0.000000 2023-10-24 19:23:33,494 epoch 2 - iter 1188/1984 - loss 0.11881279 - time (sec): 72.41 - samples/sec: 1342.66 - lr: 0.000028 - momentum: 0.000000 2023-10-24 19:23:45,448 epoch 2 - iter 1386/1984 - loss 0.11809499 - time (sec): 84.37 - samples/sec: 1344.77 - lr: 0.000028 - momentum: 0.000000 2023-10-24 19:23:57,836 epoch 2 - iter 1584/1984 - loss 0.11496911 - time (sec): 96.75 - samples/sec: 1350.51 - lr: 0.000027 - momentum: 0.000000 2023-10-24 19:24:10,044 epoch 2 - iter 1782/1984 - loss 0.11353484 - time (sec): 108.96 - samples/sec: 1347.87 - lr: 0.000027 - momentum: 0.000000 2023-10-24 19:24:22,155 epoch 2 - iter 1980/1984 - loss 0.11233027 - time (sec): 121.07 - samples/sec: 1353.14 - lr: 0.000027 - momentum: 0.000000 2023-10-24 19:24:22,383 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:24:22,383 EPOCH 2 done: loss 0.1126 - lr: 0.000027 2023-10-24 19:24:25,798 DEV : loss 0.11593124270439148 - f1-score (micro avg) 0.7271 2023-10-24 19:24:25,813 saving best model 2023-10-24 19:24:26,407 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:24:38,620 epoch 3 - iter 198/1984 - loss 0.08265116 - time (sec): 12.21 - samples/sec: 1416.54 - lr: 0.000026 - momentum: 0.000000 2023-10-24 19:24:50,809 epoch 3 - iter 396/1984 - loss 0.07804517 - time (sec): 24.40 - samples/sec: 1401.12 - lr: 0.000026 - momentum: 0.000000 2023-10-24 19:25:03,281 epoch 3 - iter 594/1984 - loss 0.08404645 - time (sec): 36.87 - samples/sec: 1385.28 - lr: 0.000026 - momentum: 0.000000 2023-10-24 19:25:15,312 epoch 3 - iter 792/1984 - loss 0.08231776 - time (sec): 48.90 - samples/sec: 1359.00 - lr: 0.000025 - momentum: 0.000000 2023-10-24 19:25:27,283 epoch 3 - iter 990/1984 - loss 0.08429583 - time (sec): 60.87 - samples/sec: 1354.01 - lr: 0.000025 - momentum: 0.000000 2023-10-24 19:25:39,410 epoch 3 - iter 1188/1984 - loss 0.08375321 - time (sec): 73.00 - samples/sec: 1341.90 - lr: 0.000025 - momentum: 0.000000 2023-10-24 19:25:51,397 epoch 3 - iter 1386/1984 - loss 0.08571122 - time (sec): 84.99 - samples/sec: 1343.09 - lr: 0.000024 - momentum: 0.000000 2023-10-24 19:26:03,534 epoch 3 - iter 1584/1984 - loss 0.08551290 - time (sec): 97.13 - samples/sec: 1344.76 - lr: 0.000024 - momentum: 0.000000 2023-10-24 19:26:15,614 epoch 3 - iter 1782/1984 - loss 0.08579605 - time (sec): 109.21 - samples/sec: 1348.07 - lr: 0.000024 - momentum: 0.000000 2023-10-24 19:26:27,719 epoch 3 - iter 1980/1984 - loss 0.08549504 - time (sec): 121.31 - samples/sec: 1349.14 - lr: 0.000023 - momentum: 0.000000 2023-10-24 19:26:27,962 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:26:27,962 EPOCH 3 done: loss 0.0857 - lr: 0.000023 2023-10-24 19:26:31,075 DEV : loss 0.12287832796573639 - f1-score (micro avg) 0.756 2023-10-24 19:26:31,090 saving best model 2023-10-24 19:26:31,684 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:26:43,776 epoch 4 - iter 198/1984 - loss 0.05492174 - time (sec): 12.09 - samples/sec: 1306.03 - lr: 0.000023 - momentum: 0.000000 2023-10-24 19:26:55,794 epoch 4 - iter 396/1984 - loss 0.06173660 - time (sec): 24.11 - samples/sec: 1325.23 - lr: 0.000023 - momentum: 0.000000 2023-10-24 19:27:07,916 epoch 4 - iter 594/1984 - loss 0.06108648 - time (sec): 36.23 - samples/sec: 1329.97 - lr: 0.000022 - momentum: 0.000000 2023-10-24 19:27:19,998 epoch 4 - iter 792/1984 - loss 0.06054768 - time (sec): 48.31 - samples/sec: 1333.58 - lr: 0.000022 - momentum: 0.000000 2023-10-24 19:27:32,167 epoch 4 - iter 990/1984 - loss 0.06244785 - time (sec): 60.48 - samples/sec: 1341.29 - lr: 0.000022 - momentum: 0.000000 2023-10-24 19:27:44,204 epoch 4 - iter 1188/1984 - loss 0.06144580 - time (sec): 72.52 - samples/sec: 1342.52 - lr: 0.000021 - momentum: 0.000000 2023-10-24 19:27:56,373 epoch 4 - iter 1386/1984 - loss 0.06113227 - time (sec): 84.69 - samples/sec: 1348.41 - lr: 0.000021 - momentum: 0.000000 2023-10-24 19:28:09,135 epoch 4 - iter 1584/1984 - loss 0.06036745 - time (sec): 97.45 - samples/sec: 1352.40 - lr: 0.000021 - momentum: 0.000000 2023-10-24 19:28:21,273 epoch 4 - iter 1782/1984 - loss 0.06099629 - time (sec): 109.59 - samples/sec: 1351.83 - lr: 0.000020 - momentum: 0.000000 2023-10-24 19:28:33,327 epoch 4 - iter 1980/1984 - loss 0.06067905 - time (sec): 121.64 - samples/sec: 1346.46 - lr: 0.000020 - momentum: 0.000000 2023-10-24 19:28:33,553 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:28:33,553 EPOCH 4 done: loss 0.0607 - lr: 0.000020 2023-10-24 19:28:36,684 DEV : loss 0.1927175521850586 - f1-score (micro avg) 0.7183 2023-10-24 19:28:36,699 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:28:48,910 epoch 5 - iter 198/1984 - loss 0.04671831 - time (sec): 12.21 - samples/sec: 1361.09 - lr: 0.000020 - momentum: 0.000000 2023-10-24 19:29:01,027 epoch 5 - iter 396/1984 - loss 0.04574779 - time (sec): 24.33 - samples/sec: 1356.48 - lr: 0.000019 - momentum: 0.000000 2023-10-24 19:29:13,073 epoch 5 - iter 594/1984 - loss 0.04539830 - time (sec): 36.37 - samples/sec: 1356.89 - lr: 0.000019 - momentum: 0.000000 2023-10-24 19:29:25,286 epoch 5 - iter 792/1984 - loss 0.04680807 - time (sec): 48.59 - samples/sec: 1358.20 - lr: 0.000019 - momentum: 0.000000 2023-10-24 19:29:37,625 epoch 5 - iter 990/1984 - loss 0.04441270 - time (sec): 60.93 - samples/sec: 1373.34 - lr: 0.000018 - momentum: 0.000000 2023-10-24 19:29:49,703 epoch 5 - iter 1188/1984 - loss 0.04380522 - time (sec): 73.00 - samples/sec: 1369.44 - lr: 0.000018 - momentum: 0.000000 2023-10-24 19:30:02,105 epoch 5 - iter 1386/1984 - loss 0.04443524 - time (sec): 85.41 - samples/sec: 1371.38 - lr: 0.000018 - momentum: 0.000000 2023-10-24 19:30:14,134 epoch 5 - iter 1584/1984 - loss 0.04578146 - time (sec): 97.43 - samples/sec: 1363.32 - lr: 0.000017 - momentum: 0.000000 2023-10-24 19:30:26,194 epoch 5 - iter 1782/1984 - loss 0.04603563 - time (sec): 109.49 - samples/sec: 1350.91 - lr: 0.000017 - momentum: 0.000000 2023-10-24 19:30:38,245 epoch 5 - iter 1980/1984 - loss 0.04522297 - time (sec): 121.55 - samples/sec: 1347.11 - lr: 0.000017 - momentum: 0.000000 2023-10-24 19:30:38,479 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:30:38,479 EPOCH 5 done: loss 0.0455 - lr: 0.000017 2023-10-24 19:30:41,600 DEV : loss 0.1995469629764557 - f1-score (micro avg) 0.7543 2023-10-24 19:30:41,615 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:30:53,777 epoch 6 - iter 198/1984 - loss 0.03231291 - time (sec): 12.16 - samples/sec: 1357.15 - lr: 0.000016 - momentum: 0.000000 2023-10-24 19:31:05,778 epoch 6 - iter 396/1984 - loss 0.03397784 - time (sec): 24.16 - samples/sec: 1328.30 - lr: 0.000016 - momentum: 0.000000 2023-10-24 19:31:17,884 epoch 6 - iter 594/1984 - loss 0.03041311 - time (sec): 36.27 - samples/sec: 1346.65 - lr: 0.000016 - momentum: 0.000000 2023-10-24 19:31:30,370 epoch 6 - iter 792/1984 - loss 0.03190695 - time (sec): 48.75 - samples/sec: 1370.84 - lr: 0.000015 - momentum: 0.000000 2023-10-24 19:31:42,871 epoch 6 - iter 990/1984 - loss 0.03341697 - time (sec): 61.25 - samples/sec: 1348.17 - lr: 0.000015 - momentum: 0.000000 2023-10-24 19:31:54,884 epoch 6 - iter 1188/1984 - loss 0.03375744 - time (sec): 73.27 - samples/sec: 1340.20 - lr: 0.000015 - momentum: 0.000000 2023-10-24 19:32:06,956 epoch 6 - iter 1386/1984 - loss 0.03312953 - time (sec): 85.34 - samples/sec: 1337.39 - lr: 0.000014 - momentum: 0.000000 2023-10-24 19:32:19,204 epoch 6 - iter 1584/1984 - loss 0.03383901 - time (sec): 97.59 - samples/sec: 1339.64 - lr: 0.000014 - momentum: 0.000000 2023-10-24 19:32:31,450 epoch 6 - iter 1782/1984 - loss 0.03389974 - time (sec): 109.83 - samples/sec: 1343.73 - lr: 0.000014 - momentum: 0.000000 2023-10-24 19:32:43,633 epoch 6 - iter 1980/1984 - loss 0.03407852 - time (sec): 122.02 - samples/sec: 1341.61 - lr: 0.000013 - momentum: 0.000000 2023-10-24 19:32:43,869 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:32:43,870 EPOCH 6 done: loss 0.0340 - lr: 0.000013 2023-10-24 19:32:46,992 DEV : loss 0.20763596892356873 - f1-score (micro avg) 0.774 2023-10-24 19:32:47,007 saving best model 2023-10-24 19:32:47,627 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:32:59,661 epoch 7 - iter 198/1984 - loss 0.02133725 - time (sec): 12.03 - samples/sec: 1354.69 - lr: 0.000013 - momentum: 0.000000 2023-10-24 19:33:11,642 epoch 7 - iter 396/1984 - loss 0.02091982 - time (sec): 24.01 - samples/sec: 1330.34 - lr: 0.000013 - momentum: 0.000000 2023-10-24 19:33:23,661 epoch 7 - iter 594/1984 - loss 0.02302967 - time (sec): 36.03 - samples/sec: 1331.94 - lr: 0.000012 - momentum: 0.000000 2023-10-24 19:33:36,085 epoch 7 - iter 792/1984 - loss 0.02423421 - time (sec): 48.46 - samples/sec: 1350.67 - lr: 0.000012 - momentum: 0.000000 2023-10-24 19:33:48,050 epoch 7 - iter 990/1984 - loss 0.02379088 - time (sec): 60.42 - samples/sec: 1344.83 - lr: 0.000012 - momentum: 0.000000 2023-10-24 19:34:00,094 epoch 7 - iter 1188/1984 - loss 0.02343269 - time (sec): 72.47 - samples/sec: 1345.70 - lr: 0.000011 - momentum: 0.000000 2023-10-24 19:34:12,236 epoch 7 - iter 1386/1984 - loss 0.02524471 - time (sec): 84.61 - samples/sec: 1347.56 - lr: 0.000011 - momentum: 0.000000 2023-10-24 19:34:24,424 epoch 7 - iter 1584/1984 - loss 0.02464132 - time (sec): 96.80 - samples/sec: 1347.23 - lr: 0.000011 - momentum: 0.000000 2023-10-24 19:34:36,539 epoch 7 - iter 1782/1984 - loss 0.02431973 - time (sec): 108.91 - samples/sec: 1347.67 - lr: 0.000010 - momentum: 0.000000 2023-10-24 19:34:48,863 epoch 7 - iter 1980/1984 - loss 0.02469436 - time (sec): 121.23 - samples/sec: 1350.73 - lr: 0.000010 - momentum: 0.000000 2023-10-24 19:34:49,087 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:34:49,087 EPOCH 7 done: loss 0.0247 - lr: 0.000010 2023-10-24 19:34:52,192 DEV : loss 0.22793228924274445 - f1-score (micro avg) 0.7628 2023-10-24 19:34:52,207 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:35:04,591 epoch 8 - iter 198/1984 - loss 0.01770350 - time (sec): 12.38 - samples/sec: 1317.10 - lr: 0.000010 - momentum: 0.000000 2023-10-24 19:35:16,521 epoch 8 - iter 396/1984 - loss 0.01496629 - time (sec): 24.31 - samples/sec: 1309.49 - lr: 0.000009 - momentum: 0.000000 2023-10-24 19:35:28,965 epoch 8 - iter 594/1984 - loss 0.01607839 - time (sec): 36.76 - samples/sec: 1345.79 - lr: 0.000009 - momentum: 0.000000 2023-10-24 19:35:40,927 epoch 8 - iter 792/1984 - loss 0.01770098 - time (sec): 48.72 - samples/sec: 1335.77 - lr: 0.000009 - momentum: 0.000000 2023-10-24 19:35:53,081 epoch 8 - iter 990/1984 - loss 0.01788262 - time (sec): 60.87 - samples/sec: 1342.78 - lr: 0.000008 - momentum: 0.000000 2023-10-24 19:36:05,273 epoch 8 - iter 1188/1984 - loss 0.01779408 - time (sec): 73.06 - samples/sec: 1342.96 - lr: 0.000008 - momentum: 0.000000 2023-10-24 19:36:17,387 epoch 8 - iter 1386/1984 - loss 0.01703299 - time (sec): 85.18 - samples/sec: 1345.43 - lr: 0.000008 - momentum: 0.000000 2023-10-24 19:36:29,612 epoch 8 - iter 1584/1984 - loss 0.01716121 - time (sec): 97.40 - samples/sec: 1341.02 - lr: 0.000007 - momentum: 0.000000 2023-10-24 19:36:41,523 epoch 8 - iter 1782/1984 - loss 0.01672368 - time (sec): 109.31 - samples/sec: 1345.12 - lr: 0.000007 - momentum: 0.000000 2023-10-24 19:36:53,631 epoch 8 - iter 1980/1984 - loss 0.01650254 - time (sec): 121.42 - samples/sec: 1347.56 - lr: 0.000007 - momentum: 0.000000 2023-10-24 19:36:53,873 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:36:53,873 EPOCH 8 done: loss 0.0165 - lr: 0.000007 2023-10-24 19:36:56,983 DEV : loss 0.23265020549297333 - f1-score (micro avg) 0.765 2023-10-24 19:36:56,998 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:37:08,934 epoch 9 - iter 198/1984 - loss 0.01460665 - time (sec): 11.94 - samples/sec: 1310.79 - lr: 0.000006 - momentum: 0.000000 2023-10-24 19:37:20,956 epoch 9 - iter 396/1984 - loss 0.01489846 - time (sec): 23.96 - samples/sec: 1330.22 - lr: 0.000006 - momentum: 0.000000 2023-10-24 19:37:32,982 epoch 9 - iter 594/1984 - loss 0.01459467 - time (sec): 35.98 - samples/sec: 1306.08 - lr: 0.000006 - momentum: 0.000000 2023-10-24 19:37:45,075 epoch 9 - iter 792/1984 - loss 0.01270058 - time (sec): 48.08 - samples/sec: 1327.28 - lr: 0.000005 - momentum: 0.000000 2023-10-24 19:37:57,137 epoch 9 - iter 990/1984 - loss 0.01265837 - time (sec): 60.14 - samples/sec: 1334.58 - lr: 0.000005 - momentum: 0.000000 2023-10-24 19:38:09,538 epoch 9 - iter 1188/1984 - loss 0.01297891 - time (sec): 72.54 - samples/sec: 1344.49 - lr: 0.000005 - momentum: 0.000000 2023-10-24 19:38:21,385 epoch 9 - iter 1386/1984 - loss 0.01248831 - time (sec): 84.39 - samples/sec: 1340.15 - lr: 0.000004 - momentum: 0.000000 2023-10-24 19:38:33,965 epoch 9 - iter 1584/1984 - loss 0.01258510 - time (sec): 96.97 - samples/sec: 1356.27 - lr: 0.000004 - momentum: 0.000000 2023-10-24 19:38:46,104 epoch 9 - iter 1782/1984 - loss 0.01225636 - time (sec): 109.11 - samples/sec: 1353.36 - lr: 0.000004 - momentum: 0.000000 2023-10-24 19:38:58,070 epoch 9 - iter 1980/1984 - loss 0.01210063 - time (sec): 121.07 - samples/sec: 1351.07 - lr: 0.000003 - momentum: 0.000000 2023-10-24 19:38:58,346 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:38:58,346 EPOCH 9 done: loss 0.0121 - lr: 0.000003 2023-10-24 19:39:01,778 DEV : loss 0.244042307138443 - f1-score (micro avg) 0.7587 2023-10-24 19:39:01,793 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:39:14,043 epoch 10 - iter 198/1984 - loss 0.01126873 - time (sec): 12.25 - samples/sec: 1403.09 - lr: 0.000003 - momentum: 0.000000 2023-10-24 19:39:26,290 epoch 10 - iter 396/1984 - loss 0.01079960 - time (sec): 24.50 - samples/sec: 1376.43 - lr: 0.000003 - momentum: 0.000000 2023-10-24 19:39:38,492 epoch 10 - iter 594/1984 - loss 0.00920357 - time (sec): 36.70 - samples/sec: 1391.31 - lr: 0.000002 - momentum: 0.000000 2023-10-24 19:39:50,933 epoch 10 - iter 792/1984 - loss 0.00891660 - time (sec): 49.14 - samples/sec: 1389.24 - lr: 0.000002 - momentum: 0.000000 2023-10-24 19:40:02,800 epoch 10 - iter 990/1984 - loss 0.00864161 - time (sec): 61.01 - samples/sec: 1368.26 - lr: 0.000002 - momentum: 0.000000 2023-10-24 19:40:14,924 epoch 10 - iter 1188/1984 - loss 0.00926189 - time (sec): 73.13 - samples/sec: 1364.12 - lr: 0.000001 - momentum: 0.000000 2023-10-24 19:40:26,822 epoch 10 - iter 1386/1984 - loss 0.00915410 - time (sec): 85.03 - samples/sec: 1349.75 - lr: 0.000001 - momentum: 0.000000 2023-10-24 19:40:38,890 epoch 10 - iter 1584/1984 - loss 0.00899681 - time (sec): 97.10 - samples/sec: 1349.54 - lr: 0.000001 - momentum: 0.000000 2023-10-24 19:40:51,050 epoch 10 - iter 1782/1984 - loss 0.00863704 - time (sec): 109.26 - samples/sec: 1350.21 - lr: 0.000000 - momentum: 0.000000 2023-10-24 19:41:03,100 epoch 10 - iter 1980/1984 - loss 0.00877194 - time (sec): 121.31 - samples/sec: 1348.60 - lr: 0.000000 - momentum: 0.000000 2023-10-24 19:41:03,350 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:41:03,350 EPOCH 10 done: loss 0.0088 - lr: 0.000000 2023-10-24 19:41:06,471 DEV : loss 0.25166356563568115 - f1-score (micro avg) 0.7624 2023-10-24 19:41:06,956 ---------------------------------------------------------------------------------------------------- 2023-10-24 19:41:06,956 Loading model from best epoch ... 2023-10-24 19:41:08,420 SequenceTagger predicts: Dictionary with 13 tags: O, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-24 19:41:11,490 Results: - F-score (micro) 0.7761 - F-score (macro) 0.6778 - Accuracy 0.6586 By class: precision recall f1-score support LOC 0.8447 0.8305 0.8376 655 PER 0.6996 0.7937 0.7437 223 ORG 0.6250 0.3543 0.4523 127 micro avg 0.7905 0.7622 0.7761 1005 macro avg 0.7231 0.6595 0.6778 1005 weighted avg 0.7848 0.7622 0.7680 1005 2023-10-24 19:41:11,490 ----------------------------------------------------------------------------------------------------