2023-10-25 08:56:05,291 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Train: 14465 sentences 2023-10-25 08:56:05,292 (train_with_dev=False, train_with_test=False) 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Training Params: 2023-10-25 08:56:05,292 - learning_rate: "3e-05" 2023-10-25 08:56:05,292 - mini_batch_size: "4" 2023-10-25 08:56:05,292 - max_epochs: "10" 2023-10-25 08:56:05,292 - shuffle: "True" 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Plugins: 2023-10-25 08:56:05,292 - TensorboardLogger 2023-10-25 08:56:05,292 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 08:56:05,292 - metric: "('micro avg', 'f1-score')" 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Computation: 2023-10-25 08:56:05,292 - compute on device: cuda:0 2023-10-25 08:56:05,292 - embedding storage: none 2023-10-25 08:56:05,292 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,292 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-25 08:56:05,293 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,293 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:56:05,293 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 08:56:27,757 epoch 1 - iter 361/3617 - loss 1.29876432 - time (sec): 22.46 - samples/sec: 1685.27 - lr: 0.000003 - momentum: 0.000000 2023-10-25 08:56:50,077 epoch 1 - iter 722/3617 - loss 0.75385707 - time (sec): 44.78 - samples/sec: 1679.34 - lr: 0.000006 - momentum: 0.000000 2023-10-25 08:57:12,715 epoch 1 - iter 1083/3617 - loss 0.54794241 - time (sec): 67.42 - samples/sec: 1685.85 - lr: 0.000009 - momentum: 0.000000 2023-10-25 08:57:35,339 epoch 1 - iter 1444/3617 - loss 0.44521586 - time (sec): 90.05 - samples/sec: 1685.05 - lr: 0.000012 - momentum: 0.000000 2023-10-25 08:57:57,806 epoch 1 - iter 1805/3617 - loss 0.38171890 - time (sec): 112.51 - samples/sec: 1680.63 - lr: 0.000015 - momentum: 0.000000 2023-10-25 08:58:20,893 epoch 1 - iter 2166/3617 - loss 0.33646336 - time (sec): 135.60 - samples/sec: 1675.78 - lr: 0.000018 - momentum: 0.000000 2023-10-25 08:58:43,512 epoch 1 - iter 2527/3617 - loss 0.30293723 - time (sec): 158.22 - samples/sec: 1678.77 - lr: 0.000021 - momentum: 0.000000 2023-10-25 08:59:06,150 epoch 1 - iter 2888/3617 - loss 0.27974922 - time (sec): 180.86 - samples/sec: 1676.54 - lr: 0.000024 - momentum: 0.000000 2023-10-25 08:59:28,993 epoch 1 - iter 3249/3617 - loss 0.26111850 - time (sec): 203.70 - samples/sec: 1675.74 - lr: 0.000027 - momentum: 0.000000 2023-10-25 08:59:51,479 epoch 1 - iter 3610/3617 - loss 0.24710193 - time (sec): 226.19 - samples/sec: 1675.77 - lr: 0.000030 - momentum: 0.000000 2023-10-25 08:59:51,943 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:59:51,944 EPOCH 1 done: loss 0.2467 - lr: 0.000030 2023-10-25 08:59:56,474 DEV : loss 0.14493782818317413 - f1-score (micro avg) 0.5921 2023-10-25 08:59:56,496 saving best model 2023-10-25 08:59:56,967 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:00:19,557 epoch 2 - iter 361/3617 - loss 0.09481662 - time (sec): 22.59 - samples/sec: 1695.11 - lr: 0.000030 - momentum: 0.000000 2023-10-25 09:00:42,519 epoch 2 - iter 722/3617 - loss 0.10727292 - time (sec): 45.55 - samples/sec: 1694.09 - lr: 0.000029 - momentum: 0.000000 2023-10-25 09:01:05,260 epoch 2 - iter 1083/3617 - loss 0.10816822 - time (sec): 68.29 - samples/sec: 1694.24 - lr: 0.000029 - momentum: 0.000000 2023-10-25 09:01:27,920 epoch 2 - iter 1444/3617 - loss 0.10358701 - time (sec): 90.95 - samples/sec: 1688.21 - lr: 0.000029 - momentum: 0.000000 2023-10-25 09:01:50,585 epoch 2 - iter 1805/3617 - loss 0.10315773 - time (sec): 113.62 - samples/sec: 1685.42 - lr: 0.000028 - momentum: 0.000000 2023-10-25 09:02:13,160 epoch 2 - iter 2166/3617 - loss 0.10157426 - time (sec): 136.19 - samples/sec: 1679.24 - lr: 0.000028 - momentum: 0.000000 2023-10-25 09:02:35,597 epoch 2 - iter 2527/3617 - loss 0.10001100 - time (sec): 158.63 - samples/sec: 1675.89 - lr: 0.000028 - momentum: 0.000000 2023-10-25 09:02:58,307 epoch 2 - iter 2888/3617 - loss 0.09732052 - time (sec): 181.34 - samples/sec: 1677.76 - lr: 0.000027 - momentum: 0.000000 2023-10-25 09:03:21,029 epoch 2 - iter 3249/3617 - loss 0.09809576 - time (sec): 204.06 - samples/sec: 1675.59 - lr: 0.000027 - momentum: 0.000000 2023-10-25 09:03:43,417 epoch 2 - iter 3610/3617 - loss 0.09810723 - time (sec): 226.45 - samples/sec: 1674.16 - lr: 0.000027 - momentum: 0.000000 2023-10-25 09:03:43,852 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:03:43,852 EPOCH 2 done: loss 0.0980 - lr: 0.000027 2023-10-25 09:03:49,086 DEV : loss 0.1498355269432068 - f1-score (micro avg) 0.6537 2023-10-25 09:03:49,108 saving best model 2023-10-25 09:03:49,728 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:04:12,340 epoch 3 - iter 361/3617 - loss 0.08371286 - time (sec): 22.61 - samples/sec: 1661.67 - lr: 0.000026 - momentum: 0.000000 2023-10-25 09:04:35,198 epoch 3 - iter 722/3617 - loss 0.08266269 - time (sec): 45.47 - samples/sec: 1671.47 - lr: 0.000026 - momentum: 0.000000 2023-10-25 09:04:57,535 epoch 3 - iter 1083/3617 - loss 0.07533014 - time (sec): 67.81 - samples/sec: 1676.54 - lr: 0.000026 - momentum: 0.000000 2023-10-25 09:05:20,055 epoch 3 - iter 1444/3617 - loss 0.07921444 - time (sec): 90.33 - samples/sec: 1672.99 - lr: 0.000025 - momentum: 0.000000 2023-10-25 09:05:42,693 epoch 3 - iter 1805/3617 - loss 0.07689623 - time (sec): 112.96 - samples/sec: 1679.89 - lr: 0.000025 - momentum: 0.000000 2023-10-25 09:06:05,757 epoch 3 - iter 2166/3617 - loss 0.07594405 - time (sec): 136.03 - samples/sec: 1684.69 - lr: 0.000025 - momentum: 0.000000 2023-10-25 09:06:28,208 epoch 3 - iter 2527/3617 - loss 0.07505941 - time (sec): 158.48 - samples/sec: 1678.13 - lr: 0.000024 - momentum: 0.000000 2023-10-25 09:06:51,076 epoch 3 - iter 2888/3617 - loss 0.07488029 - time (sec): 181.35 - samples/sec: 1685.62 - lr: 0.000024 - momentum: 0.000000 2023-10-25 09:07:13,857 epoch 3 - iter 3249/3617 - loss 0.07618760 - time (sec): 204.13 - samples/sec: 1680.18 - lr: 0.000024 - momentum: 0.000000 2023-10-25 09:07:36,286 epoch 3 - iter 3610/3617 - loss 0.07650149 - time (sec): 226.56 - samples/sec: 1674.18 - lr: 0.000023 - momentum: 0.000000 2023-10-25 09:07:36,709 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:07:36,709 EPOCH 3 done: loss 0.0764 - lr: 0.000023 2023-10-25 09:07:41,464 DEV : loss 0.19308863580226898 - f1-score (micro avg) 0.6209 2023-10-25 09:07:41,486 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:08:04,147 epoch 4 - iter 361/3617 - loss 0.04740247 - time (sec): 22.66 - samples/sec: 1676.03 - lr: 0.000023 - momentum: 0.000000 2023-10-25 09:08:27,052 epoch 4 - iter 722/3617 - loss 0.04393513 - time (sec): 45.57 - samples/sec: 1694.68 - lr: 0.000023 - momentum: 0.000000 2023-10-25 09:08:49,468 epoch 4 - iter 1083/3617 - loss 0.04673719 - time (sec): 67.98 - samples/sec: 1670.64 - lr: 0.000022 - momentum: 0.000000 2023-10-25 09:09:12,096 epoch 4 - iter 1444/3617 - loss 0.04771808 - time (sec): 90.61 - samples/sec: 1670.66 - lr: 0.000022 - momentum: 0.000000 2023-10-25 09:09:34,752 epoch 4 - iter 1805/3617 - loss 0.04805421 - time (sec): 113.27 - samples/sec: 1672.54 - lr: 0.000022 - momentum: 0.000000 2023-10-25 09:09:57,527 epoch 4 - iter 2166/3617 - loss 0.04811747 - time (sec): 136.04 - samples/sec: 1675.32 - lr: 0.000021 - momentum: 0.000000 2023-10-25 09:10:20,097 epoch 4 - iter 2527/3617 - loss 0.04985558 - time (sec): 158.61 - samples/sec: 1672.88 - lr: 0.000021 - momentum: 0.000000 2023-10-25 09:10:43,113 epoch 4 - iter 2888/3617 - loss 0.04967931 - time (sec): 181.63 - samples/sec: 1666.35 - lr: 0.000021 - momentum: 0.000000 2023-10-25 09:11:05,900 epoch 4 - iter 3249/3617 - loss 0.04983072 - time (sec): 204.41 - samples/sec: 1665.34 - lr: 0.000020 - momentum: 0.000000 2023-10-25 09:11:28,853 epoch 4 - iter 3610/3617 - loss 0.05188277 - time (sec): 227.37 - samples/sec: 1667.33 - lr: 0.000020 - momentum: 0.000000 2023-10-25 09:11:29,293 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:11:29,293 EPOCH 4 done: loss 0.0518 - lr: 0.000020 2023-10-25 09:11:34,065 DEV : loss 0.25538942217826843 - f1-score (micro avg) 0.6376 2023-10-25 09:11:34,087 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:11:56,512 epoch 5 - iter 361/3617 - loss 0.03131284 - time (sec): 22.42 - samples/sec: 1629.91 - lr: 0.000020 - momentum: 0.000000 2023-10-25 09:12:19,313 epoch 5 - iter 722/3617 - loss 0.03223206 - time (sec): 45.22 - samples/sec: 1639.25 - lr: 0.000019 - momentum: 0.000000 2023-10-25 09:12:42,036 epoch 5 - iter 1083/3617 - loss 0.03088082 - time (sec): 67.95 - samples/sec: 1652.27 - lr: 0.000019 - momentum: 0.000000 2023-10-25 09:13:04,677 epoch 5 - iter 1444/3617 - loss 0.03409690 - time (sec): 90.59 - samples/sec: 1655.84 - lr: 0.000019 - momentum: 0.000000 2023-10-25 09:13:27,356 epoch 5 - iter 1805/3617 - loss 0.03218071 - time (sec): 113.27 - samples/sec: 1668.62 - lr: 0.000018 - momentum: 0.000000 2023-10-25 09:13:49,986 epoch 5 - iter 2166/3617 - loss 0.03391101 - time (sec): 135.90 - samples/sec: 1665.03 - lr: 0.000018 - momentum: 0.000000 2023-10-25 09:14:12,624 epoch 5 - iter 2527/3617 - loss 0.03493067 - time (sec): 158.54 - samples/sec: 1662.52 - lr: 0.000018 - momentum: 0.000000 2023-10-25 09:14:35,385 epoch 5 - iter 2888/3617 - loss 0.03495628 - time (sec): 181.30 - samples/sec: 1670.98 - lr: 0.000017 - momentum: 0.000000 2023-10-25 09:14:58,001 epoch 5 - iter 3249/3617 - loss 0.03497871 - time (sec): 203.91 - samples/sec: 1670.29 - lr: 0.000017 - momentum: 0.000000 2023-10-25 09:15:20,904 epoch 5 - iter 3610/3617 - loss 0.03564780 - time (sec): 226.82 - samples/sec: 1672.37 - lr: 0.000017 - momentum: 0.000000 2023-10-25 09:15:21,319 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:15:21,319 EPOCH 5 done: loss 0.0357 - lr: 0.000017 2023-10-25 09:15:26,608 DEV : loss 0.3036385476589203 - f1-score (micro avg) 0.6379 2023-10-25 09:15:26,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:15:49,295 epoch 6 - iter 361/3617 - loss 0.01822029 - time (sec): 22.66 - samples/sec: 1605.12 - lr: 0.000016 - momentum: 0.000000 2023-10-25 09:16:12,087 epoch 6 - iter 722/3617 - loss 0.02217639 - time (sec): 45.46 - samples/sec: 1662.06 - lr: 0.000016 - momentum: 0.000000 2023-10-25 09:16:34,918 epoch 6 - iter 1083/3617 - loss 0.02506345 - time (sec): 68.29 - samples/sec: 1664.01 - lr: 0.000016 - momentum: 0.000000 2023-10-25 09:16:57,390 epoch 6 - iter 1444/3617 - loss 0.02414606 - time (sec): 90.76 - samples/sec: 1655.24 - lr: 0.000015 - momentum: 0.000000 2023-10-25 09:17:20,050 epoch 6 - iter 1805/3617 - loss 0.02424517 - time (sec): 113.42 - samples/sec: 1662.94 - lr: 0.000015 - momentum: 0.000000 2023-10-25 09:17:42,507 epoch 6 - iter 2166/3617 - loss 0.02407469 - time (sec): 135.88 - samples/sec: 1663.03 - lr: 0.000015 - momentum: 0.000000 2023-10-25 09:18:05,243 epoch 6 - iter 2527/3617 - loss 0.02329897 - time (sec): 158.61 - samples/sec: 1665.13 - lr: 0.000014 - momentum: 0.000000 2023-10-25 09:18:28,017 epoch 6 - iter 2888/3617 - loss 0.02317000 - time (sec): 181.39 - samples/sec: 1670.08 - lr: 0.000014 - momentum: 0.000000 2023-10-25 09:18:50,676 epoch 6 - iter 3249/3617 - loss 0.02253595 - time (sec): 204.04 - samples/sec: 1670.18 - lr: 0.000014 - momentum: 0.000000 2023-10-25 09:19:13,356 epoch 6 - iter 3610/3617 - loss 0.02298512 - time (sec): 226.73 - samples/sec: 1671.45 - lr: 0.000013 - momentum: 0.000000 2023-10-25 09:19:13,810 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:19:13,810 EPOCH 6 done: loss 0.0230 - lr: 0.000013 2023-10-25 09:19:19,090 DEV : loss 0.3258330523967743 - f1-score (micro avg) 0.6394 2023-10-25 09:19:19,113 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:19:41,742 epoch 7 - iter 361/3617 - loss 0.01238420 - time (sec): 22.63 - samples/sec: 1691.85 - lr: 0.000013 - momentum: 0.000000 2023-10-25 09:20:04,086 epoch 7 - iter 722/3617 - loss 0.01141601 - time (sec): 44.97 - samples/sec: 1678.52 - lr: 0.000013 - momentum: 0.000000 2023-10-25 09:20:26,635 epoch 7 - iter 1083/3617 - loss 0.01410956 - time (sec): 67.52 - samples/sec: 1670.11 - lr: 0.000012 - momentum: 0.000000 2023-10-25 09:20:49,285 epoch 7 - iter 1444/3617 - loss 0.01436451 - time (sec): 90.17 - samples/sec: 1676.14 - lr: 0.000012 - momentum: 0.000000 2023-10-25 09:21:12,335 epoch 7 - iter 1805/3617 - loss 0.01504339 - time (sec): 113.22 - samples/sec: 1693.17 - lr: 0.000012 - momentum: 0.000000 2023-10-25 09:21:34,746 epoch 7 - iter 2166/3617 - loss 0.01505583 - time (sec): 135.63 - samples/sec: 1683.15 - lr: 0.000011 - momentum: 0.000000 2023-10-25 09:21:57,667 epoch 7 - iter 2527/3617 - loss 0.01548792 - time (sec): 158.55 - samples/sec: 1678.86 - lr: 0.000011 - momentum: 0.000000 2023-10-25 09:22:20,310 epoch 7 - iter 2888/3617 - loss 0.01540908 - time (sec): 181.20 - samples/sec: 1677.51 - lr: 0.000011 - momentum: 0.000000 2023-10-25 09:22:43,073 epoch 7 - iter 3249/3617 - loss 0.01583643 - time (sec): 203.96 - samples/sec: 1679.41 - lr: 0.000010 - momentum: 0.000000 2023-10-25 09:23:05,722 epoch 7 - iter 3610/3617 - loss 0.01543481 - time (sec): 226.61 - samples/sec: 1673.87 - lr: 0.000010 - momentum: 0.000000 2023-10-25 09:23:06,127 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:23:06,128 EPOCH 7 done: loss 0.0155 - lr: 0.000010 2023-10-25 09:23:10,894 DEV : loss 0.3687475621700287 - f1-score (micro avg) 0.6512 2023-10-25 09:23:10,917 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:23:34,320 epoch 8 - iter 361/3617 - loss 0.01011873 - time (sec): 23.40 - samples/sec: 1642.59 - lr: 0.000010 - momentum: 0.000000 2023-10-25 09:23:57,093 epoch 8 - iter 722/3617 - loss 0.01183084 - time (sec): 46.18 - samples/sec: 1647.32 - lr: 0.000009 - momentum: 0.000000 2023-10-25 09:24:19,987 epoch 8 - iter 1083/3617 - loss 0.01114849 - time (sec): 69.07 - samples/sec: 1675.33 - lr: 0.000009 - momentum: 0.000000 2023-10-25 09:24:42,267 epoch 8 - iter 1444/3617 - loss 0.01144658 - time (sec): 91.35 - samples/sec: 1671.61 - lr: 0.000009 - momentum: 0.000000 2023-10-25 09:25:04,971 epoch 8 - iter 1805/3617 - loss 0.01085694 - time (sec): 114.05 - samples/sec: 1671.04 - lr: 0.000008 - momentum: 0.000000 2023-10-25 09:25:27,776 epoch 8 - iter 2166/3617 - loss 0.01113943 - time (sec): 136.86 - samples/sec: 1670.33 - lr: 0.000008 - momentum: 0.000000 2023-10-25 09:25:50,272 epoch 8 - iter 2527/3617 - loss 0.01110272 - time (sec): 159.35 - samples/sec: 1665.95 - lr: 0.000008 - momentum: 0.000000 2023-10-25 09:26:13,117 epoch 8 - iter 2888/3617 - loss 0.01112695 - time (sec): 182.20 - samples/sec: 1667.86 - lr: 0.000007 - momentum: 0.000000 2023-10-25 09:26:35,738 epoch 8 - iter 3249/3617 - loss 0.01071467 - time (sec): 204.82 - samples/sec: 1667.74 - lr: 0.000007 - momentum: 0.000000 2023-10-25 09:26:58,274 epoch 8 - iter 3610/3617 - loss 0.01074639 - time (sec): 227.36 - samples/sec: 1668.14 - lr: 0.000007 - momentum: 0.000000 2023-10-25 09:26:58,691 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:26:58,691 EPOCH 8 done: loss 0.0107 - lr: 0.000007 2023-10-25 09:27:03,463 DEV : loss 0.38349881768226624 - f1-score (micro avg) 0.6433 2023-10-25 09:27:03,486 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:27:26,470 epoch 9 - iter 361/3617 - loss 0.00556864 - time (sec): 22.98 - samples/sec: 1698.80 - lr: 0.000006 - momentum: 0.000000 2023-10-25 09:27:49,214 epoch 9 - iter 722/3617 - loss 0.00783730 - time (sec): 45.73 - samples/sec: 1713.36 - lr: 0.000006 - momentum: 0.000000 2023-10-25 09:28:11,732 epoch 9 - iter 1083/3617 - loss 0.00688603 - time (sec): 68.25 - samples/sec: 1699.80 - lr: 0.000006 - momentum: 0.000000 2023-10-25 09:28:34,228 epoch 9 - iter 1444/3617 - loss 0.00661452 - time (sec): 90.74 - samples/sec: 1681.63 - lr: 0.000005 - momentum: 0.000000 2023-10-25 09:28:57,185 epoch 9 - iter 1805/3617 - loss 0.00671017 - time (sec): 113.70 - samples/sec: 1690.74 - lr: 0.000005 - momentum: 0.000000 2023-10-25 09:29:19,774 epoch 9 - iter 2166/3617 - loss 0.00667753 - time (sec): 136.29 - samples/sec: 1681.32 - lr: 0.000005 - momentum: 0.000000 2023-10-25 09:29:42,402 epoch 9 - iter 2527/3617 - loss 0.00799751 - time (sec): 158.92 - samples/sec: 1675.07 - lr: 0.000004 - momentum: 0.000000 2023-10-25 09:30:05,056 epoch 9 - iter 2888/3617 - loss 0.00813035 - time (sec): 181.57 - samples/sec: 1675.38 - lr: 0.000004 - momentum: 0.000000 2023-10-25 09:30:28,208 epoch 9 - iter 3249/3617 - loss 0.00804585 - time (sec): 204.72 - samples/sec: 1670.75 - lr: 0.000004 - momentum: 0.000000 2023-10-25 09:30:50,683 epoch 9 - iter 3610/3617 - loss 0.00784812 - time (sec): 227.20 - samples/sec: 1668.02 - lr: 0.000003 - momentum: 0.000000 2023-10-25 09:30:51,156 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:30:51,156 EPOCH 9 done: loss 0.0079 - lr: 0.000003 2023-10-25 09:30:55,937 DEV : loss 0.3988388478755951 - f1-score (micro avg) 0.6402 2023-10-25 09:30:55,959 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:31:18,550 epoch 10 - iter 361/3617 - loss 0.00169395 - time (sec): 22.59 - samples/sec: 1691.67 - lr: 0.000003 - momentum: 0.000000 2023-10-25 09:31:41,128 epoch 10 - iter 722/3617 - loss 0.00257176 - time (sec): 45.17 - samples/sec: 1691.41 - lr: 0.000003 - momentum: 0.000000 2023-10-25 09:32:03,905 epoch 10 - iter 1083/3617 - loss 0.00388498 - time (sec): 67.95 - samples/sec: 1670.61 - lr: 0.000002 - momentum: 0.000000 2023-10-25 09:32:26,672 epoch 10 - iter 1444/3617 - loss 0.00415693 - time (sec): 90.71 - samples/sec: 1674.51 - lr: 0.000002 - momentum: 0.000000 2023-10-25 09:32:49,198 epoch 10 - iter 1805/3617 - loss 0.00422595 - time (sec): 113.24 - samples/sec: 1665.99 - lr: 0.000002 - momentum: 0.000000 2023-10-25 09:33:11,815 epoch 10 - iter 2166/3617 - loss 0.00444188 - time (sec): 135.86 - samples/sec: 1665.22 - lr: 0.000001 - momentum: 0.000000 2023-10-25 09:33:34,466 epoch 10 - iter 2527/3617 - loss 0.00456308 - time (sec): 158.51 - samples/sec: 1659.07 - lr: 0.000001 - momentum: 0.000000 2023-10-25 09:33:57,358 epoch 10 - iter 2888/3617 - loss 0.00457433 - time (sec): 181.40 - samples/sec: 1663.72 - lr: 0.000001 - momentum: 0.000000 2023-10-25 09:34:20,142 epoch 10 - iter 3249/3617 - loss 0.00465404 - time (sec): 204.18 - samples/sec: 1668.71 - lr: 0.000000 - momentum: 0.000000 2023-10-25 09:34:42,847 epoch 10 - iter 3610/3617 - loss 0.00478068 - time (sec): 226.89 - samples/sec: 1672.23 - lr: 0.000000 - momentum: 0.000000 2023-10-25 09:34:43,247 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:34:43,247 EPOCH 10 done: loss 0.0048 - lr: 0.000000 2023-10-25 09:34:48,560 DEV : loss 0.42030808329582214 - f1-score (micro avg) 0.6507 2023-10-25 09:34:49,057 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:34:49,058 Loading model from best epoch ... 2023-10-25 09:34:50,737 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 09:34:56,439 Results: - F-score (micro) 0.6562 - F-score (macro) 0.4469 - Accuracy 0.499 By class: precision recall f1-score support loc 0.6340 0.8088 0.7108 591 pers 0.5688 0.7059 0.6300 357 org 0.0000 0.0000 0.0000 79 micro avg 0.6093 0.7108 0.6562 1027 macro avg 0.4009 0.5049 0.4469 1027 weighted avg 0.5626 0.7108 0.6280 1027 2023-10-25 09:34:56,439 ----------------------------------------------------------------------------------------------------