2023-10-25 09:35:15,970 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Train: 14465 sentences 2023-10-25 09:35:15,971 (train_with_dev=False, train_with_test=False) 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Training Params: 2023-10-25 09:35:15,971 - learning_rate: "5e-05" 2023-10-25 09:35:15,971 - mini_batch_size: "4" 2023-10-25 09:35:15,971 - max_epochs: "10" 2023-10-25 09:35:15,971 - shuffle: "True" 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Plugins: 2023-10-25 09:35:15,971 - TensorboardLogger 2023-10-25 09:35:15,971 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 09:35:15,971 - metric: "('micro avg', 'f1-score')" 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Computation: 2023-10-25 09:35:15,971 - compute on device: cuda:0 2023-10-25 09:35:15,971 - embedding storage: none 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,971 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:35:15,972 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 09:35:38,470 epoch 1 - iter 361/3617 - loss 1.05695090 - time (sec): 22.50 - samples/sec: 1682.68 - lr: 0.000005 - momentum: 0.000000 2023-10-25 09:36:00,826 epoch 1 - iter 722/3617 - loss 0.61904920 - time (sec): 44.85 - samples/sec: 1676.74 - lr: 0.000010 - momentum: 0.000000 2023-10-25 09:36:23,508 epoch 1 - iter 1083/3617 - loss 0.45672288 - time (sec): 67.54 - samples/sec: 1682.98 - lr: 0.000015 - momentum: 0.000000 2023-10-25 09:36:46,172 epoch 1 - iter 1444/3617 - loss 0.37587099 - time (sec): 90.20 - samples/sec: 1682.16 - lr: 0.000020 - momentum: 0.000000 2023-10-25 09:37:08,680 epoch 1 - iter 1805/3617 - loss 0.32508458 - time (sec): 112.71 - samples/sec: 1677.72 - lr: 0.000025 - momentum: 0.000000 2023-10-25 09:37:31,312 epoch 1 - iter 2166/3617 - loss 0.28960332 - time (sec): 135.34 - samples/sec: 1679.00 - lr: 0.000030 - momentum: 0.000000 2023-10-25 09:37:53,870 epoch 1 - iter 2527/3617 - loss 0.26362773 - time (sec): 157.90 - samples/sec: 1682.18 - lr: 0.000035 - momentum: 0.000000 2023-10-25 09:38:16,458 epoch 1 - iter 2888/3617 - loss 0.24619176 - time (sec): 180.49 - samples/sec: 1679.99 - lr: 0.000040 - momentum: 0.000000 2023-10-25 09:38:39,263 epoch 1 - iter 3249/3617 - loss 0.23199189 - time (sec): 203.29 - samples/sec: 1679.11 - lr: 0.000045 - momentum: 0.000000 2023-10-25 09:39:01,722 epoch 1 - iter 3610/3617 - loss 0.22149161 - time (sec): 225.75 - samples/sec: 1679.01 - lr: 0.000050 - momentum: 0.000000 2023-10-25 09:39:02,184 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:39:02,184 EPOCH 1 done: loss 0.2211 - lr: 0.000050 2023-10-25 09:39:06,714 DEV : loss 0.164424329996109 - f1-score (micro avg) 0.5744 2023-10-25 09:39:06,737 saving best model 2023-10-25 09:39:07,206 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:39:29,790 epoch 2 - iter 361/3617 - loss 0.10128829 - time (sec): 22.58 - samples/sec: 1695.54 - lr: 0.000049 - momentum: 0.000000 2023-10-25 09:39:52,798 epoch 2 - iter 722/3617 - loss 0.11212638 - time (sec): 45.59 - samples/sec: 1692.60 - lr: 0.000049 - momentum: 0.000000 2023-10-25 09:40:15,575 epoch 2 - iter 1083/3617 - loss 0.11116111 - time (sec): 68.37 - samples/sec: 1692.36 - lr: 0.000048 - momentum: 0.000000 2023-10-25 09:40:38,290 epoch 2 - iter 1444/3617 - loss 0.10814252 - time (sec): 91.08 - samples/sec: 1685.77 - lr: 0.000048 - momentum: 0.000000 2023-10-25 09:41:01,016 epoch 2 - iter 1805/3617 - loss 0.10845616 - time (sec): 113.81 - samples/sec: 1682.58 - lr: 0.000047 - momentum: 0.000000 2023-10-25 09:41:23,650 epoch 2 - iter 2166/3617 - loss 0.10793655 - time (sec): 136.44 - samples/sec: 1676.13 - lr: 0.000047 - momentum: 0.000000 2023-10-25 09:41:46,137 epoch 2 - iter 2527/3617 - loss 0.10723312 - time (sec): 158.93 - samples/sec: 1672.72 - lr: 0.000046 - momentum: 0.000000 2023-10-25 09:42:08,851 epoch 2 - iter 2888/3617 - loss 0.10438015 - time (sec): 181.64 - samples/sec: 1674.94 - lr: 0.000046 - momentum: 0.000000 2023-10-25 09:42:31,577 epoch 2 - iter 3249/3617 - loss 0.10418652 - time (sec): 204.37 - samples/sec: 1673.05 - lr: 0.000045 - momentum: 0.000000 2023-10-25 09:42:53,958 epoch 2 - iter 3610/3617 - loss 0.10451376 - time (sec): 226.75 - samples/sec: 1671.92 - lr: 0.000044 - momentum: 0.000000 2023-10-25 09:42:54,390 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:42:54,391 EPOCH 2 done: loss 0.1044 - lr: 0.000044 2023-10-25 09:42:59,654 DEV : loss 0.1753711998462677 - f1-score (micro avg) 0.6163 2023-10-25 09:42:59,677 saving best model 2023-10-25 09:43:00,248 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:43:22,867 epoch 3 - iter 361/3617 - loss 0.09187774 - time (sec): 22.62 - samples/sec: 1661.09 - lr: 0.000044 - momentum: 0.000000 2023-10-25 09:43:45,716 epoch 3 - iter 722/3617 - loss 0.08807253 - time (sec): 45.47 - samples/sec: 1671.54 - lr: 0.000043 - momentum: 0.000000 2023-10-25 09:44:08,034 epoch 3 - iter 1083/3617 - loss 0.07979957 - time (sec): 67.79 - samples/sec: 1677.05 - lr: 0.000043 - momentum: 0.000000 2023-10-25 09:44:30,500 epoch 3 - iter 1444/3617 - loss 0.08408592 - time (sec): 90.25 - samples/sec: 1674.39 - lr: 0.000042 - momentum: 0.000000 2023-10-25 09:44:53,092 epoch 3 - iter 1805/3617 - loss 0.08186299 - time (sec): 112.84 - samples/sec: 1681.67 - lr: 0.000042 - momentum: 0.000000 2023-10-25 09:45:16,103 epoch 3 - iter 2166/3617 - loss 0.08187069 - time (sec): 135.85 - samples/sec: 1686.84 - lr: 0.000041 - momentum: 0.000000 2023-10-25 09:45:38,530 epoch 3 - iter 2527/3617 - loss 0.08256031 - time (sec): 158.28 - samples/sec: 1680.22 - lr: 0.000041 - momentum: 0.000000 2023-10-25 09:46:01,369 epoch 3 - iter 2888/3617 - loss 0.08357263 - time (sec): 181.12 - samples/sec: 1687.72 - lr: 0.000040 - momentum: 0.000000 2023-10-25 09:46:24,129 epoch 3 - iter 3249/3617 - loss 0.08633465 - time (sec): 203.88 - samples/sec: 1682.22 - lr: 0.000039 - momentum: 0.000000 2023-10-25 09:46:46,514 epoch 3 - iter 3610/3617 - loss 0.08683966 - time (sec): 226.27 - samples/sec: 1676.34 - lr: 0.000039 - momentum: 0.000000 2023-10-25 09:46:46,935 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:46:46,935 EPOCH 3 done: loss 0.0867 - lr: 0.000039 2023-10-25 09:46:52,201 DEV : loss 0.1687808334827423 - f1-score (micro avg) 0.6053 2023-10-25 09:46:52,224 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:47:14,850 epoch 4 - iter 361/3617 - loss 0.05413728 - time (sec): 22.63 - samples/sec: 1678.58 - lr: 0.000038 - momentum: 0.000000 2023-10-25 09:47:37,725 epoch 4 - iter 722/3617 - loss 0.05722728 - time (sec): 45.50 - samples/sec: 1697.08 - lr: 0.000038 - momentum: 0.000000 2023-10-25 09:48:00,109 epoch 4 - iter 1083/3617 - loss 0.07150763 - time (sec): 67.88 - samples/sec: 1673.02 - lr: 0.000037 - momentum: 0.000000 2023-10-25 09:48:22,695 epoch 4 - iter 1444/3617 - loss 0.06878229 - time (sec): 90.47 - samples/sec: 1673.24 - lr: 0.000037 - momentum: 0.000000 2023-10-25 09:48:45,328 epoch 4 - iter 1805/3617 - loss 0.06707460 - time (sec): 113.10 - samples/sec: 1674.93 - lr: 0.000036 - momentum: 0.000000 2023-10-25 09:49:08,058 epoch 4 - iter 2166/3617 - loss 0.06431722 - time (sec): 135.83 - samples/sec: 1677.87 - lr: 0.000036 - momentum: 0.000000 2023-10-25 09:49:30,598 epoch 4 - iter 2527/3617 - loss 0.06497242 - time (sec): 158.37 - samples/sec: 1675.39 - lr: 0.000035 - momentum: 0.000000 2023-10-25 09:49:53,041 epoch 4 - iter 2888/3617 - loss 0.06441663 - time (sec): 180.82 - samples/sec: 1673.82 - lr: 0.000034 - momentum: 0.000000 2023-10-25 09:50:15,774 epoch 4 - iter 3249/3617 - loss 0.06478123 - time (sec): 203.55 - samples/sec: 1672.41 - lr: 0.000034 - momentum: 0.000000 2023-10-25 09:50:38,665 epoch 4 - iter 3610/3617 - loss 0.06560130 - time (sec): 226.44 - samples/sec: 1674.16 - lr: 0.000033 - momentum: 0.000000 2023-10-25 09:50:39,104 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:50:39,104 EPOCH 4 done: loss 0.0655 - lr: 0.000033 2023-10-25 09:50:43,863 DEV : loss 0.2523341476917267 - f1-score (micro avg) 0.6151 2023-10-25 09:50:43,887 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:51:06,754 epoch 5 - iter 361/3617 - loss 0.04758910 - time (sec): 22.87 - samples/sec: 1598.33 - lr: 0.000033 - momentum: 0.000000 2023-10-25 09:51:29,489 epoch 5 - iter 722/3617 - loss 0.04378816 - time (sec): 45.60 - samples/sec: 1625.72 - lr: 0.000032 - momentum: 0.000000 2023-10-25 09:51:52,162 epoch 5 - iter 1083/3617 - loss 0.04314076 - time (sec): 68.27 - samples/sec: 1644.36 - lr: 0.000032 - momentum: 0.000000 2023-10-25 09:52:14,724 epoch 5 - iter 1444/3617 - loss 0.04929634 - time (sec): 90.84 - samples/sec: 1651.33 - lr: 0.000031 - momentum: 0.000000 2023-10-25 09:52:37,353 epoch 5 - iter 1805/3617 - loss 0.07597635 - time (sec): 113.47 - samples/sec: 1665.71 - lr: 0.000031 - momentum: 0.000000 2023-10-25 09:52:59,935 epoch 5 - iter 2166/3617 - loss 0.10924113 - time (sec): 136.05 - samples/sec: 1663.20 - lr: 0.000030 - momentum: 0.000000 2023-10-25 09:53:22,507 epoch 5 - iter 2527/3617 - loss 0.13627939 - time (sec): 158.62 - samples/sec: 1661.65 - lr: 0.000029 - momentum: 0.000000 2023-10-25 09:53:45,215 epoch 5 - iter 2888/3617 - loss 0.15088807 - time (sec): 181.33 - samples/sec: 1670.70 - lr: 0.000029 - momentum: 0.000000 2023-10-25 09:54:07,794 epoch 5 - iter 3249/3617 - loss 0.16557653 - time (sec): 203.91 - samples/sec: 1670.34 - lr: 0.000028 - momentum: 0.000000 2023-10-25 09:54:30,658 epoch 5 - iter 3610/3617 - loss 0.16418110 - time (sec): 226.77 - samples/sec: 1672.70 - lr: 0.000028 - momentum: 0.000000 2023-10-25 09:54:31,073 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:54:31,074 EPOCH 5 done: loss 0.1641 - lr: 0.000028 2023-10-25 09:54:35,815 DEV : loss 0.2738305926322937 - f1-score (micro avg) 0.4385 2023-10-25 09:54:35,838 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:54:58,458 epoch 6 - iter 361/3617 - loss 0.07602831 - time (sec): 22.62 - samples/sec: 1608.27 - lr: 0.000027 - momentum: 0.000000 2023-10-25 09:55:21,211 epoch 6 - iter 722/3617 - loss 0.06715999 - time (sec): 45.37 - samples/sec: 1665.10 - lr: 0.000027 - momentum: 0.000000 2023-10-25 09:55:44,081 epoch 6 - iter 1083/3617 - loss 0.06021834 - time (sec): 68.24 - samples/sec: 1665.09 - lr: 0.000026 - momentum: 0.000000 2023-10-25 09:56:06,585 epoch 6 - iter 1444/3617 - loss 0.05494355 - time (sec): 90.75 - samples/sec: 1655.47 - lr: 0.000026 - momentum: 0.000000 2023-10-25 09:56:29,260 epoch 6 - iter 1805/3617 - loss 0.05247836 - time (sec): 113.42 - samples/sec: 1662.91 - lr: 0.000025 - momentum: 0.000000 2023-10-25 09:56:51,734 epoch 6 - iter 2166/3617 - loss 0.05169263 - time (sec): 135.90 - samples/sec: 1662.80 - lr: 0.000024 - momentum: 0.000000 2023-10-25 09:57:14,488 epoch 6 - iter 2527/3617 - loss 0.05000458 - time (sec): 158.65 - samples/sec: 1664.74 - lr: 0.000024 - momentum: 0.000000 2023-10-25 09:57:37,297 epoch 6 - iter 2888/3617 - loss 0.04799543 - time (sec): 181.46 - samples/sec: 1669.41 - lr: 0.000023 - momentum: 0.000000 2023-10-25 09:58:00,021 epoch 6 - iter 3249/3617 - loss 0.04620394 - time (sec): 204.18 - samples/sec: 1669.05 - lr: 0.000023 - momentum: 0.000000 2023-10-25 09:58:23,282 epoch 6 - iter 3610/3617 - loss 0.04597887 - time (sec): 227.44 - samples/sec: 1666.18 - lr: 0.000022 - momentum: 0.000000 2023-10-25 09:58:23,734 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:58:23,735 EPOCH 6 done: loss 0.0460 - lr: 0.000022 2023-10-25 09:58:28,505 DEV : loss 0.2893408536911011 - f1-score (micro avg) 0.6033 2023-10-25 09:58:28,528 ---------------------------------------------------------------------------------------------------- 2023-10-25 09:58:51,138 epoch 7 - iter 361/3617 - loss 0.02052274 - time (sec): 22.61 - samples/sec: 1693.32 - lr: 0.000022 - momentum: 0.000000 2023-10-25 09:59:13,437 epoch 7 - iter 722/3617 - loss 0.02181682 - time (sec): 44.91 - samples/sec: 1680.96 - lr: 0.000021 - momentum: 0.000000 2023-10-25 09:59:35,956 epoch 7 - iter 1083/3617 - loss 0.02519344 - time (sec): 67.43 - samples/sec: 1672.46 - lr: 0.000021 - momentum: 0.000000 2023-10-25 09:59:58,626 epoch 7 - iter 1444/3617 - loss 0.02594583 - time (sec): 90.10 - samples/sec: 1677.52 - lr: 0.000020 - momentum: 0.000000 2023-10-25 10:00:21,677 epoch 7 - iter 1805/3617 - loss 0.02582872 - time (sec): 113.15 - samples/sec: 1694.28 - lr: 0.000019 - momentum: 0.000000 2023-10-25 10:00:44,093 epoch 7 - iter 2166/3617 - loss 0.02645313 - time (sec): 135.56 - samples/sec: 1684.00 - lr: 0.000019 - momentum: 0.000000 2023-10-25 10:01:07,030 epoch 7 - iter 2527/3617 - loss 0.02674202 - time (sec): 158.50 - samples/sec: 1679.42 - lr: 0.000018 - momentum: 0.000000 2023-10-25 10:01:29,669 epoch 7 - iter 2888/3617 - loss 0.02630636 - time (sec): 181.14 - samples/sec: 1678.04 - lr: 0.000018 - momentum: 0.000000 2023-10-25 10:01:52,432 epoch 7 - iter 3249/3617 - loss 0.02602543 - time (sec): 203.90 - samples/sec: 1679.87 - lr: 0.000017 - momentum: 0.000000 2023-10-25 10:02:15,089 epoch 7 - iter 3610/3617 - loss 0.02686103 - time (sec): 226.56 - samples/sec: 1674.23 - lr: 0.000017 - momentum: 0.000000 2023-10-25 10:02:15,493 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:02:15,493 EPOCH 7 done: loss 0.0269 - lr: 0.000017 2023-10-25 10:02:20,803 DEV : loss 0.362984299659729 - f1-score (micro avg) 0.6264 2023-10-25 10:02:20,826 saving best model 2023-10-25 10:02:21,400 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:02:44,280 epoch 8 - iter 361/3617 - loss 0.01775868 - time (sec): 22.88 - samples/sec: 1680.20 - lr: 0.000016 - momentum: 0.000000 2023-10-25 10:03:07,052 epoch 8 - iter 722/3617 - loss 0.01658963 - time (sec): 45.65 - samples/sec: 1666.28 - lr: 0.000016 - momentum: 0.000000 2023-10-25 10:03:29,927 epoch 8 - iter 1083/3617 - loss 0.01602400 - time (sec): 68.53 - samples/sec: 1688.63 - lr: 0.000015 - momentum: 0.000000 2023-10-25 10:03:52,202 epoch 8 - iter 1444/3617 - loss 0.01659828 - time (sec): 90.80 - samples/sec: 1681.73 - lr: 0.000014 - momentum: 0.000000 2023-10-25 10:04:14,886 epoch 8 - iter 1805/3617 - loss 0.01706020 - time (sec): 113.48 - samples/sec: 1679.42 - lr: 0.000014 - momentum: 0.000000 2023-10-25 10:04:37,673 epoch 8 - iter 2166/3617 - loss 0.01752539 - time (sec): 136.27 - samples/sec: 1677.52 - lr: 0.000013 - momentum: 0.000000 2023-10-25 10:05:00,183 epoch 8 - iter 2527/3617 - loss 0.01711630 - time (sec): 158.78 - samples/sec: 1671.96 - lr: 0.000013 - momentum: 0.000000 2023-10-25 10:05:23,054 epoch 8 - iter 2888/3617 - loss 0.01739202 - time (sec): 181.65 - samples/sec: 1672.88 - lr: 0.000012 - momentum: 0.000000 2023-10-25 10:05:45,686 epoch 8 - iter 3249/3617 - loss 0.01664947 - time (sec): 204.28 - samples/sec: 1672.12 - lr: 0.000012 - momentum: 0.000000 2023-10-25 10:06:08,304 epoch 8 - iter 3610/3617 - loss 0.01666693 - time (sec): 226.90 - samples/sec: 1671.47 - lr: 0.000011 - momentum: 0.000000 2023-10-25 10:06:08,723 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:06:08,724 EPOCH 8 done: loss 0.0166 - lr: 0.000011 2023-10-25 10:06:14,036 DEV : loss 0.3784872889518738 - f1-score (micro avg) 0.6276 2023-10-25 10:06:14,060 saving best model 2023-10-25 10:06:14,654 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:06:37,630 epoch 9 - iter 361/3617 - loss 0.00871238 - time (sec): 22.97 - samples/sec: 1699.47 - lr: 0.000011 - momentum: 0.000000 2023-10-25 10:07:00,371 epoch 9 - iter 722/3617 - loss 0.01127845 - time (sec): 45.72 - samples/sec: 1713.82 - lr: 0.000010 - momentum: 0.000000 2023-10-25 10:07:22,893 epoch 9 - iter 1083/3617 - loss 0.00954485 - time (sec): 68.24 - samples/sec: 1700.01 - lr: 0.000009 - momentum: 0.000000 2023-10-25 10:07:45,389 epoch 9 - iter 1444/3617 - loss 0.01037918 - time (sec): 90.73 - samples/sec: 1681.78 - lr: 0.000009 - momentum: 0.000000 2023-10-25 10:08:08,359 epoch 9 - iter 1805/3617 - loss 0.01022566 - time (sec): 113.70 - samples/sec: 1690.66 - lr: 0.000008 - momentum: 0.000000 2023-10-25 10:08:30,955 epoch 9 - iter 2166/3617 - loss 0.01048162 - time (sec): 136.30 - samples/sec: 1681.17 - lr: 0.000008 - momentum: 0.000000 2023-10-25 10:08:53,592 epoch 9 - iter 2527/3617 - loss 0.01062263 - time (sec): 158.94 - samples/sec: 1674.84 - lr: 0.000007 - momentum: 0.000000 2023-10-25 10:09:16,256 epoch 9 - iter 2888/3617 - loss 0.01069743 - time (sec): 181.60 - samples/sec: 1675.08 - lr: 0.000007 - momentum: 0.000000 2023-10-25 10:09:38,894 epoch 9 - iter 3249/3617 - loss 0.01061481 - time (sec): 204.24 - samples/sec: 1674.70 - lr: 0.000006 - momentum: 0.000000 2023-10-25 10:10:01,366 epoch 9 - iter 3610/3617 - loss 0.01090475 - time (sec): 226.71 - samples/sec: 1671.60 - lr: 0.000006 - momentum: 0.000000 2023-10-25 10:10:01,837 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:10:01,837 EPOCH 9 done: loss 0.0109 - lr: 0.000006 2023-10-25 10:10:07,142 DEV : loss 0.36856386065483093 - f1-score (micro avg) 0.6283 2023-10-25 10:10:07,165 saving best model 2023-10-25 10:10:07,759 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:10:30,329 epoch 10 - iter 361/3617 - loss 0.00788874 - time (sec): 22.57 - samples/sec: 1693.26 - lr: 0.000005 - momentum: 0.000000 2023-10-25 10:10:52,894 epoch 10 - iter 722/3617 - loss 0.00932594 - time (sec): 45.13 - samples/sec: 1692.70 - lr: 0.000004 - momentum: 0.000000 2023-10-25 10:11:15,621 epoch 10 - iter 1083/3617 - loss 0.00852378 - time (sec): 67.86 - samples/sec: 1672.67 - lr: 0.000004 - momentum: 0.000000 2023-10-25 10:11:38,372 epoch 10 - iter 1444/3617 - loss 0.00785181 - time (sec): 90.61 - samples/sec: 1676.36 - lr: 0.000003 - momentum: 0.000000 2023-10-25 10:12:00,876 epoch 10 - iter 1805/3617 - loss 0.00792484 - time (sec): 113.12 - samples/sec: 1667.79 - lr: 0.000003 - momentum: 0.000000 2023-10-25 10:12:23,473 epoch 10 - iter 2166/3617 - loss 0.00740207 - time (sec): 135.71 - samples/sec: 1666.96 - lr: 0.000002 - momentum: 0.000000 2023-10-25 10:12:46,105 epoch 10 - iter 2527/3617 - loss 0.00754265 - time (sec): 158.34 - samples/sec: 1660.76 - lr: 0.000002 - momentum: 0.000000 2023-10-25 10:13:08,972 epoch 10 - iter 2888/3617 - loss 0.00739471 - time (sec): 181.21 - samples/sec: 1665.43 - lr: 0.000001 - momentum: 0.000000 2023-10-25 10:13:31,727 epoch 10 - iter 3249/3617 - loss 0.00769671 - time (sec): 203.97 - samples/sec: 1670.47 - lr: 0.000001 - momentum: 0.000000 2023-10-25 10:13:54,401 epoch 10 - iter 3610/3617 - loss 0.00741003 - time (sec): 226.64 - samples/sec: 1674.04 - lr: 0.000000 - momentum: 0.000000 2023-10-25 10:13:54,800 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:13:54,801 EPOCH 10 done: loss 0.0074 - lr: 0.000000 2023-10-25 10:13:59,572 DEV : loss 0.4016081988811493 - f1-score (micro avg) 0.626 2023-10-25 10:14:00,063 ---------------------------------------------------------------------------------------------------- 2023-10-25 10:14:00,063 Loading model from best epoch ... 2023-10-25 10:14:01,739 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 10:14:07,996 Results: - F-score (micro) 0.6347 - F-score (macro) 0.5196 - Accuracy 0.4797 By class: precision recall f1-score support loc 0.6325 0.7310 0.6782 591 pers 0.5637 0.7563 0.6459 357 org 0.2289 0.2405 0.2346 79 micro avg 0.5791 0.7020 0.6347 1027 macro avg 0.4750 0.5759 0.5196 1027 weighted avg 0.5775 0.7020 0.6328 1027 2023-10-25 10:14:07,996 ----------------------------------------------------------------------------------------------------