|
{ |
|
"dataset_reader": { |
|
"type": "multitask", |
|
"readers": { |
|
"ud": { |
|
"type": "universal_dependencies", |
|
"token_indexers": { |
|
"transformer": { |
|
"type": "pretrained_transformer_mismatched", |
|
"max_length": 512, |
|
"model_name": "MLRS/BERTu" |
|
} |
|
} |
|
} |
|
} |
|
}, |
|
"model": { |
|
"type": "multitask", |
|
"arg_name_mapping": { |
|
"backbone": { |
|
"tokens": "text", |
|
"words": "text" |
|
} |
|
}, |
|
"backbone": { |
|
"type": "embedder_and_mask", |
|
"text_field_embedder": { |
|
"token_embedders": { |
|
"transformer": { |
|
"type": "pretrained_transformer_mismatched_with_dropout", |
|
"last_layer_only": false, |
|
"layer_dropout": 0.1, |
|
"max_length": 512, |
|
"model_name": "MLRS/BERTu", |
|
"tokenizer_kwargs": {}, |
|
"train_parameters": true |
|
} |
|
} |
|
} |
|
}, |
|
"heads": { |
|
"ud": { |
|
"type": "biaffine_parser", |
|
"arc_representation_dim": 100, |
|
"dropout": 0.3, |
|
"encoder": { |
|
"type": "pass_through", |
|
"input_dim": 768 |
|
}, |
|
"initializer": { |
|
"regexes": [ |
|
[ |
|
".*projection.*weight", |
|
{ |
|
"type": "xavier_uniform" |
|
} |
|
], |
|
[ |
|
".*projection.*bias", |
|
{ |
|
"type": "zero" |
|
} |
|
], |
|
[ |
|
".*tag_bilinear.*weight", |
|
{ |
|
"type": "xavier_uniform" |
|
} |
|
], |
|
[ |
|
".*tag_bilinear.*bias", |
|
{ |
|
"type": "zero" |
|
} |
|
], |
|
[ |
|
".*weight_ih.*", |
|
{ |
|
"type": "xavier_uniform" |
|
} |
|
], |
|
[ |
|
".*weight_hh.*", |
|
{ |
|
"type": "orthogonal" |
|
} |
|
], |
|
[ |
|
".*bias_ih.*", |
|
{ |
|
"type": "zero" |
|
} |
|
], |
|
[ |
|
".*bias_hh.*", |
|
{ |
|
"type": "lstm_hidden_bias" |
|
} |
|
] |
|
] |
|
}, |
|
"input_dropout": 0.3, |
|
"tag_representation_dim": 100, |
|
"use_mst_decoding_for_validation": true |
|
} |
|
} |
|
}, |
|
"train_data_path": { |
|
"ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-train.conllu" |
|
}, |
|
"validation_data_path": { |
|
"ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" |
|
}, |
|
"trainer": { |
|
"callbacks": [ |
|
{ |
|
"tensorboard_writer": { |
|
"should_log_learning_rate": true, |
|
"should_log_parameter_statistics": true |
|
}, |
|
"type": "tensorboard" |
|
} |
|
], |
|
"cuda_device": 0, |
|
"grad_norm": 5, |
|
"learning_rate_scheduler": { |
|
"type": "ulmfit_sqrt", |
|
"affected_group_count": 2, |
|
"decay_factor": 0.05, |
|
"discriminative_fine_tuning": true, |
|
"factor": 5, |
|
"gradual_unfreezing": true, |
|
"model_size": 1, |
|
"start_step": 9, |
|
"warmup_steps": 9 |
|
}, |
|
"num_epochs": 200, |
|
"optimizer": { |
|
"type": "huggingface_adamw", |
|
"betas": [ |
|
0.9, |
|
0.999 |
|
], |
|
"correct_bias": false, |
|
"lr": 0.0005, |
|
"parameter_groups": [ |
|
[ |
|
[ |
|
"text_field_embedder.*transformer_model.embeddings.*_embeddings.*", |
|
"text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).weight" |
|
], |
|
{} |
|
], |
|
[ |
|
[ |
|
"text_field_embedder.*transformer_model.embeddings.LayerNorm.*", |
|
"text_field_embedder.*transformer_model.encoder.*.output.LayerNorm.*", |
|
"text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).bias", |
|
"text_field_embedder.*transformer_model.pooler.dense.bias" |
|
], |
|
{ |
|
"weight_decay": 0 |
|
} |
|
], |
|
[ |
|
[ |
|
"text_field_embedder.*._scalar_mix.*", |
|
"text_field_embedder.*transformer_model.pooler.dense.weight", |
|
"_head_sentinel", |
|
"head_arc_feedforward._linear_layers.*.weight", |
|
"child_arc_feedforward._linear_layers.*.weight", |
|
"head_tag_feedforward._linear_layers.*.weight", |
|
"child_tag_feedforward._linear_layers.*.weight", |
|
"arc_attention._weight_matrix", |
|
"tag_bilinear.weight", |
|
"tag_projection_layer._module.weight", |
|
"crf", |
|
"linear.weight", |
|
"tagger_linear.weight" |
|
], |
|
{} |
|
], |
|
[ |
|
[ |
|
"head_arc_feedforward._linear_layers.*.bias", |
|
"child_arc_feedforward._linear_layers.*.bias", |
|
"head_tag_feedforward._linear_layers.*.bias", |
|
"child_tag_feedforward._linear_layers.*.bias", |
|
"arc_attention._bias", |
|
"tag_bilinear.bias", |
|
"tag_projection_layer._module.bias", |
|
"linear.bias", |
|
"tagger_linear.bias" |
|
], |
|
{ |
|
"weight_decay": 0 |
|
} |
|
] |
|
], |
|
"weight_decay": 0.01 |
|
}, |
|
"patience": 20, |
|
"validation_metric": [ |
|
"+ud_LAS" |
|
] |
|
}, |
|
"data_loader": { |
|
"type": "multitask", |
|
"scheduler": { |
|
"type": "unbalanced_homogeneous_roundrobin", |
|
"batch_size": 128, |
|
"dataset_sizes": { |
|
"ud": 1123 |
|
} |
|
}, |
|
"shuffle": true |
|
}, |
|
"numpy_seed": 2460, |
|
"pytorch_seed": 246, |
|
"random_seed": 24601, |
|
"validation_data_loader": { |
|
"type": "multitask", |
|
"scheduler": { |
|
"type": "homogeneous_roundrobin", |
|
"batch_size": 128 |
|
}, |
|
"shuffle": true |
|
} |
|
} |