|
{ |
|
"architectures": ["RobertaForSequenceClassification"], |
|
"model_type": "roberta", |
|
"num_classes": 2, |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 24, |
|
"hidden_size": 1024, |
|
"intermediate_size": 4096, |
|
"hidden_act": "gelu", |
|
"hidden_dropout_prob": 0.1, |
|
"attention_probs_dropout_prob": 0.1, |
|
"max_position_embeddings": 514, |
|
"type_vocab_size": 1, |
|
"initializer_range": 0.02, |
|
"layer_norm_eps": 1e-05, |
|
"classifier_dropout": null, |
|
"task_specific_params": { |
|
"sentence_prediction": { |
|
"dropout": 0.1, |
|
"classifier_dropout": 0.1 |
|
} |
|
}, |
|
"lr": 1e-05, |
|
"total_num_updates": 7812, |
|
"warmup_updates": 469, |
|
"classification_head_name": "imdb_head", |
|
"optimizer": "adam", |
|
"adam_betas": [0.9, 0.98], |
|
"adam_eps": 1e-06, |
|
"weight_decay": 0.1, |
|
"learning_rate_scheduler": "polynomial_decay", |
|
"max_sentences": 32, |
|
"max_tokens": 4400, |
|
"batch_size": 32, |
|
"init_token": 0, |
|
"separator_token": 2 |
|
} |
|
|