{ "architectures": ["RobertaForSequenceClassification"], "model_type": "roberta", "num_classes": 2, "num_attention_heads": 16, "num_hidden_layers": 24, "hidden_size": 1024, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 514, "type_vocab_size": 1, "initializer_range": 0.02, "layer_norm_eps": 1e-05, "classifier_dropout": null, "task_specific_params": { "sentence_prediction": { "dropout": 0.1, "classifier_dropout": 0.1 } }, "lr": 5e-05, "total_num_updates": 7812, "warmup_updates": 469, "classification_head_name": "imdb_head", "optimizer": "adam", "adam_betas": [0.9, 0.98], "adam_eps": 1e-06, "weight_decay": 0.1, "learning_rate_scheduler": "polynomial_decay", "max_sentences": 32, "max_tokens": 4400, "batch_size": 32, "init_token": 0, "separator_token": 2 }