|
|
|
|
|
|
|
|
|
""" |
|
RoBERTa: A Robustly Optimized BERT Pretraining Approach. |
|
""" |
|
|
|
import logging |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from fairseq import utils |
|
from fairseq.model_parallel.models.transformer import ModelParallelTransformerEncoder |
|
from fairseq.models import register_model, register_model_architecture |
|
from fairseq.models.roberta import ( |
|
roberta_base_architecture, |
|
roberta_prenorm_architecture, |
|
RobertaEncoder, |
|
RobertaModel, |
|
) |
|
from fairseq.modules import LayerNorm |
|
|
|
|
|
try: |
|
from fairseq.model_parallel.megatron.mpu import ( |
|
copy_to_model_parallel_region, |
|
gather_from_model_parallel_region, |
|
ColumnParallelLinear, |
|
VocabParallelEmbedding, |
|
) |
|
|
|
has_megatron_submodule = True |
|
except (ImportError, ModuleNotFoundError): |
|
has_megatron_submodule = False |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@register_model("model_parallel_roberta") |
|
class ModelParallelRobertaModel(RobertaModel): |
|
def __init__(self, args, encoder): |
|
super().__init__(args, encoder) |
|
|
|
self.classification_heads = nn.ModuleDict() |
|
|
|
@staticmethod |
|
def add_args(parser): |
|
RobertaModel.add_args(parser) |
|
parser.add_argument( |
|
"--no-final-layer-norm", |
|
action="store_true", |
|
help=( |
|
"don't add final layernorm (only applicable when " |
|
"--encoder-normalize-before=True" |
|
), |
|
) |
|
|
|
@classmethod |
|
def build_model(cls, args, task): |
|
"""Build a new model instance.""" |
|
|
|
|
|
base_architecture(args) |
|
|
|
task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8) |
|
task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8) |
|
|
|
if not hasattr(args, "max_positions"): |
|
args.max_positions = args.tokens_per_sample |
|
|
|
if getattr(args, "untie_weights_roberta", False): |
|
raise NotImplementedError( |
|
"--untie-weights-roberta is not supported in model parallel mode" |
|
) |
|
|
|
encoder = ModelParallelRobertaEncoder(args, task.source_dictionary) |
|
return cls(args, encoder) |
|
|
|
def forward( |
|
self, |
|
src_tokens, |
|
features_only=False, |
|
return_all_hiddens=False, |
|
classification_head_name=None, |
|
**kwargs |
|
): |
|
if classification_head_name is not None: |
|
features_only = True |
|
|
|
x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs) |
|
|
|
if classification_head_name is not None: |
|
x = self.classification_heads[classification_head_name](x) |
|
return x, extra |
|
|
|
def register_classification_head( |
|
self, name, num_classes=None, inner_dim=None, **kwargs |
|
): |
|
"""Register a classification head.""" |
|
if name in self.classification_heads: |
|
prev_num_classes = self.classification_heads[name].out_proj.out_features |
|
prev_inner_dim = self.classification_heads[name].dense.out_features |
|
if num_classes != prev_num_classes or inner_dim != prev_inner_dim: |
|
logger.warning( |
|
're-registering head "{}" with num_classes {} (prev: {}) ' |
|
"and inner_dim {} (prev: {})".format( |
|
name, num_classes, prev_num_classes, inner_dim, prev_inner_dim |
|
) |
|
) |
|
self.classification_heads[name] = ModelParallelRobertaClassificationHead( |
|
self.args.encoder_embed_dim, |
|
inner_dim or self.args.encoder_embed_dim, |
|
num_classes, |
|
self.args.pooler_activation_fn, |
|
self.args.pooler_dropout, |
|
) |
|
|
|
|
|
class ModelParallelRobertaLMHead(nn.Module): |
|
"""Head for masked language modeling.""" |
|
|
|
def __init__(self, embed_dim, output_dim, activation_fn, weight=None): |
|
super().__init__() |
|
self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True) |
|
self.activation_fn = utils.get_activation_fn(activation_fn) |
|
self.layer_norm = LayerNorm(embed_dim) |
|
|
|
if weight is None: |
|
weight = nn.Linear(embed_dim, output_dim, bias=False).weight |
|
self.weight = weight |
|
self.bias = nn.Parameter(torch.zeros(output_dim)) |
|
|
|
def forward(self, features, masked_tokens=None, **kwargs): |
|
|
|
|
|
if masked_tokens is not None: |
|
features = features[masked_tokens, :] |
|
|
|
x = self.dense(features) |
|
x = self.activation_fn(x) |
|
x = self.layer_norm(x) |
|
|
|
x = copy_to_model_parallel_region(x) |
|
|
|
x = F.linear(x, self.weight) |
|
x = gather_from_model_parallel_region(x).contiguous() |
|
x = x + self.bias |
|
return x |
|
|
|
|
|
class ModelParallelRobertaClassificationHead(nn.Module): |
|
"""Head for sentence-level classification tasks.""" |
|
|
|
def __init__( |
|
self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout |
|
): |
|
super().__init__() |
|
self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True) |
|
self.activation_fn = utils.get_activation_fn(activation_fn) |
|
self.dropout = nn.Dropout(p=pooler_dropout) |
|
self.out_proj = nn.Linear(inner_dim, num_classes) |
|
|
|
def forward(self, features, **kwargs): |
|
x = features[:, 0, :] |
|
x = self.dropout(x) |
|
x = self.dense(x) |
|
x = self.activation_fn(x) |
|
x = self.dropout(x) |
|
x = self.out_proj(x) |
|
return x |
|
|
|
|
|
class ModelParallelRobertaEncoder(RobertaEncoder): |
|
"""RoBERTa encoder.""" |
|
|
|
def __init__(self, args, dictionary): |
|
super().__init__(args, dictionary) |
|
assert not self.args.untie_weights_roberta |
|
|
|
def build_embedding(self, vocab_size, embedding_dim, padding_idx): |
|
return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx) |
|
|
|
def build_encoder(self, args, dictionary, embed_tokens): |
|
return ModelParallelTransformerEncoder(args, dictionary, embed_tokens) |
|
|
|
def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): |
|
return ModelParallelRobertaLMHead(embed_dim, output_dim, activation_fn, weight) |
|
|
|
|
|
@register_model_architecture("model_parallel_roberta", "model_parallel_roberta") |
|
def base_architecture(args): |
|
args.no_final_layer_norm = getattr(args, "no_final_layer_norm", False) |
|
|
|
roberta_prenorm_architecture(args) |
|
|
|
|
|
|
|
@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_v1") |
|
def model_parallel_roberta_v1_architecture(args): |
|
args.no_final_layer_norm = getattr(args, "no_final_layer_norm", True) |
|
base_architecture(args) |
|
|
|
|
|
@register_model_architecture( |
|
"model_parallel_roberta", "model_parallel_roberta_postnorm" |
|
) |
|
def model_parallel_roberta_postnorm_architecture(args): |
|
|
|
roberta_base_architecture(args) |
|
|
|
|
|
@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base") |
|
def model_parallel_roberta_base_architecture(args): |
|
base_architecture(args) |
|
|
|
|
|
@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large") |
|
def model_parallel_roberta_large_architecture(args): |
|
args.encoder_layers = getattr(args, "encoder_layers", 24) |
|
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) |
|
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) |
|
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) |
|
base_architecture(args) |
|
|