diff --git a/fairseq/fairseq/models/__pycache__/__init__.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c29dd14f82d34f6072ab9e5953d11d23e3a38402 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeb9fb1211330500148bc3c3ed506ade646edf86 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..438b5e9fe338cbe863f63ed006eb11570700db2b Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4359178e9ae1ef9ddabe8f73d66fc2ea7bcdcdcc Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84efd36b9375eb40a146d926ae2287eb12b63885 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31d8349040d17414bc781f1f55663f173ba23558 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2f3d3941c9075ce8ca6186145e5608004142958 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fconv.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fconv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4f1161b26fc4df12e431f3275fa9f0d59feadd0 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fconv.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e938e4d029f6e2509cbd9437b132814c5dbca69f Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6db497c31c9c552b9af9af60a341beea203e4476 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/lightconv.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/lightconv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6afa441f08d276101a2b51c28b965037c5c8487b Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/lightconv.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b51fb4f3a943f01bb73c7ee8f28b7cab1f909f3b Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/lstm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/lstm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0ca1f14ecf455a5d21a08ef63e086ab212461bd Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/lstm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbdde4d48eac2809a3cc3f256d883b4dcad99750 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/masked_lm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/masked_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c12838cdc3005a2c30057df3da388080884f6b8f Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/masked_lm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/model_utils.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/model_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76565ee25cb7e7a1fa5063287a28bad46b5d13c6 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/model_utils.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65b2ccaaf8a3e9c38361e6d62736c0aff5ca1f14 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/transformer_align.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/transformer_align.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4502290033752142b9f3b3ec9df8fa53500ef1ba Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/transformer_align.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e84908ddbabf7b84f626ea094c4545e40a7b1b7 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd19aa0a8549054de02ecdc6409e3e6377a3f46a Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/__pycache__/transformer_ulm.cpython-310.pyc b/fairseq/fairseq/models/__pycache__/transformer_ulm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd63b127a01fe4319422ca8f7e589b0fea551619 Binary files /dev/null and b/fairseq/fairseq/models/__pycache__/transformer_ulm.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/codehifigan.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/codehifigan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cf6eca6b68ee9a35b81a2850850fe958eaed3a0 Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/codehifigan.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/fastspeech2.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/fastspeech2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73b31b0e8f6fded8646d31a2798f8762a35a9509 Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/fastspeech2.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/hifigan.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/hifigan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e807d01b7e838731daedf249c8db338e2cb6291 Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/hifigan.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/hub_interface.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/hub_interface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64082e5e64b9fc7d2166bc6ca8ddb6230f55500e Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/hub_interface.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/tts_transformer.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/tts_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..427154f71633cbb4ac661fe03b9818606c1d32d1 Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/tts_transformer.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/__pycache__/vocoder.cpython-310.pyc b/fairseq/fairseq/models/text_to_speech/__pycache__/vocoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1294811cdaa0929de5496f652540e90e14c4c157 Binary files /dev/null and b/fairseq/fairseq/models/text_to_speech/__pycache__/vocoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/text_to_speech/tts_transformer.py b/fairseq/fairseq/models/text_to_speech/tts_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..19afc2b717b3036805e9d9bcfdb9a366b9cc19a5 --- /dev/null +++ b/fairseq/fairseq/models/text_to_speech/tts_transformer.py @@ -0,0 +1,454 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Optional + +import torch +from torch import nn + +from fairseq import utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, + register_model_architecture, +) +from fairseq.models.text_to_speech.hub_interface import TTSHubInterface +from fairseq.models.text_to_speech.tacotron2 import Postnet, Prenet +from fairseq.modules import ( + FairseqDropout, + LayerNorm, + PositionalEmbedding, + TransformerDecoderLayer, + TransformerEncoderLayer, +) + +logger = logging.getLogger(__name__) + + +def encoder_init(m): + if isinstance(m, nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) + + +def Embedding(num_embeddings, embedding_dim): + m = nn.Embedding(num_embeddings, embedding_dim) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + return m + + +class TTSTransformerEncoder(FairseqEncoder): + def __init__(self, args, src_dict, embed_speaker): + super().__init__(src_dict) + self.padding_idx = src_dict.pad() + self.embed_speaker = embed_speaker + self.spk_emb_proj = None + if embed_speaker is not None: + self.spk_emb_proj = nn.Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + self.dropout_module = FairseqDropout( + p=args.dropout, module_name=self.__class__.__name__ + ) + self.embed_tokens = nn.Embedding( + len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx + ) + assert args.encoder_conv_kernel_size % 2 == 1 + self.prenet = nn.ModuleList( + nn.Sequential( + nn.Conv1d( + args.encoder_embed_dim, + args.encoder_embed_dim, + kernel_size=args.encoder_conv_kernel_size, + padding=((args.encoder_conv_kernel_size - 1) // 2), + ), + nn.BatchNorm1d(args.encoder_embed_dim), + nn.ReLU(), + nn.Dropout(args.encoder_dropout), + ) + for _ in range(args.encoder_conv_layers) + ) + self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) + self.embed_positions = PositionalEmbedding( + args.max_source_positions, args.encoder_embed_dim, self.padding_idx + ) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + + self.transformer_layers = nn.ModuleList( + TransformerEncoderLayer(args) + for _ in range(args.encoder_transformer_layers) + ) + if args.encoder_normalize_before: + self.layer_norm = LayerNorm(args.encoder_embed_dim) + else: + self.layer_norm = None + + self.apply(encoder_init) + + def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs): + x = self.embed_tokens(src_tokens) + x = x.transpose(1, 2).contiguous() # B x T x C -> B x C x T + for conv in self.prenet: + x = conv(x) + x = x.transpose(1, 2).contiguous() # B x C x T -> B x T x C + x = self.prenet_proj(x) + + padding_mask = src_tokens.eq(self.padding_idx) + positions = self.embed_positions(padding_mask) + x += self.pos_emb_alpha * positions + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + for layer in self.transformer_layers: + x = layer(x, padding_mask) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + if self.embed_speaker is not None: + seq_len, bsz, _ = x.size() + emb = self.embed_speaker(speaker).transpose(0, 1) + emb = emb.expand(seq_len, bsz, -1) + x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [padding_mask] + if padding_mask.any() + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": [], # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + +def decoder_init(m): + if isinstance(m, torch.nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh")) + + +class TTSTransformerDecoder(FairseqIncrementalDecoder): + def __init__(self, args, src_dict, padding_idx=1): + super().__init__(None) + self._future_mask = torch.empty(0) + + self.args = args + self.padding_idx = src_dict.pad() if src_dict else padding_idx + self.n_frames_per_step = args.n_frames_per_step + self.out_dim = args.output_frame_dim * args.n_frames_per_step + + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.embed_positions = PositionalEmbedding( + args.max_target_positions, args.decoder_embed_dim, self.padding_idx + ) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + self.prenet = nn.Sequential( + Prenet( + self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout + ), + nn.Linear(args.prenet_dim, args.decoder_embed_dim), + ) + + self.n_transformer_layers = args.decoder_transformer_layers + self.transformer_layers = nn.ModuleList( + TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers) + ) + if args.decoder_normalize_before: + self.layer_norm = LayerNorm(args.decoder_embed_dim) + else: + self.layer_norm = None + + self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) + self.eos_proj = nn.Linear(args.decoder_embed_dim, 1) + + self.postnet = Postnet( + self.out_dim, + args.postnet_conv_dim, + args.postnet_conv_kernel_size, + args.postnet_layers, + args.postnet_dropout, + ) + + self.ctc_proj = None + if getattr(args, "ctc_weight", 0.0) > 0.0: + self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) + + self.apply(decoder_init) + + def extract_features( + self, + prev_outputs, + encoder_out=None, + incremental_state=None, + target_lengths=None, + speaker=None, + **kwargs, + ): + alignment_layer = self.n_transformer_layers - 1 + self_attn_padding_mask = lengths_to_padding_mask(target_lengths) + positions = self.embed_positions( + self_attn_padding_mask, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_outputs = prev_outputs[:, -1:, :] + self_attn_padding_mask = self_attn_padding_mask[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + x = self.prenet(prev_outputs) + x += self.pos_emb_alpha * positions + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + if not self_attn_padding_mask.any(): + self_attn_padding_mask = None + + attn: Optional[torch.Tensor] = None + inner_states: List[Optional[torch.Tensor]] = [x] + for idx, transformer_layer in enumerate(self.transformer_layers): + if incremental_state is None: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = transformer_layer( + x, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + # average probabilities over heads, transpose to + # (B, src_len, tgt_len) + attn = attn.mean(dim=0).transpose(2, 1) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, {"attn": attn, "inner_states": inner_states} + + def forward( + self, + prev_output_tokens, + encoder_out=None, + incremental_state=None, + target_lengths=None, + speaker=None, + **kwargs, + ): + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + target_lengths=target_lengths, + speaker=speaker, + **kwargs, + ) + attn = extra["attn"] + feat_out = self.feat_proj(x) + bsz, seq_len, _ = x.size() + eos_out = self.eos_proj(x) + post_feat_out = feat_out + self.postnet(feat_out) + return ( + post_feat_out, + eos_out, + { + "attn": attn, + "feature_out": feat_out, + "inner_states": extra["inner_states"], + }, + ) + + def get_normalized_probs(self, net_output, log_probs, sample): + logits = self.ctc_proj(net_output[2]["feature_out"]) + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. + if ( + self._future_mask.size(0) == 0 + or (not self._future_mask.device == tensor.device) + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 + ) + self._future_mask = self._future_mask.to(tensor) + return self._future_mask[:dim, :dim] + + +@register_model("tts_transformer") +class TTSTransformerModel(FairseqEncoderDecoderModel): + """ + Implementation for https://arxiv.org/pdf/1809.08895.pdf + """ + + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2" + model_ids = [ + "tts_transformer-en-ljspeech", + "tts_transformer-en-200_speaker-cv4", + "tts_transformer-es-css10", + "tts_transformer-fr-cv7_css10", + "tts_transformer-ru-cv7_css10", + "tts_transformer-zh-cv7_css10", + "tts_transformer-ar-cv7_css10", + "tts_transformer-tr-cv7_css10", + "tts_transformer-vi-cv7", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config_yaml="config.yaml", + vocoder: str = "griffin_lim", + fp16: bool = False, + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config_yaml, + vocoder=vocoder, + fp16=fp16, + **kwargs, + ) + return TTSHubInterface(x["args"], x["task"], x["models"][0]) + + @staticmethod + def add_args(parser): + parser.add_argument("--dropout", type=float) + parser.add_argument("--output-frame-dim", type=int) + parser.add_argument("--speaker-embed-dim", type=int) + # encoder prenet + parser.add_argument("--encoder-dropout", type=float) + parser.add_argument("--encoder-conv-layers", type=int) + parser.add_argument("--encoder-conv-kernel-size", type=int) + # encoder transformer layers + parser.add_argument("--encoder-transformer-layers", type=int) + parser.add_argument("--encoder-embed-dim", type=int) + parser.add_argument("--encoder-ffn-embed-dim", type=int) + parser.add_argument("--encoder-normalize-before", action="store_true") + parser.add_argument("--encoder-attention-heads", type=int) + parser.add_argument("--attention-dropout", type=float) + parser.add_argument("--activation-dropout", "--relu-dropout", type=float) + parser.add_argument("--activation-fn", type=str, default="relu") + # decoder prenet + parser.add_argument("--prenet-dropout", type=float) + parser.add_argument("--prenet-layers", type=int) + parser.add_argument("--prenet-dim", type=int) + # decoder postnet + parser.add_argument("--postnet-dropout", type=float) + parser.add_argument("--postnet-layers", type=int) + parser.add_argument("--postnet-conv-dim", type=int) + parser.add_argument("--postnet-conv-kernel-size", type=int) + # decoder transformer layers + parser.add_argument("--decoder-transformer-layers", type=int) + parser.add_argument("--decoder-embed-dim", type=int) + parser.add_argument("--decoder-ffn-embed-dim", type=int) + parser.add_argument("--decoder-normalize-before", action="store_true") + parser.add_argument("--decoder-attention-heads", type=int) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._num_updates = 0 + + @classmethod + def build_model(cls, args, task): + embed_speaker = task.get_speaker_embeddings(args) + encoder = TTSTransformerEncoder(args, task.src_dict, embed_speaker) + decoder = TTSTransformerDecoder(args, task.src_dict) + return cls(encoder, decoder) + + def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs): + return self.encoder( + src_tokens, src_lengths=src_lengths, speaker=speaker, **kwargs + ) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self._num_updates = num_updates + + +@register_model_architecture("tts_transformer", "tts_transformer") +def base_architecture(args): + args.dropout = getattr(args, "dropout", 0.1) + args.output_frame_dim = getattr(args, "output_frame_dim", 80) + args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64) + # encoder prenet + args.encoder_dropout = getattr(args, "encoder_dropout", 0.5) + args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3) + args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5) + # encoder transformer layers + args.encoder_transformer_layers = getattr(args, "encoder_transformer_layers", 6) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr( + args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim + ) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + # decoder prenet + args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) + args.prenet_layers = getattr(args, "prenet_layers", 2) + args.prenet_dim = getattr(args, "prenet_dim", 256) + # decoder postnet + args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) + args.postnet_layers = getattr(args, "postnet_layers", 5) + args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) + args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) + # decoder transformer layers + args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim + ) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) diff --git a/fairseq/fairseq/models/transformer/__init__.py b/fairseq/fairseq/models/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..681fca3d4553f6832a65f61fc186793bc4ee0679 --- /dev/null +++ b/fairseq/fairseq/models/transformer/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +from .transformer_config import ( + TransformerConfig, + DEFAULT_MAX_SOURCE_POSITIONS, + DEFAULT_MAX_TARGET_POSITIONS, + DEFAULT_MIN_PARAMS_TO_WRAP, +) +from .transformer_decoder import TransformerDecoder, TransformerDecoderBase, Linear +from .transformer_encoder import TransformerEncoder, TransformerEncoderBase +from .transformer_legacy import ( + TransformerModel, + base_architecture, + tiny_architecture, + transformer_iwslt_de_en, + transformer_wmt_en_de, + transformer_vaswani_wmt_en_de_big, + transformer_vaswani_wmt_en_fr_big, + transformer_wmt_en_de_big, + transformer_wmt_en_de_big_t2t, +) +from .transformer_base import TransformerModelBase, Embedding + + +__all__ = [ + "TransformerModelBase", + "TransformerConfig", + "TransformerDecoder", + "TransformerDecoderBase", + "TransformerEncoder", + "TransformerEncoderBase", + "TransformerModel", + "Embedding", + "Linear", + "base_architecture", + "tiny_architecture", + "transformer_iwslt_de_en", + "transformer_wmt_en_de", + "transformer_vaswani_wmt_en_de_big", + "transformer_vaswani_wmt_en_fr_big", + "transformer_wmt_en_de_big", + "transformer_wmt_en_de_big_t2t", + "DEFAULT_MAX_SOURCE_POSITIONS", + "DEFAULT_MAX_TARGET_POSITIONS", + "DEFAULT_MIN_PARAMS_TO_WRAP", +] diff --git a/fairseq/fairseq/models/transformer/__pycache__/__init__.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a465b9d723cdce6d047af54430a1cd638036c9c Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_base.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3be393ca8443859be67932bfb93cbd278f29f60 Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_base.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_config.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df4389a917ff262b7981f9d656e9054774996221 Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_config.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f787d29dde2cfa0fcf32bf101745591b51e4b1b8 Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder_aug.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder_aug.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b2e1e7c40b02be1fb486e2d044aa2c0f31a645a Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_decoder_aug.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_encoder.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69375627f63dd14ff3763ca32c523e730c3d6420 Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_encoder.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/__pycache__/transformer_legacy.cpython-310.pyc b/fairseq/fairseq/models/transformer/__pycache__/transformer_legacy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6de57b672fb664ca564993f6958173e84449ff9b Binary files /dev/null and b/fairseq/fairseq/models/transformer/__pycache__/transformer_legacy.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/transformer/transformer_base.py b/fairseq/fairseq/models/transformer/transformer_base.py new file mode 100644 index 0000000000000000000000000000000000000000..f9f097f04b15ff9d87a7d95c2bbfd9e81a58b813 --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_base.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +from torch import Tensor + +import logging + +from fairseq import utils +from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqEncoderDecoderModel +from fairseq.models.transformer import ( + TransformerConfig, + TransformerDecoderBase, + TransformerEncoderBase, +) + + +logger = logging.getLogger(__name__) + + +class TransformerModelBase(FairseqEncoderDecoderModel): + """ + Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) + `_. + + Args: + encoder (TransformerEncoder): the encoder + decoder (TransformerDecoder): the decoder + + The Transformer model provides the following named architectures and + command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_parser + :prog: + """ + + def __init__(self, cfg, encoder, decoder): + super().__init__(encoder, decoder) + self.cfg = cfg + self.supports_align_args = True + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + # we want to build the args recursively in this case. + gen_parser_from_dataclass( + parser, TransformerConfig(), delete_default=False, with_prefix="" + ) + + @classmethod + def build_model(cls, cfg, task): + """Build a new model instance.""" + + # -- TODO T96535332 + # bug caused by interaction between OmegaConf II and argparsing + cfg.decoder.input_dim = int(cfg.decoder.input_dim) + cfg.decoder.output_dim = int(cfg.decoder.output_dim) + # -- + + if cfg.encoder.layers_to_keep: + cfg.encoder.layers = len(cfg.encoder.layers_to_keep.split(",")) + if cfg.decoder.layers_to_keep: + cfg.decoder.layers = len(cfg.decoder.layers_to_keep.split(",")) + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + if cfg.share_all_embeddings: + if src_dict != tgt_dict: + raise ValueError("--share-all-embeddings requires a joined dictionary") + if cfg.encoder.embed_dim != cfg.decoder.embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if cfg.decoder.embed_path and ( + cfg.decoder.embed_path != cfg.encoder.embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path + ) + decoder_embed_tokens = encoder_embed_tokens + cfg.share_decoder_input_output_embed = True + elif cfg.merge_src_tgt_embed: + logger.info(f"source dict size: {len(src_dict)}") + logger.info(f"target dict size: {len(tgt_dict)}") + src_dict.update(tgt_dict) + task.src_dict = src_dict + task.tgt_dict = src_dict + logger.info(f"merged dict size: {len(src_dict)}") + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim + ) + decoder_embed_tokens = encoder_embed_tokens + cfg.share_decoder_input_output_embed = True + else: + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path + ) + decoder_embed_tokens = cls.build_embedding( + cfg, tgt_dict, cfg.decoder.embed_dim, cfg.decoder.embed_path + ) + if cfg.offload_activations: + cfg.checkpoint_activations = True # offloading implies checkpointing + encoder = cls.build_encoder(cfg, src_dict, encoder_embed_tokens) + decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) + return cls(cfg, encoder, decoder) + + @classmethod + def build_embedding(cls, cfg, dictionary, embed_dim, path=None): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + + emb = Embedding(num_embeddings, embed_dim, padding_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + @classmethod + def build_encoder(cls, cfg, src_dict, embed_tokens): + return TransformerEncoderBase(cfg, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, cfg, tgt_dict, embed_tokens): + return TransformerDecoderBase( + cfg, + tgt_dict, + embed_tokens, + no_encoder_attn=cfg.no_cross_attention, + ) + + # TorchScript doesn't support optional arguments with variable length (**kwargs). + # Current workaround is to add union of all arguments in child classes. + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + return_all_hiddens: bool = True, + features_only: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Run the forward pass for an encoder-decoder model. + + Copied from the base class, but without ``**kwargs``, + which are not supported by TorchScript. + """ + encoder_out = self.encoder( + src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens + ) + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + features_only=features_only, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + src_lengths=src_lengths, + return_all_hiddens=return_all_hiddens, + ) + return decoder_out + + # Since get_normalized_probs is in the Fairseq Model which is not scriptable, + # I rewrite the get_normalized_probs from Base Class to call the + # helper function in the Base Class. + @torch.jit.export + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + return self.get_normalized_probs_scriptable(net_output, log_probs, sample) + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m diff --git a/fairseq/fairseq/models/transformer/transformer_config.py b/fairseq/fairseq/models/transformer/transformer_config.py new file mode 100644 index 0000000000000000000000000000000000000000..4650de2e1710a7bb790431dea855a82ac9b49def --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_config.py @@ -0,0 +1,341 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import re +from dataclasses import dataclass, field, fields +from typing import List, Optional + +from omegaconf import II + +from fairseq import utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.utils import safe_getattr, safe_hasattr + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + +DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) + +_NAME_PARSER = r"(decoder|encoder|quant_noise)_(.*)" + + +@dataclass +class EncDecBaseConfig(FairseqDataclass): + embed_path: Optional[str] = field( + default=None, metadata={"help": "path to pre-trained embedding"} + ) + embed_dim: Optional[int] = field( + default=512, metadata={"help": "embedding dimension"} + ) + ffn_embed_dim: int = field( + default=2048, metadata={"help": "embedding dimension for FFN"} + ) + layers: int = field(default=6, metadata={"help": "number of layers"}) + attention_heads: int = field( + default=8, metadata={"help": "number of attention heads"} + ) + normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each block"} + ) + learned_pos: bool = field( + default=False, metadata={"help": "use learned positional embeddings"} + ) + # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) + layerdrop: float = field(default=0, metadata={"help": "LayerDrop probability"}) + layers_to_keep: Optional[List[int]] = field( + default=None, metadata={"help": "which layers to *keep* when pruning"} + ) + + xformers_att_config: Optional[str] = field( + default=None, + metadata={ + "help": "config for xFormers attention, defined in xformers.components.attention.AttentionConfig" + }, + ) + + +@dataclass +class DecoderConfig(EncDecBaseConfig): + input_dim: int = II("model.decoder.embed_dim") + output_dim: int = field( + default=II("model.decoder.embed_dim"), + metadata={ + "help": "decoder output dimension (extra linear layer if different from decoder embed dim)" + }, + ) + + def __post_init__(self): + # II doesn't work if we are just creating the object outside of hydra so fix that + if self.input_dim == II("model.decoder.embed_dim"): + self.input_dim = self.embed_dim + if self.output_dim == II("model.decoder.embed_dim"): + self.output_dim = self.embed_dim + + +@dataclass +class QuantNoiseConfig(FairseqDataclass): + pq: float = field( + default=0.0, + metadata={"help": "iterative PQ quantization noise at training time"}, + ) + pq_block_size: int = field( + default=8, + metadata={"help": "block size of quantization noise at training time"}, + ) + scalar: float = field( + default=0.0, + metadata={ + "help": "scalar quantization noise and scalar quantization at training time" + }, + ) + + +@dataclass +class TransformerConfig(FairseqDataclass): + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="relu", + metadata={"help": "activation function to use"}, + ) + dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) + attention_dropout: float = field( + default=0.0, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN.", + "alias": "--relu-dropout", + }, + ) + adaptive_input: bool = False + encoder: EncDecBaseConfig = EncDecBaseConfig() + # TODO should really be in the encoder config + max_source_positions: int = field( + default=DEFAULT_MAX_SOURCE_POSITIONS, + metadata={"help": "Maximum input length supported by the encoder"}, + ) + decoder: DecoderConfig = DecoderConfig() + # TODO should really be in the decoder config + max_target_positions: int = field( + default=DEFAULT_MAX_TARGET_POSITIONS, + metadata={"help": "Maximum output length supported by the decoder"}, + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + share_all_embeddings: bool = field( + default=False, + metadata={ + "help": "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)" + }, + ) + merge_src_tgt_embed: bool = field( + default=False, + metadata={ + "help": "if true then the source and target embedding table is " + "merged into one table. This is going to make the model smaller but " + "it might hurt performance." + }, + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if True, disables positional embeddings (outside self attention)" + }, + ) + adaptive_softmax_cutoff: Optional[List[int]] = field( + default=None, + metadata={ + "help": "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion" + }, + ) + adaptive_softmax_dropout: float = field( + default=0.0, + metadata={"help": "sets adaptive softmax dropout for the tail projections"}, + ) + adaptive_softmax_factor: float = field( + default=4, metadata={"help": "adaptive input factor"} + ) + layernorm_embedding: bool = field( + default=False, metadata={"help": "add layernorm to embedding"} + ) + tie_adaptive_weights: bool = field( + default=False, + metadata={ + "help": "if set, ties the weights of adaptive softmax and adaptive input" + }, + ) + tie_adaptive_proj: bool = field( + default=False, + metadata={ + "help": "if set, ties the projection weights of adaptive softmax and adaptive input" + }, + ) + no_scale_embedding: bool = field( + default=False, metadata={"help": "if True, dont scale embeddings"} + ) + checkpoint_activations: bool = field( + default=False, + metadata={ + "help": "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute" + }, + ) + offload_activations: bool = field( + default=False, + metadata={ + "help": "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations." + }, + ) + # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) + no_cross_attention: bool = field( + default=False, metadata={"help": "do not perform cross-attention"} + ) + cross_self_attention: bool = field( + default=False, metadata={"help": "perform cross+self-attention"} + ) + # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) + quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) + min_params_to_wrap: int = field( + default=DEFAULT_MIN_PARAMS_TO_WRAP, + metadata={ + "help": "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + }, + ) + # DEPRECATED field, but some old checkpoints might have it + char_inputs: bool = field( + default=False, metadata={"help": "if set, model takes character ids as input"} + ) + relu_dropout: float = 0.0 + # config for "BASE Layers: Simplifying Training of Large, Sparse Models" + base_layers: Optional[int] = field( + default=0, metadata={"help": "number of BASE layers in total"} + ) + base_sublayers: Optional[int] = field( + default=1, metadata={"help": "number of sublayers in each BASE layer"} + ) + base_shuffle: Optional[int] = field( + default=1, + metadata={"help": "shuffle tokens between workers before computing assignment"}, + ) + + export: bool = field( + default=False, + metadata={"help": "make the layernorm exportable with torchscript."}, + ) + + # copied from transformer_lm but expected in transformer_decoder: + no_decoder_final_norm: bool = field( + default=False, + metadata={"help": "don't add an extra layernorm after the last decoder block"}, + ) + + # We need to make this hierarchical dataclass like the flat namespace + # __getattr__ and __setattr__ here allow backward compatibility + # for subclasses of Transformer(Legacy) that depend on read/write on + # the flat namespace. + + def __getattr__(self, name): + match = re.match(_NAME_PARSER, name) + if match: + sub = safe_getattr(self, match[1]) + return safe_getattr(sub, match[2]) + raise AttributeError(f"invalid argument {name}.") + + def __setattr__(self, name, value): + match = re.match(_NAME_PARSER, name) + if match: + sub = safe_getattr(self, match[1]) + setattr(sub, match[2], value) + else: + super().__setattr__(name, value) + + @staticmethod + def _copy_keys(args, cls, prefix, seen): + """ + copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim + """ + cfg = cls() + for fld in fields(cls): + # for all the fields in the DC, find the fields (e.g. embed_dim) + # in the namespace with the prefix (e.g. decoder) + # and set it on the dc. + args_key = f"{prefix}_{fld.name}" + if safe_hasattr(args, args_key): + seen.add(args_key) + setattr(cfg, fld.name, safe_getattr(args, args_key)) + if safe_hasattr(args, fld.name): + seen.add(fld.name) + setattr(cfg, fld.name, safe_getattr(args, fld.name)) + return cfg + + @classmethod + def from_namespace(cls, args): + if args is None: + return None + if not isinstance(args, cls): + seen = set() + config = cls() + # currently, we can go generically from DC fields to args hierarchically + # but we can't easily deconstruct a flat namespace to a hierarchical + # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not + # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple + # for now. + for fld in fields(cls): + # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields + # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()` + if fld.name == "decoder": + if safe_hasattr(args, "decoder"): + # in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC + seen.add("decoder") + config.decoder = DecoderConfig(**args.decoder) + else: + config.decoder = cls._copy_keys( + args, DecoderConfig, "decoder", seen + ) + elif fld.name == "encoder": + # same but for encoder + if safe_hasattr(args, "encoder"): + seen.add("encoder") + config.encoder = EncDecBaseConfig(**args.encoder) + else: + config.encoder = cls._copy_keys( + args, EncDecBaseConfig, "encoder", seen + ) + elif fld.name == "quant_noise": + # same but for quant_noise + if safe_hasattr(args, "quant_noise"): + seen.add("quant_noise") + config.quant_noise = QuantNoiseConfig(**args.quant_noise) + else: + config.quant_noise = cls._copy_keys( + args, QuantNoiseConfig, "quant_noise", seen + ) + elif safe_hasattr(args, fld.name): + # if it's not a structure field, it's just a normal field, copy it over + seen.add(fld.name) + setattr(config, fld.name, safe_getattr(args, fld.name)) + # we got all the fields defined in the dataclass, but + # the argparse namespace might have extra args for two reasons: + # - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this + # - some places expect args to be there but never define them + args_dict = ( + args._asdict() + if safe_hasattr(args, "_asdict") + else vars(args) + if safe_hasattr(args, "__dict__") + else {} + ) # namedtupled doesn't have __dict__ :-/ + for key, value in args_dict.items(): + if key not in seen: + setattr(config, key, value) + return config + else: + return args diff --git a/fairseq/fairseq/models/transformer/transformer_decoder.py b/fairseq/fairseq/models/transformer/transformer_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..744c73f4f8cda6f0e9d6899d146ca1343d514b45 --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_decoder.py @@ -0,0 +1,474 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqIncrementalDecoder +from fairseq.models.transformer import TransformerConfig +from fairseq.modules import ( + AdaptiveSoftmax, + BaseLayer, + FairseqDropout, + LayerDropModuleList, + LayerNorm, + PositionalEmbedding, + SinusoidalPositionalEmbedding, + transformer_layer, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ + + +# rewrite name for backward compatibility in `make_generation_fast_` +def module_name_fordropout(module_name: str) -> str: + if module_name == "TransformerDecoderBase": + return "TransformerDecoder" + else: + return module_name + + +class TransformerDecoderBase(FairseqIncrementalDecoder): + """ + Transformer decoder consisting of *cfg.decoder.layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + cfg (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + cfg, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + self.cfg = cfg + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + self._future_mask = torch.empty(0) + + self.dropout_module = FairseqDropout( + cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) + ) + self.decoder_layerdrop = cfg.decoder.layerdrop + self.share_input_output_embed = cfg.share_decoder_input_output_embed + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = cfg.decoder.embed_dim + self.embed_dim = embed_dim + self.output_embed_dim = cfg.decoder.output_dim + + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = cfg.max_target_positions + + self.embed_tokens = embed_tokens + + self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) + + if not cfg.adaptive_input and cfg.quant_noise.pq > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(embed_dim, embed_dim, bias=False), + cfg.quant_noise.pq, + cfg.quant_noise.pq_block_size, + ) + else: + self.quant_noise = None + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + self.embed_positions = ( + PositionalEmbedding( + self.max_target_positions, + embed_dim, + self.padding_idx, + learned=cfg.decoder.learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + if cfg.layernorm_embedding: + self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) + else: + self.layernorm_embedding = None + + self.cross_self_attention = cfg.cross_self_attention + + if self.decoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.decoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_decoder_layer(cfg, no_encoder_attn) + for _ in range(cfg.decoder.layers) + ] + ) + self.num_layers = len(self.layers) + + if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: + self.layer_norm = LayerNorm(embed_dim, export=cfg.export) + else: + self.layer_norm = None + + self.project_out_dim = ( + Linear(embed_dim, self.output_embed_dim, bias=False) + if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights + else None + ) + + self.adaptive_softmax = None + self.output_projection = output_projection + if self.output_projection is None: + self.build_output_projection(cfg, dictionary, embed_tokens) + + def build_output_projection(self, cfg, dictionary, embed_tokens): + if cfg.adaptive_softmax_cutoff is not None: + self.adaptive_softmax = AdaptiveSoftmax( + len(dictionary), + self.output_embed_dim, + utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int), + dropout=cfg.adaptive_softmax_dropout, + adaptive_inputs=embed_tokens if cfg.tie_adaptive_weights else None, + factor=cfg.adaptive_softmax_factor, + tie_proj=cfg.tie_adaptive_proj, + ) + elif self.share_input_output_embed: + self.output_projection = nn.Linear( + self.embed_tokens.weight.shape[1], + self.embed_tokens.weight.shape[0], + bias=False, + ) + self.output_projection.weight = self.embed_tokens.weight + else: + self.output_projection = nn.Linear( + self.output_embed_dim, len(dictionary), bias=False + ) + nn.init.normal_( + self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5 + ) + num_base_layers = cfg.base_layers + for i in range(num_base_layers): + self.layers.insert( + ((i + 1) * cfg.decoder.layers) // (num_base_layers + 1), + BaseLayer(cfg), + ) + + def build_decoder_layer(self, cfg, no_encoder_attn=False): + layer = transformer_layer.TransformerDecoderLayerBase(cfg, no_encoder_attn) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + x = self.output_layer(x) + return x, extra + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + return self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + + """ + A scriptable subclass of this class has an extract_features method and calls + super().extract_features, but super() is not supported in torchscript. A copy of + this function is made to be used in the subclass instead. + """ + + def extract_features_scriptable( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Similar to *forward* but only return features. + + Includes several features from "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + bs, slen = prev_output_tokens.size() + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + enc: Optional[Tensor] = None + padding_mask: Optional[Tensor] = None + if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: + enc = encoder_out["encoder_out"][0] + if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: + padding_mask = encoder_out["encoder_padding_mask"][0] + + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # Prevent torchscript exporting issue for dynamic quant embedding + prev_output_tokens = prev_output_tokens.contiguous() + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = layer( + x, + enc, + padding_mask, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": [attn], "inner_states": inner_states} + + def output_layer(self, features): + """Project features to the vocabulary size.""" + if self.adaptive_softmax is None: + # project back to size of vocabulary + return self.output_projection(features) + else: + return features + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. + if ( + self._future_mask.size(0) == 0 + or (not self._future_mask.device == tensor.device) + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 + ) + self._future_mask = self._future_mask.to(tensor) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + if f"{name}.output_projection.weight" not in state_dict: + if self.share_input_output_embed: + embed_out_key = f"{name}.embed_tokens.weight" + else: + embed_out_key = f"{name}.embed_out" + if embed_out_key in state_dict: + state_dict[f"{name}.output_projection.weight"] = state_dict[ + embed_out_key + ] + if not self.share_input_output_embed: + del state_dict[embed_out_key] + + for i in range(self.num_layers): + # update layer norms + layer_norm_map = { + "0": "self_attn_layer_norm", + "1": "encoder_attn_layer_norm", + "2": "final_layer_norm", + } + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) + if k in state_dict: + state_dict[ + "{}.layers.{}.{}.{}".format(name, i, new, m) + ] = state_dict[k] + del state_dict[k] + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m + + +class TransformerDecoder(TransformerDecoderBase): + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + no_encoder_attn=no_encoder_attn, + output_projection=output_projection, + ) + + def build_output_projection(self, args, dictionary, embed_tokens): + super().build_output_projection( + TransformerConfig.from_namespace(args), dictionary, embed_tokens + ) + + def build_decoder_layer(self, args, no_encoder_attn=False): + return super().build_decoder_layer( + TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn + ) diff --git a/fairseq/fairseq/models/transformer/transformer_decoder_aug.py b/fairseq/fairseq/models/transformer/transformer_decoder_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..b73c06e02a0a9749120e50ce6e9f65c35a0462dc --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_decoder_aug.py @@ -0,0 +1,384 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models.transformer import TransformerConfig +from fairseq.models.transformer.transformer_decoder import TransformerDecoderBase +from fairseq.modules import ( + LayerDropModuleList, + SinusoidalPositionalEmbedding, + transformer_layer_aug, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper + + +class AugTransformerDecoderBase(TransformerDecoderBase): + """ + Transformer decoder augmented with an additional cross-attention. Each layer + is a :class:`AugTransformerDecoderLayerBase`. + + Args: + cfg (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + encoder_attn_merge_type (str, optional): the way to combine outputs from + two cross-attention modules. If "sequential" is set, two cross-attention + modules are stacked sequentially. If "parallel" is set, they are processed + in parallel and combined before feeding it to FFN (default: sequential). + dropnet_ratio (float, optional): a probability to drop each cross-attention + module during training (default: 0.0). + """ + + def __init__( + self, + cfg, + dictionary, + embed_tokens, + output_projection=None, + encoder_attn_merge_type="sequential", + dropnet_ratio=0.0, + ): + super().__init__( + cfg, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=output_projection, + ) + # assert cfg.cross_self_attention + self.cross_self_attention = cfg.cross_self_attention + + if self.decoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.decoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_decoder_layer(cfg, encoder_attn_merge_type, dropnet_ratio) + for _ in range(cfg.decoder.layers) + ] + ) + + def build_decoder_layer( + self, + cfg, + encoder_attn_merge_type="sequential", + dropnet_ratio=0, + ): + layer = transformer_layer_aug.AugTransformerDecoderLayerBase( + cfg, + no_encoder_attn=False, + encoder_attn_merge_type=encoder_attn_merge_type, + dropnet_ratio=dropnet_ratio, + ) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + encoder_out_aug=encoder_out_aug, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + x = self.output_layer(x) + return x, extra + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + encoder_out_aug: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + return self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + encoder_out_aug, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + + """ + A scriptable subclass of this class has an extract_features method and calls + super().extract_features, but super() is not supported in torchscript. A copy of + this function is made to be used in the subclass instead. + """ + + def extract_features_scriptable( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + encoder_out_aug: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Similar to *forward* but only return features. + + Includes several features from "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + bs, slen = prev_output_tokens.size() + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + enc: Optional[Tensor] = None + padding_mask: Optional[Tensor] = None + if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: + enc = encoder_out["encoder_out"][0] + if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: + padding_mask = encoder_out["encoder_padding_mask"][0] + + enc_aug: Optional[Tensor] = None + padding_mask_aug: Optional[Tensor] = None + if encoder_out_aug is not None and len(encoder_out_aug["encoder_out"]) > 0: + enc_aug = encoder_out_aug["encoder_out"][0] + if ( + encoder_out_aug is not None + and len(encoder_out_aug["encoder_padding_mask"]) > 0 + ): + padding_mask_aug = encoder_out_aug["encoder_padding_mask"][0] + + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # Prevent torchscript exporting issue for dynamic quant embedding + prev_output_tokens = prev_output_tokens.contiguous() + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + attn_aug: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, layer_attn_aug, _ = layer( + x, + enc, + padding_mask, + enc_aug, + padding_mask_aug, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + if layer_attn_aug is not None and idx == alignment_layer: + attn_aug = layer_attn_aug.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if attn_aug is not None: + if alignment_heads is not None: + attn_aug = attn_aug[:alignment_heads] + + # average probabilities over heads + attn_aug = attn_aug.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": [attn], "attn_aug": [attn_aug], "inner_states": inner_states} + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + if f"{name}.output_projection.weight" not in state_dict: + if self.share_input_output_embed: + embed_out_key = f"{name}.embed_tokens.weight" + else: + embed_out_key = f"{name}.embed_out" + if embed_out_key in state_dict: + state_dict[f"{name}.output_projection.weight"] = state_dict[ + embed_out_key + ] + if not self.share_input_output_embed: + del state_dict[embed_out_key] + + for i in range(self.num_layers): + # update layer norms + layer_norm_map = { + "0": "self_attn_layer_norm", + "1": "encoder_attn_layer_norm", + "2": "encoder_attn_layer_norm2", + "3": "final_layer_norm", + } + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) + if k in state_dict: + state_dict[ + "{}.layers.{}.{}.{}".format(name, i, new, m) + ] = state_dict[k] + del state_dict[k] + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +class AugTransformerDecoder(AugTransformerDecoderBase): + def __init__( + self, + args, + dictionary, + embed_tokens, + output_projection=None, + ): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=output_projection, + encoder_attn_merge_type=getattr( + args, "synthesizer_augmented_cross_attention_merge_type", "sequential" + ), + dropnet_ratio=getattr(args, "dropnet_ratio", 0), + ) + + def build_output_projection(self, args, dictionary, embed_tokens): + super().build_output_projection( + TransformerConfig.from_namespace(args), dictionary, embed_tokens + ) + + def build_decoder_layer( + self, + args, + encoder_attn_merge_type="sequential", + dropnet_ratio=0, + ): + return super().build_decoder_layer( + TransformerConfig.from_namespace(args), + no_encoder_attn=False, + encoder_attn_merge_type=encoder_attn_merge_type, + dropnet_ratio=dropnet_ratio, + ) diff --git a/fairseq/fairseq/models/transformer/transformer_encoder.py b/fairseq/fairseq/models/transformer/transformer_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..a684fcb448e9ac6a7a6440c825bbe3212789cf88 --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_encoder.py @@ -0,0 +1,362 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqEncoder +from fairseq.models.transformer import TransformerConfig +from fairseq.modules import ( + FairseqDropout, + LayerDropModuleList, + LayerNorm, + PositionalEmbedding, + SinusoidalPositionalEmbedding, + transformer_layer, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ + + +# rewrite name for backward compatibility in `make_generation_fast_` +def module_name_fordropout(module_name: str) -> str: + if module_name == "TransformerEncoderBase": + return "TransformerEncoder" + else: + return module_name + + +class TransformerEncoderBase(FairseqEncoder): + """ + Transformer encoder consisting of *cfg.encoder.layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, cfg, dictionary, embed_tokens, return_fc=False): + self.cfg = cfg + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + + self.dropout_module = FairseqDropout( + cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) + ) + self.encoder_layerdrop = cfg.encoder.layerdrop + self.return_fc = return_fc + + embed_dim = embed_tokens.embedding_dim + self.padding_idx = embed_tokens.padding_idx + self.max_source_positions = cfg.max_source_positions + + self.embed_tokens = embed_tokens + + self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) + + self.embed_positions = ( + PositionalEmbedding( + cfg.max_source_positions, + embed_dim, + self.padding_idx, + learned=cfg.encoder.learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + if cfg.layernorm_embedding: + self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) + else: + self.layernorm_embedding = None + + if not cfg.adaptive_input and cfg.quant_noise.pq > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(embed_dim, embed_dim, bias=False), + cfg.quant_noise.pq, + cfg.quant_noise.pq_block_size, + ) + else: + self.quant_noise = None + + if self.encoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.encoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)] + ) + self.num_layers = len(self.layers) + + if cfg.encoder.normalize_before: + self.layer_norm = LayerNorm(embed_dim, export=cfg.export) + else: + self.layer_norm = None + + def build_encoder_layer(self, cfg): + layer = transformer_layer.TransformerEncoderLayerBase( + cfg, return_fc=self.return_fc + ) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward_embedding( + self, src_tokens, token_embedding: Optional[torch.Tensor] = None + ): + # embed tokens and positions + if token_embedding is None: + token_embedding = self.embed_tokens(src_tokens) + x = embed = self.embed_scale * token_embedding + if self.embed_positions is not None: + x = embed + self.embed_positions(src_tokens) + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + x = self.dropout_module(x) + if self.quant_noise is not None: + x = self.quant_noise(x) + return x, embed + + def forward( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + return self.forward_scriptable( + src_tokens, src_lengths, return_all_hiddens, token_embeddings + ) + + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + def forward_scriptable( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + has_pads = ( + torch.tensor(src_tokens.device.type == "xla") or encoder_padding_mask.any() + ) + # Torchscript doesn't handle bool Tensor correctly, so we need to work around. + if torch.jit.is_scripting(): + has_pads = torch.tensor(1) if has_pads else torch.tensor(0) + + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # account for padding while computing the representation + x = x * ( + 1 - encoder_padding_mask.unsqueeze(-1).type_as(x) * has_pads.type_as(x) + ) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + encoder_states = [] + fc_results = [] + + if return_all_hiddens: + encoder_states.append(x) + + # encoder layers + for layer in self.layers: + lr = layer( + x, encoder_padding_mask=encoder_padding_mask if has_pads else None + ) + + if isinstance(lr, tuple) and len(lr) == 2: + x, fc_result = lr + else: + x = lr + fc_result = None + + if return_all_hiddens and not torch.jit.is_scripting(): + assert encoder_states is not None + encoder_states.append(x) + fc_results.append(fc_result) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + src_lengths = ( + src_tokens.ne(self.padding_idx) + .sum(dim=1, dtype=torch.int32) + .reshape(-1, 1) + .contiguous() + ) + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask], # B x T + "encoder_embedding": [encoder_embedding], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "fc_results": fc_results, # List[T x B x C] + "src_tokens": [], + "src_lengths": [src_lengths], + } + + @torch.jit.export + def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + if len(encoder_out["encoder_out"]) == 0: + new_encoder_out = [] + else: + new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)] + if len(encoder_out["encoder_padding_mask"]) == 0: + new_encoder_padding_mask = [] + else: + new_encoder_padding_mask = [ + encoder_out["encoder_padding_mask"][0].index_select(0, new_order) + ] + if len(encoder_out["encoder_embedding"]) == 0: + new_encoder_embedding = [] + else: + new_encoder_embedding = [ + encoder_out["encoder_embedding"][0].index_select(0, new_order) + ] + + if len(encoder_out["src_tokens"]) == 0: + src_tokens = [] + else: + src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)] + + if len(encoder_out["src_lengths"]) == 0: + src_lengths = [] + else: + src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)] + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": src_tokens, # B x T + "src_lengths": src_lengths, # B x 1 + } + + @torch.jit.export + def _reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """Dummy re-order function for beamable enc-dec attention""" + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + if self.embed_positions is None: + return self.max_source_positions + return min(self.max_source_positions, self.embed_positions.max_positions) + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + for i in range(self.num_layers): + # update layer norms + self.layers[i].upgrade_state_dict_named( + state_dict, "{}.layers.{}".format(name, i) + ) + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + return state_dict + + +class TransformerEncoder(TransformerEncoderBase): + def __init__(self, args, dictionary, embed_tokens, return_fc=False): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + return_fc=return_fc, + ) + + def build_encoder_layer(self, args): + return super().build_encoder_layer( + TransformerConfig.from_namespace(args), + ) diff --git a/fairseq/fairseq/models/transformer/transformer_legacy.py b/fairseq/fairseq/models/transformer/transformer_legacy.py new file mode 100644 index 0000000000000000000000000000000000000000..00d14a7dde778a4b3b2faecde50eecdd5115a864 --- /dev/null +++ b/fairseq/fairseq/models/transformer/transformer_legacy.py @@ -0,0 +1,277 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.transformer.transformer_config import ( + TransformerConfig, + DEFAULT_MAX_SOURCE_POSITIONS, + DEFAULT_MAX_TARGET_POSITIONS, + DEFAULT_MIN_PARAMS_TO_WRAP, +) +from fairseq.models.transformer.transformer_base import ( + TransformerModelBase, +) + + +@register_model("transformer") +class TransformerModel(TransformerModelBase): + """ + This is the legacy implementation of the transformer model that + uses argparse for configuration. + """ + + @classmethod + def hub_models(cls): + # fmt: off + + def moses_subword(path): + return { + 'path': path, + 'tokenizer': 'moses', + 'bpe': 'subword_nmt', + } + + def moses_fastbpe(path): + return { + 'path': path, + 'tokenizer': 'moses', + 'bpe': 'fastbpe', + } + + def spm(path): + return { + 'path': path, + 'bpe': 'sentencepiece', + 'tokenizer': 'space', + } + + return { + 'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'), + 'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2', + 'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'), + 'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'), + 'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'), + 'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'), + 'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'), + 'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'), + 'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'), + 'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'), + 'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'), + 'transformer.wmt20.en-ta': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz'), + 'transformer.wmt20.en-iu.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz'), + 'transformer.wmt20.en-iu.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz'), + 'transformer.wmt20.ta-en': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz'), + 'transformer.wmt20.iu-en.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz'), + 'transformer.wmt20.iu-en.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz'), + 'transformer.flores101.mm100.615M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz'), + 'transformer.flores101.mm100.175M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz'), + } + # fmt: on + + def __init__(self, args, encoder, decoder): + cfg = TransformerConfig.from_namespace(args) + super().__init__(cfg, encoder, decoder) + self.args = args + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + # we want to build the args recursively in this case. + # do not set defaults so that settings defaults from various architectures still works + gen_parser_from_dataclass( + parser, TransformerConfig(), delete_default=True, with_prefix="" + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + if args.encoder_layers_to_keep: + args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) + if args.decoder_layers_to_keep: + args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) + + if getattr(args, "max_source_positions", None) is None: + args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + if args.share_all_embeddings: + if src_dict != tgt_dict: + raise ValueError("--share-all-embeddings requires a joined dictionary") + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + args.share_decoder_input_output_embed = True + + if getattr(args, "offload_activations", False): + args.checkpoint_activations = True # offloading implies checkpointing + + if not args.share_all_embeddings: + args.min_params_to_wrap = getattr( + args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP + ) + cfg = TransformerConfig.from_namespace(args) + return super().build_model(cfg, task) + + @classmethod + def build_embedding(cls, args, dictionary, embed_dim, path=None): + return super().build_embedding( + TransformerConfig.from_namespace(args), dictionary, embed_dim, path + ) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return super().build_encoder( + TransformerConfig.from_namespace(args), src_dict, embed_tokens + ) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return super().build_decoder( + TransformerConfig.from_namespace(args), tgt_dict, embed_tokens + ) + + +# architectures + + +@register_model_architecture("transformer", "transformer_tiny") +def tiny_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 64) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 64) + args.encoder_layers = getattr(args, "encoder_layers", 2) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2) + args.decoder_layers = getattr(args, "decoder_layers", 2) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2) + return base_architecture(args) + + +@register_model_architecture("transformer", "transformer") +def base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.merge_src_tgt_embed = getattr(args, "merge_src_tgt_embed", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.no_cross_attention = getattr(args, "no_cross_attention", False) + args.cross_self_attention = getattr(args, "cross_self_attention", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.checkpoint_activations = getattr(args, "checkpoint_activations", False) + args.offload_activations = getattr(args, "offload_activations", False) + if args.offload_activations: + args.checkpoint_activations = True + args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) + args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) + args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) + + +@register_model_architecture("transformer", "transformer_iwslt_de_en") +def transformer_iwslt_de_en(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 6) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_wmt_en_de") +def transformer_wmt_en_de(args): + base_architecture(args) + + +# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) +@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big") +def transformer_vaswani_wmt_en_de_big(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.dropout = getattr(args, "dropout", 0.3) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big") +def transformer_vaswani_wmt_en_fr_big(args): + args.dropout = getattr(args, "dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) + + +@register_model_architecture("transformer", "transformer_wmt_en_de_big") +def transformer_wmt_en_de_big(args): + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) + + +# default parameters used in tensor2tensor implementation +@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t") +def transformer_wmt_en_de_big_t2t(args): + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.activation_dropout = getattr(args, "activation_dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) diff --git a/fairseq/fairseq/models/wav2vec/__init__.py b/fairseq/fairseq/models/wav2vec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b756e4580b82b42c735d9e097c62f63cb7798f5e --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .wav2vec import * # noqa +from .wav2vec2 import * # noqa +from .wav2vec2_asr import * # noqa +from .wav2vec2_laser import * # noqa +from .wav2vec2_classification import * # noqa diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdfd96d926ef139f77cc089652c553f4744467b9 Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/utils.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4555d6edda81c2ec66bcc5af34e38f53d5ccb04a Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/utils.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e9d1dca9630f3fef4ce1f35157143b9835d3c02 Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d04ea47fe7a8ecfcaed8aee06d8594a9297696a Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe1f4aeb6924676212b23fc5912e1d30a241f189 Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_classification.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_classification.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c078f72573521e7805b40653e3d9f144a0e9c8 Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_classification.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_laser.cpython-310.pyc b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_laser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a18d491a8499c61853d4ffcb1024d5074118ff5c Binary files /dev/null and b/fairseq/fairseq/models/wav2vec/__pycache__/wav2vec2_laser.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/wav2vec/utils.py b/fairseq/fairseq/models/wav2vec/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dd52d86242ec5289b480cb085b50a68a713cc306 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/utils.py @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch.nn.functional as F + + +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if m.is_integer(): + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder diff --git a/fairseq/fairseq/models/wav2vec/wav2vec.py b/fairseq/fairseq/models/wav2vec/wav2vec.py new file mode 100644 index 0000000000000000000000000000000000000000..af6604da10f504baabff50bf14a6eb2214bffef3 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/wav2vec.py @@ -0,0 +1,630 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field +import logging +import math +from typing import Optional, Tuple +from omegaconf import II +import sys + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.modules import ( + Fp32GroupNorm, + Fp32LayerNorm, + GumbelVectorQuantizer, + KmeansVectorQuantizer, + TransposeLast, +) +from fairseq.tasks import FairseqTask +from fairseq.utils import buffered_arange + + +logger = logging.getLogger(__name__) + + +AGGREGATOR_CHOICES = ChoiceEnum(["cnn", "gru"]) +PROJECT_FEATURES_CHOICES = ChoiceEnum(["none", "same", "new"]) +ACTIVATION_CHOICES = ChoiceEnum(["relu", "gelu"]) +VQ_TYPE_CHOICES = ChoiceEnum(["none", "gumbel", "kmeans"]) + + +@dataclass +class Wav2VecConfig(FairseqDataclass): + prediction_steps: int = field( + default=12, metadata={"help": "number of steps ahead to predict"} + ) + sample_distance: Optional[int] = field( + default=None, + metadata={ + "help": "sample distance from target. does not work properly with cross-sampling" + }, + ) + cross_sample_negatives: int = field( + default=0, metadata={"help": "num of cross sampled negatives"} + ) + num_negatives: int = field( + default=10, metadata={"help": "num of sampled negatives"} + ) + conv_feature_layers: str = field( + default="[(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)]", + metadata={ + "help": "convolutional feature extraction layers [(dim, kernel_size, stride), ...]" + }, + ) + conv_aggregator_layers: str = field( + default="[(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]", + metadata={ + "help": "convolutional aggregator layers [(dim, kernel_size, stride), ...]" + }, + ) + dropout: float = field( + default=0.0, metadata={"help": "dropout to apply within the model"} + ) + dropout_features: float = field( + default=0.0, metadata={"help": "dropout to apply to the features"} + ) + dropout_agg: float = field( + default=0.0, metadata={"help": "dropout to apply after aggregation step"} + ) + aggregator: AGGREGATOR_CHOICES = field( + default="cnn", metadata={"help": "type of aggregator to use"} + ) + gru_dim: int = field(default=512, metadata={"help": "GRU dimensionality"}) + no_conv_bias: bool = field( + default=False, metadata={"help": "if set, does not learn bias for conv layers"} + ) + agg_zero_pad: bool = field( + default=False, + metadata={"help": "if set, zero pads in aggregator instead of repl pad"}, + ) + skip_connections_feat: bool = field( + default=False, + metadata={"help": "if set, adds skip connections to the feature extractor"}, + ) + skip_connections_agg: bool = field( + default=True, + metadata={"help": "if set, adds skip connections to the aggregator"}, + ) + residual_scale: float = field( + default=0.5, metadata={"help": "scales residual by sqrt(value)"} + ) + log_compression: bool = field( + default=True, + metadata={"help": "if set, adds a log compression to feature extractor"}, + ) + balanced_classes: bool = field( + default=False, + metadata={"help": "if set, loss is scaled to balance for number of negatives"}, + ) + project_features: PROJECT_FEATURES_CHOICES = field( + default="none", + metadata={ + "help": "if not none, features are projected using the (same or new) aggregator" + }, + ) + non_affine_group_norm: bool = field( + default=False, metadata={"help": "if set, group norm is not affine"} + ) + offset: str = field( + default="auto", + metadata={ + "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" + }, + ) + activation: ACTIVATION_CHOICES = field( + default="relu", + metadata={ + "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" + }, + ) + vq_type: VQ_TYPE_CHOICES = field( + default="none", metadata={"help": "which type of quantizer to use"} + ) + vq_vars: int = field( + default=320, + metadata={"help": "project to this many vector quantized variables per group"}, + ) + vq_groups: int = field( + default=2, metadata={"help": "number of groups of latent variables"} + ) + vq_dim: int = field( + default=0, + metadata={ + "help": "uses this dimensionality for quantized vectors. 0 to use model dim // groups" + }, + ) + vq_depth: int = field( + default=1, metadata={"help": "number of layers for vq weight projection"} + ) + combine_groups: bool = field( + default=False, metadata={"help": "if set, variables are shared among groups"} + ) + vq_temp: Tuple[float, float, float] = field( + default=(2.0, 0.5, 0.999995), + metadata={ + "help": "temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)" + }, + ) + vq_gamma: float = field( + default=0.25, + metadata={"help": "gamma parameter for kmeans style vector quantization"}, + ) + infonce: bool = II("criterion.infonce") + + +@register_model("wav2vec", dataclass=Wav2VecConfig) +class Wav2VecModel(BaseFairseqModel): + @classmethod + def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask): + """Build a new model instance.""" + + model = Wav2VecModel(cfg) + logger.info(model) + return model + + def __init__(self, cfg: Wav2VecConfig): + super().__init__() + + self.prediction_steps = cfg.prediction_steps + offset = cfg.offset + + if cfg.activation == "relu": + activation = nn.ReLU() + elif cfg.activation == "gelu": + activation = nn.GELU() + else: + raise Exception("unknown activation " + cfg.activation) + + feature_enc_layers = eval(cfg.conv_feature_layers) + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + log_compression=cfg.log_compression, + skip_connections=cfg.skip_connections_feat, + residual_scale=cfg.residual_scale, + non_affine_group_norm=cfg.non_affine_group_norm, + activation=activation, + ) + embed = feature_enc_layers[-1][0] + + self.vector_quantizer = None + if cfg.vq_type == "gumbel": + self.vector_quantizer = GumbelVectorQuantizer( + dim=embed, + num_vars=cfg.vq_vars, + temp=cfg.vq_temp, + groups=cfg.vq_groups, + combine_groups=cfg.combine_groups, + vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, + time_first=False, + activation=activation, + weight_proj_depth=cfg.vq_depth, + weight_proj_factor=2, + ) + elif cfg.vq_type == "kmeans": + self.vector_quantizer = KmeansVectorQuantizer( + dim=embed, + num_vars=cfg.vq_vars, + groups=cfg.vq_groups, + combine_groups=cfg.combine_groups, + vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, + time_first=False, + gamma=cfg.vq_gamma, + ) + else: + assert ( + cfg.vq_type == "none" or cfg.vq_type is None + ), "Unknown quantizer type" + + if cfg.offset == "auto": + jin = 0 + rin = 0 + for _, k, stride in feature_enc_layers: + if rin == 0: + rin = k + rin = rin + (k - 1) * jin + if jin == 0: + jin = stride + else: + jin *= stride + offset = math.ceil(rin / jin) + + offset = int(offset) + + def make_aggregator(): + if cfg.aggregator == "cnn": + agg_layers = eval(cfg.conv_aggregator_layers) + agg_dim = agg_layers[-1][0] + feature_aggregator = ConvAggegator( + conv_layers=agg_layers, + embed=embed, + dropout=cfg.dropout, + skip_connections=cfg.skip_connections_agg, + residual_scale=cfg.residual_scale, + non_affine_group_norm=cfg.non_affine_group_norm, + conv_bias=not cfg.no_conv_bias, + zero_pad=cfg.agg_zero_pad, + activation=activation, + ) + elif cfg.aggregator == "gru": + agg_dim = cfg.gru_dim + feature_aggregator = nn.Sequential( + TransposeLast(), + nn.GRU( + input_size=embed, + hidden_size=agg_dim, + num_layers=1, + dropout=cfg.dropout, + ), + TransposeLast(deconstruct_idx=0), + ) + else: + raise Exception("unknown aggregator type " + cfg.aggregator) + + return feature_aggregator, agg_dim + + self.feature_aggregator, agg_dim = make_aggregator() + + self.wav2vec_predictions = Wav2VecPredictionsModel( + in_dim=agg_dim, + out_dim=embed, + prediction_steps=cfg.prediction_steps, + n_negatives=cfg.num_negatives, + cross_sample_negatives=cfg.cross_sample_negatives, + sample_distance=cfg.sample_distance, + dropout=cfg.dropout, + offset=offset, + balanced_classes=cfg.balanced_classes, + infonce=cfg.infonce, + ) + + self.dropout_feats = nn.Dropout(p=cfg.dropout_features) + self.dropout_agg = nn.Dropout(p=cfg.dropout_agg) + + if cfg.project_features == "none": + self.project_features = None + elif cfg.project_features == "same": + self.project_features = self.feature_aggregator + elif cfg.project_features == "new": + self.project_features, _ = make_aggregator() + + def forward(self, source): + result = {} + + features = self.feature_extractor(source) + if self.vector_quantizer: + q_res = self.vector_quantizer(features) + features = q_res["x"] + for k in q_res.keys(): + if k != "x": + result[k] = q_res[k] + + x = self.dropout_feats(features) + x = self.feature_aggregator(x) + x = self.dropout_agg(x) + + if self.project_features is not None: + features = self.project_features(features) + x, targets = self.wav2vec_predictions(x, features) + result["cpc_logits"] = x + result["cpc_targets"] = targets + + return result + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + + def max_positions(self): + """Maximum length supported by the model.""" + return sys.maxsize + + def get_logits(self, net_output): + logits = net_output["cpc_logits"] + return logits + + def get_targets(self, sample, net_output): + t = net_output["cpc_targets"] + if isinstance(t, tuple): + t = t[0] + return t.contiguous() + + def get_target_weights(self, targets, net_output): + targets = net_output["cpc_targets"] + if isinstance(targets, tuple) and targets[-1] is not None: + return targets[-1] + return None + + def get_extra_losses(self, net_output): + loss = None + if "prob_perplexity" in net_output: + loss = net_output["num_vars"] - net_output["prob_perplexity"] + elif "kmeans_loss" in net_output: + loss = net_output["kmeans_loss"] + + return loss + + +def norm_block(is_layer_norm, dim, affine=True): + if is_layer_norm: + mod = nn.Sequential( + TransposeLast(), + Fp32LayerNorm(dim, elementwise_affine=affine), + TransposeLast(), + ) + else: + mod = Fp32GroupNorm(1, dim, affine=affine) + + return mod + + +class ConvFeatureExtractionModel(nn.Module): + def __init__( + self, + conv_layers, + dropout, + log_compression, + skip_connections, + residual_scale, + non_affine_group_norm, + activation, + ): + super().__init__() + + def block(n_in, n_out, k, stride): + return nn.Sequential( + nn.Conv1d(n_in, n_out, k, stride=stride, bias=False), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm + ), + activation, + ) + + in_d = 1 + self.conv_layers = nn.ModuleList() + for dim, k, stride in conv_layers: + self.conv_layers.append(block(in_d, dim, k, stride)) + in_d = dim + + self.log_compression = log_compression + self.skip_connections = skip_connections + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x): + # BxT -> BxCxT + x = x.unsqueeze(1) + + for conv in self.conv_layers: + residual = x + x = conv(x) + if self.skip_connections and x.size(1) == residual.size(1): + tsz = x.size(2) + r_tsz = residual.size(2) + residual = residual[..., :: r_tsz // tsz][..., :tsz] + x = (x + residual) * self.residual_scale + + if self.log_compression: + x = x.abs() + x = x + 1 + x = x.log() + + return x + + +class ZeroPad1d(nn.Module): + def __init__(self, pad_left, pad_right): + super().__init__() + self.pad_left = pad_left + self.pad_right = pad_right + + def forward(self, x): + return F.pad(x, (self.pad_left, self.pad_right)) + + +class ConvAggegator(nn.Module): + def __init__( + self, + conv_layers, + embed, + dropout, + skip_connections, + residual_scale, + non_affine_group_norm, + conv_bias, + zero_pad, + activation, + ): + super().__init__() + + def block(n_in, n_out, k, stride): + # padding dims only really make sense for stride = 1 + ka = k // 2 + kb = ka - 1 if k % 2 == 0 else ka + + pad = ( + ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0)) + ) + + return nn.Sequential( + pad, + nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias), + nn.Dropout(p=dropout), + norm_block(False, n_out, affine=not non_affine_group_norm), + activation, + ) + + in_d = embed + self.conv_layers = nn.ModuleList() + self.residual_proj = nn.ModuleList() + for dim, k, stride in conv_layers: + if in_d != dim and skip_connections: + self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False)) + else: + self.residual_proj.append(None) + + self.conv_layers.append(block(in_d, dim, k, stride)) + in_d = dim + self.conv_layers = nn.Sequential(*self.conv_layers) + self.skip_connections = skip_connections + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x): + for rproj, conv in zip(self.residual_proj, self.conv_layers): + residual = x + x = conv(x) + if self.skip_connections: + if rproj is not None: + residual = rproj(residual) + x = (x + residual) * self.residual_scale + return x + + +class Wav2VecPredictionsModel(nn.Module): + def __init__( + self, + in_dim, + out_dim, + prediction_steps, + n_negatives, + cross_sample_negatives, + sample_distance, + dropout, + offset, + balanced_classes, + infonce, + ): + super().__init__() + + self.n_negatives = n_negatives + self.cross_sample_negatives = cross_sample_negatives + self.sample_distance = sample_distance + self.project_to_steps = nn.ConvTranspose2d( + in_dim, out_dim, (1, prediction_steps) + ) + self.dropout = nn.Dropout(p=dropout) + self.offset = offset + self.balanced_classes = balanced_classes + self.infonce = infonce + + def sample_negatives(self, y): + bsz, fsz, tsz = y.shape + + y = y.transpose(0, 1) # BCT -> CBT + y = y.contiguous().view(fsz, -1) # CBT => C(BxT) + + cross_high = tsz * bsz + high = tsz if self.sample_distance is None else min(tsz, self.sample_distance) + assert high > 1 + + neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz)) + + with torch.no_grad(): + if self.n_negatives > 0: + tszs = ( + buffered_arange(tsz) + .unsqueeze(-1) + .expand(-1, self.n_negatives) + .flatten() + ) + + neg_idxs = torch.randint( + low=0, high=high - 1, size=(bsz, self.n_negatives * tsz) + ) + neg_idxs[neg_idxs >= tszs] += 1 + + if self.cross_sample_negatives > 0: + tszs = ( + buffered_arange(tsz) + .unsqueeze(-1) + .expand(-1, self.cross_sample_negatives) + .flatten() + ) + + cross_neg_idxs = torch.randint( + low=0, + high=cross_high - 1, + size=(bsz, self.cross_sample_negatives * tsz), + ) + cross_neg_idxs[cross_neg_idxs >= tszs] += 1 + + if self.n_negatives > 0: + for i in range(1, bsz): + neg_idxs[i] += i * high + else: + neg_idxs = cross_neg_idxs + + if self.cross_sample_negatives > 0 and self.n_negatives > 0: + neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1) + + negs = y[..., neg_idxs.view(-1)] + negs = negs.view( + fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz + ).permute( + 2, 1, 0, 3 + ) # to NxBxCxT + + return negs + + def forward(self, x, y): + + x = x.unsqueeze(-1) + x = self.project_to_steps(x) # BxCxTxS + x = self.dropout(x) + + negatives = self.sample_negatives(y) + y = y.unsqueeze(0) + targets = torch.cat([y, negatives], dim=0) # Copies x B x C x T + + copies = targets.size(0) + bsz, dim, tsz, steps = x.shape + steps = min(steps, tsz - self.offset) + + predictions = x.new( + bsz * copies * (tsz - self.offset + 1) * steps + - ((steps + 1) * steps // 2) * copies * bsz + ) + if self.infonce: + labels = predictions.new_full( + (predictions.shape[0] // copies,), 0, dtype=torch.long + ) + else: + labels = torch.zeros_like(predictions) + weights = ( + torch.full_like(labels, 1 / self.n_negatives) + if self.balanced_classes and not self.infonce + else None + ) + + start = end = 0 + for i in range(steps): + offset = i + self.offset + end = start + (tsz - offset) * bsz * copies + if self.infonce: + predictions[start:end] = torch.einsum( + "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:] + ).flatten() + else: + pos_num = (end - start) // copies + predictions[start:end] = torch.einsum( + "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:] + ).flatten() + labels[start : start + pos_num] = 1.0 + if weights is not None: + weights[start : start + pos_num] = 1.0 + start = end + assert end == predictions.numel(), "{} != {}".format(end, predictions.numel()) + + if self.infonce: + predictions = predictions.view(-1, copies) + else: + if weights is not None: + labels = (labels, weights) + + return predictions, labels diff --git a/fairseq/fairseq/models/wav2vec/wav2vec2.py b/fairseq/fairseq/models/wav2vec/wav2vec2.py new file mode 100644 index 0000000000000000000000000000000000000000..0faba77f8be8e5acfc9358972606714ee2da5da6 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/wav2vec2.py @@ -0,0 +1,1499 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field +from typing import List, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq import utils +from fairseq.data.data_utils import compute_mask_indices +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.distributed import fsdp_wrap +from fairseq.models import BaseFairseqModel, register_model +from fairseq.distributed.fully_sharded_data_parallel import FullyShardedDataParallel +from fairseq.modules import ( + Fp32GroupNorm, + Fp32LayerNorm, + GradMultiply, + GumbelVectorQuantizer, + LayerNorm, + MultiheadAttention, + RelPositionalEncoding, + SamePad, + TransposeLast, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.conformer_layer import ConformerWav2Vec2EncoderLayer +from fairseq.modules.transformer_sentence_encoder import init_bert_params +from fairseq.utils import buffered_arange, index_put, is_xla_tensor + +from .utils import pad_to_multiple + +EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"]) +MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"]) +LAYER_TYPE_CHOICES = ChoiceEnum(["transformer", "conformer", "trf_adp"]) + + +@dataclass +class Wav2Vec2Config(FairseqDataclass): + extractor_mode: EXTRACTOR_MODE_CHOICES = field( + default="default", + metadata={ + "help": "mode for feature extractor. default has a single group norm with d " + "groups in the first conv block, whereas layer_norm has layer norms in " + "every block (meant to use with normalize=True)" + }, + ) + encoder_layers: int = field( + default=12, metadata={"help": "num encoder layers in the transformer"} + ) + encoder_embed_dim: int = field( + default=768, metadata={"help": "encoder embedding dimension"} + ) + encoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "encoder embedding dimension for FFN"} + ) + encoder_attention_heads: int = field( + default=12, metadata={"help": "num encoder attention heads"} + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + # dropouts + dropout: float = field( + default=0.1, metadata={"help": "dropout probability for the transformer"} + ) + attention_dropout: float = field( + default=0.1, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "dropout probability after activation in FFN"} + ) + encoder_layerdrop: float = field( + default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"} + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + dropout_features: float = field( + default=0.0, + metadata={"help": "dropout to apply to the features (after feat extr)"}, + ) + + final_dim: int = field( + default=0, + metadata={ + "help": "project final representations and targets to this many dimensions." + "set to encoder_embed_dim is <= 0" + }, + ) + layer_norm_first: bool = field( + default=False, metadata={"help": "apply layernorm first in the transformer"} + ) + conv_feature_layers: str = field( + default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", + metadata={ + "help": "string describing convolutional feature extraction layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + conv_bias: bool = field( + default=False, metadata={"help": "include bias in conv encoder"} + ) + logit_temp: float = field( + default=0.1, metadata={"help": "temperature to divide logits by"} + ) + quantize_targets: bool = field( + default=False, metadata={"help": "use quantized targets"} + ) + quantize_input: bool = field( + default=False, metadata={"help": "use quantized inputs"} + ) + same_quantizer: bool = field( + default=False, metadata={"help": "use same quantizer for inputs and targets"} + ) + target_glu: bool = field( + default=False, metadata={"help": "adds projection + glu to targets"} + ) + feature_grad_mult: float = field( + default=1.0, metadata={"help": "multiply feature extractor var grads by this"} + ) + quantizer_depth: int = field( + default=1, + metadata={"help": "number of quantizer layers"}, + ) + quantizer_factor: int = field( + default=3, + metadata={ + "help": "dimensionality increase for inner quantizer layers (if depth > 1)" + }, + ) + latent_vars: int = field( + default=320, + metadata={"help": "number of latent variables V in each group of the codebook"}, + ) + latent_groups: int = field( + default=2, + metadata={"help": "number of groups G of latent variables in the codebook"}, + ) + latent_dim: int = field( + default=0, + metadata={ + "help": "if > 0, uses this dimensionality for latent variables. " + "otherwise uses final_dim / latent_groups" + }, + ) + + # masking + mask_length: int = field(default=10, metadata={"help": "mask length"}) + mask_prob: float = field( + default=0.65, metadata={"help": "probability of replacing a token with mask"} + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose mask length"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + require_same_masks: bool = field( + default=True, + metadata={ + "help": "whether to number of masked timesteps must be the same across all " + "examples in a batch" + }, + ) + mask_dropout: float = field( + default=0.0, + metadata={"help": "percent of masks to unmask for each sample"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, metadata={"help": "length of the mask for features (channels)"} + ) + mask_channel_prob: float = field( + default=0.0, metadata={"help": "probability of replacing a feature with 0"} + ) + mask_channel_before: bool = False + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, metadata={"help": "whether to allow channel masks to overlap"} + ) + mask_channel_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # negative selection + num_negatives: int = field( + default=100, + metadata={"help": "number of negative examples from the same sample"}, + ) + negatives_from_everywhere: bool = field( + default=False, + metadata={"help": "sample negatives from everywhere, not just masked states"}, + ) + cross_sample_negatives: int = field( + default=0, metadata={"help": "number of negative examples from the any sample"} + ) + codebook_negatives: int = field( + default=0, metadata={"help": "number of negative examples codebook"} + ) + + # positional embeddings + conv_pos: int = field( + default=128, + metadata={"help": "number of filters for convolutional positional embeddings"}, + ) + conv_pos_groups: int = field( + default=16, + metadata={"help": "number of groups for convolutional positional embedding"}, + ) + pos_conv_depth: int = field( + default=1, + metadata={"help": "depth of positional encoder network"}, + ) + + latent_temp: Tuple[float, float, float] = field( + default=(2, 0.5, 0.999995), + metadata={ + "help": "temperature for latent variable sampling. " + "can be tuple of 3 values (start, end, decay)" + }, + ) + max_positions: int = field(default=100000, metadata={"help": "Max positions"}) + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + + # FP16 optimization + required_seq_len_multiple: int = field( + default=2, + metadata={ + "help": "pad the input to encoder such that the sequence length is divisible by multiple" + }, + ) + crop_seq_to_multiple: int = field( + default=1, + metadata={ + "help": "crop convolutional feature extractor output such that the sequence length is divisible by multiple" + }, + ) + + # Conformer + depthwise_conv_kernel_size: int = field( + default=31, + metadata={ + "help": "depthwise-conv-kernel-size for convolution in conformer layer" + }, + ) + attn_type: str = field( + default="", + metadata={"help": "if espnet use ESPNET MHA"}, + ) + pos_enc_type: str = field( + default="abs", + metadata={"help": "Positional encoding type to use in conformer"}, + ) + fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) + + # Adapter num + adp_num: int = field( + default=-1 + ) + adp_dim: int = field( + default=64 + ) + adp_act_fn: str = field( + default="relu" + ) + adp_trf_idx: str = field( + default="all", + ) + + +@register_model("wav2vec2", dataclass=Wav2Vec2Config) +class Wav2Vec2Model(BaseFairseqModel): + def __init__(self, cfg: Wav2Vec2Config): + super().__init__() + self.cfg = cfg + + feature_enc_layers = eval(cfg.conv_feature_layers) + self.embed = feature_enc_layers[-1][0] + + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + mode=cfg.extractor_mode, + conv_bias=cfg.conv_bias, + ) + + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input + else None + ) + + self.crop_seq_to_multiple = cfg.crop_seq_to_multiple + + self.mask_prob = cfg.mask_prob + self.mask_selection = cfg.mask_selection + self.mask_other = cfg.mask_other + self.mask_length = cfg.mask_length + self.no_mask_overlap = cfg.no_mask_overlap + self.mask_min_space = cfg.mask_min_space + + self.mask_channel_prob = cfg.mask_channel_prob + self.mask_channel_before = cfg.mask_channel_before + self.mask_channel_selection = cfg.mask_channel_selection + self.mask_channel_other = cfg.mask_channel_other + self.mask_channel_length = cfg.mask_channel_length + self.no_mask_channel_overlap = cfg.no_mask_channel_overlap + self.mask_channel_min_space = cfg.mask_channel_min_space + + self.dropout_input = nn.Dropout(cfg.dropout_input) + self.dropout_features = nn.Dropout(cfg.dropout_features) + + self.feature_grad_mult = cfg.feature_grad_mult + + self.quantizer = None + self.input_quantizer = None + + self.n_negatives = cfg.num_negatives + self.cross_sample_negatives = cfg.cross_sample_negatives + self.codebook_negatives = cfg.codebook_negatives + self.negatives_from_everywhere = cfg.negatives_from_everywhere + + self.logit_temp = cfg.logit_temp + + final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim + + if cfg.quantize_targets: + vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim + self.quantizer = GumbelVectorQuantizer( + dim=self.embed, + num_vars=cfg.latent_vars, + temp=cfg.latent_temp, + groups=cfg.latent_groups, + combine_groups=False, + vq_dim=vq_dim, + time_first=True, + weight_proj_depth=cfg.quantizer_depth, + weight_proj_factor=cfg.quantizer_factor, + ) + self.project_q = nn.Linear(vq_dim, final_dim) + else: + self.project_q = nn.Linear(self.embed, final_dim) + + if cfg.quantize_input: + if cfg.same_quantizer and self.quantizer is not None: + vq_dim = final_dim + self.input_quantizer = self.quantizer + else: + vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim + self.input_quantizer = GumbelVectorQuantizer( + dim=self.embed, + num_vars=cfg.latent_vars, + temp=cfg.latent_temp, + groups=cfg.latent_groups, + combine_groups=False, + vq_dim=vq_dim, + time_first=True, + weight_proj_depth=cfg.quantizer_depth, + weight_proj_factor=cfg.quantizer_factor, + ) + self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim) + + self.mask_emb = nn.Parameter( + torch.FloatTensor(cfg.encoder_embed_dim).uniform_() + ) + encoder_cls = TransformerEncoder + if cfg.layer_type == "conformer" and cfg.pos_enc_type in ["rel_pos", "rope"]: + encoder_cls = ConformerEncoder + + self.encoder = encoder_cls(cfg) + self.layer_norm = LayerNorm(self.embed) + + self.target_glu = None + if cfg.target_glu: + self.target_glu = nn.Sequential( + nn.Linear(final_dim, final_dim * 2), nn.GLU() + ) + + self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + return state_dict + + @classmethod + def build_model(cls, cfg: Wav2Vec2Config, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def apply_mask( + self, + x, + padding_mask, + mask_indices=None, + mask_channel_indices=None, + ): + B, T, C = x.shape + + if self.mask_channel_prob > 0 and self.mask_channel_before: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + if self.mask_prob > 0: + if mask_indices is None: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + require_same_masks=self.cfg.require_same_masks, + mask_dropout=self.cfg.mask_dropout, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x = index_put(x, mask_indices, self.mask_emb) + else: + mask_indices = None + + if self.mask_channel_prob > 0 and not self.mask_channel_before: + if mask_channel_indices is None: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x = index_put(x, mask_channel_indices, 0) + + return x, mask_indices + + def sample_negatives(self, y, num, padding_count=None): + + if self.n_negatives == 0 and self.cross_sample_negatives == 0: + return y.new(0) + + bsz, tsz, fsz = y.shape + y = y.view(-1, fsz) # BTC => (BxT)C + + # FIXME: what happens if padding_count is specified? + cross_high = tsz * bsz + high = tsz - (padding_count or 0) + with torch.no_grad(): + assert high > 1, f"{bsz,tsz,fsz}" + + if self.n_negatives > 0: + tszs = ( + buffered_arange(num) + .unsqueeze(-1) + .expand(-1, self.n_negatives) + .flatten() + ) + + neg_idxs = torch.randint( + low=0, high=high - 1, size=(bsz, self.n_negatives * num) + ) + neg_idxs[neg_idxs >= tszs] += 1 + + if self.cross_sample_negatives > 0: + tszs = ( + buffered_arange(num) + .unsqueeze(-1) + .expand(-1, self.cross_sample_negatives) + .flatten() + ) + + cross_neg_idxs = torch.randint( + low=0, + high=cross_high - 1, + size=(bsz, self.cross_sample_negatives * num), + ) + cross_neg_idxs[cross_neg_idxs >= tszs] += 1 + + if self.n_negatives > 0: + neg_idxs = neg_idxs + (torch.arange(bsz).unsqueeze(1) * high) + else: + neg_idxs = cross_neg_idxs + + if self.cross_sample_negatives > 0 and self.n_negatives > 0: + neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1) + + negs = y[neg_idxs.view(-1)] + negs = negs.view( + bsz, num, self.n_negatives + self.cross_sample_negatives, fsz + ).permute( + 2, 0, 1, 3 + ) # to NxBxTxC + return negs, neg_idxs + + def compute_preds(self, x, y, negatives): + + neg_is_pos = (y == negatives).all(-1) + y = y.unsqueeze(0) + targets = torch.cat([y, negatives], dim=0) + + logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1) + logits = logits / self.logit_temp + logits = logits.type_as(x) + + if is_xla_tensor(logits) or neg_is_pos.any(): + if not hasattr(self, "_inftensor"): + fillval = -float(2**30) + self._inftensor = ( + torch.tensor(fillval).to(x.device) + if is_xla_tensor(logits) + else float("-inf") + ) + logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor) + + return logits + + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + return torch.floor((input_length - kernel_size) / stride + 1) + + conv_cfg_list = eval(self.cfg.conv_feature_layers) + + for i in range(len(conv_cfg_list)): + input_lengths = _conv_out_length( + input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2] + ) + + return input_lengths.to(torch.long) + + def forward( + self, + source, + padding_mask=None, + mask=True, + features_only=False, + layer=None, + mask_indices=None, + mask_channel_indices=None, + padding_count=None, + corpus_key=None, + ): + + if self.feature_grad_mult > 0: + features = self.feature_extractor(source) + if self.feature_grad_mult != 1.0: + features = GradMultiply.apply(features, self.feature_grad_mult) + else: + with torch.no_grad(): + features = self.feature_extractor(source) + + features_pen = features.float().pow(2).mean() + + features = features.transpose(1, 2) + features = self.layer_norm(features) + unmasked_features = features.clone() + + if padding_mask is not None and padding_mask.any(): + input_lengths = (1 - padding_mask.long()).sum(-1) + # apply conv formula to get real output_lengths + output_lengths = self._get_feat_extract_output_lengths(input_lengths) + + padding_mask = torch.zeros( + features.shape[:2], dtype=features.dtype, device=features.device + ) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + padding_mask[ + ( + torch.arange(padding_mask.shape[0], device=padding_mask.device), + output_lengths - 1, + ) + ] = 1 + padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() + else: + padding_mask = None + + time_steps_to_drop = features.size(1) % self.crop_seq_to_multiple + if time_steps_to_drop != 0: + features = features[:, :-time_steps_to_drop] + unmasked_features = unmasked_features[:, :-time_steps_to_drop] + if padding_mask is not None: + padding_mask = padding_mask[:, :-time_steps_to_drop] + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + features = self.dropout_input(features) + unmasked_features = self.dropout_features(unmasked_features) + + num_vars = None + code_ppl = None + prob_ppl = None + curr_temp = None + + if self.input_quantizer: + q = self.input_quantizer(features, produce_targets=False) + features = q["x"] + num_vars = q["num_vars"] + code_ppl = q["code_perplexity"] + prob_ppl = q["prob_perplexity"] + curr_temp = q["temp"] + features = self.project_inp(features) + + if mask: + x, mask_indices = self.apply_mask( + features, + padding_mask, + mask_indices=mask_indices, + mask_channel_indices=mask_channel_indices, + ) + if not is_xla_tensor(x) and mask_indices is not None: + # tpu-comment: reducing the size in a dynamic way causes + # too many recompilations on xla. + y = unmasked_features[mask_indices].view( + unmasked_features.size(0), -1, unmasked_features.size(-1) + ) + else: + y = unmasked_features + else: + x = features + y = unmasked_features + mask_indices = None + + x, layer_results = self.encoder( + x, padding_mask=padding_mask, layer=layer, corpus_key=corpus_key + ) + + if features_only: + return { + "x": x, + "padding_mask": padding_mask, + "features": unmasked_features, + "layer_results": layer_results, + } + + if self.quantizer: + if self.negatives_from_everywhere: + q = self.quantizer(unmasked_features, produce_targets=False) + y = q["x"] + num_vars = q["num_vars"] + code_ppl = q["code_perplexity"] + prob_ppl = q["prob_perplexity"] + curr_temp = q["temp"] + y = self.project_q(y) + + negs, _ = self.sample_negatives( + y, + mask_indices[0].sum(), + padding_count=padding_count, + ) + y = y[mask_indices].view(y.size(0), -1, y.size(-1)) + + else: + q = self.quantizer(y, produce_targets=False) + y = q["x"] + num_vars = q["num_vars"] + code_ppl = q["code_perplexity"] + prob_ppl = q["prob_perplexity"] + curr_temp = q["temp"] + + y = self.project_q(y) + + negs, _ = self.sample_negatives( + y, + y.size(1), + padding_count=padding_count, + ) + + if self.codebook_negatives > 0: + cb_negs = self.quantizer.sample_from_codebook( + y.size(0) * y.size(1), self.codebook_negatives + ) + cb_negs = cb_negs.view( + self.codebook_negatives, y.size(0), y.size(1), -1 + ) # order doesnt matter + cb_negs = self.project_q(cb_negs) + negs = torch.cat([negs, cb_negs], dim=0) + else: + y = self.project_q(y) + + if self.negatives_from_everywhere: + negs, _ = self.sample_negatives( + unmasked_features, + y.size(1), + padding_count=padding_count, + ) + negs = self.project_q(negs) + else: + negs, _ = self.sample_negatives( + y, + y.size(1), + padding_count=padding_count, + ) + + if not is_xla_tensor(x): + # tpu-comment: reducing the size in a dynamic way causes + # too many recompilations on xla. + x = x[mask_indices].view(x.size(0), -1, x.size(-1)) + + if self.target_glu: + y = self.target_glu(y) + negs = self.target_glu(negs) + + x = self.final_proj(x) + x = self.compute_preds(x, y, negs) + + result = { + "x": x, + "padding_mask": padding_mask, + "features_pen": features_pen, + } + + if prob_ppl is not None: + result["prob_perplexity"] = prob_ppl + result["code_perplexity"] = code_ppl + result["num_vars"] = num_vars + result["temp"] = curr_temp + + return result + + def quantize(self, x): + assert self.quantizer is not None + x = self.feature_extractor(x) + x = x.transpose(1, 2) + x = self.layer_norm(x) + return self.quantizer.forward_idx(x) + + def extract_features( + self, source, padding_mask, mask=False, layer=None, corpus_key=None + ): + res = self.forward( + source, + padding_mask, + mask=mask, + features_only=True, + layer=layer, + corpus_key=corpus_key, + ) + return res + + def get_logits(self, net_output): + logits = net_output["x"] + logits = logits.transpose(0, 2) + logits = logits.reshape(-1, logits.size(-1)) + return logits + + def get_targets(self, sample, net_output, expand_steps=True): + x = net_output["x"] + return x.new_zeros(x.size(1) * x.size(2), dtype=torch.long) + + def get_extra_losses(self, net_output): + pen = [] + + if "prob_perplexity" in net_output: + pen.append( + (net_output["num_vars"] - net_output["prob_perplexity"]) + / net_output["num_vars"] + ) + + if "features_pen" in net_output: + pen.append(net_output["features_pen"]) + + return pen + + def remove_pretraining_modules(self, last_layer=None): + self.quantizer = None + self.project_q = None + self.target_glu = None + self.final_proj = None + + if last_layer is not None: + self.encoder.layers = nn.ModuleList( + l for i, l in enumerate(self.encoder.layers) if i <= last_layer + ) + + +class ConvFeatureExtractionModel(nn.Module): + def __init__( + self, + conv_layers: List[Tuple[int, int, int]], + dropout: float = 0.0, + mode: str = "default", + conv_bias: bool = False, + ): + super().__init__() + + assert mode in {"default", "layer_norm"} + + def block( + n_in, + n_out, + k, + stride, + is_layer_norm=False, + is_group_norm=False, + conv_bias=False, + ): + def make_conv(): + conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias) + nn.init.kaiming_normal_(conv.weight) + return conv + + assert ( + is_layer_norm and is_group_norm + ) == False, "layer norm and group norm are exclusive" + + if is_layer_norm: + return nn.Sequential( + make_conv(), + nn.Dropout(p=dropout), + nn.Sequential( + TransposeLast(), + Fp32LayerNorm(dim, elementwise_affine=True), + TransposeLast(), + ), + nn.GELU(), + ) + elif is_group_norm: + return nn.Sequential( + make_conv(), + nn.Dropout(p=dropout), + Fp32GroupNorm(dim, dim, affine=True), + nn.GELU(), + ) + else: + return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) + + in_d = 1 + self.conv_layers = nn.ModuleList() + for i, cl in enumerate(conv_layers): + assert len(cl) == 3, "invalid conv definition: " + str(cl) + (dim, k, stride) = cl + + self.conv_layers.append( + block( + in_d, + dim, + k, + stride, + is_layer_norm=mode == "layer_norm", + is_group_norm=mode == "default" and i == 0, + conv_bias=conv_bias, + ) + ) + in_d = dim + + def forward(self, x): + + # BxT -> BxCxT + x = x.unsqueeze(1) + + for conv in self.conv_layers: + x = conv(x) + + return x + + +def make_conv_pos(e, k, g, is_batch_norm=False): + pos_conv = nn.Conv1d( + e, + e, + kernel_size=k, + padding=k // 2, + groups=g, + ) + dropout = 0 + std = math.sqrt((4 * (1.0 - dropout)) / (k * e)) + nn.init.normal_(pos_conv.weight, mean=0, std=std) + nn.init.constant_(pos_conv.bias, 0) + + if not is_batch_norm: + pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2) + pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU()) + else: + batch_norm = nn.BatchNorm1d(e) + pos_conv = nn.Sequential(batch_norm, pos_conv, SamePad(k), nn.GELU()) + + return pos_conv + + +class TransformerEncoder(nn.Module): + def build_encoder_layer(self, args: Wav2Vec2Config, **kwargs): + if args.layer_type == "transformer": + layer = TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + ) + elif args.layer_type == "conformer": + layer = ConformerWav2Vec2EncoderLayer( + embed_dim=self.embedding_dim, + ffn_embed_dim=args.encoder_ffn_embed_dim, + attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, + activation_fn="swish", + attn_type=args.attn_type, + use_fp16=args.fp16, + pos_enc_type="abs", + ) + elif args.layer_type == "trf_adp": + use_adp = False + if args.adp_trf_idx == "all": + use_adp = True + else: + adp_trf_idx = list(range(*[int(g) for g in args.adp_trf_idx.split(":")])) + if kwargs.get("layer_idx", None) in adp_trf_idx: + use_adp = True + if use_adp: + layer = TransformerSentenceEncoderWithAdapterLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + adapter_num=args.adp_num, + adapter_dim=args.adp_dim, + adapter_act_fn=args.adp_act_fn, + ) + else: + layer = TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + ) + + layer = fsdp_wrap(layer) + if args.checkpoint_activations: + layer = checkpoint_wrapper(layer) + return layer + + def __init__(self, args: Wav2Vec2Config, skip_pos_conv: bool = False, override_encoder_layer: int = None): + super().__init__() + + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + self.required_seq_len_multiple = args.required_seq_len_multiple + + pos_conv_depth = getattr(args, "pos_conv_depth", 1) + if pos_conv_depth > 1: + num_layers = args.pos_conv_depth + k = max(3, args.conv_pos // num_layers) + + def make_conv_block(e, k, g, l): + return nn.Sequential( + *[ + nn.Sequential( + nn.Conv1d( + e, + e, + kernel_size=k, + padding=k // 2, + groups=g, + ), + SamePad(k), + TransposeLast(), + LayerNorm(e, elementwise_affine=False), + TransposeLast(), + nn.GELU(), + ) + for _ in range(l) + ] + ) + + self.pos_conv = make_conv_block( + self.embedding_dim, k, args.conv_pos_groups, num_layers + ) + elif skip_pos_conv: + self.pos_conv = None + else: + self.pos_conv = make_conv_pos( + self.embedding_dim, + args.conv_pos, + args.conv_pos_groups, + is_batch_norm=args.conv_pos_batch_norm + if hasattr(args, "conv_pos_batch_norm") + else False, + ) + + if override_encoder_layer is None: + encoder_layers = args.encoder_layers + else: + encoder_layers = override_encoder_layer + + self.layers = nn.ModuleList( + [self.build_encoder_layer(args, layer_idx=ii) for ii in range(encoder_layers)] + ) + self.layer_norm_first = args.layer_norm_first + self.layer_norm = LayerNorm(self.embedding_dim) + self.layerdrop = args.encoder_layerdrop + + self.apply(init_bert_params) + + def forward(self, x, padding_mask=None, layer=None, corpus_key=None): + x, layer_results = self.extract_features( + x, padding_mask, layer, corpus_key=corpus_key + ) + + if self.layer_norm_first and layer is None: + x = self.layer_norm(x) + + return x, layer_results + + def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + corpus_key=None, + ): + + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + if self.pos_conv is not None: + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple( + x, self.required_seq_len_multiple, dim=-2, value=0 + ) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + r = None + + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 + if not self.training or (dropout_probability > self.layerdrop): + layer_check = layer + if isinstance(layer, FullyShardedDataParallel): + layer_check = layer.unwrapped_module + if (corpus_key is None) or ( + not isinstance(layer_check, ( + TransformerSentenceEncoderWithAdapterLayer, + ) + ) + ): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + else: + x, (z, lr) = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + corpus_key=corpus_key, + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results + + def max_positions(self): + """Maximum output length supported by the encoder.""" + return self.args.max_positions + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + return state_dict + + +class ConformerEncoder(TransformerEncoder): + def build_encoder_layer(self, args): + layer = ConformerWav2Vec2EncoderLayer( + embed_dim=self.embedding_dim, + ffn_embed_dim=args.encoder_ffn_embed_dim, + attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, + activation_fn="swish", + attn_type=args.attn_type, + pos_enc_type=args.pos_enc_type, + use_fp16=args.fp16, # only used for rope + ) + layer = fsdp_wrap(layer) + if args.checkpoint_activations: + layer = checkpoint_wrapper(layer) + return layer + + def __init__(self, args): + super().__init__(args) + self.args = args + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + self.pos_enc_type = args.pos_enc_type + max_source_positions = self.max_positions() + + if self.pos_enc_type == "rel_pos": + self.embed_positions = RelPositionalEncoding( + max_source_positions, self.embedding_dim + ) + elif self.pos_enc_type == "rope": + self.embed_positions = None + else: + raise Exception("Unsupported positional encoding type") + + self.layers = nn.ModuleList( + [self.build_encoder_layer(args) for _ in range(args.encoder_layers)] + ) + self.layer_norm_first = args.layer_norm_first + self.layer_norm = LayerNorm(self.embedding_dim) + self.layerdrop = args.encoder_layerdrop + + self.apply(init_bert_params) + + def extract_features(self, x, padding_mask=None, tgt_layer=None): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # B X T X C here + position_emb = None + if self.pos_enc_type == "rel_pos": + position_emb = self.embed_positions(x) + + if not self.layer_norm_first: + x = self.layer_norm(x) + + x = F.dropout(x, p=self.dropout, training=self.training) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, z = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + position_emb=position_emb, + ) + if tgt_layer is not None: + layer_results.append((x, z)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, layer_results + + +class TransformerSentenceEncoderLayer(nn.Module): + """ + Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained + models. + """ + + def __init__( + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + ) -> None: + + super().__init__() + # Initialize parameters + self.embedding_dim = embedding_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + + # Initialize blocks + self.activation_fn = utils.get_activation_fn(activation_fn) + self.self_attn = MultiheadAttention( + self.embedding_dim, + num_attention_heads, + dropout=attention_dropout, + self_attention=True, + ) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(self.activation_dropout) + self.dropout3 = nn.Dropout(dropout) + + self.layer_norm_first = layer_norm_first + + # layer norm associated with the self attention layer + self.self_attn_layer_norm = LayerNorm(self.embedding_dim) + self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) + self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) + + # layer norm associated with the position wise feed-forward NN + self.final_layer_norm = LayerNorm(self.embedding_dim) + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + att_args=None, + ): + """ + LayerNorm is applied either before or after the self-attention/ffn + modules similar to the original Transformer imlementation. + """ + residual = x + + if self.layer_norm_first: + x = self.self_attn_layer_norm(x) + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + attn_mask=self_attn_mask, + need_weights=False, + ) + x = self.dropout1(x) + x = residual + x + + residual = x + x = self.final_layer_norm(x) + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + + layer_result = x + + x = self.dropout3(x) + x = residual + x + else: + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + ) + + x = self.dropout1(x) + x = residual + x + + x = self.self_attn_layer_norm(x) + + residual = x + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + + layer_result = x + + x = self.dropout3(x) + x = residual + x + x = self.final_layer_norm(x) + + return x, (attn, layer_result) + + +class AdapterFast(nn.Module): + def __init__(self, adapter_num, input_dim, hidden_dim, act_fn): + """ + Implements adapter modules directly with 3D tensor weight as parameters + and without using ModuleList orto speed up training throughput. + """ + super().__init__() + + self.adapter_num = adapter_num + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.W_a = nn.Parameter(torch.empty(adapter_num, hidden_dim, input_dim)) + self.W_b = nn.Parameter(torch.empty(adapter_num, input_dim, hidden_dim)) + self.b_a = nn.Parameter(torch.empty(adapter_num, hidden_dim)) + self.b_b = nn.Parameter(torch.empty(adapter_num, input_dim)) + + self.ln_W = nn.Parameter(torch.empty(adapter_num, input_dim)) + self.ln_b = nn.Parameter(torch.empty(adapter_num, input_dim)) + self.act_fn = nn.Identity() + if act_fn == "relu": + self.act_fn = nn.ReLU() + elif act_fn == "gelu": + self.act_fn = nn.GELU() + elif act_fn == "selu": + self.act_fn = nn.SELU() + else: + raise ValueError(f"unsupported {act_fn}") + + + self.input_dim = input_dim + self.reset_parameters() + + def reset_parameters(self): + for ii in range(self.adapter_num): + nn.init.kaiming_uniform_(self.W_a[ii], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.W_b[ii], a=math.sqrt(5)) + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_a[ii]) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(self.b_a[ii], -bound, bound) + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_b[ii]) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(self.b_b[ii], -bound, bound) + + nn.init.ones_(self.ln_W) + nn.init.zeros_(self.ln_b) + + def forward(self, x, adapter_id): + ii = adapter_id + h = x + h = F.layer_norm(h, (self.input_dim, ), self.ln_W[ii], self.ln_b[ii]) + h = F.linear(h, self.W_a[ii], self.b_a[ii]) + h = self.act_fn(h) + h = F.linear(h, self.W_b[ii], self.b_b[ii]) + outputs = h + return outputs + + def extra_repr(self): + return ('adapter={}, input_dim={}, hidden_dim={}'.format(self.adapter_num, self.input_dim, self.hidden_dim)) + + + +class TransformerSentenceEncoderWithAdapterLayer(TransformerSentenceEncoderLayer): + """ + Implements a Transformer Encoder Layer with adapters used in BERT/XLM style pre-trained + models. An adapter module is added along with vanilla Transformer module. + """ + + def __init__( + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + adapter_num=201, + adapter_dim=64, + adapter_act_fn="relu", + ) -> None: + + super().__init__( + embedding_dim=embedding_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=dropout, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + layer_norm_first=layer_norm_first, + + ) + + self.adapter_num = adapter_num + self.adapter_dim = adapter_dim + self.adapter_layer = AdapterFast(adapter_num, self.embedding_dim, self.adapter_dim, adapter_act_fn) + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + att_args=None, + corpus_key=None, + ): + + x, (attn, layer_result) = super().forward( + x=x, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_weights=need_weights, + att_args=att_args, + ) + assert corpus_key is not None + assert len(set(corpus_key)) == 1, f"corpus_key items are not same {corpus_key}" + y = self.adapter_layer(x, corpus_key[0]) + x = x + y + return x, (attn, layer_result) diff --git a/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py b/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py new file mode 100644 index 0000000000000000000000000000000000000000..0403efebb9b4b48c4d8f518e127e7663395329e8 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py @@ -0,0 +1,878 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import copy +import logging +import math +import re +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from omegaconf import II, MISSING, open_dict + +from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import ( + BaseFairseqModel, + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, +) +from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, LAYER_TYPE_CHOICES, AdapterFast +from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer +from fairseq.tasks import FairseqTask + +logger = logging.getLogger(__name__) + + +@dataclass +class Wav2Vec2AsrConfig(FairseqDataclass): + w2v_path: str = field( + default=MISSING, metadata={"help": "path to wav2vec 2.0 model"} + ) + no_pretrained_weights: bool = field( + default=False, metadata={"help": "if true, does not load pretrained weights"} + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + + final_dropout: float = field( + default=0.0, + metadata={"help": "dropout after transformer and before final projection"}, + ) + dropout: float = field( + default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"} + ) + attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside wav2vec 2.0 model" + }, + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside wav2vec 2.0 model" + }, + ) + + # masking + apply_mask: bool = field( + default=False, metadata={"help": "apply masking during fine-tuning"} + ) + mask_length: int = field( + default=10, metadata={"help": "repeat the mask indices multiple times"} + ) + mask_prob: float = field( + default=0.5, + metadata={ + "help": "probability of replacing a token with mask (normalized by length)" + }, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose masks"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + require_same_masks: bool = field( + default=True, + metadata={ + "help": "whether to number of masked timesteps must be the same across all " + "examples in a batch" + }, + ) + mask_dropout: float = field( + default=0.0, + metadata={"help": "percent of masks to unmask for each sample"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, metadata={"help": "length of the mask for features (channels)"} + ) + mask_channel_prob: float = field( + default=0.0, metadata={"help": "probability of replacing a feature with 0"} + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, metadata={"help": "whether to allow channel masks to overlap"} + ) + freeze_finetune_updates: int = field( + default=0, metadata={"help": "dont finetune wav2vec for this many updates"} + ) + feature_grad_mult: float = field( + default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"} + ) + layerdrop: float = field( + default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"} + ) + drop_path: float = 0 + mask_channel_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + mask_channel_before: bool = False + normalize: bool = II("task.normalize") + update_alibi: bool = True + data: str = II("task.data") + # this holds the loaded wav2vec args + w2v_args: Any = None + offload_activations: bool = field( + default=False, metadata={"help": "offload_activations"} + ) + min_params_to_wrap: int = field( + default=int(1e8), + metadata={ + "help": "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + }, + ) + + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + ddp_backend: str = II("distributed_training.ddp_backend") + + zero_mask: bool = False + load_ema: bool = False + + layer_decay: float = 1 + + + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + # Adapter num + adp_num: int = field( + default=-1 + ) + adp_dim: int = field( + default=64 + ) + adp_act_fn: str = field( + default="relu" + ) + adp_trf_idx: str = field( + default="all", + ) + + freeze_regex: Optional[str] = field( + default=None, + ) + +@dataclass +class Wav2Vec2CtcConfig(Wav2Vec2AsrConfig): + blank_weight: float = 0 + blank_mode: str = "add" + + +@register_model("wav2vec_ctc", dataclass=Wav2Vec2CtcConfig) +class Wav2VecCtc(BaseFairseqModel): + def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): + super().__init__() + self.cfg = cfg + self.w2v_encoder = w2v_encoder + self.blank_weight = cfg.blank_weight + self.blank_mode = cfg.blank_mode + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = Wav2VecEncoder(cfg, len(task.target_dictionary)) + return cls(cfg, w2v_encoder) + + def get_logits(self, net_output, normalize=False): + logits = net_output["encoder_out"] + if self.blank_weight != 0: + if self.blank_mode == "add": + logits[..., 0] += self.blank_weight + elif self.blank_mode == "set": + logits[..., 0] = self.blank_weight + else: + raise Exception(f"invalid blank mode {self.blank_mode}") + + if net_output["padding_mask"] is not None and net_output["padding_mask"].any(): + number_of_classes = logits.size(-1) + masking_tensor = torch.ones( + number_of_classes, device=logits.device + ) * float("-inf") + masking_tensor[0] = 0 + + if logits.size(0) > net_output["padding_mask"].size(1): + net_output["padding_mask"] = F.pad( + net_output["padding_mask"], (1, 0), value=False + ) + + logits[net_output["padding_mask"].T] = masking_tensor.type_as(logits) + + if normalize: + logits = utils.log_softmax(logits.float(), dim=-1) + + return logits + + def get_normalized_probs(self, net_output, log_probs): + """Get normalized probabilities (or log probs) from a net's output.""" + + logits = self.get_logits(net_output) + + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def forward(self, **kwargs): + x = self.w2v_encoder(**kwargs) + return x + + +@dataclass +class Wav2Vec2Seq2SeqConfig(Wav2Vec2AsrConfig): + decoder_embed_dim: int = field( + default=768, metadata={"help": "decoder embedding dimension"} + ) + decoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "decoder embedding dimension for FFN"} + ) + decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) + decoder_layerdrop: float = field( + default=0.0, metadata={"help": "decoder layerdrop chance"} + ) + decoder_attention_heads: int = field( + default=4, metadata={"help": "num decoder attention heads"} + ) + decoder_learned_pos: bool = field( + default=False, + metadata={"help": "use learned positional embeddings in the decoder"}, + ) + decoder_normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each decoder block"} + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if set, disables positional embeddings (outside self attention)" + }, + ) + decoder_dropout: float = field( + default=0.0, metadata={"help": "dropout probability in the decoder"} + ) + decoder_attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside the decoder" + }, + ) + decoder_activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside the decoder" + }, + ) + max_target_positions: int = field( + default=2048, metadata={"help": "max target positions"} + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + autoregressive: bool = II("task.autoregressive") + + +@register_model("wav2vec_seq2seq", dataclass=Wav2Vec2Seq2SeqConfig) +class Wav2Vec2Seq2SeqModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, cfg: Wav2Vec2Seq2SeqConfig, task: FairseqTask): + """Build a new model instance.""" + + assert ( + cfg.autoregressive + ), "Please set task.autoregressive=true for seq2seq asr models" + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + def build_embedding(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + return emb + + decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim) + + encoder = cls.build_encoder(cfg) + decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) + + return Wav2Vec2Seq2SeqModel(encoder, decoder) + + @classmethod + def build_encoder(cls, cfg: Wav2Vec2AsrConfig): + return Wav2VecEncoder(cfg) + + @classmethod + def build_decoder(cls, cfg: Wav2Vec2Seq2SeqConfig, tgt_dict, embed_tokens): + return TransformerDecoder(cfg, tgt_dict, embed_tokens) + + def forward(self, **kwargs): + encoder_out = self.encoder(**kwargs) + decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) + return decoder_out + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + +class Wav2VecEncoder(FairseqEncoder): + def __init__(self, cfg: Wav2Vec2AsrConfig, output_size=None): + self.apply_mask = cfg.apply_mask + + arg_overrides = { + "dropout": cfg.dropout, + "activation_dropout": cfg.activation_dropout, + "dropout_input": cfg.dropout_input, + "attention_dropout": cfg.attention_dropout, + "mask_length": cfg.mask_length, + "mask_prob": cfg.mask_prob, + "require_same_masks": getattr(cfg, "require_same_masks", True), + "pct_holes": getattr(cfg, "mask_dropout", 0), + "mask_selection": cfg.mask_selection, + "mask_other": cfg.mask_other, + "no_mask_overlap": cfg.no_mask_overlap, + "mask_channel_length": cfg.mask_channel_length, + "mask_channel_prob": cfg.mask_channel_prob, + "mask_channel_before": cfg.mask_channel_before, + "mask_channel_selection": cfg.mask_channel_selection, + "mask_channel_other": cfg.mask_channel_other, + "no_mask_channel_overlap": cfg.no_mask_channel_overlap, + "encoder_layerdrop": cfg.layerdrop, + "feature_grad_mult": cfg.feature_grad_mult, + "checkpoint_activations": cfg.checkpoint_activations, + "offload_activations": cfg.offload_activations, + "min_params_to_wrap": cfg.min_params_to_wrap, + # d2v multi args + "encoder_dropout": cfg.dropout, + "drop_path": getattr(cfg, "drop_path", 0), + "mask_dropout": getattr(cfg, "mask_dropout", 0), + "zero_mask": getattr(cfg, "zero_mask", False), + "local_grad_mult": cfg.feature_grad_mult, + "layerdrop": cfg.layerdrop, + "prenet_layerdrop": cfg.layerdrop, + "prenet_dropout": cfg.dropout, + "post_mlp_drop": cfg.dropout, + "encoder_zero_mask": getattr(cfg, "zero_mask", False), + "inverse_mask": False, + "learned_alibi_scale": getattr(cfg, "update_alibi", True), + } + + if cfg.w2v_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides) + w2v_args = state.get("cfg", None) + if w2v_args is None: + w2v_args = convert_namespace_to_omegaconf(state["args"]) + w2v_args.criterion = None + w2v_args.lr_scheduler = None + + cfg.w2v_args = w2v_args + + logger.info(w2v_args) + + else: + state = None + w2v_args = cfg.w2v_args + if isinstance(w2v_args, Namespace): + cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) + + self.is_d2v_multi = "data2vec_multi" in w2v_args.model.get("_name", None) + + if not self.is_d2v_multi: + model_normalized = w2v_args.task.get( + "normalize", w2v_args.model.get("normalize", False) + ) + assert cfg.normalize == model_normalized, ( + "Fine-tuning works best when data normalization is the same. " + "Please check that --normalize is set or unset for both pre-training and here" + ) + + with open_dict(w2v_args): + args_replacement = ["checkpoint_activations", "layer_type", + "adp_num", "adp_dim", + "adp_act_fn", "adp_trf_idx"] + for _args in args_replacement: + if hasattr(cfg, _args) and getattr(cfg, _args, None) is not None: + w2v_args.model[_args] = getattr(cfg, _args, None) + + if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations: + with open_dict(w2v_args): + w2v_args.model.checkpoint_activations = cfg.checkpoint_activations + + w2v_args.task.data = cfg.data + task = tasks.setup_task(w2v_args.task, from_checkpoint=True) + model = task.build_model(w2v_args.model, from_checkpoint=True) + model.remove_pretraining_modules() + d = w2v_args.model.encoder_embed_dim + else: + assert cfg.normalize + + if hasattr(w2v_args.task, "audio"): + w2v_args.task.audio.data = cfg.data + else: + w2v_args.task.data = cfg.data + task = tasks.setup_task(w2v_args.task, from_checkpoint=True) + + model = task.build_model(w2v_args.model, from_checkpoint=True) + + model.remove_pretraining_modules(modality="audio") + d = w2v_args.model.embed_dim + + if state is not None and not cfg.no_pretrained_weights: + if cfg.load_ema: + assert "_ema" in state["model"] + for k in state["model"]["_ema"]: + mk = "encoder." + k + assert mk in state["model"], mk + state["model"][mk] = state["model"]["_ema"][k] + self.load_model_weights(state, model, cfg) + + super().__init__(task.source_dictionary) + + self.w2v_model = model + + self.final_dropout = nn.Dropout(cfg.final_dropout) + self.freeze_finetune_updates = cfg.freeze_finetune_updates + self.num_updates = 0 + + targ_d = None + self.proj = None + + if output_size is not None: + targ_d = output_size + elif getattr(cfg, "decoder_embed_dim", d) != d: + targ_d = cfg.decoder_embed_dim + + if targ_d is not None: + self.proj = Linear(d, targ_d) + + if cfg.freeze_regex is not None: + self.freeze_regex(cfg.freeze_regex) + + layer_decay = getattr(cfg, "layer_decay", 1) + if layer_decay < 1: + mod_encs = list(model.modality_encoders.values()) + assert len(mod_encs) == 1, len(mod_encs) + blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks) + num_layers = len(blocks) + 1 + layer_scales = list( + layer_decay ** (num_layers - i) for i in range(num_layers + 1) + ) + + for i, b in enumerate(blocks): + lid = i + 1 + if layer_scales[lid] == 1.0: + continue + + for n, p in b.named_parameters(): + optim_override = getattr(p, "optim_overrides", {}) + if "optimizer" not in optim_override: + optim_override["optimizer"] = {} + + optim_override["optimizer"]["lr_scale"] = layer_scales[lid] + p.optim_overrides = optim_override + + def freeze_regex(self, pattern): + unfrozen_names = [] + for name, param in self.named_parameters(): + if re.fullmatch(pattern, name) is not None: + param.requires_grad_(False) + else: + unfrozen_names.append(name) + + def load_model_weights(self, state, model, cfg): + if cfg.ddp_backend == "fully_sharded": + from fairseq.distributed import FullyShardedDataParallel + + for name, module in model.named_modules(): + if "encoder.layers" in name and len(name.split(".")) == 3: + # Only for layers, we do a special handling and load the weights one by one + # We dont load all weights together as that wont be memory efficient and may + # cause oom + new_dict = { + k.replace(name + ".", ""): v + for (k, v) in state["model"].items() + if name + "." in k + } + assert isinstance(module, FullyShardedDataParallel) + with module.summon_full_params(): + module.load_state_dict(new_dict, strict=True) + module._reset_lazy_init() + + # Once layers are loaded, filter them out and load everything else. + r = re.compile("encoder.layers.\d.") + filtered_list = list(filter(r.match, state["model"].keys())) + + new_big_dict = { + k: v for (k, v) in state["model"].items() if k not in filtered_list + } + + model.load_state_dict(new_big_dict, strict=False) + else: + to_delete = {"_ema", "target_proj", "decoder"} + for k in to_delete: + if k in state["model"]: + del state["model"][k] + + if hasattr(model, "modality_encoders"): + if "modality_encoders.AUDIO.encoder_mask" not in state["model"]: + model.modality_encoders["AUDIO"].encoder_mask = None + elif not cfg.zero_mask: + model.modality_encoders["AUDIO"].encoder_mask = None + del state["model"]["modality_encoders.AUDIO.encoder_mask"] + + for k in list(state["model"].keys()): + if k.startswith("modality_encoders.") and not k.startswith( + "modality_encoders.AUDIO" + ): + del state["model"][k] + + print(model) + model.load_state_dict(state["model"], strict=True) + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward(self, source, padding_mask, **kwargs): + + w2v_args = { + "source": source, + "padding_mask": padding_mask, + "mask": self.apply_mask and self.training, + } + if "corpus_key" in kwargs: + w2v_args["corpus_key"] = kwargs["corpus_key"] + + if self.is_d2v_multi: + w2v_args["mode"] = "AUDIO" + + ft = self.freeze_finetune_updates <= self.num_updates + + with torch.no_grad() if not ft else contextlib.ExitStack(): + res = self.w2v_model.extract_features(**w2v_args) + + x = res["x"] + padding_mask = res["padding_mask"] + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + x = self.final_dropout(x) + + if self.proj: + x = self.proj(x) + + return { + "encoder_out": x, # T x B x C + "padding_mask": padding_mask, # B x T, + "layer_results": res["layer_results"], + } + + def forward_torchscript(self, net_input): + if torch.jit.is_scripting(): + return self.forward(net_input["source"], net_input["padding_mask"]) + else: + return self.forward_non_torchscript(net_input) + + def reorder_encoder_out(self, encoder_out, new_order): + if encoder_out["encoder_out"] is not None: + encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( + 1, new_order + ) + if encoder_out["padding_mask"] is not None: + encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select( + 0, new_order + ) + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return None + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +class TransformerDecoder(FairseqIncrementalDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + cfg: Wav2Vec2Seq2SeqConfig, + dictionary, + embed_tokens, + no_encoder_attn=False, + ): + super().__init__(dictionary) + + self.dropout = cfg.decoder_dropout + self.share_input_output_embed = cfg.share_decoder_input_output_embed + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = cfg.decoder_embed_dim + self.output_embed_dim = cfg.decoder_embed_dim + + self.layerdrop = cfg.decoder_layerdrop + + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = cfg.max_target_positions + + self.embed_tokens = embed_tokens + self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + + self.embed_positions = ( + PositionalEmbedding( + cfg.max_target_positions, + embed_dim, + self.padding_idx, + learned=cfg.decoder_learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + + # TODO: update this when transformer gets converted to dataclass configs + transformer_cfg = copy.deepcopy(cfg) + with open_dict(transformer_cfg): + transformer_cfg.dropout = transformer_cfg.decoder_dropout + transformer_cfg.attention_dropout = ( + transformer_cfg.decoder_attention_dropout + ) + transformer_cfg.activation_dropout = ( + transformer_cfg.decoder_activation_dropout + ) + + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + TransformerDecoderLayer(transformer_cfg, no_encoder_attn) + for _ in range(transformer_cfg.decoder_layers) + ] + ) + + if not self.share_input_output_embed: + self.embed_out = nn.Parameter( + torch.Tensor(len(dictionary), self.output_embed_dim) + ) + nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) + + if transformer_cfg.decoder_normalize_before: + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (Tensor, optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + if type(prev_output_tokens) == list: + max_len = max((len(x) for x in prev_output_tokens)) + tmp = torch.zeros( + [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device + ) + for (i, p) in enumerate(prev_output_tokens): + tmp[i, : len(p)] = p + prev_output_tokens = tmp + + prev_output_tokens = prev_output_tokens.long() + x, extra = self.extract_features( + prev_output_tokens, encoder_out, incremental_state + ) + x = self.output_layer(x) + return x, extra + + def extract_features( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + + # embed positions + positions = ( + self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + attn = None + + inner_states = [x] + + # decoder layers + self_attn_padding_mask = None + if prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + for layer in self.layers: + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, attn, _ = layer( + x, + encoder_out["encoder_out"] if encoder_out is not None else None, + encoder_out["padding_mask"] if encoder_out is not None else None, + incremental_state, + self_attn_mask=self.buffered_future_mask(x) + if incremental_state is None + else None, + self_attn_padding_mask=self_attn_padding_mask, + ) + inner_states.append(x) + + if self.layer_norm: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, {"attn": attn, "inner_states": inner_states} + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + # project back to size of vocabulary + if self.share_input_output_embed: + return F.linear(features, self.embed_tokens.weight) + else: + return F.linear(features, self.embed_out) + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + if ( + not hasattr(self, "_future_mask") + or self._future_mask is None + or self._future_mask.device != tensor.device + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 + ) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m diff --git a/fairseq/fairseq/models/wav2vec/wav2vec2_classification.py b/fairseq/fairseq/models/wav2vec/wav2vec2_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..c9bbaab28e16e2a8eab153b16d7f003979934d03 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/wav2vec2_classification.py @@ -0,0 +1,348 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from omegaconf import II, MISSING, open_dict + +from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model +from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, Wav2Vec2Config +from fairseq.models.wav2vec.wav2vec2_asr import Embedding, Linear, Wav2VecEncoder, Wav2Vec2AsrConfig +from fairseq.tasks import FairseqTask + +logging.basicConfig(level=logging.DEBUG) + + +@dataclass +class Wav2Vec2ClassificationConfig(Wav2Vec2AsrConfig): + latent_embed_dim: Optional[int] = field( + default=None, metadata={"help": "latent dim (encoder w2v -> latent -> class"} + ) + pooling: str = field( + default="first_token", + metadata={"help": "pooling layer choices"}, + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + + +@register_model("wav2vec_classification", dataclass=Wav2Vec2ClassificationConfig) +class Wav2VecClassification(BaseFairseqModel): + # TODO: Can be shared/merged with ASR model class as w2v_encoder params are common. + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + w2v_encoder: BaseFairseqModel, + pooling_layer, + ): + super().__init__() + self.cfg = cfg + self.w2v_encoder = w2v_encoder + self.pooling_layer = pooling_layer + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: Wav2Vec2ClassificationConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = Wav2VecEncoder(cfg, None) + pooling_layer = get_pooling_layer( + cfg, + w2v_encoder.w2v_model.encoder.layers[-1].embedding_dim, + len(task.target_dictionary), + len(w2v_encoder.w2v_model.encoder.layers), + ) + return cls(cfg, w2v_encoder, pooling_layer) + + def get_normalized_probs(self, net_output, log_probs): + """Get normalized probabilities (or log probs) from a net's output.""" + logits = net_output + + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def get_logits(self, net_output): + return net_output + + def forward(self, **kwargs): + encoder_out_dict = self.w2v_encoder(**kwargs) + w2v_encoder_out = encoder_out_dict["encoder_out"] # TxBxC + w2v_encoder_padding_mask = encoder_out_dict["padding_mask"] # BxT + # w2v_encoder_layer_results = encoder_out_dict["layer_results"] + return self.pooling_layer( + last_layer_feats=w2v_encoder_out, + padding_mask=w2v_encoder_padding_mask, + # all_layer_feats=w2v_encoder_layer_results, + ) + + # def forward_latent(self, **kwargs): + # encoder_out_dict = self.w2v_encoder(**kwargs) + # w2v_encoder_out = encoder_out_dict["encoder_out"] + # w2v_encoder_padding_mask = encoder_out_dict["encoder_padding_mask"] + # w2v_encoder_layer_results = encoder_out_dict["layer_results"] + # return self.pooling_layer.forward_latent( + # last_layer_feats=w2v_encoder_out, + # padding_mask=w2v_encoder_padding_mask, + # all_layer_feats=w2v_encoder_layer_results, + # ) + + +def get_pooling_layer( + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + encoder_layers: int, +): + assert cfg.pooling == 'mean' + if cfg.pooling == "first_token": + return FirstToken(cfg, encoder_embed_dim, num_targets) + # elif cfg.pooling == "mean": + # return MeanPooling(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "mean": + return MeanPoolingFast(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "mean_amsoftmax": + return MeanPoolingFastAMSoftmax(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "max": + return MaxPoolingFast(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "elmo": + return LayerWeightedMeanPooling( + cfg, encoder_embed_dim, num_targets, encoder_layers + ) + else: + raise NotImplementedError(f"{cfg.pooling} has not been implemented yet.") + + +class Pooling(nn.Module): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + ): + super().__init__() + self.projection = Linear(encoder_embed_dim, num_targets) + + def forward(self, last_layer_feats, **kwargs): + raise NotImplementedError() + + +class FirstToken(Pooling): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, last_layer_feats, **kwargs): + return self.projection(last_layer_feats[:, 0]) + + +# class MeanPooling(Pooling): +# def __init__( +# self, +# cfg: Wav2VecClassificationConfig, +# encoder_embed_dim: int, +# num_targets: int, +# **kwargs, +# ): +# super().__init__(cfg, encoder_embed_dim, num_targets) +# self.activation_fn = utils.get_activation_fn(cfg.activation_fn) +# self.linear = Linear(encoder_embed_dim, encoder_embed_dim) + +# def forward(self, last_layer_feats, padding_mask, **kwargs): +# # last_layer_feats: [BxTxD] +# # padding_mask: [BxT] +# last_layer_feats = self.linear(self.activation_fn(last_layer_feats)) +# input_lengths = (1 - padding_mask.long()).sum(-1) +# pooled_feature_list = [] +# for i in range(len(last_layer_feats)): +# length = input_lengths[i] +# pooled_feature = torch.mean(last_layer_feats[i][:length], dim=0) +# pooled_feature_list.append(pooled_feature) +# return self.projection(torch.stack(pooled_feature_list)) + + +def fn_mean(x, mask): + """ + Args: + x: TxBxD + mask: BxT + Return: + y: BxD + """ + if mask is not None: + mask = mask.t()[:, :, None] + return (x * mask).sum(0) / mask.sum(0) + else: + return x.sum(0) / x.shape[0] + + +class MeanPoolingFast(nn.Module): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__() + self.activation_fn = utils.get_activation_fn(cfg.activation_fn) + self.latent_embed_dim = ( + cfg.latent_embed_dim + if cfg.latent_embed_dim is not None + else encoder_embed_dim + ) + logging.debug(f"| {self.latent_embed_dim=}") + self.linear = Linear(encoder_embed_dim, self.latent_embed_dim) + self.projection = Linear(self.latent_embed_dim, num_targets) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + if padding_mask is not None: + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + else: + feat_mask = None + feat = self.linear(last_layer_feats) + feat = fn_mean(feat, feat_mask) + feat = self.activation_fn(feat) + return self.projection(feat) + + def forward_latent(self, last_layer_feats, padding_mask, **kwargs): + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + if padding_mask is not None: + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + else: + feat_mask = None + feat = self.linear(last_layer_feats) + feat = fn_mean(feat, feat_mask) + return feat + + +class MeanPoolingFastAMSoftmax(MeanPoolingFast): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__(cfg, encoder_embed_dim, num_targets, **kwargs) + self.projection = Linear(self.latent_embed_dim, num_targets, bias=False) + nn.init.xavier_normal_(self.projection.weight, gain=1) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + + """ + Arguments + features - [BxTxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + feat_mask = (~padding_mask).to(last_layer_feats.dtype) # T,B -> B,T + feat = self.linear(last_layer_feats) # B,T,D + feat = fn_mean(feat, feat_mask) # B,D + feat = self.activation_fn(feat) + # normalize feat + feat_norm = F.normalize(feat, p=2, dim=-1) # B,D + weight_norm = F.normalize(self.projection.weight.t(), p=2, dim=-1) # D,K + cos_fw = feat_norm @ weight_norm + return cos_fw + + +def fn_max(x, mask): + """ + Args: + x: TxBxD + mask: BxT + Return: + y: BxD + """ + mask = mask.t()[:, :, None].to(torch.bool) + return x.masked_fill(~mask, -1e-8).max(0)[0] + + +class MaxPoolingFast(Pooling): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__(cfg, encoder_embed_dim, num_targets) + self.activation_fn = utils.get_activation_fn(cfg.activation_fn) + self.linear = Linear(encoder_embed_dim, encoder_embed_dim) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + feat = self.linear(last_layer_feats) + feat = fn_max(feat, feat_mask) + feat = self.activation_fn(feat) + return self.projection(feat) + + +class LayerWeightedMeanPooling(MeanPoolingFast): + """Elmo-style weighted average representation.""" + + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + encoder_layers: int, + ): + super().__init__(cfg, encoder_embed_dim, num_targets) + self.num_layers = encoder_layers + self.weights = nn.Parameter(torch.ones(encoder_layers)) + + def forward(self, last_layer_feats, padding_mask, all_layer_feats): + # last_layer_feats: [BxTxD] + # padding_mask: [BxT] + if not self.training: + msg = ( + f"Number of layers in input features = {len(all_layer_feats)}." + f" Expected {self.num_layers} layers." + ) + assert len(all_layer_feats) == self.num_layers, msg + + # Stack up all layers and reshape to (num_layers, features) + all_layer_feats_stacked = torch.stack(all_layer_feats, dim=0) + num_layers, *original_feat_shape = all_layer_feats_stacked.shape + all_layer_feats_stacked_flat = all_layer_feats_stacked.view(num_layers, -1) + + # Weighted average + normalized_weights = F.softmax(self.weights, dim=-1) + weighted_avg_features = ( + normalized_weights.unsqueeze(-1) * all_layer_feats_stacked_flat + ).sum(dim=0) + weighted_avg_features = weighted_avg_features.view(*original_feat_shape) + + # Mean Pooling on weighted average features. + return super().forward(weighted_avg_features, padding_mask) \ No newline at end of file diff --git a/fairseq/fairseq/models/wav2vec/wav2vec2_laser.py b/fairseq/fairseq/models/wav2vec/wav2vec2_laser.py new file mode 100644 index 0000000000000000000000000000000000000000..ff89759d38cc1837270a8571d279f11cf1edfa75 --- /dev/null +++ b/fairseq/fairseq/models/wav2vec/wav2vec2_laser.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec.wav2vec2_asr import ( + Wav2Vec2CtcConfig, + Wav2VecCtc, + Wav2VecEncoder, +) +from fairseq.tasks import FairseqTask + + +@register_model("wav2vec2_laser", dataclass=Wav2Vec2CtcConfig) +class Wav2VecLaser(Wav2VecCtc): + def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): + super().__init__(cfg, w2v_encoder) + self.num_updates = 0 + self.freeze_finetune_updates = cfg.freeze_finetune_updates + + @classmethod + def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = Wav2VecEncoder(cfg, 1024) + return cls(cfg, w2v_encoder) + + def forward(self, **kwargs): + output = super().forward(**kwargs) + x_out = output["encoder_out"] * 0.01 + out_pad_mask = output["padding_mask"] + # Set padded outputs to -inf so they are not selected by max-pooling + if out_pad_mask is not None and out_pad_mask.any(): + x_out = ( + x_out.float() + .masked_fill_(out_pad_mask.T.unsqueeze(-1), float("-inf")) + .type_as(x_out) + ) + return x_out.max(dim=0)[0] diff --git a/fairseq/fairseq/models/xmod/__init__.py b/fairseq/fairseq/models/xmod/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bbf7694920eb00bed27e17dac272611be1ab44f9 --- /dev/null +++ b/fairseq/fairseq/models/xmod/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .model import * # noqa +from .transformer_layer_xmod import * # noqa diff --git a/fairseq/fairseq/models/xmod/__pycache__/__init__.cpython-310.pyc b/fairseq/fairseq/models/xmod/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4be44e4e01a53f5a94c33a5ba6dfaeca3d70bd6f Binary files /dev/null and b/fairseq/fairseq/models/xmod/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/xmod/__pycache__/hub_interface.cpython-310.pyc b/fairseq/fairseq/models/xmod/__pycache__/hub_interface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac8bb2f30e313c78a62f02a324c325f5ed54004a Binary files /dev/null and b/fairseq/fairseq/models/xmod/__pycache__/hub_interface.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/xmod/__pycache__/model.cpython-310.pyc b/fairseq/fairseq/models/xmod/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73980f1666a1ffd99e85027bea0a5d56829f664d Binary files /dev/null and b/fairseq/fairseq/models/xmod/__pycache__/model.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/xmod/__pycache__/transformer_layer_xmod.cpython-310.pyc b/fairseq/fairseq/models/xmod/__pycache__/transformer_layer_xmod.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69c4f7e850b43064ac86910b1a8d230f39d66a48 Binary files /dev/null and b/fairseq/fairseq/models/xmod/__pycache__/transformer_layer_xmod.cpython-310.pyc differ diff --git a/fairseq/fairseq/models/xmod/hub_interface.py b/fairseq/fairseq/models/xmod/hub_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..909bb423cac449b7f48f11e47126730aeadee919 --- /dev/null +++ b/fairseq/fairseq/models/xmod/hub_interface.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from fairseq.models.roberta.hub_interface import RobertaHubInterface +import torch +import torch.nn.functional as F + + +class XMODHubInterface(RobertaHubInterface): + def extract_features( + self, + tokens: torch.LongTensor, + return_all_hiddens: bool = False, + lang_id=None, + ) -> torch.Tensor: + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + if tokens.size(-1) > self.model.max_positions(): + raise ValueError( + "tokens exceeds maximum length: {} > {}".format( + tokens.size(-1), self.model.max_positions() + ) + ) + features, extra = self.model( + tokens.to(device=self.device), + features_only=True, + return_all_hiddens=return_all_hiddens, + lang_id=lang_id, + ) + if return_all_hiddens: + # convert from T x B x C -> B x T x C + inner_states = extra["inner_states"] + return [inner_state.transpose(0, 1) for inner_state in inner_states] + else: + return features # just the last layer's features + + def predict( + self, + head: str, + tokens: torch.LongTensor, + return_logits: bool = False, + lang_id=None, + ): + features = self.extract_features(tokens.to(device=self.device), lang_id=lang_id) + logits = self.model.classification_heads[head](features) + if return_logits: + return logits + return F.log_softmax(logits, dim=-1) diff --git a/fairseq/fairseq/models/xmod/model.py b/fairseq/fairseq/models/xmod/model.py new file mode 100644 index 0000000000000000000000000000000000000000..fb6c7a8deac1656855e6cdca9d4cf156942e10b2 --- /dev/null +++ b/fairseq/fairseq/models/xmod/model.py @@ -0,0 +1,742 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from ..roberta.model_xlmr import XLMRModel +from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase +from ..roberta.model import base_architecture, RobertaEncoder +from fairseq.models.transformer import TransformerEncoder +from fairseq.modules.transformer_sentence_encoder import init_bert_params +from typing import Optional +from fairseq.models.xmod.hub_interface import XMODHubInterface +import torch +from fairseq.distributed import fsdp_wrap +from fairseq.models import ( + register_model, + register_model_architecture, +) + +from fairseq.modules.checkpoint_activations import checkpoint_wrapper + +DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) + + +@register_model("xmod") +class XMODModel(XLMRModel): + @classmethod + def hub_models(cls): + return { + "xmod.base": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz", + "xmod.large.prenorm": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz", + "xmod.base.13.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz", + "xmod.base.30.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz", + "xmod.base.30.195k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz", + "xmod.base.60.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz", + "xmod.base.60.265k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz", + "xmod.base.75.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz", + "xmod.base.75.269k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz", + } + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="sentencepiece", + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return XMODHubInterface(x["args"], x["task"], x["models"][0]) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + from omegaconf import OmegaConf + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, False) + + # make sure all arguments are present + base_architecture(args) + + if not hasattr(args, "max_positions"): + if not hasattr(args, "tokens_per_sample"): + args.tokens_per_sample = task.max_positions() + args.max_positions = args.tokens_per_sample + + encoder = XMODEncoder(args, task.source_dictionary) + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, True) + + return cls(args, encoder) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + classification_head_name=None, + lang_id=None, + **kwargs, + ): + if classification_head_name is not None: + features_only = True + x, extra = self.encoder( + src_tokens, features_only, return_all_hiddens, lang_id=lang_id, **kwargs + ) + + if classification_head_name is not None: + x = self.classification_heads[classification_head_name](x) + return x, extra + + +class XMODEncoder(RobertaEncoder): + """XMOD encoder.""" + + def build_encoder(self, args, dictionary, embed_tokens): + encoder = XMODTransformerEncoder(args, dictionary, embed_tokens) + encoder.apply(init_bert_params) + return encoder + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + masked_tokens=None, + lang_id=None, + **unused, + ): + """ + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + features_only (bool, optional): skip LM head and just return + features. If True, the output will be of shape + `(batch, src_len, embed_dim)`. + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + + Returns: + tuple: + - the LM output of shape `(batch, src_len, vocab)` + - a dictionary of additional data, where 'inner_states' + is a list of hidden states. Note that the hidden + states have shape `(src_len, batch, vocab)`. + """ + x, extra = self.extract_features( + src_tokens, return_all_hiddens=return_all_hiddens, lang_id=lang_id + ) + if not features_only: + x = self.output_layer(x, masked_tokens=masked_tokens) + return x, extra + + def extract_features( + self, src_tokens, return_all_hiddens=False, lang_id=None, **kwargs + ): + encoder_out = self.sentence_encoder( + src_tokens, + return_all_hiddens=return_all_hiddens, + lang_id=lang_id, + token_embeddings=kwargs.get("token_embeddings", None), + ) + # T x B x C -> B x T x C + features = encoder_out["encoder_out"][0].transpose(0, 1) + inner_states = encoder_out["encoder_states"] if return_all_hiddens else None + return features, {"inner_states": inner_states} + + +class XMODTransformerEncoder(TransformerEncoder): + def build_encoder_layer(self, cfg): + layer = XMODTransformerEncoderLayerBase(cfg) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + lang_id=None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + return self.forward_scriptable( + src_tokens, + src_lengths, + return_all_hiddens, + token_embeddings, + lang_id=lang_id, + ) + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + + def forward_scriptable( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + lang_id=None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any() + + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # account for padding while computing the representation + if has_pads: + x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x)) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + encoder_states = [] + + if return_all_hiddens: + encoder_states.append(x) + + # encoder layers + for layer in self.layers: + x = layer( + x, + encoder_padding_mask=encoder_padding_mask if has_pads else None, + lang_id=lang_id, + ) + if return_all_hiddens: + assert encoder_states is not None + encoder_states.append(x) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + src_lengths = ( + src_tokens.ne(self.padding_idx) + .sum(dim=1, dtype=torch.int32) + .reshape(-1, 1) + .contiguous() + ) + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask], # B x T + "encoder_embedding": [encoder_embedding], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [src_lengths], + } + + +@register_model_architecture("xmod", "xmod_base_13") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "ar_AR", + "en_XX", + "fi_FI", + "fr_XX", + "hi_IN", + "id_ID", + "ka_GE", + "ko_KR", + "ru_RU", + "sw_KE", + "ta_IN", + "th_TH", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_30") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "ar_AR", + "cs_CZ", + "en_XX", + "eu_ES", + "fi_FI", + "fr_XX", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "it_IT", + "ka_GE", + "ko_KR", + "lt_LT", + "ml_IN", + "mn_MN", + "ms_MY", + "pl_PL", + "ro_RO", + "ru_RU", + "si_LK", + "sk_SK", + "sq_AL", + "sv_SE", + "sw_KE", + "ta_IN", + "th_TH", + "tl_XX", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_60") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "af_ZA", + "am_ET", + "ar_AR", + "be_BY", + "bn_IN", + "ca_ES", + "cs_CZ", + "cy_GB", + "da_DK", + "en_XX", + "eo_EO", + "et_EE", + "eu_ES", + "fa_IR", + "fi_FI", + "fr_XX", + "ga_IE", + "gl_ES", + "gu_IN", + "ha_NG", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "is_IS", + "it_IT", + "ka_GE", + "ko_KR", + "ku_TR", + "la_VA", + "lt_LT", + "lv_LV", + "mk_MK", + "ml_IN", + "mn_MN", + "ms_MY", + "ne_NP", + "nl_XX", + "no_XX", + "pl_PL", + "ps_AF", + "pt_XX", + "ro_RO", + "ru_RU", + "sa_IN", + "sd_PK", + "si_LK", + "sk_SK", + "sl_SI", + "so_SO", + "sq_AL", + "sr_RS", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_75") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "af_ZA", + "am_ET", + "ar_AR", + "as_IN", + "be_BY", + "bn_IN", + "br_FR", + "bs_BA", + "ca_ES", + "cs_CZ", + "cy_GB", + "da_DK", + "en_XX", + "eo_EO", + "et_EE", + "eu_ES", + "fa_IR", + "fi_FI", + "fr_XX", + "fy_NL", + "ga_IE", + "gd_GB", + "gl_ES", + "gu_IN", + "ha_NG", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "is_IS", + "it_IT", + "jv_ID", + "ka_GE", + "kn_IN", + "ko_KR", + "ku_TR", + "la_VA", + "lt_LT", + "lv_LV", + "mg_MG", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "ms_MY", + "ne_NP", + "nl_XX", + "no_XX", + "om_KE", + "or_IN", + "pa_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "ro_RO", + "ru_RU", + "sa_IN", + "sd_PK", + "si_LK", + "sk_SK", + "sl_SI", + "so_SO", + "sq_AL", + "sr_RS", + "su_ID", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "vi_VN", + "xh_ZA", + "yi_DE", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "en_XX", + "id_ID", + "vi_VN", + "ru_RU", + "fa_IR", + "sv_SE", + "ja_XX", + "fr_XX", + "de_DE", + "ro_RO", + "ko_KR", + "hu_HU", + "es_XX", + "fi_FI", + "uk_UA", + "da_DK", + "pt_XX", + "no_XX", + "th_TH", + "pl_PL", + "bg_BG", + "nl_XX", + "zh_CN", + "he_IL", + "el_GR", + "it_IT", + "sk_SK", + "hr_HR", + "tr_TR", + "ar_AR", + "cs_CZ", + "lt_LT", + "hi_IN", + "zh_TW", + "ca_ES", + "ms_MY", + "sl_SI", + "lv_LV", + "ta_IN", + "bn_IN", + "et_EE", + "az_AZ", + "sq_AL", + "sr_RS", + "kk_KZ", + "ka_GE", + "tl_XX", + "ur_PK", + "is_IS", + "hy_AM", + "ml_IN", + "mk_MK", + "be_BY", + "la_VA", + "te_IN", + "eu_ES", + "gl_ES", + "mn_MN", + "kn_IN", + "ne_NP", + "sw_KE", + "si_LK", + "mr_IN", + "af_ZA", + "gu_IN", + "cy_GB", + "eo_EO", + "km_KH", + "ky_KG", + "uz_UZ", + "ps_AF", + "pa_IN", + "ga_IE", + "ha_NG", + "am_ET", + "lo_LA", + "ku_TR", + "so_SO", + "my_MM", + "or_IN", + "sa_IN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_large_prenorm") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", True) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", False) + args.ln_before_adapter = getattr(args, "ln_before_adapter", False) + # args.bottleneck = getattr(args, "bottleneck", 8) + args.bottleneck = getattr(args, "bottleneck", 4) + args.languages = getattr( + args, + "languages", + [ + "en_XX", + "id_ID", + "vi_VN", + "ru_RU", + "fa_IR", + "sv_SE", + "ja_XX", + "fr_XX", + "de_DE", + "ro_RO", + "ko_KR", + "hu_HU", + "es_XX", + "fi_FI", + "uk_UA", + "da_DK", + "pt_XX", + "no_XX", + "th_TH", + "pl_PL", + "bg_BG", + "nl_XX", + "zh_CN", + "he_IL", + "el_GR", + "it_IT", + "sk_SK", + "hr_HR", + "tr_TR", + "ar_AR", + "cs_CZ", + "lt_LT", + "hi_IN", + "zh_TW", + "ca_ES", + "ms_MY", + "sl_SI", + "lv_LV", + "ta_IN", + "bn_IN", + "et_EE", + "az_AZ", + "sq_AL", + "sr_RS", + "kk_KZ", + "ka_GE", + "tl_XX", + "ur_PK", + "is_IS", + "hy_AM", + "ml_IN", + "mk_MK", + "be_BY", + "la_VA", + "te_IN", + "eu_ES", + "gl_ES", + "mn_MN", + "kn_IN", + "ne_NP", + "sw_KE", + "si_LK", + "mr_IN", + "af_ZA", + "gu_IN", + "cy_GB", + "eo_EO", + "km_KH", + "ky_KG", + "uz_UZ", + "ps_AF", + "pa_IN", + "ga_IE", + "ha_NG", + "am_ET", + "lo_LA", + "ku_TR", + "so_SO", + "my_MM", + "or_IN", + "sa_IN", + ], + ) + + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.encoder_layers = getattr(args, "encoder_layers", 24) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + base_architecture(args) diff --git a/fairseq/fairseq/models/xmod/transformer_layer_xmod.py b/fairseq/fairseq/models/xmod/transformer_layer_xmod.py new file mode 100644 index 0000000000000000000000000000000000000000..47a91cdc2339f235efe0cecc85d6a7a260d19c6a --- /dev/null +++ b/fairseq/fairseq/models/xmod/transformer_layer_xmod.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.modules.transformer_layer import TransformerEncoderLayer +from typing import Optional +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.modules import LayerNorm +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor + + +class Adapter(nn.Module): + def __init__(self, cfg, red_fac=2): + super(Adapter, self).__init__() + self.cfg = cfg + self.embed_dim = cfg.encoder_embed_dim + self.quant_noise = getattr(cfg, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(cfg, "quant_noise_pq_block_size", 8) or 8 + self.activation_fn = utils.get_activation_fn( + activation=getattr(cfg, "activation_fn", "relu") or "relu" + ) + self.fc1 = quant_noise( + nn.Linear(self.embed_dim, self.embed_dim // red_fac), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + self.fc2 = quant_noise( + nn.Linear(self.embed_dim // red_fac, self.embed_dim), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + activation_dropout_p = getattr(cfg, "activation_dropout", 0) or 0 + if activation_dropout_p == 0: + # for backwards compatibility with models that use cfg.relu_dropout + activation_dropout_p = getattr(cfg, "relu_dropout", 0) or 0 + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + + def forward(self, x): + x = self.activation_fn(self.fc1(x)) + if not hasattr(self.cfg, "adapter_dropout") or self.cfg.adapter_dropout: + x = self.activation_dropout_module(x) + x = self.fc2(x) + return x + + +class XMODTransformerEncoderLayerBase(TransformerEncoderLayer): + """Encoder layer block. + + In the original paper each operation (multi-head attention or FFN) is + postprocessed with: `dropout -> add residual -> layernorm`. In the + tensor2tensor code they suggest that learning is more robust when + preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *cfg.encoder.normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + + def __init__(self, cfg): + super().__init__(cfg) + if hasattr(cfg, "adapter_modules") and cfg.adapter_modules: + export = getattr(cfg, "export", False) + if cfg.adapter_layer_norm: + self.adapter_layer_norm = LayerNorm(self.embed_dim, export=export) + self.adapter_modules = nn.ModuleDict(dict()) + if hasattr(self.cfg, "bottleneck"): + bottleneck = self.cfg.bottleneck + else: + bottleneck = 2 + for language in cfg.languages: + self.adapter_modules[str(language)] = Adapter(cfg, red_fac=bottleneck) + + def lang_adapter(self, lang_id, x): + # If language adapters exist pass throught them + if hasattr(self.cfg, "adapter_modules") and self.cfg.adapter_modules: + if lang_id is None: + lang_id = ["en_XX"] * x.shape[1] + d_langs = [lang_id[0]] + lang_lengths = [1] + for lang in lang_id[1:]: + if lang == d_langs[-1]: + lang_lengths[-1] += 1 + else: + d_langs.append(lang) + lang_lengths.append(1) + + if ( + not hasattr(self.cfg, "ln_before_adapter") + or not self.cfg.ln_before_adapter + ): + residual = x + if self.cfg.adapter_layer_norm: + x = self.adapter_layer_norm(x) + elif self.cfg.adapter_reuse_layer_norm: + x = self.final_layer_norm(x) + if hasattr(self.cfg, "ln_before_adapter") and self.cfg.ln_before_adapter: + residual = x + + split_x = torch.split(x, lang_lengths, 1) + x_ = [] + for i, (lang, s_x) in enumerate(zip(d_langs, split_x)): + lang = lang.replace("_rom", "").replace("_zaw", "") + x_.append(self.adapter_modules[str(lang)](s_x)) + x = torch.cat(x_, 1) + + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + + return x + + def forward( + self, + x, + encoder_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor] = None, + lang_id: Optional[list] = None, + ): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor): binary ByteTensor of shape + `(batch, seq_len)` where padding elements are indicated by ``1``. + attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`, + where `tgt_len` is the length of output and `src_len` is the + length of input, though here both are equal to `seq_len`. + `attn_mask[tgt_i, src_j] = 1` means that when calculating the + embedding for `tgt_i`, we exclude (mask out) `src_j`. This is + useful for strided self-attention. + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + # anything in original attn_mask = 1, becomes -1e8 + # anything in original attn_mask = 0, becomes 0 + # Note that we cannot use -inf here, because at some edge cases, + # the attention weight (before softmax) for some padded element in query + # will become -inf, which results in NaN in model parameters + if attn_mask is not None: + attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + x, _ = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + need_weights=False, + attn_mask=attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + + x = self.lang_adapter(lang_id, x) + + if not self.normalize_before: + x = self.final_layer_norm(x) + return x diff --git a/fairseq/fairseq/modules/__init__.py b/fairseq/fairseq/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dcfda9b82aa74819d99dc9305fcad437fd90456f --- /dev/null +++ b/fairseq/fairseq/modules/__init__.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +from .adaptive_input import AdaptiveInput +from .adaptive_softmax import AdaptiveSoftmax +from .base_layer import BaseLayer +from .beamable_mm import BeamableMM +from .character_token_embedder import CharacterTokenEmbedder +from .conv_tbc import ConvTBC +from .cross_entropy import cross_entropy +from .downsampled_multihead_attention import DownsampledMultiHeadAttention +from .dynamic_convolution import DynamicConv, DynamicConv1dTBC, DynamicConv_scripatable +from .dynamic_crf_layer import DynamicCRF +from .ema_module import EMAModuleConfig, EMAModule +from .fairseq_dropout import FairseqDropout +from .fp32_batch_norm import Fp32BatchNorm +from .fp32_group_norm import Fp32GroupNorm +from .fp32_instance_norm import Fp32InstanceNorm +from .gelu import gelu, gelu_accurate +from .grad_multiply import GradMultiply +from .gumbel_vector_quantizer import GumbelVectorQuantizer +from .kmeans_vector_quantizer import KmeansVectorQuantizer +from .layer_drop import LayerDropModuleList +from .layer_norm import Fp32LayerNorm, LayerNorm +from .learned_positional_embedding import LearnedPositionalEmbedding +from .lightweight_convolution import LightweightConv, LightweightConv1dTBC +from .linearized_convolution import LinearizedConvolution +from .location_attention import LocationAttention +from .lstm_cell_with_zoneout import LSTMCellWithZoneOut +from .multihead_attention import MultiheadAttention +from .positional_embedding import PositionalEmbedding +from .same_pad import SamePad, SamePad2d +from .scalar_bias import ScalarBias +from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding +from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer +from .transformer_sentence_encoder import TransformerSentenceEncoder +from .transpose_last import TransposeLast +from .unfold import unfold1d +from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer +from .vggblock import VGGBlock +from .espnet_multihead_attention import ( + ESPNETMultiHeadedAttention, + RelPositionMultiHeadedAttention, + RotaryPositionMultiHeadedAttention, +) +from .rotary_positional_embedding import RotaryPositionalEmbedding +from .positional_encoding import ( + RelPositionalEncoding, +) + +__all__ = [ + "AdaptiveInput", + "AdaptiveSoftmax", + "BaseLayer", + "BeamableMM", + "CharacterTokenEmbedder", + "ConvTBC", + "cross_entropy", + "DownsampledMultiHeadAttention", + "DynamicConv1dTBC", + "DynamicConv", + "DynamicConv_scripatable", + "DynamicCRF", + "EMAModule", + "EMAModuleConfig", + "FairseqDropout", + "Fp32BatchNorm", + "Fp32GroupNorm", + "Fp32LayerNorm", + "Fp32InstanceNorm", + "gelu", + "gelu_accurate", + "GradMultiply", + "GumbelVectorQuantizer", + "KmeansVectorQuantizer", + "LayerDropModuleList", + "LayerNorm", + "LearnedPositionalEmbedding", + "LightweightConv1dTBC", + "LightweightConv", + "LinearizedConvolution", + "LocationAttention", + "LSTMCellWithZoneOut", + "MultiheadAttention", + "PositionalEmbedding", + "SamePad", + "SamePad2d", + "ScalarBias", + "SinusoidalPositionalEmbedding", + "TransformerSentenceEncoderLayer", + "TransformerSentenceEncoder", + "TransformerDecoderLayer", + "TransformerEncoderLayer", + "TransposeLast", + "VGGBlock", + "unfold1d", + "ESPNETMultiheadedAttention", + "PositionalEmbedding", + "RelPositionMultiHeadedAttention", + "RelPositionalEncoding", + "RotaryPositionalEmbedding", + "RotaryPositionMultiHeadedAttention", +] diff --git a/fairseq/fairseq/modules/adaptive_input.py b/fairseq/fairseq/modules/adaptive_input.py new file mode 100644 index 0000000000000000000000000000000000000000..01ac4accaccc3e4625ee2ca6a17cf88df1db0ca1 --- /dev/null +++ b/fairseq/fairseq/modules/adaptive_input.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import List + +import torch +from torch import nn + +from fairseq.modules.quant_noise import quant_noise + + +class AdaptiveInput(nn.Module): + def __init__( + self, + vocab_size: int, + padding_idx: int, + initial_dim: int, + factor: float, + output_dim: int, + cutoff: List[int], + q_noise: float = 0, + qn_block_size: int = 8, + ): + super().__init__() + + if vocab_size > cutoff[-1]: + cutoff = cutoff + [vocab_size] + else: + assert ( + vocab_size == cutoff[-1] + ), "cannot specify cutoff larger than vocab size" + + self.cutoff = cutoff + self.embedding_dim = output_dim + self.padding_idx = padding_idx + + self.embeddings = nn.ModuleList() + for i in range(len(self.cutoff)): + prev = self.cutoff[i - 1] if i > 0 else 0 + size = self.cutoff[i] - prev + dim = int(initial_dim // (factor**i)) + seq = nn.Sequential( + nn.Embedding(size, dim, self.padding_idx), + quant_noise( + nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size + ), + ) + + self.embeddings.append(seq) + self.padding_idx = None + self.padding_idx = padding_idx + + def init_weights(m): + if isinstance(m, nn.Embedding): + nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5) + nn.init.constant_(m.weight[padding_idx], 0) + elif hasattr(m, "weight"): + nn.init.xavier_uniform_(m.weight) + + self.apply(init_weights) + + self.register_buffer("_float_tensor", torch.FloatTensor(1)) + + def weights_for_band(self, band: int): + return self.embeddings[band][0].weight, self.embeddings[band][1].weight + + def forward(self, input: torch.Tensor): + result = self._float_tensor.new(input.shape + (self.embedding_dim,)) + for i in range(len(self.cutoff)): + mask = input.lt(self.cutoff[i]) + if i > 0: + mask.mul_(input.ge(self.cutoff[i - 1])) + chunk_input = input[mask] - self.cutoff[i - 1] + else: + chunk_input = input[mask] + if mask.any(): + result[mask] = self.embeddings[i](chunk_input) + return result diff --git a/fairseq/fairseq/modules/adaptive_softmax.py b/fairseq/fairseq/modules/adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..ae0c77ba0f6ee98501306d66cbc4a948b4ade0f7 --- /dev/null +++ b/fairseq/fairseq/modules/adaptive_softmax.py @@ -0,0 +1,268 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import operator + +import torch +import torch.nn.functional as F +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import nn + + +class TiedLinear(nn.Module): + def __init__(self, weight, transpose): + super().__init__() + self.weight = weight + self.transpose = transpose + + def forward(self, input): + return F.linear(input, self.weight.t() if self.transpose else self.weight) + + +class TiedHeadModule(nn.Module): + def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size): + super().__init__() + tied_emb, _ = weights + self.num_words, emb_dim = tied_emb.size() + + self.word_proj = quant_noise( + TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size + ) + if input_dim != emb_dim: + self.word_proj = nn.Sequential( + quant_noise( + nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size + ), + self.word_proj, + ) + + self.class_proj = quant_noise( + nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size + ) + self.out_dim = self.num_words + num_classes + + self.register_buffer("_float_tensor", torch.FloatTensor(1)) + + def forward(self, input): + inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1) + out = self._float_tensor.new(inp_sz, self.out_dim) + out[:, : self.num_words] = self.word_proj(input.view(inp_sz, -1)) + out[:, self.num_words :] = self.class_proj(input.view(inp_sz, -1)) + return out + + +class AdaptiveSoftmax(nn.Module): + """ + This is an implementation of the efficient softmax approximation for + graphical processing units (GPU), described in the paper "Efficient softmax + approximation for GPUs" (http://arxiv.org/abs/1609.04309). + """ + + def __init__( + self, + vocab_size, + input_dim, + cutoff, + dropout, + factor=4.0, + adaptive_inputs=None, + tie_proj=False, + q_noise=0, + qn_block_size=8, + ): + super().__init__() + + if vocab_size > cutoff[-1]: + cutoff = cutoff + [vocab_size] + else: + assert ( + vocab_size == cutoff[-1] + ), "cannot specify cutoff larger than vocab size" + + output_dim = cutoff[0] + len(cutoff) - 1 + + self.vocab_size = vocab_size + self.cutoff = cutoff + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + self.input_dim = input_dim + self.factor = factor + self.q_noise = q_noise + self.qn_block_size = qn_block_size + + self.lsm = nn.LogSoftmax(dim=1) + + if adaptive_inputs is not None: + self.head = TiedHeadModule( + adaptive_inputs.weights_for_band(0), + input_dim, + len(cutoff) - 1, + self.q_noise, + self.qn_block_size, + ) + else: + self.head = quant_noise( + nn.Linear(input_dim, output_dim, bias=False), + self.q_noise, + self.qn_block_size, + ) + + self._make_tail(adaptive_inputs, tie_proj) + + def init_weights(m): + if ( + hasattr(m, "weight") + and not isinstance(m, TiedLinear) + and not isinstance(m, TiedHeadModule) + ): + nn.init.xavier_uniform_(m.weight) + + self.apply(init_weights) + + self.register_buffer("version", torch.LongTensor([1])) + + def _make_tail(self, adaptive_inputs=None, tie_proj=False): + self.tail = nn.ModuleList() + for i in range(len(self.cutoff) - 1): + dim = int(self.input_dim // self.factor ** (i + 1)) + + tied_emb, tied_proj = ( + adaptive_inputs.weights_for_band(i + 1) + if adaptive_inputs is not None + else (None, None) + ) + + if tied_proj is not None: + if tie_proj: + proj = quant_noise( + TiedLinear(tied_proj, transpose=True), + self.q_noise, + self.qn_block_size, + ) + else: + proj = quant_noise( + nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), + self.q_noise, + self.qn_block_size, + ) + else: + proj = quant_noise( + nn.Linear(self.input_dim, dim, bias=False), + self.q_noise, + self.qn_block_size, + ) + + if tied_emb is None: + out_proj = nn.Linear( + dim, self.cutoff[i + 1] - self.cutoff[i], bias=False + ) + else: + out_proj = TiedLinear(tied_emb, transpose=False) + + m = nn.Sequential( + proj, + nn.Dropout(self.dropout_module.p), + quant_noise(out_proj, self.q_noise, self.qn_block_size), + ) + + self.tail.append(m) + + def upgrade_state_dict_named(self, state_dict, name): + version_name = name + ".version" + if version_name not in state_dict: + raise Exception("This version of the model is no longer supported") + + def adapt_target(self, target): + """ + In order to be efficient, the AdaptiveSoftMax does not compute the + scores for all the word of the vocabulary for all the examples. It is + thus necessary to call the method adapt_target of the AdaptiveSoftMax + layer inside each forward pass. + """ + + target = target.view(-1) + new_target = [target.clone()] + target_idxs = [] + + for i in range(len(self.cutoff) - 1): + mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1])) + new_target[0][mask] = self.cutoff[0] + i + + if mask.any(): + target_idxs.append(mask.nonzero(as_tuple=False).squeeze(1)) + new_target.append(target[mask].add(-self.cutoff[i])) + else: + target_idxs.append(None) + new_target.append(None) + + return new_target, target_idxs + + def forward(self, input, target): + """ + Args: + input: (b x t x d) + target: (b x t) + Returns: + 2 lists: output for each cutoff section and new targets by cut off + """ + + input = input.contiguous().view(-1, input.size(-1)) + input = self.dropout_module(input) + + new_target, target_idxs = self.adapt_target(target) + output = [self.head(input)] + + for i in range(len(target_idxs)): + if target_idxs[i] is not None: + output.append(self.tail[i](input.index_select(0, target_idxs[i]))) + else: + output.append(None) + + return output, new_target + + def get_log_prob(self, input, target): + """ + Computes the log probabilities for all the words of the vocabulary, + given a 2D tensor of hidden vectors. + """ + + bsz, length, dim = input.size() + input = input.contiguous().view(-1, dim) + + if target is not None: + _, target_idxs = self.adapt_target(target) + else: + target_idxs = None + + head_y = self.head(input) + log_probs = head_y.new_zeros(input.size(0), self.vocab_size) + + head_sz = self.cutoff[0] + len(self.tail) + log_probs[:, :head_sz] = self.lsm(head_y) + tail_priors = log_probs[:, self.cutoff[0] : head_sz].clone() + + for i in range(len(self.tail)): + start = self.cutoff[i] + end = self.cutoff[i + 1] + + if target_idxs is None: + tail_out = log_probs[:, start:end] + tail_out.copy_(self.tail[i](input)) + log_probs[:, start:end] = self.lsm(tail_out).add_( + tail_priors[:, i, None] + ) + elif target_idxs[i] is not None: + idxs = target_idxs[i] + tail_out = log_probs[idxs, start:end] + tail_out.copy_(self.tail[i](input[idxs])) + log_probs[idxs, start:end] = self.lsm(tail_out).add_( + tail_priors[idxs, i, None] + ) + + log_probs = log_probs.view(bsz, length, -1) + return log_probs diff --git a/fairseq/fairseq/modules/base_layer.py b/fairseq/fairseq/modules/base_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e823f7bae2c2e9b7e2277b0161cf26b453e7216c --- /dev/null +++ b/fairseq/fairseq/modules/base_layer.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +import torch +import sys +from fairseq import utils +from fairseq.distributed import utils as distributed_utils +from fairseq.modules.layer_norm import LayerNorm + + +class BaseLayer(nn.Module): + def __init__(self, args): + super().__init__() + self.num_workers = distributed_utils.get_data_parallel_world_size() + expert_centroids = torch.empty(self.num_workers, args.decoder_embed_dim) + torch.nn.init.orthogonal_(expert_centroids, gain=0.1) + self.register_parameter( + "expert_centroids", torch.nn.Parameter(expert_centroids) + ) + self.expert_network = nn.Sequential( + *([BaseSublayer(args) for _ in range(args.base_sublayers)]) + ) + self.expert_id = distributed_utils.get_data_parallel_rank() + self.shuffle = args.base_shuffle + self.cpp = self.load_assignment() + + # Add a special attribute to the expert parameters, so we know not to sync their gradients + for param in self.expert_network.parameters(): + param.expert = True + + def forward(self, input_features, *args, **kwargs): + features = input_features.reshape(-1, input_features.size(-1)) + is_training = input_features.requires_grad + + if self.shuffle and is_training: + # Send each token to a random worker, to break correlations within the batch + shuffle_sort = torch.randperm(features.size(0), device=features.device) + features = All2All.apply(features[shuffle_sort]) + + with torch.no_grad(): + # Compute similarity of each token to each expert, for routing + token_expert_affinities = features.matmul( + self.expert_centroids.transpose(0, 1) + ) + + # Compute which token goes to which expert + sort_by_expert, input_splits, output_splits = ( + self.balanced_assignment(token_expert_affinities) + if is_training + else self.greedy_assignment(token_expert_affinities) + ) + # Swap these tokens for the right ones for our expert + routed_features = All2All.apply( + features[sort_by_expert], output_splits, input_splits + ) + + if routed_features.size(0) > 0: + # Mix in the expert network based on how appropriate it is for these tokens + alpha = torch.sigmoid( + routed_features.mv(self.expert_centroids[self.expert_id]) + ).unsqueeze(1) + routed_features = ( + alpha * self.expert_network(routed_features) + + (1 - alpha) * routed_features + ) + # Return to original worker and ordering + result = All2All.apply(routed_features, input_splits, output_splits)[ + self.inverse_sort(sort_by_expert) + ] + + if self.shuffle and is_training: + # Undo shuffling + result = All2All.apply(result)[self.inverse_sort(shuffle_sort)] + + # Return additional Nones for compatibility with TransformerDecoderLayer + return result.view(input_features.size()), None, None + + def inverse_sort(self, order): + # Creates an index that undoes a sort: xs==xs[order][inverse_sort(order)] + return torch.empty_like(order).scatter_( + 0, order, torch.arange(0, order.size(0), device=order.device) + ) + + def balanced_assignment(self, scores): + ok = scores.isfinite() + if not ok.all(): + # NaNs here can break the assignment algorithm + scores[~ok] = scores[ok].min() + return self.cpp.balanced_assignment(scores), None, None + + # Assigns each token to the top k experts + def greedy_assignment(self, scores, k=1): + token_to_workers = torch.topk(scores, dim=1, k=k, largest=True).indices.view(-1) + token_to_workers, sort_ordering = torch.sort(token_to_workers) + worker2token = sort_ordering // k + + # Find how many tokens we're sending to each other worker (being careful for sending 0 tokens to some workers) + output_splits = torch.zeros( + (self.num_workers,), dtype=torch.long, device=scores.device + ) + workers, counts = torch.unique_consecutive(token_to_workers, return_counts=True) + output_splits[workers] = counts + # Tell other workers how many tokens to expect from us + input_splits = All2All.apply(output_splits) + return worker2token, input_splits.tolist(), output_splits.tolist() + + def load_assignment(self): + try: + from fairseq import libbase + + return libbase + + except ImportError as e: + sys.stderr.write( + "ERROR: missing libbase. run `python setup.py build_ext --inplace`\n" + ) + raise e + + +class BaseSublayer(nn.Module): + def __init__(self, args): + super().__init__() + self.activation_fn = utils.get_activation_fn( + activation=getattr(args, "activation_fn", "relu") or "relu" + ) + self.norm = LayerNorm(args.decoder_embed_dim, export=False) + self.ff1 = torch.nn.Linear(args.decoder_embed_dim, args.decoder_ffn_embed_dim) + self.ff2 = torch.nn.Linear(args.decoder_ffn_embed_dim, args.decoder_embed_dim) + self.ff2.weight.data.zero_() + + def forward(self, xs): + return xs + self.ff2(self.activation_fn(self.ff1(self.norm(xs)))) + + +# Wraps torch.distributed.all_to_all_single as a function that supports autograd +class All2All(torch.autograd.Function): + @staticmethod + def forward(ctx, xs, input_splits=None, output_splits=None): + ctx.input_splits = input_splits + ctx.output_splits = output_splits + + ys = ( + torch.empty_like(xs) + if output_splits is None + else xs.new_empty(size=[sum(output_splits)] + list(xs.size()[1:])) + ) + torch.distributed.all_to_all_single( + ys, xs, output_split_sizes=output_splits, input_split_sizes=input_splits + ) + return ys + + @staticmethod + def backward(ctx, grad_output): + result = ( + torch.empty_like(grad_output) + if ctx.input_splits is None + else grad_output.new_empty( + size=[sum(ctx.input_splits)] + list(grad_output.size()[1:]) + ) + ) + torch.distributed.all_to_all_single( + result, + grad_output, + output_split_sizes=ctx.input_splits, + input_split_sizes=ctx.output_splits, + ) + return result, None, None diff --git a/fairseq/fairseq/modules/beamable_mm.py b/fairseq/fairseq/modules/beamable_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..eff1a4607f600c71210e6b914985dc48731aae86 --- /dev/null +++ b/fairseq/fairseq/modules/beamable_mm.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class BeamableMM(nn.Module): + """This module provides an optimized MM for beam decoding with attention. + + It leverage the fact that the source-side of the input is replicated beam + times and the target-side of the input is of width one. This layer speeds up + inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)} + with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}. + """ + + def __init__(self, beam_size=None): + super(BeamableMM, self).__init__() + self.beam_size = beam_size + + def forward(self, input1, input2): + if ( + not self.training + and self.beam_size is not None # test mode + and input1.dim() == 3 # beam size is set + and input1.size(1) # only support batched input + == 1 # single time step update + ): + bsz, beam = input1.size(0), self.beam_size + + # bsz x 1 x nhu --> bsz/beam x beam x nhu + input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1) + + # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu + input2 = input2.unfold(0, beam, beam)[:, :, :, 0] + + # use non batched operation if bsz = beam + if input1.size(0) == 1: + output = torch.mm(input1[0, :, :], input2[0, :, :]) + else: + output = input1.bmm(input2) + return output.view(bsz, 1, -1) + else: + return input1.bmm(input2) + + def set_beam_size(self, beam_size): + self.beam_size = beam_size diff --git a/fairseq/fairseq/modules/character_token_embedder.py b/fairseq/fairseq/modules/character_token_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..181221b61b9f76453b67e3b848b198620dce912c --- /dev/null +++ b/fairseq/fairseq/modules/character_token_embedder.py @@ -0,0 +1,214 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from fairseq.data import Dictionary +from torch import nn + + +CHAR_PAD_IDX = 0 +CHAR_EOS_IDX = 257 + + +logger = logging.getLogger(__name__) + + +class CharacterTokenEmbedder(torch.nn.Module): + def __init__( + self, + vocab: Dictionary, + filters: List[Tuple[int, int]], + char_embed_dim: int, + word_embed_dim: int, + highway_layers: int, + max_char_len: int = 50, + char_inputs: bool = False, + ): + super(CharacterTokenEmbedder, self).__init__() + + self.onnx_trace = False + self.embedding_dim = word_embed_dim + self.max_char_len = max_char_len + self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0) + self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim)) + self.eos_idx, self.unk_idx = 0, 1 + self.char_inputs = char_inputs + + self.convolutions = nn.ModuleList() + for width, out_c in filters: + self.convolutions.append( + nn.Conv1d(char_embed_dim, out_c, kernel_size=width) + ) + + last_dim = sum(f[1] for f in filters) + + self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None + + self.projection = nn.Linear(last_dim, word_embed_dim) + + assert ( + vocab is not None or char_inputs + ), "vocab must be set if not using char inputs" + self.vocab = None + if vocab is not None: + self.set_vocab(vocab, max_char_len) + + self.reset_parameters() + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def set_vocab(self, vocab, max_char_len): + word_to_char = torch.LongTensor(len(vocab), max_char_len) + + truncated = 0 + for i in range(len(vocab)): + if i < vocab.nspecial: + char_idxs = [0] * max_char_len + else: + chars = vocab[i].encode() + # +1 for padding + char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars)) + if len(char_idxs) > max_char_len: + truncated += 1 + char_idxs = char_idxs[:max_char_len] + word_to_char[i] = torch.LongTensor(char_idxs) + + if truncated > 0: + logger.info( + "truncated {} words longer than {} characters".format( + truncated, max_char_len + ) + ) + + self.vocab = vocab + self.word_to_char = word_to_char + + @property + def padding_idx(self): + return Dictionary().pad() if self.vocab is None else self.vocab.pad() + + def reset_parameters(self): + nn.init.xavier_normal_(self.char_embeddings.weight) + nn.init.xavier_normal_(self.symbol_embeddings) + nn.init.xavier_uniform_(self.projection.weight) + + nn.init.constant_( + self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.0 + ) + nn.init.constant_(self.projection.bias, 0.0) + + def forward( + self, + input: torch.Tensor, + ): + if self.char_inputs: + chars = input.view(-1, self.max_char_len) + pads = chars[:, 0].eq(CHAR_PAD_IDX) + eos = chars[:, 0].eq(CHAR_EOS_IDX) + if eos.any(): + if self.onnx_trace: + chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars) + else: + chars[eos] = 0 + + unk = None + else: + flat_words = input.view(-1) + chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as( + input + ) + pads = flat_words.eq(self.vocab.pad()) + eos = flat_words.eq(self.vocab.eos()) + unk = flat_words.eq(self.vocab.unk()) + + word_embs = self._convolve(chars) + if self.onnx_trace: + if pads.any(): + word_embs = torch.where( + pads.unsqueeze(1), word_embs.new_zeros(1), word_embs + ) + if eos.any(): + word_embs = torch.where( + eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs + ) + if unk is not None and unk.any(): + word_embs = torch.where( + unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs + ) + else: + if pads.any(): + word_embs[pads] = 0 + if eos.any(): + word_embs[eos] = self.symbol_embeddings[self.eos_idx] + if unk is not None and unk.any(): + word_embs[unk] = self.symbol_embeddings[self.unk_idx] + + return word_embs.view(input.size()[:2] + (-1,)) + + def _convolve( + self, + char_idxs: torch.Tensor, + ): + char_embs = self.char_embeddings(char_idxs) + char_embs = char_embs.transpose(1, 2) # BTC -> BCT + + conv_result = [] + + for conv in self.convolutions: + x = conv(char_embs) + x, _ = torch.max(x, -1) + x = F.relu(x) + conv_result.append(x) + + x = torch.cat(conv_result, dim=-1) + + if self.highway is not None: + x = self.highway(x) + x = self.projection(x) + + return x + + +class Highway(torch.nn.Module): + """ + A `Highway layer `_. + Adopted from the AllenNLP implementation. + """ + + def __init__(self, input_dim: int, num_layers: int = 1): + super(Highway, self).__init__() + self.input_dim = input_dim + self.layers = nn.ModuleList( + [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)] + ) + self.activation = nn.ReLU() + + self.reset_parameters() + + def reset_parameters(self): + for layer in self.layers: + # As per comment in AllenNLP: + # We should bias the highway layer to just carry its input forward. We do that by + # setting the bias on `B(x)` to be positive, because that means `g` will be biased to + # be high, so we will carry the input forward. The bias on `B(x)` is the second half + # of the bias vector in each Linear layer. + nn.init.constant_(layer.bias[self.input_dim :], 1) + + nn.init.constant_(layer.bias[: self.input_dim], 0) + nn.init.xavier_normal_(layer.weight) + + def forward(self, x: torch.Tensor): + for layer in self.layers: + projection = layer(x) + proj_x, gate = projection.chunk(2, dim=-1) + proj_x = self.activation(proj_x) + gate = torch.sigmoid(gate) + x = gate * x + (gate.new_tensor([1]) - gate) * proj_x + return x diff --git a/fairseq/fairseq/modules/checkpoint_activations.py b/fairseq/fairseq/modules/checkpoint_activations.py new file mode 100644 index 0000000000000000000000000000000000000000..aa0b5929a39d5acec91038d7335276b4e5098aed --- /dev/null +++ b/fairseq/fairseq/modules/checkpoint_activations.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +from typing import Any, Dict, List, Tuple, Union + +import torch +import torch.utils.checkpoint as checkpoint +from fairseq import utils + + +def checkpoint_wrapper(m, offload_to_cpu=False): + """ + A friendlier wrapper for performing activation checkpointing. + + Compared to the PyTorch version, this version: + - wraps an nn.Module, so that all subsequent calls will use checkpointing + - handles keyword arguments in the forward + - handles non-Tensor outputs from the forward + + Usage:: + + checkpointed_module = checkpoint_wrapper(my_module, offload_to_cpu=True) + a, b = checkpointed_module(x, y=3, z=torch.Tensor([1])) + """ + # should I check whether original_forward has already been set? + assert not hasattr( + m, "precheckpoint_forward" + ), "checkpoint function has already been applied?" + m.precheckpoint_forward = m.forward + m.forward = functools.partial( + _checkpointed_forward, + m.precheckpoint_forward, # original_forward + offload_to_cpu, + ) + return m + + +def unwrap_checkpoint(m: torch.nn.Module): + """ + unwrap a module and its children from checkpoint_wrapper + """ + for module in m.modules(): + if hasattr(module, "precheckpoint_forward"): + module.forward = module.precheckpoint_forward + del module.precheckpoint_forward + if hasattr(module, "old_deepcopy_method"): + module.__deepcopy__ = module.old_deepcopy_method + del module.old_deepcopy_method + return m + + +def _checkpointed_forward(original_forward, offload_to_cpu, *args, **kwargs): + # Autograd Functions in PyTorch work best with positional args, since + # the backward must return gradients (or None) for every input argument. + # We can flatten keyword arguments to make this easier. + kwarg_keys, flat_args = pack_kwargs(*args, **kwargs) + parent_ctx_dict = {"offload": offload_to_cpu} + output = CheckpointFunction.apply( + original_forward, parent_ctx_dict, kwarg_keys, *flat_args + ) + if isinstance(output, torch.Tensor): + return output + else: + packed_non_tensor_outputs = parent_ctx_dict["packed_non_tensor_outputs"] + if packed_non_tensor_outputs: + output = unpack_non_tensors(output, packed_non_tensor_outputs) + return output + + +def pack_kwargs(*args, **kwargs) -> Tuple[List[str], List[Any]]: + """ + Usage:: + + kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4) + args, kwargs = unpack_kwargs(kwarg_keys, flat_args) + assert args == [1, 2] + assert kwargs == {"a": 3, "b": 4} + """ + kwarg_keys = [] + flat_args = list(args) + for k, v in kwargs.items(): + kwarg_keys.append(k) + flat_args.append(v) + return kwarg_keys, flat_args + + +def unpack_kwargs( + kwarg_keys: List[str], flat_args: List[Any] +) -> Tuple[List[Any], Dict[str, Any]]: + if len(kwarg_keys) == 0: + return flat_args, {} + args = flat_args[: -len(kwarg_keys)] + kwargs = {k: v for k, v in zip(kwarg_keys, flat_args[-len(kwarg_keys) :])} + return args, kwargs + + +def split_non_tensors( + mixed: Union[torch.Tensor, Tuple[Any]] +) -> Tuple[Tuple[torch.Tensor], Dict[str, List[Any]]]: + """ + Usage:: + + x = torch.Tensor([1]) + y = torch.Tensor([2]) + tensors, packed_non_tensors = split_non_tensors((x, y, None, 3)) + recon = unpack_non_tensors(tensors, packed_non_tensors) + assert recon == (x, y, None, 3) + """ + if isinstance(mixed, torch.Tensor): + return (mixed,), None + tensors = [] + packed_non_tensors = {"is_tensor": [], "objects": []} + for o in mixed: + if isinstance(o, torch.Tensor): + packed_non_tensors["is_tensor"].append(True) + tensors.append(o) + else: + packed_non_tensors["is_tensor"].append(False) + packed_non_tensors["objects"].append(o) + return tuple(tensors), packed_non_tensors + + +def unpack_non_tensors( + tensors: Tuple[torch.Tensor], + packed_non_tensors: Dict[str, List[Any]], +) -> Tuple[Any]: + if packed_non_tensors is None: + return tensors + assert isinstance(packed_non_tensors, dict) + mixed = [] + is_tensor_list = packed_non_tensors["is_tensor"] + objects = packed_non_tensors["objects"] + assert len(tensors) + len(objects) == len(is_tensor_list) + obj_i = tnsr_i = 0 + for is_tensor in is_tensor_list: + if is_tensor: + mixed.append(tensors[tnsr_i]) + tnsr_i += 1 + else: + mixed.append(objects[obj_i]) + obj_i += 1 + return tuple(mixed) + + +class CheckpointFunction(torch.autograd.Function): + """Similar to the torch version, but support non-Tensor outputs. + + The caller is expected to provide a dict (*parent_ctx_dict*) that will hold + the non-Tensor outputs. These should be combined with the Tensor *outputs* + by calling ``unpack_non_tensors``. + """ + + @staticmethod + def forward(ctx, run_function, parent_ctx_dict, kwarg_keys, *args): + if torch.is_grad_enabled(): # grad may be disabled, e.g., during validation + checkpoint.check_backward_validity(args) + + ctx.run_function = run_function + ctx.kwarg_keys = kwarg_keys + ctx.fwd_rng_state = utils.get_rng_state() + + tensor_inputs, packed_non_tensor_inputs = split_non_tensors(args) + if parent_ctx_dict["offload"]: + ctx.fwd_device = tuple(x.device for x in tensor_inputs) + ctx.grad_requirements = tuple(x.requires_grad for x in tensor_inputs) + tensor_inputs = tuple( + x.to(torch.device("cpu"), non_blocking=True) for x in tensor_inputs + ) + + else: + ctx.fwd_device, ctx.grad_requirements = None, None + + ctx.save_for_backward(*tensor_inputs) + ctx.packed_non_tensor_inputs = packed_non_tensor_inputs + + with torch.no_grad(): + unpacked_args, unpacked_kwargs = unpack_kwargs(kwarg_keys, args) + outputs = run_function(*unpacked_args, **unpacked_kwargs) + + if isinstance(outputs, torch.Tensor): + return outputs + else: + # Autograd Functions don't like non-Tensor outputs. We can split the + # non-Tensor and Tensor outputs, returning the former by reference + # through *parent_ctx_dict* and returning the latter directly. + outputs, packed_non_tensor_outputs = split_non_tensors(outputs) + parent_ctx_dict["packed_non_tensor_outputs"] = packed_non_tensor_outputs + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError( + "Checkpointing is not compatible with .grad(), please use .backward() if possible" + ) + + tensor_inputs: Tuple = ctx.saved_tensors + tensor_inputs = checkpoint.detach_variable(tensor_inputs) + if ctx.fwd_device is not None: + tensor_inputs = [ + t.to(ctx.fwd_device[i], non_blocking=True) + for i, t in enumerate(tensor_inputs) + ] + for i, need_grad in enumerate(ctx.grad_requirements): + tensor_inputs[i].requires_grad = need_grad + inputs = unpack_non_tensors(tensor_inputs, ctx.packed_non_tensor_inputs) + + # Store the current states. + bwd_rng_state = utils.get_rng_state() + + # Set the states to what it used to be before the forward pass. + utils.set_rng_state(ctx.fwd_rng_state) + + with torch.enable_grad(): + unpacked_args, unpacked_kwargs = unpack_kwargs(ctx.kwarg_keys, inputs) + outputs = ctx.run_function(*unpacked_args, **unpacked_kwargs) + tensor_outputs, _ = split_non_tensors(outputs) + # Set the states back to what it was at the start of this function. + utils.set_rng_state(bwd_rng_state) + + # Run backward() with only Tensors that require grad + outputs_with_grad = [] + args_with_grad = [] + for i in range(len(tensor_outputs)): + if tensor_outputs[i].requires_grad: + outputs_with_grad.append(tensor_outputs[i]) + args_with_grad.append(args[i]) + if len(outputs_with_grad) == 0: + raise RuntimeError( + "None of the outputs have requires_grad=True, " + "this checkpoint() is not necessary" + ) + + torch.autograd.backward(outputs_with_grad, args_with_grad) + + grads = tuple( + inp.grad if isinstance(inp, torch.Tensor) else None for inp in inputs + ) + return (None, None, None) + grads diff --git a/fairseq/fairseq/modules/conformer_layer.py b/fairseq/fairseq/modules/conformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..964af243ec8fb5e09e0b61da688a9657277d39c1 --- /dev/null +++ b/fairseq/fairseq/modules/conformer_layer.py @@ -0,0 +1,301 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional + +import torch + +from fairseq.modules import ( + ESPNETMultiHeadedAttention, + LayerNorm, + MultiheadAttention, + RelPositionMultiHeadedAttention, + RotaryPositionMultiHeadedAttention, +) +from fairseq.utils import get_activation_fn + + +class ConvolutionModule(torch.nn.Module): + """Convolution block used in the conformer block""" + + def __init__( + self, + embed_dim, + channels, + depthwise_kernel_size, + dropout, + activation_fn="swish", + bias=False, + export=False, + ): + """ + Args: + embed_dim: Embedding dimension + channels: Number of channels in depthwise conv layers + depthwise_kernel_size: Depthwise conv layer kernel size + dropout: dropout value + activation_fn: Activation function to use after depthwise convolution kernel + bias: If bias should be added to conv layers + export: If layernorm should be exported to jit + """ + super(ConvolutionModule, self).__init__() + assert ( + depthwise_kernel_size - 1 + ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" + self.layer_norm = LayerNorm(embed_dim, export=export) + self.pointwise_conv1 = torch.nn.Conv1d( + embed_dim, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.glu = torch.nn.GLU(dim=1) + self.depthwise_conv = torch.nn.Conv1d( + channels, + channels, + depthwise_kernel_size, + stride=1, + padding=(depthwise_kernel_size - 1) // 2, + groups=channels, + bias=bias, + ) + self.batch_norm = torch.nn.BatchNorm1d(channels) + self.activation = get_activation_fn(activation_fn)(channels) + self.pointwise_conv2 = torch.nn.Conv1d( + channels, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.dropout = torch.nn.Dropout(dropout) + + def forward(self, x): + """ + Args: + x: Input of shape B X T X C + Returns: + Tensor of shape B X T X C + """ + x = self.layer_norm(x) + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = self.glu(x) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + x = self.batch_norm(x) + x = self.activation(x) + + x = self.pointwise_conv2(x) + x = self.dropout(x) + return x.transpose(1, 2) + + +class FeedForwardModule(torch.nn.Module): + """Positionwise feed forward layer used in conformer""" + + def __init__( + self, + input_feat, + hidden_units, + dropout1, + dropout2, + activation_fn="swish", + bias=True, + ): + """ + Args: + input_feat: Input feature dimension + hidden_units: Hidden unit dimension + dropout1: dropout value for layer1 + dropout2: dropout value for layer2 + activation_fn: Name of activation function + bias: If linear layers should have bias + """ + + super(FeedForwardModule, self).__init__() + self.layer_norm = LayerNorm(input_feat) + self.w_1 = torch.nn.Linear(input_feat, hidden_units, bias=bias) + self.w_2 = torch.nn.Linear(hidden_units, input_feat, bias=bias) + self.dropout1 = torch.nn.Dropout(dropout1) + self.dropout2 = torch.nn.Dropout(dropout2) + self.activation = get_activation_fn(activation_fn)(hidden_units) + + def forward(self, x): + """ + Args: + x: Input Tensor of shape T X B X C + Returns: + Tensor of shape T X B X C + """ + x = self.layer_norm(x) + x = self.w_1(x) + x = self.activation(x) + x = self.dropout1(x) + x = self.w_2(x) + return self.dropout2(x) + + +class ConformerEncoderLayer(torch.nn.Module): + """Conformer block based on https://arxiv.org/abs/2005.08100. We currently don't support relative positional encoding in MHA""" + + def __init__( + self, + embed_dim, + ffn_embed_dim, + attention_heads, + dropout, + use_fp16, + depthwise_conv_kernel_size=31, + activation_fn="swish", + attn_type=None, + pos_enc_type="abs", + ): + """ + Args: + embed_dim: Input embedding dimension + ffn_embed_dim: FFN layer dimension + attention_heads: Number of attention heads in MHA + dropout: dropout value + depthwise_conv_kernel_size: Size of kernel in depthwise conv layer in convolution module + activation_fn: Activation function name to use in convulation block and feed forward block + attn_type: MHA implementation from ESPNET vs fairseq + pos_enc_type: Positional encoding type - abs, rope, rel_pos + """ + self.pos_enc_type = pos_enc_type + super(ConformerEncoderLayer, self).__init__() + + self.ffn1 = FeedForwardModule( + embed_dim, + ffn_embed_dim, + dropout, + dropout, + ) + + self.self_attn_layer_norm = LayerNorm(embed_dim, export=False) + self.self_attn_dropout = torch.nn.Dropout(dropout) + if attn_type == "espnet": + if self.pos_enc_type == "rel_pos": + self.self_attn = RelPositionMultiHeadedAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + elif self.pos_enc_type == "rope": + self.self_attn = RotaryPositionMultiHeadedAttention( + embed_dim, attention_heads, dropout=dropout, precision=use_fp16 + ) + elif self.pos_enc_type == "abs": + self.self_attn = ESPNETMultiHeadedAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + else: + raise Exception(f"Unsupported attention type {self.pos_enc_type}") + else: + # Default to fairseq MHA + self.self_attn = MultiheadAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + + self.conv_module = ConvolutionModule( + embed_dim=embed_dim, + channels=embed_dim, + depthwise_kernel_size=depthwise_conv_kernel_size, + dropout=dropout, + activation_fn=activation_fn, + ) + + self.ffn2 = FeedForwardModule( + embed_dim, + ffn_embed_dim, + dropout, + dropout, + activation_fn=activation_fn, + ) + self.final_layer_norm = LayerNorm(embed_dim, export=False) + + def forward( + self, + x, + encoder_padding_mask: Optional[torch.Tensor], + position_emb: Optional[torch.Tensor] = None, + ): + """ + Args: + x: Tensor of shape T X B X C + encoder_padding_mask: Optional mask tensor + positions: + Returns: + Tensor of shape T X B X C + """ + residual = x + x = self.ffn1(x) + x = x * 0.5 + residual + residual = x + x = self.self_attn_layer_norm(x) + if self.pos_enc_type == "rel_pos": + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + pos_emb=position_emb, + need_weights=False, + ) + else: + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + need_weights=False, + ) + x = self.self_attn_dropout(x) + x = x + residual + + residual = x + # TBC to BTC + x = x.transpose(0, 1) + x = self.conv_module(x) + # BTC to TBC + x = x.transpose(0, 1) + x = residual + x + + residual = x + x = self.ffn2(x) + + layer_result = x + + x = x * 0.5 + residual + + x = self.final_layer_norm(x) + return x, (attn, layer_result) + + +class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer): + """Encoder layer for Wav2vec2 encoder""" + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + att_args=None, + position_emb=None, + ): + return super().forward(x, self_attn_padding_mask, position_emb) diff --git a/fairseq/fairseq/modules/conv_tbc.py b/fairseq/fairseq/modules/conv_tbc.py new file mode 100644 index 0000000000000000000000000000000000000000..65e17ec94f7e595cb657b3d2daaa1052a95d0677 --- /dev/null +++ b/fairseq/fairseq/modules/conv_tbc.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn +from torch.nn.modules.utils import _single +from torch import Tensor + + +class ConvTBC(torch.nn.Module): + """1D convolution over an input of shape (time x batch x channel) + + The implementation uses gemm to perform the convolution. This implementation + is faster than cuDNN for small kernel sizes. + """ + + def __init__(self, in_channels, out_channels, kernel_size, padding=0): + super(ConvTBC, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _single(kernel_size) + self.padding = _single(padding) + + self.weight = torch.nn.Parameter( + torch.Tensor(self.kernel_size[0], in_channels, out_channels) + ) + self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) + + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_normal_(self.weight) + nn.init.zeros_(self.bias) + + def conv_tbc(self, input: Tensor): + return torch.conv_tbc( + input.contiguous(), self.weight, self.bias, self.padding[0] + ) + + def forward(self, input: Tensor): + return self.conv_tbc(input) + + def __repr__(self): + s = ( + "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}" + ", padding={padding}" + ) + if self.bias is None: + s += ", bias=False" + s += ")" + return s.format(name=self.__class__.__name__, **self.__dict__) diff --git a/fairseq/fairseq/modules/cuda_utils.cu b/fairseq/fairseq/modules/cuda_utils.cu new file mode 100644 index 0000000000000000000000000000000000000000..924f852758ee654e462546274db4b5e7199a9c90 --- /dev/null +++ b/fairseq/fairseq/modules/cuda_utils.cu @@ -0,0 +1,202 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +template +constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { + return (a + b - 1) / b; +} + +template +__inline__ __device__ void zeroSharedMem(scalar_t* data) { + /* + Given an array of length FS + SB, zero out the first padding_l and last + (FS - padding_l) values in the array + */ + + int tid = threadIdx.x; + + if (FS < SB) { + // zero all if we have enough threads in a block to do all of them + if (tid < padding_l || tid > SB - FS + padding_l - 1) { + data[tid] = scalar_t(0.0); + } + } else { + // otherwise zero out one block at a time + const int numIterations = divUp(FS, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if (tid + offset < padding_l) { + data[tid + offset] = scalar_t(0.0); + } else if (tid + offset < FS) { + data[SB + tid + offset] = scalar_t(0.0); + } + } + } +} + +template +__inline__ __device__ scalar_t warpReduce(scalar_t data) { + /* + Reduce an array within each warp. After processing all values in warp will + caontain the sum of all original values in that warp. + + data - pointer to data to reduce + */ + data += __shfl_xor_sync(SHFL_MASK, data, 16); + data += __shfl_xor_sync(SHFL_MASK, data, 8); + data += __shfl_xor_sync(SHFL_MASK, data, 4); + data += __shfl_xor_sync(SHFL_MASK, data, 2); + data += __shfl_xor_sync(SHFL_MASK, data, 1); + return data; +} + +template +__inline__ __device__ scalar_t blockReduce(scalar_t data) { + /* + Reduce an entire array on the block level. After processing, the + first value in the array will contain the reduced sum. + + data - pointer to data to reduce + */ + + static __shared__ scalar_t warpSum[32]; + const int tid = threadIdx.x; + int wid = tid / 32; + int lane = tid % 32; + + __syncthreads(); + + // reduce each warp then write to shared memory + scalar_t sum = warpReduce(data); + if (lane == 0) { + warpSum[wid] = sum; + } + + __syncthreads(); + + scalar_t v; + // perform final sum of partial warp sums + if (tid < blockDim.x / 32) { + v = warpSum[lane]; + } else { + v = scalar_t(0.0); + } + + if (wid == 0) { + v = warpReduce(v); + } + __syncthreads(); + + return v; +} + +void checkCudaStatus(cudaError_t status, int lineNumber = -1) { + if (status != cudaSuccess) { + std::cout << cudaGetErrorString(status) << " at line " << lineNumber + << std::endl; + std::cout << "Exiting" << std::endl; + exit(1); + } +} + +template +__device__ void load_input_to_shared( + const scalar_t* input, // global memory + int inputOffset, + int sequenceLength, + int iteration, + int numIterations, + bool no_prev, + scalar_t* output /* shared memory */) { + /* + Load a block size of input into shared memory with + right and left overhang of total size FS. If previously + loaded memory, overlap will be shifted over to reduce + global memory access + + input - pointer to start of channel sequence + inputOffset - how far in the sequence to start loading + sequenceLength - total length of sequence + iteration - which block of sequence we are loading + numIterations - total number of blocks to load + no_prev - whether to load the whole block if the previous block + wasn't loaded + output - shared memory to write input to + */ + + const int tid = threadIdx.x; + + // Load the left "overhang" of input + if (iteration > 0) { + if (padding_l < SB) { + // load all at once + if (tid < padding_l) { + output[tid] = + (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; + } + } else { + // load in chunks of size SB + int numIterations = divUp(padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < padding_l) { + output[tid + offset] = (no_prev) + ? input[inputOffset - padding_l + tid + offset] + : output[tid + offset + SB]; + } + } + } + } + + // Load the right "overhang" of input + if (iteration < (numIterations - 1)) { + const int elementsLeft = sequenceLength - (iteration + 1) * SB; + + if ((FS - padding_l) < SB) { + // load all at once + if (tid < (FS - padding_l)) { + output[padding_l + SB + tid] = (tid < elementsLeft) + ? input[inputOffset + SB + tid] + : scalar_t(0.0); + } + } else { + // load in chunks of size SB + int numIterations = divUp(FS - padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < (FS - padding_l)) { + output[padding_l + SB + tid + offset] = + ((tid + offset) < elementsLeft) + ? input[inputOffset + SB + tid + offset] + : scalar_t(0.0); + } + } + } + } + + // We should also clear out the right "overhang" + if (iteration == (numIterations - 1)) { + if ((FS - padding_l) < SB) { + // clear out all at once + if (tid < (FS - padding_l)) { + output[padding_l + SB + tid] = scalar_t(0.0); + } + } else { + // clear in chunks of size SB + int numIterations = divUp(FS - padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < (FS - padding_l)) { + output[padding_l + SB + tid + offset] = scalar_t(0.0); + } + } + } + } + output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) + ? input[inputOffset + tid] + : scalar_t(0.0); +} diff --git a/fairseq/fairseq/modules/espnet_multihead_attention.py b/fairseq/fairseq/modules/espnet_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..82bc0d7b452b47b3135548f89c098272c8699c0a --- /dev/null +++ b/fairseq/fairseq/modules/espnet_multihead_attention.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Shigeki Karita +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Multi-Head Attention layer definition.""" + +import math + +import torch +from torch import nn + +from fairseq.modules.rotary_positional_embedding import ( + RotaryPositionalEmbedding, + apply_rotary_pos_emb, +) + + +class ESPNETMultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + Args: + n_head: The number of heads. + n_feat: The number of features. + dropout: Dropout rate. + """ + + def __init__(self, n_feat, n_head, dropout): + """Construct an MultiHeadedAttention object.""" + super(ESPNETMultiHeadedAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout) + + def forward_qkv(self, query, key, value, **kwargs): + """Transform query, key and value. + Args: + query: Query tensor B X T1 X C + key: Key tensor B X T2 X C + value: Value tensor B X T2 X C + Returns: + torch.Tensor: Transformed query tensor B X n_head X T1 X d_k + torch.Tensor: Transformed key tensor B X n_head X T2 X d_k + torch.Tensor: Transformed value tensor B X n_head X T2 X d_k + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + return q, k, v + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + Args: + value: Transformed value B X n_head X T2 X d_k. + scores: Attention score B X n_head X T1 X T2 + mask: Mask T2 X B + Returns: + torch.Tensor: Transformed value B X T1 X d_model + weighted by the attention score B X T1 X T2 + """ + n_batch = value.size(0) + if mask is not None: + scores = scores.masked_fill( + mask.unsqueeze(1).unsqueeze(2).to(bool), + float("-inf"), # (batch, head, time1, time2) + ) + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, query, key, value, key_padding_mask=None, **kwargs): + """Compute scaled dot product attention. + Args: + query (torch.Tensor): Query tensor T X B X C + key (torch.Tensor): Key tensor T X B X C + value (torch.Tensor): Value tensor T X B X C + mask (torch.Tensor): Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X D. + """ + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None + + +class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head: The number of heads. + n_feat: The number of features. + dropout: Dropout rate. + zero_triu: Whether to zero the upper triangular part of attention matrix. + """ + + def __init__(self, n_feat, n_head, dropout, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_feat, n_head, dropout) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + Args: + x: Input tensor B X n_head X T X 2T-1 + Returns: + torch.Tensor: Output tensor. + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + + if self.zero_triu: + ones = torch.ones((x.size(2), x.size(3)), device=x.device) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs): + """Compute scaled dot product attention. + Args: + query: Query tensor T X B X C + key: Key tensor T X B X C + value: Value tensor T X B X C + pos_emb: Positional embedding tensor B X 2T-1 X C + key_padding_mask: Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X C. + """ + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + pos_emb = pos_emb.transpose(0, 1) + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, 2*time1-1) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) + + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None + + +class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): + def __init__( + self, + n_feat, + n_head, + dropout, + precision, + rotary_emd_base=10000, + ): + """Construct an RotaryPositionMultiHeadedAttention object.""" + super().__init__(n_feat, n_head, dropout) + precision = torch.float + self.rotary_ndims = self.d_k # also try self.d_k//2 + if precision == "fp16": + precision = torch.half + + self.rotary_emb = RotaryPositionalEmbedding( + self.rotary_ndims, base=rotary_emd_base, precision=precision + ) + + def forward(self, query, key, value, key_padding_mask=None, **kwargs): + """Compute rotary position attention. + Args: + query: Query tensor T X B X C + key: Key tensor T X B X C + value: Value tensor T X B X C + key_padding_mask: Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X D. + Notes: + Assumes self attn + """ + + T, B, C = value.size() + query = query.view(T, B, self.h, self.d_k) + key = key.view(T, B, self.h, self.d_k) + value = value.view(T, B, self.h, self.d_k) + cos, sin = self.rotary_emb(value, seq_len=T) + query, key = apply_rotary_pos_emb( + query, key, cos, sin, offset=0 + ) # offset is based on layer_past + + query = query.view(T, B, self.h * self.d_k) + key = key.view(T, B, self.h * self.d_k) + value = value.view(T, B, self.h * self.d_k) + + # TBD to BTD + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None