# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from collections import namedtuple from dataclasses import dataclass from functools import partial from omegaconf import MISSING, II from typing import Optional, Callable from funasr_detach.models.emotion2vec.fairseq_modules import compute_mask_indices from funasr_detach.models.emotion2vec.fairseq_modules import GradMultiply from funasr_detach.models.emotion2vec.fairseq_modules import index_put logger = logging.getLogger(__name__) MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"]) MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"]) class ModalitySpecificEncoder(nn.Module): def __init__( self, modality_cfg, embed_dim: int, local_encoder: nn.Module, project_features: nn.Module, fixed_positional_encoder: Optional[nn.Module], relative_positional_encoder: Optional[nn.Module], context_encoder: nn.Module, decoder: nn.Module, get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]], ): super().__init__() self.modality_cfg = modality_cfg self.local_encoder = local_encoder self.project_features = project_features self.fixed_positional_encoder = fixed_positional_encoder self.relative_positional_encoder = relative_positional_encoder self.context_encoder = context_encoder self.decoder = decoder self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None self.local_grad_mult = self.modality_cfg.local_grad_mult self.extra_tokens = None if modality_cfg.num_extra_tokens > 0: self.extra_tokens = nn.Parameter( torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim) ) if not modality_cfg.init_extra_token_zero: nn.init.normal_(self.extra_tokens) elif self.extra_tokens.size(1) > 1: nn.init.normal_(self.extra_tokens[:, 1:]) self.alibi_scale = None if self.get_alibi_bias is not None: self.alibi_scale = nn.Parameter( torch.full( ( ( (modality_cfg.prenet_depth + modality_cfg.model_depth) if modality_cfg.learned_alibi_scale_per_layer else 1 ), 1, ( self.modality_cfg.num_alibi_heads if modality_cfg.learned_alibi_scale_per_head else 1 ), 1, 1, ), modality_cfg.alibi_scale, dtype=torch.float, ), requires_grad=modality_cfg.learned_alibi_scale, ) if modality_cfg.learned_alibi and self.get_alibi_bias is not None: assert modality_cfg.alibi_max_pos is not None alibi_bias = self.get_alibi_bias( batch_size=1, time_steps=modality_cfg.alibi_max_pos, heads=modality_cfg.num_alibi_heads, scale=1.0, dtype=torch.float, device="cpu", ) self.alibi_bias = nn.Parameter(alibi_bias) self.get_alibi_bias = partial( _learned_alibi_bias, alibi_bias=self.alibi_bias ) def upgrade_state_dict_named(self, state_dict, name): k = f"{name}.alibi_scale" if k in state_dict and state_dict[k].dim() == 4: state_dict[k] = state_dict[k].unsqueeze(0) return state_dict def convert_padding_mask(self, x, padding_mask): return padding_mask def decoder_input(self, x, mask_info: MaskInfo): inp_drop = self.modality_cfg.decoder.input_dropout if inp_drop > 0: x = F.dropout(x, inp_drop, training=self.training, inplace=True) num_extra = self.modality_cfg.num_extra_tokens if mask_info is not None: num_masked = mask_info.ids_restore.shape[1] - x.shape[1] + num_extra mask_tokens = x.new_empty( x.size(0), num_masked, x.size(-1), ).normal_(0, self.modality_cfg.mask_noise_std) x_ = torch.cat([x[:, num_extra:], mask_tokens], dim=1) x = torch.gather(x_, dim=1, index=mask_info.ids_restore) if self.modality_cfg.decoder.add_positions_masked: assert self.fixed_positional_encoder is not None pos = self.fixed_positional_encoder(x, None) x = x + (pos * mask_info.mask.unsqueeze(-1)) else: x = x[:, num_extra:] if self.modality_cfg.decoder.add_positions_all: assert self.fixed_positional_encoder is not None x = x + self.fixed_positional_encoder(x, None) return x, mask_info def local_features(self, features): if self.local_grad_mult > 0: if self.local_grad_mult == 1.0: x = self.local_encoder(features) else: x = GradMultiply.apply( self.local_encoder(features), self.local_grad_mult ) else: with torch.no_grad(): x = self.local_encoder(features) x = self.project_features(x) return x def contextualized_features( self, x, padding_mask, mask, remove_masked, clone_batch: int = 1, mask_seeds: Optional[torch.Tensor] = None, precomputed_mask=None, ): if padding_mask is not None: padding_mask = self.convert_padding_mask(x, padding_mask) local_features = x if mask and clone_batch == 1: local_features = local_features.clone() orig_B, orig_T, _ = x.shape pre_mask_B = orig_B mask_info = None x_pos = None if self.fixed_positional_encoder is not None: x = x + self.fixed_positional_encoder(x, padding_mask) if mask: if clone_batch > 1: x = x.repeat_interleave(clone_batch, 0) if mask_seeds is not None: clone_hash = [ int(hash((mask_seeds.seed, ind)) % 1e10) for ind in range(clone_batch - 1) ] clone_hash = torch.tensor([0] + clone_hash).long().view(1, -1) id = mask_seeds.ids id = id.repeat_interleave(clone_batch, 0) id = id.view(-1, clone_batch) + clone_hash.to(id) id = id.view(-1) mask_seeds = MaskSeed( seed=mask_seeds.seed, update=mask_seeds.update, ids=id ) if padding_mask is not None: padding_mask = padding_mask.repeat_interleave(clone_batch, 0) x, mask_info = self.compute_mask( x, padding_mask, mask_seed=mask_seeds, apply=self.relative_positional_encoder is not None or not remove_masked, precomputed_mask=precomputed_mask, ) if self.relative_positional_encoder is not None: x_pos = self.relative_positional_encoder(x) masked_padding_mask = padding_mask if mask and remove_masked: x = mask_info.x_unmasked if x_pos is not None: x = x + gather_unmasked(x_pos, mask_info) if padding_mask is not None and padding_mask.any(): masked_padding_mask = gather_unmasked_mask(padding_mask, mask_info) if not masked_padding_mask.any(): masked_padding_mask = None else: masked_padding_mask = None elif x_pos is not None: x = x + x_pos alibi_bias = None alibi_scale = self.alibi_scale if self.get_alibi_bias is not None: alibi_bias = self.get_alibi_bias( batch_size=pre_mask_B, time_steps=orig_T, heads=self.modality_cfg.num_alibi_heads, dtype=torch.float32, device=x.device, ) if alibi_scale is not None: alibi_scale = alibi_scale.clamp_min(0) if alibi_scale.size(0) == 1: alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias) alibi_scale = None if clone_batch > 1: alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0) if mask_info is not None and remove_masked: alibi_bias = masked_alibi(alibi_bias, mask_info) if self.extra_tokens is not None: num = self.extra_tokens.size(1) x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1) if masked_padding_mask is not None: # B x T masked_padding_mask = F.pad(masked_padding_mask, (num, 0)) if alibi_bias is not None: # B x H x T x T alibi_bias = F.pad(alibi_bias, (num, 0, num, 0)) x = self.context_encoder( x, masked_padding_mask, alibi_bias, ( alibi_scale[: self.modality_cfg.prenet_depth] if alibi_scale is not None else None ), ) return { "x": x, "local_features": local_features, "padding_mask": masked_padding_mask, "alibi_bias": alibi_bias, "alibi_scale": ( alibi_scale[self.modality_cfg.prenet_depth :] if alibi_scale is not None and alibi_scale.size(0) > 1 else alibi_scale ), "encoder_mask": mask_info, } def forward( self, features, padding_mask, mask: bool, remove_masked: bool, clone_batch: int = 1, mask_seeds: Optional[torch.Tensor] = None, precomputed_mask=None, ): x = self.local_features(features) return self.contextualized_features( x, padding_mask, mask, remove_masked, clone_batch, mask_seeds, precomputed_mask, ) def reset_parameters(self): pass def compute_mask( self, x, padding_mask, mask_seed: Optional[MaskSeed], apply, precomputed_mask, ): if precomputed_mask is not None: mask = precomputed_mask mask_info = self.make_maskinfo(x, mask) else: B, T, C = x.shape cfg = self.modality_cfg mask_prob = cfg.mask_prob if ( cfg.mask_prob_min is not None and cfg.mask_prob_min >= 0 and cfg.mask_prob_min < mask_prob ): mask_prob = np.random.uniform(cfg.mask_prob_min, mask_prob) if mask_prob > 0: if cfg.mask_length == 1: mask_info = random_masking(x, mask_prob, mask_seed) else: if self.modality_cfg.inverse_mask: mask_prob = 1 - mask_prob mask = compute_mask_indices( (B, T), padding_mask, mask_prob, cfg.mask_length, min_masks=1, require_same_masks=True, mask_dropout=cfg.mask_dropout, add_masks=cfg.add_masks, seed=mask_seed.seed if mask_seed is not None else None, epoch=mask_seed.update if mask_seed is not None else None, indices=mask_seed.ids if mask_seed is not None else None, ) mask = torch.from_numpy(mask).to(device=x.device) if self.modality_cfg.inverse_mask: mask = 1 - mask mask_info = self.make_maskinfo(x, mask) else: mask_info = None if apply: x = self.apply_mask(x, mask_info) return x, mask_info def make_maskinfo(self, x, mask, shape=None): if shape is None: B, T, D = x.shape else: B, T, D = shape mask = mask.to(torch.uint8) ids_shuffle = mask.argsort(dim=1) ids_restore = ids_shuffle.argsort(dim=1).unsqueeze(-1).expand(-1, -1, D) len_keep = T - mask[0].sum() if self.modality_cfg.keep_masked_pct > 0: len_keep += round((T - int(len_keep)) * self.modality_cfg.keep_masked_pct) ids_keep = ids_shuffle[:, :len_keep] if shape is not None: x_unmasked = None else: ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) x_unmasked = torch.gather(x, dim=1, index=ids_keep) mask_info = MaskInfo( x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep, ) return mask_info def apply_mask(self, x, mask_info): cfg = self.modality_cfg B, T, C = x.shape if mask_info is not None: mask = mask_info.mask if cfg.encoder_zero_mask: x = x * (1 - mask.type_as(x).unsqueeze(-1)) else: num_masks = mask.sum().item() masks = x.new_empty(num_masks, x.size(-1)).normal_( 0, cfg.mask_noise_std ) x = index_put(x, mask, masks) if cfg.mask_channel_prob > 0: mask_channel = compute_mask_indices( (B, C), None, cfg.mask_channel_prob, cfg.mask_channel_length, ) mask_channel = ( torch.from_numpy(mask_channel) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x = index_put(x, mask_channel, 0) return x def remove_pretraining_modules(self, keep_decoder=False): if not keep_decoder: self.decoder = None def get_annealed_rate(start, end, curr_step, total_steps): if curr_step >= total_steps: return end r = end - start pct_remaining = 1 - curr_step / total_steps return end - r * pct_remaining # adapted from MAE def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]): N, L, D = x.shape # batch, length, dim len_keep = int(L * (1 - mask_ratio)) generator = None if mask_seed is not None: seed = int( hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6 ) generator = torch.Generator(device=x.device) generator.manual_seed(seed) noise = torch.rand(N, L, generator=generator, device=x.device) # noise in [0, 1] # sort noise for each sample ids_shuffle = noise.argsort(dim=1) # ascend: small is keep, large is remove ids_restore = ids_shuffle.argsort(dim=1) # keep the first subset ids_keep = ids_shuffle[:, :len_keep] ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) x_unmasked = torch.gather(x, dim=1, index=ids_keep) # generate the binary mask: 0 is keep, 1 is remove mask = torch.ones([N, L], dtype=x.dtype, device=x.device) mask[:, :len_keep] = 0 # unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D) return MaskInfo( x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep ) def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: return torch.gather( x, dim=1, index=mask_info.ids_keep, ) def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: return torch.gather( x, dim=1, index=mask_info.ids_keep[..., 0], # ignore the feature dimension ) def get_alibi( max_positions: int, attention_heads: int, dims: int = 1, distance: str = "manhattan", ): def get_slopes(n): def get_slopes_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio**i for i in range(n)] # In the paper, we only train models that have 2^a heads for some # a. This function has some good properties that only occur when # the input is a power of 2. To maintain that even when the number # of heads is not a power of 2, we use this workaround. if math.log2(n).is_integer(): return get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return ( get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] ) maxpos = max_positions attn_heads = attention_heads slopes = torch.Tensor(get_slopes(attn_heads)) if dims == 1: # prepare alibi position linear bias. Note that wav2vec2 is non # autoregressive model so we want a symmetric mask with 0 on the # diagonal and other wise linear decreasing valuees pos_bias = ( torch.abs( torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1) ) * -1 ) elif dims == 2: if distance == "manhattan": df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2) elif distance == "euclidean": df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) n = math.sqrt(max_positions) assert n.is_integer(), n n = int(n) pos_bias = torch.zeros((max_positions, max_positions)) for i in range(n): for j in range(n): for k in range(n): for l in range(n): new_x = i * n + j new_y = k * n + l pos_bias[new_x, new_y] = -df(i, j, k, l) else: raise Exception(f"unsupported number of alibi dims: {dims}") alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand( attn_heads, -1, -1 ) return alibi_bias def get_alibi_bias( alibi_biases, batch_size, time_steps, heads, dtype, device, dims=1, distance="manhattan", ): cache_key = f"{dims}_{heads}_{distance}" buffered = alibi_biases.get(cache_key, None) target_size = heads * batch_size if ( buffered is None or buffered.size(0) < target_size or buffered.size(1) < time_steps or buffered.dtype != dtype or buffered.device != device ): bt = max(time_steps, buffered.size(1) if buffered is not None else 0) bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads buffered = ( get_alibi(bt, heads, dims=dims, distance=distance) .to(dtype=dtype, device=device) .repeat(bn, 1, 1) ) alibi_biases[cache_key] = buffered b = buffered[:target_size, :time_steps, :time_steps] b = b.view(batch_size, heads, time_steps, time_steps) return b def _learned_alibi_bias( alibi_bias, batch_size, time_steps, heads, scale, dtype, device, ): assert alibi_bias.size(1) == heads, alibi_bias.shape assert alibi_bias.dtype == dtype, alibi_bias.dtype assert alibi_bias.device == device, alibi_bias.device if alibi_bias.size(-1) < time_steps: psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2) alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate") alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale return alibi_bias[..., :time_steps, :time_steps] def masked_alibi(alibi_bias, mask_info): H = alibi_bias.size(1) orig_bias = alibi_bias index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1) alibi_bias = torch.gather( orig_bias, dim=-2, index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)), ) alibi_bias = torch.gather( alibi_bias, dim=-1, index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1), ) return alibi_bias