Spaces:

stepfun-ai
/

Step-Audio

Running

Step-Audio / funasr_detach /models /emotion2vec /base.py

martin

initial

67c46fd 5 months ago

21.2 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import logging
	import math
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from collections import namedtuple
	from dataclasses import dataclass
	from functools import partial
	from omegaconf import MISSING, II
	from typing import Optional, Callable
	from funasr_detach.models.emotion2vec.fairseq_modules import compute_mask_indices
	from funasr_detach.models.emotion2vec.fairseq_modules import GradMultiply
	from funasr_detach.models.emotion2vec.fairseq_modules import index_put


	logger = logging.getLogger(__name__)


	MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"])
	MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"])


	class ModalitySpecificEncoder(nn.Module):
	def __init__(
	self,
	modality_cfg,
	embed_dim: int,
	local_encoder: nn.Module,
	project_features: nn.Module,
	fixed_positional_encoder: Optional[nn.Module],
	relative_positional_encoder: Optional[nn.Module],
	context_encoder: nn.Module,
	decoder: nn.Module,
	get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]],
	):
	super().__init__()

	self.modality_cfg = modality_cfg
	self.local_encoder = local_encoder
	self.project_features = project_features
	self.fixed_positional_encoder = fixed_positional_encoder
	self.relative_positional_encoder = relative_positional_encoder
	self.context_encoder = context_encoder

	self.decoder = decoder
	self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None

	self.local_grad_mult = self.modality_cfg.local_grad_mult

	self.extra_tokens = None
	if modality_cfg.num_extra_tokens > 0:
	self.extra_tokens = nn.Parameter(
	torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim)
	)
	if not modality_cfg.init_extra_token_zero:
	nn.init.normal_(self.extra_tokens)
	elif self.extra_tokens.size(1) > 1:
	nn.init.normal_(self.extra_tokens[:, 1:])

	self.alibi_scale = None
	if self.get_alibi_bias is not None:
	self.alibi_scale = nn.Parameter(
	torch.full(
	(
	(
	(modality_cfg.prenet_depth + modality_cfg.model_depth)
	if modality_cfg.learned_alibi_scale_per_layer
	else 1
	),
	1,
	(
	self.modality_cfg.num_alibi_heads
	if modality_cfg.learned_alibi_scale_per_head
	else 1
	),
	1,
	1,
	),
	modality_cfg.alibi_scale,
	dtype=torch.float,
	),
	requires_grad=modality_cfg.learned_alibi_scale,
	)

	if modality_cfg.learned_alibi and self.get_alibi_bias is not None:
	assert modality_cfg.alibi_max_pos is not None
	alibi_bias = self.get_alibi_bias(
	batch_size=1,
	time_steps=modality_cfg.alibi_max_pos,
	heads=modality_cfg.num_alibi_heads,
	scale=1.0,
	dtype=torch.float,
	device="cpu",
	)
	self.alibi_bias = nn.Parameter(alibi_bias)
	self.get_alibi_bias = partial(
	_learned_alibi_bias, alibi_bias=self.alibi_bias
	)

	def upgrade_state_dict_named(self, state_dict, name):
	k = f"{name}.alibi_scale"
	if k in state_dict and state_dict[k].dim() == 4:
	state_dict[k] = state_dict[k].unsqueeze(0)

	return state_dict

	def convert_padding_mask(self, x, padding_mask):
	return padding_mask

	def decoder_input(self, x, mask_info: MaskInfo):
	inp_drop = self.modality_cfg.decoder.input_dropout
	if inp_drop > 0:
	x = F.dropout(x, inp_drop, training=self.training, inplace=True)

	num_extra = self.modality_cfg.num_extra_tokens

	if mask_info is not None:
	num_masked = mask_info.ids_restore.shape[1] - x.shape[1] + num_extra

	mask_tokens = x.new_empty(
	x.size(0),
	num_masked,
	x.size(-1),
	).normal_(0, self.modality_cfg.mask_noise_std)

	x_ = torch.cat([x[:, num_extra:], mask_tokens], dim=1)
	x = torch.gather(x_, dim=1, index=mask_info.ids_restore)

	if self.modality_cfg.decoder.add_positions_masked:
	assert self.fixed_positional_encoder is not None
	pos = self.fixed_positional_encoder(x, None)
	x = x + (pos * mask_info.mask.unsqueeze(-1))
	else:
	x = x[:, num_extra:]

	if self.modality_cfg.decoder.add_positions_all:
	assert self.fixed_positional_encoder is not None
	x = x + self.fixed_positional_encoder(x, None)

	return x, mask_info

	def local_features(self, features):
	if self.local_grad_mult > 0:
	if self.local_grad_mult == 1.0:
	x = self.local_encoder(features)
	else:
	x = GradMultiply.apply(
	self.local_encoder(features), self.local_grad_mult
	)
	else:
	with torch.no_grad():
	x = self.local_encoder(features)

	x = self.project_features(x)
	return x

	def contextualized_features(
	self,
	x,
	padding_mask,
	mask,
	remove_masked,
	clone_batch: int = 1,
	mask_seeds: Optional[torch.Tensor] = None,
	precomputed_mask=None,
	):

	if padding_mask is not None:
	padding_mask = self.convert_padding_mask(x, padding_mask)

	local_features = x
	if mask and clone_batch == 1:
	local_features = local_features.clone()

	orig_B, orig_T, _ = x.shape
	pre_mask_B = orig_B
	mask_info = None

	x_pos = None
	if self.fixed_positional_encoder is not None:
	x = x + self.fixed_positional_encoder(x, padding_mask)

	if mask:
	if clone_batch > 1:
	x = x.repeat_interleave(clone_batch, 0)
	if mask_seeds is not None:
	clone_hash = [
	int(hash((mask_seeds.seed, ind)) % 1e10)
	for ind in range(clone_batch - 1)
	]
	clone_hash = torch.tensor([0] + clone_hash).long().view(1, -1)

	id = mask_seeds.ids
	id = id.repeat_interleave(clone_batch, 0)
	id = id.view(-1, clone_batch) + clone_hash.to(id)
	id = id.view(-1)
	mask_seeds = MaskSeed(
	seed=mask_seeds.seed, update=mask_seeds.update, ids=id
	)
	if padding_mask is not None:
	padding_mask = padding_mask.repeat_interleave(clone_batch, 0)

	x, mask_info = self.compute_mask(
	x,
	padding_mask,
	mask_seed=mask_seeds,
	apply=self.relative_positional_encoder is not None or not remove_masked,
	precomputed_mask=precomputed_mask,
	)

	if self.relative_positional_encoder is not None:
	x_pos = self.relative_positional_encoder(x)

	masked_padding_mask = padding_mask
	if mask and remove_masked:
	x = mask_info.x_unmasked
	if x_pos is not None:
	x = x + gather_unmasked(x_pos, mask_info)

	if padding_mask is not None and padding_mask.any():
	masked_padding_mask = gather_unmasked_mask(padding_mask, mask_info)
	if not masked_padding_mask.any():
	masked_padding_mask = None
	else:
	masked_padding_mask = None

	elif x_pos is not None:
	x = x + x_pos

	alibi_bias = None
	alibi_scale = self.alibi_scale

	if self.get_alibi_bias is not None:
	alibi_bias = self.get_alibi_bias(
	batch_size=pre_mask_B,
	time_steps=orig_T,
	heads=self.modality_cfg.num_alibi_heads,
	dtype=torch.float32,
	device=x.device,
	)

	if alibi_scale is not None:
	alibi_scale = alibi_scale.clamp_min(0)
	if alibi_scale.size(0) == 1:
	alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias)
	alibi_scale = None

	if clone_batch > 1:
	alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0)

	if mask_info is not None and remove_masked:
	alibi_bias = masked_alibi(alibi_bias, mask_info)

	if self.extra_tokens is not None:
	num = self.extra_tokens.size(1)
	x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1)
	if masked_padding_mask is not None:
	# B x T
	masked_padding_mask = F.pad(masked_padding_mask, (num, 0))
	if alibi_bias is not None:
	# B x H x T x T
	alibi_bias = F.pad(alibi_bias, (num, 0, num, 0))

	x = self.context_encoder(
	x,
	masked_padding_mask,
	alibi_bias,
	(
	alibi_scale[: self.modality_cfg.prenet_depth]
	if alibi_scale is not None
	else None
	),
	)

	return {
	"x": x,
	"local_features": local_features,
	"padding_mask": masked_padding_mask,
	"alibi_bias": alibi_bias,
	"alibi_scale": (
	alibi_scale[self.modality_cfg.prenet_depth :]
	if alibi_scale is not None and alibi_scale.size(0) > 1
	else alibi_scale
	),
	"encoder_mask": mask_info,
	}

	def forward(
	self,
	features,
	padding_mask,
	mask: bool,
	remove_masked: bool,
	clone_batch: int = 1,
	mask_seeds: Optional[torch.Tensor] = None,
	precomputed_mask=None,
	):
	x = self.local_features(features)
	return self.contextualized_features(
	x,
	padding_mask,
	mask,
	remove_masked,
	clone_batch,
	mask_seeds,
	precomputed_mask,
	)

	def reset_parameters(self):
	pass

	def compute_mask(
	self,
	x,
	padding_mask,
	mask_seed: Optional[MaskSeed],
	apply,
	precomputed_mask,
	):
	if precomputed_mask is not None:
	mask = precomputed_mask
	mask_info = self.make_maskinfo(x, mask)
	else:
	B, T, C = x.shape
	cfg = self.modality_cfg

	mask_prob = cfg.mask_prob

	if (
	cfg.mask_prob_min is not None
	and cfg.mask_prob_min >= 0
	and cfg.mask_prob_min < mask_prob
	):
	mask_prob = np.random.uniform(cfg.mask_prob_min, mask_prob)

	if mask_prob > 0:
	if cfg.mask_length == 1:
	mask_info = random_masking(x, mask_prob, mask_seed)
	else:
	if self.modality_cfg.inverse_mask:
	mask_prob = 1 - mask_prob

	mask = compute_mask_indices(
	(B, T),
	padding_mask,
	mask_prob,
	cfg.mask_length,
	min_masks=1,
	require_same_masks=True,
	mask_dropout=cfg.mask_dropout,
	add_masks=cfg.add_masks,
	seed=mask_seed.seed if mask_seed is not None else None,
	epoch=mask_seed.update if mask_seed is not None else None,
	indices=mask_seed.ids if mask_seed is not None else None,
	)

	mask = torch.from_numpy(mask).to(device=x.device)
	if self.modality_cfg.inverse_mask:
	mask = 1 - mask
	mask_info = self.make_maskinfo(x, mask)
	else:
	mask_info = None

	if apply:
	x = self.apply_mask(x, mask_info)

	return x, mask_info

	def make_maskinfo(self, x, mask, shape=None):
	if shape is None:
	B, T, D = x.shape
	else:
	B, T, D = shape

	mask = mask.to(torch.uint8)
	ids_shuffle = mask.argsort(dim=1)
	ids_restore = ids_shuffle.argsort(dim=1).unsqueeze(-1).expand(-1, -1, D)

	len_keep = T - mask[0].sum()
	if self.modality_cfg.keep_masked_pct > 0:
	len_keep += round((T - int(len_keep)) * self.modality_cfg.keep_masked_pct)

	ids_keep = ids_shuffle[:, :len_keep]

	if shape is not None:
	x_unmasked = None
	else:
	ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
	x_unmasked = torch.gather(x, dim=1, index=ids_keep)

	mask_info = MaskInfo(
	x_unmasked=x_unmasked,
	mask=mask,
	ids_restore=ids_restore,
	ids_keep=ids_keep,
	)
	return mask_info

	def apply_mask(self, x, mask_info):
	cfg = self.modality_cfg
	B, T, C = x.shape

	if mask_info is not None:
	mask = mask_info.mask
	if cfg.encoder_zero_mask:
	x = x * (1 - mask.type_as(x).unsqueeze(-1))
	else:
	num_masks = mask.sum().item()
	masks = x.new_empty(num_masks, x.size(-1)).normal_(
	0, cfg.mask_noise_std
	)
	x = index_put(x, mask, masks)
	if cfg.mask_channel_prob > 0:
	mask_channel = compute_mask_indices(
	(B, C),
	None,
	cfg.mask_channel_prob,
	cfg.mask_channel_length,
	)
	mask_channel = (
	torch.from_numpy(mask_channel)
	.to(x.device)
	.unsqueeze(1)
	.expand(-1, T, -1)
	)
	x = index_put(x, mask_channel, 0)
	return x

	def remove_pretraining_modules(self, keep_decoder=False):
	if not keep_decoder:
	self.decoder = None


	def get_annealed_rate(start, end, curr_step, total_steps):
	if curr_step >= total_steps:
	return end
	r = end - start
	pct_remaining = 1 - curr_step / total_steps
	return end - r * pct_remaining


	# adapted from MAE
	def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]):
	N, L, D = x.shape # batch, length, dim
	len_keep = int(L * (1 - mask_ratio))

	generator = None
	if mask_seed is not None:
	seed = int(
	hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6
	)
	generator = torch.Generator(device=x.device)
	generator.manual_seed(seed)

	noise = torch.rand(N, L, generator=generator, device=x.device) # noise in [0, 1]

	# sort noise for each sample
	ids_shuffle = noise.argsort(dim=1) # ascend: small is keep, large is remove
	ids_restore = ids_shuffle.argsort(dim=1)

	# keep the first subset
	ids_keep = ids_shuffle[:, :len_keep]
	ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
	x_unmasked = torch.gather(x, dim=1, index=ids_keep)

	# generate the binary mask: 0 is keep, 1 is remove
	mask = torch.ones([N, L], dtype=x.dtype, device=x.device)
	mask[:, :len_keep] = 0
	# unshuffle to get the binary mask
	mask = torch.gather(mask, dim=1, index=ids_restore)

	ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D)

	return MaskInfo(
	x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep
	)


	def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
	return torch.gather(
	x,
	dim=1,
	index=mask_info.ids_keep,
	)


	def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
	return torch.gather(
	x,
	dim=1,
	index=mask_info.ids_keep[..., 0], # ignore the feature dimension
	)


	def get_alibi(
	max_positions: int,
	attention_heads: int,
	dims: int = 1,
	distance: str = "manhattan",
	):
	def get_slopes(n):
	def get_slopes_power_of_2(n):
	start = 2 (-(2 -(math.log2(n) - 3)))
	ratio = start
	return [start * ratio**i for i in range(n)]

	# In the paper, we only train models that have 2^a heads for some
	# a. This function has some good properties that only occur when
	# the input is a power of 2. To maintain that even when the number
	# of heads is not a power of 2, we use this workaround.
	if math.log2(n).is_integer():
	return get_slopes_power_of_2(n)
	else:
	closest_power_of_2 = 2 ** math.floor(math.log2(n))
	return (
	get_slopes_power_of_2(closest_power_of_2)
	+ get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
	)

	maxpos = max_positions
	attn_heads = attention_heads
	slopes = torch.Tensor(get_slopes(attn_heads))

	if dims == 1:
	# prepare alibi position linear bias. Note that wav2vec2 is non
	# autoregressive model so we want a symmetric mask with 0 on the
	# diagonal and other wise linear decreasing valuees
	pos_bias = (
	torch.abs(
	torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1)
	)
	* -1
	)
	elif dims == 2:
	if distance == "manhattan":
	df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)
	elif distance == "euclidean":
	df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) 2 + (y1 - y2) 2)

	n = math.sqrt(max_positions)
	assert n.is_integer(), n
	n = int(n)

	pos_bias = torch.zeros((max_positions, max_positions))

	for i in range(n):
	for j in range(n):
	for k in range(n):
	for l in range(n):
	new_x = i * n + j
	new_y = k * n + l
	pos_bias[new_x, new_y] = -df(i, j, k, l)

	else:
	raise Exception(f"unsupported number of alibi dims: {dims}")

	alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand(
	attn_heads, -1, -1
	)

	return alibi_bias


	def get_alibi_bias(
	alibi_biases,
	batch_size,
	time_steps,
	heads,
	dtype,
	device,
	dims=1,
	distance="manhattan",
	):
	cache_key = f"{dims}_{heads}_{distance}"

	buffered = alibi_biases.get(cache_key, None)

	target_size = heads * batch_size
	if (
	buffered is None
	or buffered.size(0) < target_size
	or buffered.size(1) < time_steps
	or buffered.dtype != dtype
	or buffered.device != device
	):
	bt = max(time_steps, buffered.size(1) if buffered is not None else 0)
	bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads

	buffered = (
	get_alibi(bt, heads, dims=dims, distance=distance)
	.to(dtype=dtype, device=device)
	.repeat(bn, 1, 1)
	)

	alibi_biases[cache_key] = buffered

	b = buffered[:target_size, :time_steps, :time_steps]
	b = b.view(batch_size, heads, time_steps, time_steps)
	return b


	def _learned_alibi_bias(
	alibi_bias,
	batch_size,
	time_steps,
	heads,
	scale,
	dtype,
	device,
	):
	assert alibi_bias.size(1) == heads, alibi_bias.shape
	assert alibi_bias.dtype == dtype, alibi_bias.dtype
	assert alibi_bias.device == device, alibi_bias.device

	if alibi_bias.size(-1) < time_steps:
	psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2)
	alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate")

	alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale
	return alibi_bias[..., :time_steps, :time_steps]


	def masked_alibi(alibi_bias, mask_info):
	H = alibi_bias.size(1)

	orig_bias = alibi_bias

	index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1)
	alibi_bias = torch.gather(
	orig_bias,
	dim=-2,
	index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)),
	)
	alibi_bias = torch.gather(
	alibi_bias,
	dim=-1,
	index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1),
	)

	return alibi_bias