Seed-VC

Running

App Files Files Community

Seed-VC / modules /quantize.py

Plachta

Upload 42 files

f32cd36 verified about 2 months ago

raw

history blame

8.56 kB

	from dac.nn.quantize import ResidualVectorQuantize
	from torch import nn
	from modules.wavenet import WN
	import torch
	import torchaudio
	import torchaudio.functional as audio_F
	import numpy as np
	from .alias_free_torch import *
	from torch.nn.utils import weight_norm
	from torch import nn, sin, pow
	from einops.layers.torch import Rearrange
	from dac.model.encodec import SConv1d

	def init_weights(m):
	if isinstance(m, nn.Conv1d):
	nn.init.trunc_normal_(m.weight, std=0.02)
	nn.init.constant_(m.bias, 0)


	def WNConv1d(args, *kwargs):
	return weight_norm(nn.Conv1d(args, *kwargs))


	def WNConvTranspose1d(args, *kwargs):
	return weight_norm(nn.ConvTranspose1d(args, *kwargs))

	class SnakeBeta(nn.Module):
	"""
	A modified Snake function which uses separate parameters for the magnitude of the periodic components
	Shape:
	- Input: (B, C, T)
	- Output: (B, C, T), same shape as the input
	Parameters:
	- alpha - trainable parameter that controls frequency
	- beta - trainable parameter that controls magnitude
	References:
	- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
	https://arxiv.org/abs/2006.08195
	Examples:
	>>> a1 = snakebeta(256)
	>>> x = torch.randn(256)
	>>> x = a1(x)
	"""

	def __init__(
	self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
	):
	"""
	Initialization.
	INPUT:
	- in_features: shape of the input
	- alpha - trainable parameter that controls frequency
	- beta - trainable parameter that controls magnitude
	alpha is initialized to 1 by default, higher values = higher-frequency.
	beta is initialized to 1 by default, higher values = higher-magnitude.
	alpha will be trained along with the rest of your model.
	"""
	super(SnakeBeta, self).__init__()
	self.in_features = in_features

	# initialize alpha
	self.alpha_logscale = alpha_logscale
	if self.alpha_logscale: # log scale alphas initialized to zeros
	self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
	self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
	else: # linear scale alphas initialized to ones
	self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
	self.beta = nn.Parameter(torch.ones(in_features) * alpha)

	self.alpha.requires_grad = alpha_trainable
	self.beta.requires_grad = alpha_trainable

	self.no_div_by_zero = 0.000000001

	def forward(self, x):
	"""
	Forward pass of the function.
	Applies the function to the input elementwise.
	SnakeBeta := x + 1/b * sin^2 (xa)
	"""
	alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
	beta = self.beta.unsqueeze(0).unsqueeze(-1)
	if self.alpha_logscale:
	alpha = torch.exp(alpha)
	beta = torch.exp(beta)
	x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)

	return x

	class ResidualUnit(nn.Module):
	def __init__(self, dim: int = 16, dilation: int = 1):
	super().__init__()
	pad = ((7 - 1) * dilation) // 2
	self.block = nn.Sequential(
	Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
	WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
	Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
	WNConv1d(dim, dim, kernel_size=1),
	)

	def forward(self, x):
	return x + self.block(x)

	class CNNLSTM(nn.Module):
	def __init__(self, indim, outdim, head, global_pred=False):
	super().__init__()
	self.global_pred = global_pred
	self.model = nn.Sequential(
	ResidualUnit(indim, dilation=1),
	ResidualUnit(indim, dilation=2),
	ResidualUnit(indim, dilation=3),
	Activation1d(activation=SnakeBeta(indim, alpha_logscale=True)),
	Rearrange("b c t -> b t c"),
	)
	self.heads = nn.ModuleList([nn.Linear(indim, outdim) for i in range(head)])

	def forward(self, x):
	# x: [B, C, T]
	x = self.model(x)
	if self.global_pred:
	x = torch.mean(x, dim=1, keepdim=False)
	outs = [head(x) for head in self.heads]
	return outs

	def sequence_mask(length, max_length=None):
	if max_length is None:
	max_length = length.max()
	x = torch.arange(max_length, dtype=length.dtype, device=length.device)
	return x.unsqueeze(0) < length.unsqueeze(1)
	class FAquantizer(nn.Module):
	def __init__(self, in_dim=1024,
	n_p_codebooks=1,
	n_c_codebooks=2,
	n_t_codebooks=2,
	n_r_codebooks=3,
	codebook_size=1024,
	codebook_dim=8,
	quantizer_dropout=0.5,
	causal=False,
	separate_prosody_encoder=False,
	timbre_norm=False,):
	super(FAquantizer, self).__init__()
	conv1d_type = SConv1d# if causal else nn.Conv1d
	self.prosody_quantizer = ResidualVectorQuantize(
	input_dim=in_dim,
	n_codebooks=n_p_codebooks,
	codebook_size=codebook_size,
	codebook_dim=codebook_dim,
	quantizer_dropout=quantizer_dropout,
	)

	self.content_quantizer = ResidualVectorQuantize(
	input_dim=in_dim,
	n_codebooks=n_c_codebooks,
	codebook_size=codebook_size,
	codebook_dim=codebook_dim,
	quantizer_dropout=quantizer_dropout,
	)

	self.residual_quantizer = ResidualVectorQuantize(
	input_dim=in_dim,
	n_codebooks=n_r_codebooks,
	codebook_size=codebook_size,
	codebook_dim=codebook_dim,
	quantizer_dropout=quantizer_dropout,
	)

	self.melspec_linear = conv1d_type(in_channels=20, out_channels=256, kernel_size=1, causal=causal)
	self.melspec_encoder = WN(hidden_channels=256, kernel_size=5, dilation_rate=1, n_layers=8, gin_channels=0, p_dropout=0.2, causal=causal)
	self.melspec_linear2 = conv1d_type(in_channels=256, out_channels=1024, kernel_size=1, causal=causal)

	self.prob_random_mask_residual = 0.75

	SPECT_PARAMS = {
	"n_fft": 2048,
	"win_length": 1200,
	"hop_length": 300,
	}
	MEL_PARAMS = {
	"n_mels": 80,
	}

	self.to_mel = torchaudio.transforms.MelSpectrogram(
	n_mels=MEL_PARAMS["n_mels"], sample_rate=24000, **SPECT_PARAMS
	)
	self.mel_mean, self.mel_std = -4, 4
	self.frame_rate = 24000 / 300
	self.hop_length = 300

	def preprocess(self, wave_tensor, n_bins=20):
	mel_tensor = self.to_mel(wave_tensor.squeeze(1))
	mel_tensor = (torch.log(1e-5 + mel_tensor) - self.mel_mean) / self.mel_std
	return mel_tensor[:, :n_bins, :int(wave_tensor.size(-1) / self.hop_length)]

	def forward(self, x, wave_segments):
	outs = 0
	prosody_feature = self.preprocess(wave_segments)

	f0_input = prosody_feature # (B, T, 20)
	f0_input = self.melspec_linear(f0_input)
	f0_input = self.melspec_encoder(f0_input, torch.ones(f0_input.shape[0], 1, f0_input.shape[2]).to(
	f0_input.device).bool())
	f0_input = self.melspec_linear2(f0_input)

	common_min_size = min(f0_input.size(2), x.size(2))
	f0_input = f0_input[:, :, :common_min_size]

	x = x[:, :, :common_min_size]

	z_p, codes_p, latents_p, commitment_loss_p, codebook_loss_p = self.prosody_quantizer(
	f0_input, 1
	)
	outs += z_p.detach()

	z_c, codes_c, latents_c, commitment_loss_c, codebook_loss_c = self.content_quantizer(
	x, 2
	)
	outs += z_c.detach()

	residual_feature = x - z_p.detach() - z_c.detach()

	z_r, codes_r, latents_r, commitment_loss_r, codebook_loss_r = self.residual_quantizer(
	residual_feature, 3
	)

	quantized = [z_p, z_c, z_r]
	codes = [codes_p, codes_c, codes_r]

	return quantized, codes