EzAudio

Sleeping

App Files Files Community

EzAudio / src /modules /stable_vae /models /blocks.py

OpenSound

Upload 84 files

b9d6819 verified 10 months ago

raw

history blame

12 kB

	from functools import reduce
	import math
	import numpy as np
	import torch
	from torch import nn
	from torch.nn import functional as F

	from torch.backends.cuda import sdp_kernel
	from packaging import version

	from .nn.layers import Snake1d


	class ResidualBlock(nn.Module):
	def __init__(self, main, skip=None):
	super().__init__()
	self.main = nn.Sequential(*main)
	self.skip = skip if skip else nn.Identity()

	def forward(self, input):
	return self.main(input) + self.skip(input)


	class ResConvBlock(ResidualBlock):
	def __init__(self, c_in, c_mid, c_out, is_last=False, kernel_size=5, conv_bias=True, use_snake=False):
	skip = None if c_in == c_out else nn.Conv1d(c_in, c_out, 1, bias=False)
	super().__init__([
	nn.Conv1d(c_in, c_mid, kernel_size, padding=kernel_size//2, bias=conv_bias),
	nn.GroupNorm(1, c_mid),
	Snake1d(c_mid) if use_snake else nn.GELU(),
	nn.Conv1d(c_mid, c_out, kernel_size, padding=kernel_size//2, bias=conv_bias),
	nn.GroupNorm(1, c_out) if not is_last else nn.Identity(),
	(Snake1d(c_out) if use_snake else nn.GELU()) if not is_last else nn.Identity(),
	], skip)


	class SelfAttention1d(nn.Module):
	def __init__(self, c_in, n_head=1, dropout_rate=0.):
	super().__init__()
	assert c_in % n_head == 0
	self.norm = nn.GroupNorm(1, c_in)
	self.n_head = n_head
	self.qkv_proj = nn.Conv1d(c_in, c_in * 3, 1)
	self.out_proj = nn.Conv1d(c_in, c_in, 1)
	self.dropout = nn.Dropout(dropout_rate, inplace=True)

	self.use_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0')

	if not self.use_flash:
	return

	device_properties = torch.cuda.get_device_properties(torch.device('cuda'))

	if device_properties.major == 8 and device_properties.minor == 0:
	# Use flash attention for A100 GPUs
	self.sdp_kernel_config = (True, False, False)
	else:
	# Don't use flash attention for other GPUs
	self.sdp_kernel_config = (False, True, True)

	def forward(self, input):
	n, c, s = input.shape
	qkv = self.qkv_proj(self.norm(input))
	qkv = qkv.view(
	[n, self.n_head * 3, c // self.n_head, s]).transpose(2, 3)
	q, k, v = qkv.chunk(3, dim=1)
	scale = k.shape[3]**-0.25

	if self.use_flash:
	with sdp_kernel(*self.sdp_kernel_config):
	y = F.scaled_dot_product_attention(q, k, v, is_causal=False).contiguous().view([n, c, s])
	else:
	att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3)
	y = (att @ v).transpose(2, 3).contiguous().view([n, c, s])


	return input + self.dropout(self.out_proj(y))


	class SkipBlock(nn.Module):
	def __init__(self, *main):
	super().__init__()
	self.main = nn.Sequential(*main)

	def forward(self, input):
	return torch.cat([self.main(input), input], dim=1)


	class FourierFeatures(nn.Module):
	def __init__(self, in_features, out_features, std=1.):
	super().__init__()
	assert out_features % 2 == 0
	self.weight = nn.Parameter(torch.randn(
	[out_features // 2, in_features]) * std)

	def forward(self, input):
	f = 2 * math.pi * input @ self.weight.T
	return torch.cat([f.cos(), f.sin()], dim=-1)


	def expand_to_planes(input, shape):
	return input[..., None].repeat([1, 1, shape[2]])

	_kernels = {
	'linear':
	[1 / 8, 3 / 8, 3 / 8, 1 / 8],
	'cubic':
	[-0.01171875, -0.03515625, 0.11328125, 0.43359375,
	0.43359375, 0.11328125, -0.03515625, -0.01171875],
	'lanczos3':
	[0.003689131001010537, 0.015056144446134567, -0.03399861603975296,
	-0.066637322306633, 0.13550527393817902, 0.44638532400131226,
	0.44638532400131226, 0.13550527393817902, -0.066637322306633,
	-0.03399861603975296, 0.015056144446134567, 0.003689131001010537]
	}


	class Downsample1d(nn.Module):
	def __init__(self, kernel='linear', pad_mode='reflect', channels_last=False):
	super().__init__()
	self.pad_mode = pad_mode
	kernel_1d = torch.tensor(_kernels[kernel])
	self.pad = kernel_1d.shape[0] // 2 - 1
	self.register_buffer('kernel', kernel_1d)
	self.channels_last = channels_last

	def forward(self, x):
	if self.channels_last:
	x = x.permute(0, 2, 1)
	x = F.pad(x, (self.pad,) * 2, self.pad_mode)
	weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]])
	indices = torch.arange(x.shape[1], device=x.device)
	weight[indices, indices] = self.kernel.to(weight)
	x = F.conv1d(x, weight, stride=2)
	if self.channels_last:
	x = x.permute(0, 2, 1)
	return x


	class Upsample1d(nn.Module):
	def __init__(self, kernel='linear', pad_mode='reflect', channels_last=False):
	super().__init__()
	self.pad_mode = pad_mode
	kernel_1d = torch.tensor(_kernels[kernel]) * 2
	self.pad = kernel_1d.shape[0] // 2 - 1
	self.register_buffer('kernel', kernel_1d)
	self.channels_last = channels_last

	def forward(self, x):
	if self.channels_last:
	x = x.permute(0, 2, 1)
	x = F.pad(x, ((self.pad + 1) // 2,) * 2, self.pad_mode)
	weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]])
	indices = torch.arange(x.shape[1], device=x.device)
	weight[indices, indices] = self.kernel.to(weight)
	x = F.conv_transpose1d(x, weight, stride=2, padding=self.pad * 2 + 1)
	if self.channels_last:
	x = x.permute(0, 2, 1)
	return x


	def Downsample1d_2(
	in_channels: int, out_channels: int, factor: int, kernel_multiplier: int = 2
	) -> nn.Module:
	assert kernel_multiplier % 2 == 0, "Kernel multiplier must be even"

	return nn.Conv1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=factor * kernel_multiplier + 1,
	stride=factor,
	padding=factor * (kernel_multiplier // 2),
	)


	def Upsample1d_2(
	in_channels: int, out_channels: int, factor: int, use_nearest: bool = False
	) -> nn.Module:

	if factor == 1:
	return nn.Conv1d(
	in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1
	)

	if use_nearest:
	return nn.Sequential(
	nn.Upsample(scale_factor=factor, mode="nearest"),
	nn.Conv1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=3,
	padding=1,
	),
	)
	else:
	return nn.ConvTranspose1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=factor * 2,
	stride=factor,
	padding=factor // 2 + factor % 2,
	output_padding=factor % 2,
	)


	def zero_init(layer):
	nn.init.zeros_(layer.weight)
	if layer.bias is not None:
	nn.init.zeros_(layer.bias)
	return layer


	def rms_norm(x, scale, eps):
	dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
	mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True)
	scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
	return x * scale.to(x.dtype)

	#rms_norm = torch.compile(rms_norm)

	class AdaRMSNorm(nn.Module):
	def __init__(self, features, cond_features, eps=1e-6):
	super().__init__()
	self.eps = eps
	self.linear = zero_init(nn.Linear(cond_features, features, bias=False))

	def extra_repr(self):
	return f"eps={self.eps},"

	def forward(self, x, cond):
	return rms_norm(x, self.linear(cond)[:, None, :] + 1, self.eps)


	def normalize(x, eps=1e-4):
	dim = list(range(1, x.ndim))
	n = torch.linalg.vector_norm(x, dim=dim, keepdim=True)
	alpha = np.sqrt(n.numel() / x.numel())
	return x / torch.add(eps, n, alpha=alpha)


	class ForcedWNConv1d(nn.Module):
	def __init__(self, in_channels, out_channels, kernel_size=1):
	super().__init__()
	self.weight = nn.Parameter(torch.randn([out_channels, in_channels, kernel_size]))

	def forward(self, x):
	if self.training:
	with torch.no_grad():
	self.weight.copy_(normalize(self.weight))

	fan_in = self.weight[0].numel()

	w = normalize(self.weight) / math.sqrt(fan_in)

	return F.conv1d(x, w, padding='same')

	# Kernels

	use_compile = True

	def compile(function, args, *kwargs):
	if not use_compile:
	return function
	try:
	return torch.compile(function, args, *kwargs)
	except RuntimeError:
	return function


	@compile
	def linear_geglu(x, weight, bias=None):
	x = x @ weight.mT
	if bias is not None:
	x = x + bias
	x, gate = x.chunk(2, dim=-1)
	return x * F.gelu(gate)


	@compile
	def rms_norm(x, scale, eps):
	dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
	mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True)
	scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
	return x * scale.to(x.dtype)

	# Layers


	class LinearGEGLU(nn.Linear):
	def __init__(self, in_features, out_features, bias=True):
	super().__init__(in_features, out_features * 2, bias=bias)
	self.out_features = out_features

	def forward(self, x):
	return linear_geglu(x, self.weight, self.bias)


	class RMSNorm(nn.Module):
	def __init__(self, shape, fix_scale = False, eps=1e-6):
	super().__init__()
	self.eps = eps

	if fix_scale:
	self.register_buffer("scale", torch.ones(shape))
	else:
	self.scale = nn.Parameter(torch.ones(shape))

	def extra_repr(self):
	return f"shape={tuple(self.scale.shape)}, eps={self.eps}"

	def forward(self, x):
	return rms_norm(x, self.scale, self.eps)


	# jit script make it 1.4x faster and save GPU memory
	@torch.jit.script
	def snake_beta(x, alpha, beta):
	return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)

	# try:
	# snake_beta = torch.compile(snake_beta)
	# except RuntimeError:
	# pass


	# Adapted from https://github.com/NVIDIA/BigVGAN/blob/main/activations.py under MIT license
	# License available in LICENSES/LICENSE_NVIDIA.txt
	class SnakeBeta(nn.Module):

	def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
	super(SnakeBeta, self).__init__()
	self.in_features = in_features

	# initialize alpha
	self.alpha_logscale = alpha_logscale
	if self.alpha_logscale:
	# log scale alphas initialized to zeros
	self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
	self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
	else:
	# linear scale alphas initialized to ones
	self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
	self.beta = nn.Parameter(torch.ones(in_features) * alpha)

	self.alpha.requires_grad = alpha_trainable
	self.beta.requires_grad = alpha_trainable

	# self.no_div_by_zero = 0.000000001

	def forward(self, x):
	alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
	# line up with x to [B, C, T]
	beta = self.beta.unsqueeze(0).unsqueeze(-1)
	if self.alpha_logscale:
	alpha = torch.exp(alpha)
	beta = torch.exp(beta)
	x = snake_beta(x, alpha, beta)

	return x