Upload 86 files

a257816 verified about 1 month ago

23.1 kB

	import torch
	import torch.nn as nn
	from einops import pack, rearrange, repeat

	import math
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from conformer import ConformerBlock
	from diffusers.models.activations import get_activation

	from VietTTS.transformer.transformer import BasicTransformerBlock


	class SinusoidalPosEmb(torch.nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim
	assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"

	def forward(self, x, scale=1000):
	if x.ndim < 1:
	x = x.unsqueeze(0)
	device = x.device
	half_dim = self.dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
	emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
	return emb


	class Block1D(torch.nn.Module):
	def __init__(self, dim, dim_out, groups=8):
	super().__init__()
	self.block = torch.nn.Sequential(
	torch.nn.Conv1d(dim, dim_out, 3, padding=1),
	torch.nn.GroupNorm(groups, dim_out),
	nn.Mish(),
	)

	def forward(self, x, mask):
	output = self.block(x * mask)
	return output * mask


	class ResnetBlock1D(torch.nn.Module):
	def __init__(self, dim, dim_out, time_emb_dim, groups=8):
	super().__init__()
	self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))

	self.block1 = Block1D(dim, dim_out, groups=groups)
	self.block2 = Block1D(dim_out, dim_out, groups=groups)

	self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)

	def forward(self, x, mask, time_emb):
	h = self.block1(x, mask)
	h += self.mlp(time_emb).unsqueeze(-1)
	h = self.block2(h, mask)
	output = h + self.res_conv(x * mask)
	return output


	class Downsample1D(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)

	def forward(self, x):
	return self.conv(x)


	class TimestepEmbedding(nn.Module):
	def __init__(
	self,
	in_channels: int,
	time_embed_dim: int,
	act_fn: str = "silu",
	out_dim: int = None,
	post_act_fn: Optional[str] = None,
	cond_proj_dim=None,
	):
	super().__init__()

	self.linear_1 = nn.Linear(in_channels, time_embed_dim)

	if cond_proj_dim is not None:
	self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
	else:
	self.cond_proj = None

	self.act = get_activation(act_fn)

	if out_dim is not None:
	time_embed_dim_out = out_dim
	else:
	time_embed_dim_out = time_embed_dim
	self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)

	if post_act_fn is None:
	self.post_act = None
	else:
	self.post_act = get_activation(post_act_fn)

	def forward(self, sample, condition=None):
	if condition is not None:
	sample = sample + self.cond_proj(condition)
	sample = self.linear_1(sample)

	if self.act is not None:
	sample = self.act(sample)

	sample = self.linear_2(sample)

	if self.post_act is not None:
	sample = self.post_act(sample)
	return sample


	class Upsample1D(nn.Module):
	"""A 1D upsampling layer with an optional convolution.

	Parameters:
	channels (`int`):
	number of channels in the inputs and outputs.
	use_conv (`bool`, default `False`):
	option to use a convolution.
	use_conv_transpose (`bool`, default `False`):
	option to use a convolution transpose.
	out_channels (`int`, optional):
	number of output channels. Defaults to `channels`.
	"""

	def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.use_conv_transpose = use_conv_transpose
	self.name = name

	self.conv = None
	if use_conv_transpose:
	self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
	elif use_conv:
	self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)

	def forward(self, inputs):
	assert inputs.shape[1] == self.channels
	if self.use_conv_transpose:
	return self.conv(inputs)

	outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")

	if self.use_conv:
	outputs = self.conv(outputs)

	return outputs


	class ConformerWrapper(ConformerBlock):
	def __init__( # pylint: disable=useless-super-delegation
	self,
	*,
	dim,
	dim_head=64,
	heads=8,
	ff_mult=4,
	conv_expansion_factor=2,
	conv_kernel_size=31,
	attn_dropout=0,
	ff_dropout=0,
	conv_dropout=0,
	conv_causal=False,
	):
	super().__init__(
	dim=dim,
	dim_head=dim_head,
	heads=heads,
	ff_mult=ff_mult,
	conv_expansion_factor=conv_expansion_factor,
	conv_kernel_size=conv_kernel_size,
	attn_dropout=attn_dropout,
	ff_dropout=ff_dropout,
	conv_dropout=conv_dropout,
	conv_causal=conv_causal,
	)

	def forward(
	self,
	hidden_states,
	attention_mask,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	timestep=None,
	):
	return super().forward(x=hidden_states, mask=attention_mask.bool())


	class Decoder(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	channels=(256, 256),
	dropout=0.05,
	attention_head_dim=64,
	n_blocks=1,
	num_mid_blocks=2,
	num_heads=4,
	act_fn="snake",
	down_block_type="transformer",
	mid_block_type="transformer",
	up_block_type="transformer",
	):
	super().__init__()
	channels = tuple(channels)
	self.in_channels = in_channels
	self.out_channels = out_channels

	self.time_embeddings = SinusoidalPosEmb(in_channels)
	time_embed_dim = channels[0] * 4
	self.time_mlp = TimestepEmbedding(
	in_channels=in_channels,
	time_embed_dim=time_embed_dim,
	act_fn="silu",
	)

	self.down_blocks = nn.ModuleList([])
	self.mid_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	output_channel = in_channels
	for i in range(len(channels)): # pylint: disable=consider-using-enumerate
	input_channel = output_channel
	output_channel = channels[i]
	is_last = i == len(channels) - 1
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	down_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	downsample = (
	Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)

	self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

	for i in range(num_mid_blocks):
	input_channel = channels[-1]
	out_channels = channels[-1]

	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	mid_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)

	self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

	channels = channels[::-1] + (channels[0],)
	for i in range(len(channels) - 1):
	input_channel = channels[i]
	output_channel = channels[i + 1]
	is_last = i == len(channels) - 2

	resnet = ResnetBlock1D(
	dim=2 * input_channel,
	dim_out=output_channel,
	time_emb_dim=time_embed_dim,
	)
	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	up_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	upsample = (
	Upsample1D(output_channel, use_conv_transpose=True)
	if not is_last
	else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)

	self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))

	self.final_block = Block1D(channels[-1], channels[-1])
	self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)

	self.initialize_weights()
	# nn.init.normal_(self.final_proj.weight)

	@staticmethod
	def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
	if block_type == "conformer":
	block = ConformerWrapper(
	dim=dim,
	dim_head=attention_head_dim,
	heads=num_heads,
	ff_mult=1,
	conv_expansion_factor=2,
	ff_dropout=dropout,
	attn_dropout=dropout,
	conv_dropout=dropout,
	conv_kernel_size=31,
	)
	elif block_type == "transformer":
	block = BasicTransformerBlock(
	dim=dim,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	else:
	raise ValueError(f"Unknown block type {block_type}")

	return block

	def initialize_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Conv1d):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")

	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	elif isinstance(m, nn.GroupNorm):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	elif isinstance(m, nn.Linear):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")

	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	def forward(self, x, mask, mu, t, spks=None, cond=None):
	"""Forward pass of the UNet1DConditional model.

	Args:
	x (torch.Tensor): shape (batch_size, in_channels, time)
	mask (_type_): shape (batch_size, 1, time)
	t (_type_): shape (batch_size)
	spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
	cond (_type_, optional): placeholder for future use. Defaults to None.

	Raises:
	ValueError: _description_
	ValueError: _description_

	Returns:
	_type_: _description_
	"""

	t = self.time_embeddings(t)
	t = self.time_mlp(t)

	x = pack([x, mu], "b * t")[0]

	if spks is not None:
	spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
	x = pack([x, spks], "b * t")[0]

	hiddens = []
	masks = [mask]
	for resnet, transformer_blocks, downsample in self.down_blocks:
	mask_down = masks[-1]
	x = resnet(x, mask_down, t)
	x = rearrange(x, "b c t -> b t c")
	mask_down = rearrange(mask_down, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_down,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_down = rearrange(mask_down, "b t -> b 1 t")
	hiddens.append(x) # Save hidden states for skip connections
	x = downsample(x * mask_down)
	masks.append(mask_down[:, :, ::2])

	masks = masks[:-1]
	mask_mid = masks[-1]

	for resnet, transformer_blocks in self.mid_blocks:
	x = resnet(x, mask_mid, t)
	x = rearrange(x, "b c t -> b t c")
	mask_mid = rearrange(mask_mid, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_mid,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_mid = rearrange(mask_mid, "b t -> b 1 t")

	for resnet, transformer_blocks, upsample in self.up_blocks:
	mask_up = masks.pop()
	x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
	x = rearrange(x, "b c t -> b t c")
	mask_up = rearrange(mask_up, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_up,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_up = rearrange(mask_up, "b t -> b 1 t")
	x = upsample(x * mask_up)

	x = self.final_block(x, mask_up)
	output = self.final_proj(x * mask_up)

	return output * mask


	class ConditionalDecoder(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	channels=(256, 256),
	dropout=0.05,
	attention_head_dim=64,
	n_blocks=1,
	num_mid_blocks=2,
	num_heads=4,
	act_fn="snake",
	):
	"""
	This decoder requires an input with the same shape of the target. So, if your text content
	is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
	"""
	super().__init__()
	channels = tuple(channels)
	self.in_channels = in_channels
	self.out_channels = out_channels

	self.time_embeddings = SinusoidalPosEmb(in_channels)
	time_embed_dim = channels[0] * 4
	self.time_mlp = TimestepEmbedding(
	in_channels=in_channels,
	time_embed_dim=time_embed_dim,
	act_fn="silu",
	)
	self.down_blocks = nn.ModuleList([])
	self.mid_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	output_channel = in_channels
	for i in range(len(channels)): # pylint: disable=consider-using-enumerate
	input_channel = output_channel
	output_channel = channels[i]
	is_last = i == len(channels) - 1
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	downsample = (
	Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)
	self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

	for _ in range(num_mid_blocks):
	input_channel = channels[-1]
	out_channels = channels[-1]
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)

	self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

	channels = channels[::-1] + (channels[0],)
	for i in range(len(channels) - 1):
	input_channel = channels[i] * 2
	output_channel = channels[i + 1]
	is_last = i == len(channels) - 2
	resnet = ResnetBlock1D(
	dim=input_channel,
	dim_out=output_channel,
	time_emb_dim=time_embed_dim,
	)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	upsample = (
	Upsample1D(output_channel, use_conv_transpose=True)
	if not is_last
	else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)
	self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
	self.final_block = Block1D(channels[-1], channels[-1])
	self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
	self.initialize_weights()

	def initialize_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Conv1d):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.GroupNorm):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.Linear):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	def forward(self, x, mask, mu, t, spks=None, cond=None):
	"""Forward pass of the UNet1DConditional model.

	Args:
	x (torch.Tensor): shape (batch_size, in_channels, time)
	mask (_type_): shape (batch_size, 1, time)
	t (_type_): shape (batch_size)
	spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
	cond (_type_, optional): placeholder for future use. Defaults to None.

	Raises:
	ValueError: _description_
	ValueError: _description_

	Returns:
	_type_: _description_
	"""

	t = self.time_embeddings(t).to(t.dtype)
	t = self.time_mlp(t)

	x = pack([x, mu], "b * t")[0]

	if spks is not None:
	spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
	x = pack([x, spks], "b * t")[0]
	if cond is not None:
	x = pack([x, cond], "b * t")[0]

	hiddens = []
	masks = [mask]
	for resnet, transformer_blocks, downsample in self.down_blocks:
	mask_down = masks[-1]
	x = resnet(x, mask_down, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	hiddens.append(x) # Save hidden states for skip connections
	x = downsample(x * mask_down)
	masks.append(mask_down[:, :, ::2])
	masks = masks[:-1]
	mask_mid = masks[-1]

	for resnet, transformer_blocks in self.mid_blocks:
	x = resnet(x, mask_mid, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()

	for resnet, transformer_blocks, upsample in self.up_blocks:
	mask_up = masks.pop()
	skip = hiddens.pop()
	x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
	x = resnet(x, mask_up, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	x = upsample(x * mask_up)
	x = self.final_block(x, mask_up)
	output = self.final_proj(x * mask_up)
	return output * mask