Spaces:

ByteDance
/

MegaTTS3

Running on Zero

App Files Files Community

MegaTTS3 / tts /modules /wavvae /decoder /hifigan_modules.py

ZiyueJiang

first commit for huggingface space

593f3bc 3 months ago

raw

history blame contribute delete

10.1 kB

	# Copyright 2025 ByteDance and/or its affiliates.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import torch.nn as nn
	import torch.nn.functional as F
	import torch
	import torch.utils.data
	from librosa.filters import mel as librosa_mel_fn
	from torch.nn.utils import weight_norm, remove_weight_norm
	from torch.nn import Conv1d
	import numpy as np


	def init_weights(m, mean=0.0, std=0.01):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.normal_(mean, std)


	def get_padding(kernel_size, dilation=1):
	return int((kernel_size*dilation - dilation)/2)


	class Upsample(nn.Module):
	def __init__(self, mult, r):
	super(Upsample, self).__init__()
	self.r = r
	self.upsample = nn.Sequential(nn.Upsample(mode="nearest", scale_factor=r),
	nn.LeakyReLU(0.2),
	nn.ReflectionPad1d(3),
	nn.utils.weight_norm(nn.Conv1d(mult, mult // 2, kernel_size=7, stride=1))
	)
	r_kernel = r if r >= 5 else 5
	self.trans_upsample = nn.Sequential(nn.LeakyReLU(0.2),
	nn.utils.weight_norm(nn.ConvTranspose1d(mult, mult // 2,
	kernel_size=r_kernel * 2, stride=r,
	padding=r_kernel - r // 2,
	output_padding=r % 2)
	))

	def forward(self, x):
	x = torch.sin(x) + x
	out1 = self.upsample(x)
	out2 = self.trans_upsample(x)
	return out1 + out2


	class Downsample(nn.Module):
	def __init__(self, mult, r):
	super(Downsample, self).__init__()
	self.r = r
	r_kernel = r if r >= 5 else 5
	self.trans_downsample = nn.Sequential(nn.LeakyReLU(0.2),
	nn.utils.weight_norm(nn.Conv1d(mult, mult * 2,
	kernel_size=r_kernel * 2, stride=r,
	padding=r_kernel - r // 2)
	))

	def forward(self, x):
	out = self.trans_downsample(x)
	return out


	def weights_init(m):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.normal_(0.0, 0.02)
	elif classname.find("BatchNorm2d") != -1:
	m.weight.data.normal_(1.0, 0.02)
	m.bias.data.fill_(0)


	def weights_zero_init(m):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.fill_(0.0)
	m.bias.data.fill_(0.0)


	def WNConv1d(args, *kwargs):
	return weight_norm(nn.Conv1d(args, *kwargs))


	def WNConvTranspose1d(args, *kwargs):
	return weight_norm(nn.ConvTranspose1d(args, *kwargs))


	class Audio2Mel(nn.Module):
	def __init__(
	self,
	hop_length=300,
	sampling_rate=24000,
	n_mel_channels=80,
	mel_fmin=0.,
	mel_fmax=None,
	frame_size=0.05,
	device='cpu'
	):
	super().__init__()
	##############################################
	# FFT Parameters #
	##############################################

	self.n_fft = int(np.power(2., np.ceil(np.log(sampling_rate * frame_size) / np.log(2))))
	window = torch.hann_window(int(sampling_rate * frame_size)).float()
	mel_basis = librosa_mel_fn(
	sampling_rate, self.n_fft, n_mel_channels, mel_fmin, mel_fmax
	) # Mel filter (by librosa)
	mel_basis = torch.from_numpy(mel_basis).float()
	self.register_buffer("mel_basis", mel_basis)
	self.register_buffer("window", window)

	self.hop_length = hop_length
	self.win_length = int(sampling_rate * frame_size)
	self.sampling_rate = sampling_rate
	self.n_mel_channels = n_mel_channels

	def forward(self, audio):
	fft = torch.stft(
	audio.squeeze(1),
	n_fft=self.n_fft,
	hop_length=self.hop_length,
	win_length=self.win_length,
	window=self.window,
	center=True,
	)
	real_part, imag_part = fft.unbind(-1)
	magnitude = torch.sqrt(torch.clamp(real_part 2 + imag_part 2, min=1e-5))
	mel_output = torch.matmul(self.mel_basis, magnitude)

	log_mel_spec = 20 * torch.log10(torch.clamp(mel_output, min=1e-5)) - 20
	norm_mel = (log_mel_spec + 115.) / 115.
	mel_comp = torch.clamp(norm_mel * 8. - 4., -4., 4.)

	return mel_comp


	class ResnetBlock(nn.Module):
	def __init__(self, dim, dilation=1, dim_in=None):
	super().__init__()
	if dim_in is None:
	dim_in = dim

	self.block = nn.Sequential(
	nn.LeakyReLU(0.2),
	nn.ReflectionPad1d(dilation),
	WNConv1d(dim_in, dim, kernel_size=3, dilation=dilation),
	nn.LeakyReLU(0.2),
	WNConv1d(dim, dim, kernel_size=1),
	)
	self.shortcut = WNConv1d(dim_in, dim, kernel_size=1)

	def forward(self, x):
	return self.shortcut(x) + self.block(x)


	'''
	参照hifigan（https://arxiv.org/pdf/2010.05646.pdf）v2结构
	多尺度主要是kernel_size不同，3组并行卷积模块，每个卷积模块内部采用不同的串行dilation size，且中间交叉正常无dilation卷积层
	'''


	class ResBlockMRFV2(torch.nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
	super(ResBlockMRFV2, self).__init__()
	self.convs1 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
	padding=get_padding(kernel_size, dilation[0]))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
	padding=get_padding(kernel_size, dilation[1]))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
	padding=get_padding(kernel_size, dilation[2])))
	])
	self.convs1.apply(init_weights)

	self.convs2 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1))),
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
	padding=get_padding(kernel_size, 1)))
	])
	self.convs2.apply(init_weights)

	def forward(self, x):
	for c1, c2 in zip(self.convs1, self.convs2):
	xt = F.leaky_relu(x, 0.2)
	xt = c1(xt)
	xt = F.leaky_relu(xt, 0.2)
	xt = c2(xt)
	x = xt + x
	return x

	def remove_weight_norm(self):
	for l in self.convs1:
	remove_weight_norm(l)
	for l in self.convs2:
	remove_weight_norm(l)


	class ResBlockMRFV2Inter(torch.nn.Module):
	def __init__(self, channels, kernel_size=3):
	super(ResBlockMRFV2Inter, self).__init__()
	self.block1 = ResBlockMRFV2(channels)
	self.block2 = ResBlockMRFV2(channels, 7)
	self.block3 = ResBlockMRFV2(channels, 11)

	def forward(self, x):
	xs = self.block1(x)
	xs += self.block2(x)
	xs += self.block3(x)
	x = xs / 3
	return x


	class Generator(nn.Module):
	def __init__(self, input_size_, ngf, n_residual_layers, num_band, args, ratios=[5, 5, 4, 3], onnx_export=False,
	device='cpu'):
	super().__init__()
	self.hop_length = args.frame_shift
	self.args = args
	self.onnx_export = onnx_export

	# ------------- Define upsample layers ----------------
	mult = int(2 ** len(ratios))
	model_up = []
	input_size = input_size_
	model_up += [
	nn.ReflectionPad1d(3),
	WNConv1d(input_size, mult * ngf, kernel_size=7, padding=0),
	]

	# Upsample to raw audio scale
	for i, r in enumerate(ratios):
	model_up += [Upsample(mult * ngf, r)]
	model_up += [ResBlockMRFV2Inter(mult * ngf // 2)]
	mult //= 2

	model_up += [
	nn.LeakyReLU(0.2),
	nn.ReflectionPad1d(3),
	WNConv1d(ngf, num_band, kernel_size=7, padding=0),
	nn.Tanh(),
	]
	if not args.use_tanh:
	model_up[-1] = nn.Conv1d(num_band, num_band, 1)
	model_up[-2].apply(weights_zero_init)

	self.model_up = nn.Sequential(*model_up)

	self.apply(weights_init)

	def forward(self, mel, step=None):
	# mel input: (batch_size, seq_num, 80)
	if self.onnx_export:
	mel = mel.transpose(1, 2)
	# on onnx, for engineering, mel input: (batch_size, 80, seq_num)

	# Between Down and up
	x = mel

	# Upsample pipline
	cnt_after_upsample = 0

	for i, m in enumerate(self.model_up):
	x = m(x)

	if type(m) == Upsample:
	cnt_after_upsample += 1

	return x