vishred18's picture
Upload 364 files
d5ee97c
# -*- coding: utf-8 -*-
# Copyright 2020 The TensorFlowTTS Team and Tomoki Hayashi (@kan-bayashi)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parallel-wavegan Modules. Based on pytorch implementation (https://github.com/kan-bayashi/ParallelWaveGAN)"""
import tensorflow as tf
from tensorflow_tts.models import BaseModel
def get_initializer(initializer_seed=42):
"""Creates a `tf.initializers.he_normal` with the given seed.
Args:
initializer_seed: int, initializer seed.
Returns:
HeNormal initializer with seed = `initializer_seed`.
"""
return tf.keras.initializers.he_normal(seed=initializer_seed)
class TFConv1d1x1(tf.keras.layers.Conv1D):
"""1x1 Conv1d with customized initialization."""
def __init__(self, filters, use_bias, padding, initializer_seed, **kwargs):
"""Initialize 1x1 Conv1d module."""
super().__init__(
filters=filters,
kernel_size=1,
strides=1,
padding=padding,
dilation_rate=1,
use_bias=use_bias,
kernel_initializer=get_initializer(initializer_seed),
**kwargs,
)
class TFConv1d(tf.keras.layers.Conv1D):
"""Conv1d with customized initialization."""
def __init__(self, *args, **kwargs):
"""Initialize Conv1d module."""
initializer_seed = kwargs.pop("initializer_seed", 42)
super().__init__(
*args, **kwargs, kernel_initializer=get_initializer(initializer_seed)
)
class TFResidualBlock(tf.keras.layers.Layer):
"""Residual block module in WaveNet."""
def __init__(
self,
kernel_size=3,
residual_channels=64,
gate_channels=128,
skip_channels=64,
aux_channels=80,
dropout_rate=0.0,
dilation_rate=1,
use_bias=True,
use_causal_conv=False,
initializer_seed=42,
**kwargs,
):
"""Initialize ResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
residual_channels (int): Number of channels for residual connection.
skip_channels (int): Number of channels for skip connection.
aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
dropout_rate (float): Dropout probability.
dilation_rate (int): Dilation factor.
use_bias (bool): Whether to add bias parameter in convolution layers.
use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
initializer_seed (int32): initializer seed.
"""
super().__init__(**kwargs)
self.dropout_rate = dropout_rate
# no future time stamps available
self.use_causal_conv = use_causal_conv
# dilation conv
self.conv = TFConv1d(
filters=gate_channels,
kernel_size=kernel_size,
padding="same" if self.use_causal_conv is False else "causal",
strides=1,
dilation_rate=dilation_rate,
use_bias=use_bias,
initializer_seed=initializer_seed,
)
# local conditionong
if aux_channels > 0:
self.conv1x1_aux = TFConv1d1x1(
gate_channels,
use_bias=False,
padding="same",
initializer_seed=initializer_seed,
name="conv1x1_aux",
)
else:
self.conv1x1_aux = None
# conv output is split into two groups
gate_out_channels = gate_channels // 2
self.conv1x1_out = TFConv1d1x1(
residual_channels,
use_bias=use_bias,
padding="same",
initializer_seed=initializer_seed,
name="conv1x1_out",
)
self.conv1x1_skip = TFConv1d1x1(
skip_channels,
use_bias=use_bias,
padding="same",
initializer_seed=initializer_seed,
name="conv1x1_skip",
)
self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
def call(self, x, c, training=False):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, residual_channels, T).
c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
Returns:
Tensor: Output tensor for residual connection (B, T, residual_channels).
Tensor: Output tensor for skip connection (B, T, skip_channels).
"""
residual = x
x = self.dropout(x, training=training)
x = self.conv(x)
# split into two part for gated activation
xa, xb = tf.split(x, 2, axis=-1)
# local conditioning
if c is not None:
assert self.conv1x1_aux is not None
c = self.conv1x1_aux(c)
ca, cb = tf.split(c, 2, axis=-1)
xa, xb = xa + ca, xb + cb
x = tf.nn.tanh(xa) * tf.nn.sigmoid(xb)
# for skip connection
s = self.conv1x1_skip(x)
# for residual connection
x = self.conv1x1_out(x)
x = (x + residual) * tf.math.sqrt(0.5)
return x, s
class TFStretch1d(tf.keras.layers.Layer):
"""Stretch2d module."""
def __init__(self, x_scale, y_scale, method="nearest", **kwargs):
"""Initialize Stretch2d module.
Args:
x_scale (int): X scaling factor (Time axis in spectrogram).
y_scale (int): Y scaling factor (Frequency axis in spectrogram).
method (str): Interpolation method.
"""
super().__init__(**kwargs)
self.x_scale = x_scale
self.y_scale = y_scale
self.method = method
def call(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, T, C, 1).
Returns:
Tensor: Interpolated tensor (B, T * x_scale, C * y_scale, 1)
"""
x_shape = tf.shape(x)
new_size = (x_shape[1] * self.x_scale, x_shape[2] * self.y_scale)
x = tf.image.resize(x, method=self.method, size=new_size)
return x
class TFUpsampleNetWork(tf.keras.layers.Layer):
"""Upsampling network module."""
def __init__(
self,
output_channels,
upsample_scales,
nonlinear_activation=None,
nonlinear_activation_params={},
interpolate_mode="nearest",
freq_axis_kernel_size=1,
use_causal_conv=False,
**kwargs,
):
"""Initialize upsampling network module.
Args:
output_channels (int): output feature channels.
upsample_scales (list): List of upsampling scales.
nonlinear_activation (str): Activation function name.
nonlinear_activation_params (dict): Arguments for specified activation function.
interpolate_mode (str): Interpolation mode.
freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
"""
super().__init__(**kwargs)
self.use_causal_conv = use_causal_conv
self.up_layers = []
for scale in upsample_scales:
# interpolation layer
stretch = TFStretch1d(
scale, 1, interpolate_mode, name="stretch_._{}".format(scale)
) # ->> outputs: [B, T * scale, C * 1, 1]
self.up_layers += [stretch]
# conv layer
assert (
freq_axis_kernel_size - 1
) % 2 == 0, "Not support even number freq axis kernel size."
kernel_size = scale * 2 + 1
conv = tf.keras.layers.Conv2D(
filters=1,
kernel_size=(kernel_size, freq_axis_kernel_size),
padding="causal" if self.use_causal_conv is True else "same",
use_bias=False,
) # ->> outputs: [B, T * scale, C * 1, 1]
self.up_layers += [conv]
# nonlinear
if nonlinear_activation is not None:
nonlinear = getattr(tf.keras.layers, nonlinear_activation)(
**nonlinear_activation_params
)
self.up_layers += [nonlinear]
def call(self, c):
"""Calculate forward propagation.
Args:
c : Input tensor (B, T, C).
Returns:
Tensor: Upsampled tensor (B, T', C), where T' = T * prod(upsample_scales).
"""
c = tf.expand_dims(c, -1) # [B, T, C, 1]
for f in self.up_layers:
c = f(c)
return tf.squeeze(c, -1) # [B, T, C]
class TFConvInUpsampleNetWork(tf.keras.layers.Layer):
"""Convolution + upsampling network module."""
def __init__(
self,
upsample_scales,
nonlinear_activation=None,
nonlinear_activation_params={},
interpolate_mode="nearest",
freq_axis_kernel_size=1,
aux_channels=80,
aux_context_window=0,
use_causal_conv=False,
initializer_seed=42,
**kwargs,
):
"""Initialize convolution + upsampling network module.
Args:
upsample_scales (list): List of upsampling scales.
nonlinear_activation (str): Activation function name.
nonlinear_activation_params (dict): Arguments for specified activation function.
mode (str): Interpolation mode.
freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
aux_channels (int): Number of channels of pre-convolutional layer.
aux_context_window (int): Context window size of the pre-convolutional layer.
use_causal_conv (bool): Whether to use causal structure.
"""
super().__init__(**kwargs)
self.aux_context_window = aux_context_window
self.use_causal_conv = use_causal_conv and aux_context_window > 0
# To capture wide-context information in conditional features
kernel_size = (
aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
)
self.conv_in = TFConv1d(
filters=aux_channels,
kernel_size=kernel_size,
padding="same",
use_bias=False,
initializer_seed=initializer_seed,
name="conv_in",
)
self.upsample = TFUpsampleNetWork(
output_channels=aux_channels,
upsample_scales=upsample_scales,
nonlinear_activation=nonlinear_activation,
nonlinear_activation_params=nonlinear_activation_params,
interpolate_mode=interpolate_mode,
freq_axis_kernel_size=freq_axis_kernel_size,
use_causal_conv=use_causal_conv,
name="upsample_network",
)
def call(self, c):
"""Calculate forward propagation.
Args:
c : Input tensor (B, T', C).
Returns:
Tensor: Upsampled tensor (B, T, C),
where T = (T' - aux_context_window * 2) * prod(upsample_scales).
Note:
The length of inputs considers the context window size.
"""
c_ = self.conv_in(c)
return self.upsample(c_)
class TFParallelWaveGANGenerator(BaseModel):
"""Parallel WaveGAN Generator module."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.out_channels = config.out_channels
self.aux_channels = config.aux_channels
self.n_layers = config.n_layers
self.stacks = config.stacks
self.kernel_size = config.kernel_size
self.upsample_params = config.upsample_params
# check the number of layers and stacks
assert self.n_layers % self.stacks == 0
n_layers_per_stack = self.n_layers // self.stacks
# define first convolution
self.first_conv = TFConv1d1x1(
filters=config.residual_channels,
use_bias=True,
padding="same",
initializer_seed=config.initializer_seed,
name="first_convolution",
)
# define conv + upsampling network
if config.upsample_conditional_features:
self.upsample_params.update({"use_causal_conv": config.use_causal_conv})
self.upsample_params.update(
{
"aux_channels": config.aux_channels,
"aux_context_window": config.aux_context_window,
}
)
self.upsample_net = TFConvInUpsampleNetWork(**self.upsample_params)
else:
self.upsample_net = None
# define residual blocks
self.conv_layers = []
for layer in range(self.n_layers):
dilation_rate = 2 ** (layer % n_layers_per_stack)
conv = TFResidualBlock(
kernel_size=config.kernel_size,
residual_channels=config.residual_channels,
gate_channels=config.gate_channels,
skip_channels=config.skip_channels,
aux_channels=config.aux_channels,
dilation_rate=dilation_rate,
dropout_rate=config.dropout_rate,
use_bias=config.use_bias,
use_causal_conv=config.use_causal_conv,
initializer_seed=config.initializer_seed,
name="residual_block_._{}".format(layer),
)
self.conv_layers += [conv]
# define output layers
self.last_conv_layers = [
tf.keras.layers.ReLU(),
TFConv1d1x1(
filters=config.skip_channels,
use_bias=config.use_bias,
padding="same",
initializer_seed=config.initializer_seed,
),
tf.keras.layers.ReLU(),
TFConv1d1x1(
filters=config.out_channels,
use_bias=True,
padding="same",
initializer_seed=config.initializer_seed,
),
tf.keras.layers.Activation("tanh"),
]
def _build(self):
mels = tf.random.uniform(shape=[2, 20, 80], dtype=tf.float32)
self(mels, training=tf.cast(True, tf.bool))
def call(self, mels, training=False, **kwargs):
"""Calculate forward propagation.
Args:
mels (Tensor): Local conditioning auxiliary features (B, T', C).
Returns:
Tensor: Output tensor (B, T, 1)
"""
# perform upsampling
if mels is not None and self.upsample_net is not None:
c = self.upsample_net(mels)
# random noise x
# enccode to hidden representation
x = tf.expand_dims(tf.random.normal(shape=tf.shape(c)[0:2]), axis=2)
x = self.first_conv(x)
skips = 0
for f in self.conv_layers:
x, h = f(x, c, training=training)
skips += h
skips *= tf.math.sqrt(1.0 / len(self.conv_layers))
# apply final layers
x = skips
for f in self.last_conv_layers:
x = f(x)
return x
@tf.function(
experimental_relax_shapes=True,
input_signature=[
tf.TensorSpec(shape=[None, None, 80], dtype=tf.float32, name="mels"),
],
)
def inference(self, mels):
"""Calculate forward propagation.
Args:
c (Tensor): Local conditioning auxiliary features (B, T', C).
Returns:
Tensor: Output tensor (B, T, 1)
"""
# perform upsampling
if mels is not None and self.upsample_net is not None:
c = self.upsample_net(mels)
# enccode to hidden representation
x = tf.expand_dims(tf.random.normal(shape=tf.shape(c)[0:2]), axis=2)
x = self.first_conv(x)
skips = 0
for f in self.conv_layers:
x, h = f(x, c, training=False)
skips += h
skips *= tf.math.sqrt(1.0 / len(self.conv_layers))
# apply final layers
x = skips
for f in self.last_conv_layers:
x = f(x)
return x
class TFParallelWaveGANDiscriminator(BaseModel):
"""Parallel WaveGAN Discriminator module."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
assert (config.kernel_size - 1) % 2 == 0, "Not support even number kernel size."
assert config.dilation_factor > 0, "Dilation factor must be > 0."
self.conv_layers = []
for i in range(config.n_layers - 1):
if i == 0:
dilation_rate = 1
else:
dilation_rate = (
i if config.dilation_factor == 1 else config.dilation_factor ** i
)
self.conv_layers += [
TFConv1d(
filters=config.conv_channels,
kernel_size=config.kernel_size,
padding="same",
dilation_rate=dilation_rate,
use_bias=config.use_bias,
initializer_seed=config.initializer_seed,
)
]
self.conv_layers += [
getattr(tf.keras.layers, config.nonlinear_activation)(
**config.nonlinear_activation_params
)
]
self.conv_layers += [
TFConv1d(
filters=config.out_channels,
kernel_size=config.kernel_size,
padding="same",
use_bias=config.use_bias,
initializer_seed=config.initializer_seed,
)
]
if config.apply_sigmoid_at_last:
self.conv_layers += [
tf.keras.layers.Activation("sigmoid"),
]
def _build(self):
x = tf.random.uniform(shape=[2, 16000, 1])
self(x)
def call(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, T, 1).
Returns:
Tensor: Output tensor (B, T, 1)
"""
for f in self.conv_layers:
x = f(x)
return x