|
import logging |
|
import math |
|
|
|
import torch |
|
import torch.nn as nn |
|
from torch.nn import functional as F |
|
|
|
from modules.commons.espnet_positional_embedding import RelPositionalEncoding |
|
from modules.commons.common_layers import SinusoidalPositionalEmbedding, Linear, EncSALayer, DecSALayer, BatchNorm1dTBC |
|
from utils.hparams import hparams |
|
|
|
DEFAULT_MAX_SOURCE_POSITIONS = 2000 |
|
DEFAULT_MAX_TARGET_POSITIONS = 2000 |
|
|
|
|
|
class TransformerEncoderLayer(nn.Module): |
|
def __init__(self, hidden_size, dropout, kernel_size=None, num_heads=2, norm='ln'): |
|
super().__init__() |
|
self.hidden_size = hidden_size |
|
self.dropout = dropout |
|
self.num_heads = num_heads |
|
self.op = EncSALayer( |
|
hidden_size, num_heads, dropout=dropout, |
|
attention_dropout=0.0, relu_dropout=dropout, |
|
kernel_size=kernel_size |
|
if kernel_size is not None else hparams['enc_ffn_kernel_size'], |
|
padding=hparams['ffn_padding'], |
|
norm=norm, act=hparams['ffn_act']) |
|
|
|
def forward(self, x, **kwargs): |
|
return self.op(x, **kwargs) |
|
|
|
|
|
|
|
|
|
|
|
class LayerNorm(torch.nn.LayerNorm): |
|
"""Layer normalization module. |
|
:param int nout: output dim size |
|
:param int dim: dimension to be normalized |
|
""" |
|
|
|
def __init__(self, nout, dim=-1, eps=1e-5): |
|
"""Construct an LayerNorm object.""" |
|
super(LayerNorm, self).__init__(nout, eps=eps) |
|
self.dim = dim |
|
|
|
def forward(self, x): |
|
"""Apply layer normalization. |
|
:param torch.Tensor x: input tensor |
|
:return: layer normalized tensor |
|
:rtype torch.Tensor |
|
""" |
|
if self.dim == -1: |
|
return super(LayerNorm, self).forward(x) |
|
return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) |
|
|
|
|
|
class DurationPredictor(torch.nn.Module): |
|
"""Duration predictor module. |
|
This is a module of duration predictor described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_. |
|
The duration predictor predicts a duration of each frame in log domain from the hidden embeddings of encoder. |
|
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: |
|
https://arxiv.org/pdf/1905.09263.pdf |
|
Note: |
|
The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, |
|
the outputs are calculated in log domain but in `inference`, those are calculated in linear domain. |
|
""" |
|
|
|
def __init__(self, idim, odims = 1, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, padding='SAME'): |
|
"""Initilize duration predictor module. |
|
Args: |
|
idim (int): Input dimension. |
|
n_layers (int, optional): Number of convolutional layers. |
|
n_chans (int, optional): Number of channels of convolutional layers. |
|
kernel_size (int, optional): Kernel size of convolutional layers. |
|
dropout_rate (float, optional): Dropout rate. |
|
offset (float, optional): Offset value to avoid nan in log domain. |
|
""" |
|
super(DurationPredictor, self).__init__() |
|
self.offset = offset |
|
self.conv = torch.nn.ModuleList() |
|
self.kernel_size = kernel_size |
|
self.padding = padding |
|
for idx in range(n_layers): |
|
in_chans = idim if idx == 0 else n_chans |
|
self.conv += [torch.nn.Sequential( |
|
torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) |
|
if padding == 'SAME' |
|
else (kernel_size - 1, 0), 0), |
|
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), |
|
torch.nn.ReLU(), |
|
LayerNorm(n_chans, dim=1), |
|
torch.nn.Dropout(dropout_rate) |
|
)] |
|
self.linear = torch.nn.Linear(n_chans, odims) |
|
|
|
def _forward(self, xs, x_masks=None, is_inference=False): |
|
xs = xs.transpose(1, -1) |
|
for f in self.conv: |
|
xs = f(xs) |
|
if x_masks is not None: |
|
xs = xs * (1 - x_masks.float())[:, None, :] |
|
|
|
xs = self.linear(xs.transpose(1, -1)) |
|
xs = xs * (1 - x_masks.float())[:, :, None] |
|
if is_inference: |
|
return self.out2dur(xs), xs |
|
else: |
|
if hparams['dur_loss'] in ['mse']: |
|
xs = xs.squeeze(-1) |
|
return xs |
|
|
|
def out2dur(self, xs): |
|
if hparams['dur_loss'] in ['mse']: |
|
|
|
xs = xs.squeeze(-1) |
|
dur = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long() |
|
elif hparams['dur_loss'] == 'mog': |
|
return NotImplementedError |
|
elif hparams['dur_loss'] == 'crf': |
|
dur = torch.LongTensor(self.crf.decode(xs)).cuda() |
|
return dur |
|
|
|
def forward(self, xs, x_masks=None): |
|
"""Calculate forward propagation. |
|
Args: |
|
xs (Tensor): Batch of input sequences (B, Tmax, idim). |
|
x_masks (ByteTensor, optional): Batch of masks indicating padded part (B, Tmax). |
|
Returns: |
|
Tensor: Batch of predicted durations in log domain (B, Tmax). |
|
""" |
|
return self._forward(xs, x_masks, False) |
|
|
|
def inference(self, xs, x_masks=None): |
|
"""Inference duration. |
|
Args: |
|
xs (Tensor): Batch of input sequences (B, Tmax, idim). |
|
x_masks (ByteTensor, optional): Batch of masks indicating padded part (B, Tmax). |
|
Returns: |
|
LongTensor: Batch of predicted durations in linear domain (B, Tmax). |
|
""" |
|
return self._forward(xs, x_masks, True) |
|
|
|
class SyntaDurationPredictor(torch.nn.Module): |
|
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0): |
|
super(SyntaDurationPredictor, self).__init__() |
|
from modules.syntaspeech.syntactic_graph_encoder import GraphAuxEnc |
|
self.graph_encoder = GraphAuxEnc(in_dim=idim, hid_dim=idim, out_dim=idim) |
|
self.offset = offset |
|
self.conv = torch.nn.ModuleList() |
|
self.kernel_size = kernel_size |
|
for idx in range(n_layers): |
|
in_chans = idim if idx == 0 else n_chans |
|
self.conv += [torch.nn.Sequential( |
|
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), |
|
torch.nn.ReLU(), |
|
LayerNorm(n_chans, dim=1), |
|
torch.nn.Dropout(dropout_rate) |
|
)] |
|
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus()) |
|
|
|
def forward(self, x, x_padding=None, ph2word=None, graph_lst=None, etypes_lst=None): |
|
x = x.transpose(1, -1) |
|
assert ph2word is not None and graph_lst is not None and etypes_lst is not None |
|
x_graph = self.graph_encoder(graph_lst, x, ph2word, etypes_lst) |
|
x = x + x_graph * 1. |
|
|
|
for f in self.conv: |
|
x = f(x) |
|
if x_padding is not None: |
|
x = x * (1 - x_padding.float())[:, None, :] |
|
|
|
x = self.linear(x.transpose(1, -1)) |
|
x = x * (1 - x_padding.float())[:, :, None] |
|
x = x[..., 0] |
|
return x |
|
|
|
class LengthRegulator(torch.nn.Module): |
|
def __init__(self, pad_value=0.0): |
|
super(LengthRegulator, self).__init__() |
|
self.pad_value = pad_value |
|
|
|
def forward(self, dur, dur_padding=None, alpha=1.0): |
|
""" |
|
Example (no batch dim version): |
|
1. dur = [2,2,3] |
|
2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4] |
|
3. token_mask = [[1,1,0,0,0,0,0], |
|
[0,0,1,1,0,0,0], |
|
[0,0,0,0,1,1,1]] |
|
4. token_idx * token_mask = [[1,1,0,0,0,0,0], |
|
[0,0,2,2,0,0,0], |
|
[0,0,0,0,3,3,3]] |
|
5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3] |
|
|
|
:param dur: Batch of durations of each frame (B, T_txt) |
|
:param dur_padding: Batch of padding of each frame (B, T_txt) |
|
:param alpha: duration rescale coefficient |
|
:return: |
|
mel2ph (B, T_speech) |
|
""" |
|
assert alpha > 0 |
|
dur = torch.round(dur.float() * alpha).long() |
|
if dur_padding is not None: |
|
dur = dur * (1 - dur_padding.long()) |
|
token_idx = torch.arange(1, dur.shape[1] + 1)[None, :, None].to(dur.device) |
|
dur_cumsum = torch.cumsum(dur, 1) |
|
dur_cumsum_prev = F.pad(dur_cumsum, [1, -1], mode='constant', value=0) |
|
|
|
pos_idx = torch.arange(dur.sum(-1).max())[None, None].to(dur.device) |
|
token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None]) |
|
mel2ph = (token_idx * token_mask.long()).sum(1) |
|
return mel2ph |
|
|
|
|
|
class PitchPredictor(torch.nn.Module): |
|
def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5, |
|
dropout_rate=0.1, padding='SAME'): |
|
"""Initilize pitch predictor module. |
|
Args: |
|
idim (int): Input dimension. |
|
n_layers (int, optional): Number of convolutional layers. |
|
n_chans (int, optional): Number of channels of convolutional layers. |
|
kernel_size (int, optional): Kernel size of convolutional layers. |
|
dropout_rate (float, optional): Dropout rate. |
|
""" |
|
super(PitchPredictor, self).__init__() |
|
self.conv = torch.nn.ModuleList() |
|
self.kernel_size = kernel_size |
|
self.padding = padding |
|
for idx in range(n_layers): |
|
in_chans = idim if idx == 0 else n_chans |
|
self.conv += [torch.nn.Sequential( |
|
torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) |
|
if padding == 'SAME' |
|
else (kernel_size - 1, 0), 0), |
|
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), |
|
torch.nn.ReLU(), |
|
LayerNorm(n_chans, dim=1), |
|
torch.nn.Dropout(dropout_rate) |
|
)] |
|
self.linear = torch.nn.Linear(n_chans, odim) |
|
self.embed_positions = SinusoidalPositionalEmbedding(idim, 0, init_size=4096) |
|
self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) |
|
|
|
def forward(self, xs): |
|
""" |
|
|
|
:param xs: [B, T, H] |
|
:return: [B, T, H] |
|
""" |
|
positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0]) |
|
xs = xs + positions |
|
xs = xs.transpose(1, -1) |
|
for f in self.conv: |
|
xs = f(xs) |
|
|
|
xs = self.linear(xs.transpose(1, -1)) |
|
return xs |
|
|
|
|
|
class EnergyPredictor(PitchPredictor): |
|
pass |
|
|
|
|
|
def mel2ph_to_dur(mel2ph, T_txt, max_dur=None): |
|
B, _ = mel2ph.shape |
|
dur = mel2ph.new_zeros(B, T_txt + 1).scatter_add(1, mel2ph, torch.ones_like(mel2ph)) |
|
dur = dur[:, 1:] |
|
if max_dur is not None: |
|
dur = dur.clamp(max=max_dur) |
|
return dur |
|
|
|
|
|
class FFTBlocks(nn.Module): |
|
def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout=None, num_heads=2, |
|
use_pos_embed=True, use_last_norm=True, norm='ln', use_pos_embed_alpha=True): |
|
super().__init__() |
|
self.num_layers = num_layers |
|
embed_dim = self.hidden_size = hidden_size |
|
self.dropout = dropout if dropout is not None else hparams['dropout'] |
|
self.use_pos_embed = use_pos_embed |
|
self.use_last_norm = use_last_norm |
|
if use_pos_embed: |
|
self.max_source_positions = DEFAULT_MAX_TARGET_POSITIONS |
|
self.padding_idx = 0 |
|
self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) if use_pos_embed_alpha else 1 |
|
self.embed_positions = SinusoidalPositionalEmbedding( |
|
embed_dim, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS, |
|
) |
|
|
|
self.layers = nn.ModuleList([]) |
|
self.layers.extend([ |
|
TransformerEncoderLayer(self.hidden_size, self.dropout, |
|
kernel_size=ffn_kernel_size, num_heads=num_heads) |
|
for _ in range(self.num_layers) |
|
]) |
|
if self.use_last_norm: |
|
if norm == 'ln': |
|
self.layer_norm = nn.LayerNorm(embed_dim) |
|
elif norm == 'bn': |
|
self.layer_norm = BatchNorm1dTBC(embed_dim) |
|
else: |
|
self.layer_norm = None |
|
|
|
def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens=False): |
|
""" |
|
:param x: [B, T, C] |
|
:param padding_mask: [B, T] |
|
:return: [B, T, C] or [L, B, T, C] |
|
""" |
|
padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask |
|
nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None] |
|
if self.use_pos_embed: |
|
positions = self.pos_embed_alpha * self.embed_positions(x[..., 0]) |
|
x = x + positions |
|
x = F.dropout(x, p=self.dropout, training=self.training) |
|
|
|
x = x.transpose(0, 1) * nonpadding_mask_TB |
|
hiddens = [] |
|
for layer in self.layers: |
|
x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB |
|
hiddens.append(x) |
|
if self.use_last_norm: |
|
x = self.layer_norm(x) * nonpadding_mask_TB |
|
if return_hiddens: |
|
x = torch.stack(hiddens, 0) |
|
x = x.transpose(1, 2) |
|
else: |
|
x = x.transpose(0, 1) |
|
return x |
|
|
|
|
|
class FastspeechEncoder(FFTBlocks): |
|
def __init__(self, embed_tokens, hidden_size=None, num_layers=None, kernel_size=None, num_heads=2): |
|
hidden_size = hparams['hidden_size'] if hidden_size is None else hidden_size |
|
kernel_size = hparams['enc_ffn_kernel_size'] if kernel_size is None else kernel_size |
|
num_layers = hparams['dec_layers'] if num_layers is None else num_layers |
|
super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads, |
|
use_pos_embed=False) |
|
self.embed_tokens = embed_tokens |
|
self.embed_scale = math.sqrt(hidden_size) |
|
self.padding_idx = 0 |
|
if hparams.get('rel_pos') is not None and hparams['rel_pos']: |
|
self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0) |
|
else: |
|
self.embed_positions = SinusoidalPositionalEmbedding( |
|
hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS, |
|
) |
|
|
|
def forward(self, txt_tokens): |
|
""" |
|
|
|
:param txt_tokens: [B, T] |
|
:return: { |
|
'encoder_out': [T x B x C] |
|
} |
|
""" |
|
encoder_padding_mask = txt_tokens.eq(self.padding_idx).data |
|
x = self.forward_embedding(txt_tokens) |
|
x = super(FastspeechEncoder, self).forward(x, encoder_padding_mask) |
|
return x |
|
|
|
def forward_embedding(self, txt_tokens): |
|
|
|
x = self.embed_scale * self.embed_tokens(txt_tokens) |
|
if hparams['use_pos_embed']: |
|
if hparams.get('rel_pos') is not None and hparams['rel_pos']: |
|
x = self.embed_positions(x) |
|
else: |
|
positions = self.embed_positions(txt_tokens) |
|
x = x + positions |
|
x = F.dropout(x, p=self.dropout, training=self.training) |
|
return x |
|
|
|
|
|
class FastspeechDecoder(FFTBlocks): |
|
def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=None): |
|
num_heads = hparams['num_heads'] if num_heads is None else num_heads |
|
hidden_size = hparams['hidden_size'] if hidden_size is None else hidden_size |
|
kernel_size = hparams['dec_ffn_kernel_size'] if kernel_size is None else kernel_size |
|
num_layers = hparams['dec_layers'] if num_layers is None else num_layers |
|
super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads) |
|
|
|
|