Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# This code is modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/ttv_v1/styleencoder.py | |
from . import attentions | |
from torch import nn | |
import torch | |
from torch.nn import functional as F | |
class Mish(nn.Module): | |
def __init__(self): | |
super(Mish, self).__init__() | |
def forward(self, x): | |
return x * torch.tanh(F.softplus(x)) | |
class Conv1dGLU(nn.Module): | |
""" | |
Conv1d + GLU(Gated Linear Unit) with residual connection. | |
For GLU refer to https://arxiv.org/abs/1612.08083 paper. | |
""" | |
def __init__(self, in_channels, out_channels, kernel_size, dropout): | |
super(Conv1dGLU, self).__init__() | |
self.out_channels = out_channels | |
self.conv1 = nn.Conv1d( | |
in_channels, 2 * out_channels, kernel_size=kernel_size, padding=2 | |
) | |
self.dropout = nn.Dropout(dropout) | |
def forward(self, x): | |
residual = x | |
x = self.conv1(x) | |
x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1) | |
x = x1 * torch.sigmoid(x2) | |
x = residual + self.dropout(x) | |
return x | |
class StyleEncoder(torch.nn.Module): | |
def __init__(self, in_dim=513, hidden_dim=128, out_dim=256): | |
super().__init__() | |
self.in_dim = in_dim # Linear 513 wav2vec 2.0 1024 | |
self.hidden_dim = hidden_dim | |
self.out_dim = out_dim | |
self.kernel_size = 5 | |
self.n_head = 2 | |
self.dropout = 0.1 | |
self.spectral = nn.Sequential( | |
nn.Conv1d(self.in_dim, self.hidden_dim, 1), | |
Mish(), | |
nn.Dropout(self.dropout), | |
nn.Conv1d(self.hidden_dim, self.hidden_dim, 1), | |
Mish(), | |
nn.Dropout(self.dropout), | |
) | |
self.temporal = nn.Sequential( | |
Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout), | |
Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout), | |
) | |
self.slf_attn = attentions.MultiHeadAttention( | |
self.hidden_dim, | |
self.hidden_dim, | |
self.n_head, | |
p_dropout=self.dropout, | |
proximal_bias=False, | |
proximal_init=True, | |
) | |
self.atten_drop = nn.Dropout(self.dropout) | |
self.fc = nn.Conv1d(self.hidden_dim, self.out_dim, 1) | |
def forward(self, x, mask=None): | |
# spectral | |
x = self.spectral(x) * mask | |
# temporal | |
x = self.temporal(x) * mask | |
# self-attention | |
attn_mask = mask.unsqueeze(2) * mask.unsqueeze(-1) | |
y = self.slf_attn(x, x, attn_mask=attn_mask) | |
x = x + self.atten_drop(y) | |
# fc | |
x = self.fc(x) | |
# temoral average pooling | |
w = self.temporal_avg_pool(x, mask=mask) | |
return w | |
def temporal_avg_pool(self, x, mask=None): | |
if mask is None: | |
out = torch.mean(x, dim=2) | |
else: | |
len_ = mask.sum(dim=2) | |
x = x.sum(dim=2) | |
out = torch.div(x, len_) | |
return out | |