Spaces:
Running
Running
File size: 3,175 Bytes
c968fc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# This code is modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/ttv_v1/styleencoder.py
from . import attentions
from torch import nn
import torch
from torch.nn import functional as F
class Mish(nn.Module):
def __init__(self):
super(Mish, self).__init__()
def forward(self, x):
return x * torch.tanh(F.softplus(x))
class Conv1dGLU(nn.Module):
"""
Conv1d + GLU(Gated Linear Unit) with residual connection.
For GLU refer to https://arxiv.org/abs/1612.08083 paper.
"""
def __init__(self, in_channels, out_channels, kernel_size, dropout):
super(Conv1dGLU, self).__init__()
self.out_channels = out_channels
self.conv1 = nn.Conv1d(
in_channels, 2 * out_channels, kernel_size=kernel_size, padding=2
)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
residual = x
x = self.conv1(x)
x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1)
x = x1 * torch.sigmoid(x2)
x = residual + self.dropout(x)
return x
class StyleEncoder(torch.nn.Module):
def __init__(self, in_dim=513, hidden_dim=128, out_dim=256):
super().__init__()
self.in_dim = in_dim # Linear 513 wav2vec 2.0 1024
self.hidden_dim = hidden_dim
self.out_dim = out_dim
self.kernel_size = 5
self.n_head = 2
self.dropout = 0.1
self.spectral = nn.Sequential(
nn.Conv1d(self.in_dim, self.hidden_dim, 1),
Mish(),
nn.Dropout(self.dropout),
nn.Conv1d(self.hidden_dim, self.hidden_dim, 1),
Mish(),
nn.Dropout(self.dropout),
)
self.temporal = nn.Sequential(
Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
)
self.slf_attn = attentions.MultiHeadAttention(
self.hidden_dim,
self.hidden_dim,
self.n_head,
p_dropout=self.dropout,
proximal_bias=False,
proximal_init=True,
)
self.atten_drop = nn.Dropout(self.dropout)
self.fc = nn.Conv1d(self.hidden_dim, self.out_dim, 1)
def forward(self, x, mask=None):
# spectral
x = self.spectral(x) * mask
# temporal
x = self.temporal(x) * mask
# self-attention
attn_mask = mask.unsqueeze(2) * mask.unsqueeze(-1)
y = self.slf_attn(x, x, attn_mask=attn_mask)
x = x + self.atten_drop(y)
# fc
x = self.fc(x)
# temoral average pooling
w = self.temporal_avg_pool(x, mask=mask)
return w
def temporal_avg_pool(self, x, mask=None):
if mask is None:
out = torch.mean(x, dim=2)
else:
len_ = mask.sum(dim=2)
x = x.sum(dim=2)
out = torch.div(x, len_)
return out
|