dkounadis
/

artificial-styletts2

@@ -1,26 +1,29 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 import math
 import numpy as np
 LRELU_SLOPE = 0.1
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size*dilation - dilation)/2)
-def _tile(x,
           length=None):
     x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
     return x
 class AdaIN1d(nn.Module):
     # used by HiFiGan & ProsodyPredictor
     def __init__(self, style_dim, num_features):
         super().__init__()
         self.norm = nn.InstanceNorm1d(num_features, affine=False)
@@ -30,20 +33,15 @@ class AdaIN1d(nn.Module):
         # x = torch.Size([1, 512, 248])     same as output
         # s = torch.Size([1, 7, 1, 128])
         s = self.fc(s.transpose(1, 2)).transpose(1, 2)
         s = _tile(s, length=x.shape[2])
         gamma, beta = torch.chunk(s, chunks=2, dim=1)
         return (1+gamma) * self.norm(x) + beta
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
         super(AdaINResBlock1, self).__init__()
@@ -66,29 +64,30 @@ class AdaINResBlock1(torch.nn.Module):
                                padding=get_padding(kernel_size, 1)))
         ])
         # self.convs2.apply(init_weights)
         self.adain1 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
         ])
         self.adain2 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
         ])
-        self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
-        self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
     def forward(self, x, s):
         for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
             xt = n1(x, s)  # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
             xt = c1(xt)
-            xt = n2(xt, s) # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
             xt = c2(xt)
             x = xt + x
@@ -99,13 +98,14 @@ class AdaINResBlock1(torch.nn.Module):
             remove_weight_norm(l)
         for l in self.convs2:
             remove_weight_norm(l)
 class SineGen(torch.nn.Module):
     def __init__(self,
-                 samp_rate=24000,
-                 upsample_scale=300,
-                 harmonic_num=8, # HARDCODED due to nn.Linear() of SourceModuleHnNSF
                  voiced_threshold=10):
         super(SineGen, self).__init__()
@@ -116,60 +116,66 @@ class SineGen(torch.nn.Module):
     def _f02sine(self, f0_values):
         # --
-        # 134 HIFI
         # torch.Size([1, 145200, 9])
         # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
-        rad_values = (f0_values / self.sampling_rate) % 1   # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
         rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
-                                                        scale_factor=1/self.upsample_scale,
-                                                        mode="linear").transpose(1, 2)
-        phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi  # 1.89 sounds also nice has woofer at punctuation
         phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
                                                 scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
         sines = torch.sin(phase)
         return sines
     def forward(self, f0):
-        # f0 is already full length - [1, 142600, 1]
-        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
-        sine_waves = self._f02sine(fn) * .01 # .007  # very important effect DEFAULT=0.1  very sensitive to speaker CHECK COnTINUITY FROM SEGMENTS IN AUDIOBOOK
         uv = (f0 > self.voiced_threshold).type(torch.float32)
-        return sine_waves * uv #+ noise
 class SourceModuleHnNSF(torch.nn.Module):
-    def __init__(self, harmonic_num=8):
         super(SourceModuleHnNSF, self).__init__()
         self.l_sin_gen = SineGen()
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)  # harmonic=8 is hard fixed due to this nn.Linear()
         self.l_tanh = torch.nn.Tanh()
     def forward(self, x):
         # print('   HNnSF', x.shape)  # why this is [1, 300, 1, 535800]
         sine_wavs = self.l_sin_gen(x)
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))  # This linear sums all 9 harmonics
         return sine_merge
 class Generator(torch.nn.Module):
     def __init__(self,
                  style_dim,
-                 resblock_kernel_sizes,
-                 upsample_rates,
-                 upsample_initial_channel,
-                 resblock_dilation_sizes,
                  upsample_kernel_sizes):
         super(Generator, self).__init__()
         self.num_kernels = len(resblock_kernel_sizes)
@@ -182,48 +188,52 @@ class Generator(torch.nn.Module):
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             c_cur = upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
-                         upsample_initial_channel//(2**(i+1)),
-                         k, u, padding=(u//2 + u%2), output_padding=u%2)))
-            if i + 1 < len(upsample_rates):  #
                 stride_f0 = np.prod(upsample_rates[i + 1:])
                 self.noise_convs.append(Conv1d(
                     1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
-                self.noise_res.append(AdaINResBlock1(c_cur, 7, [1,3,5], style_dim))
             else:
                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
-                self.noise_res.append(AdaINResBlock1(c_cur, 11, [1,3,5], style_dim))
         self.resblocks = nn.ModuleList()
         self.alphas = nn.ParameterList()
-        self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
         for i in range(len(self.ups)):
             ch = upsample_initial_channel//(2**(i+1))
             self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                 self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
     def forward(self, x, s, f0):
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
         f0 = self.f0_upsamp(f0).transpose(1, 2)
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
-        har_source = self.m_source(f0)  # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
         har_source = har_source.transpose(1, 2)
         for i in range(self.num_upsamples):
             x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
             x_source = self.noise_convs[i](har_source)
             x_source = self.noise_res[i](x_source, s)
@@ -234,7 +244,7 @@ class Generator(torch.nn.Module):
             xs = None
             for j in range(self.num_kernels):
                 if xs is None:
                     xs = self.resblocks[i*self.num_kernels+j](x, s)
                 else:
@@ -255,11 +265,11 @@ class Generator(torch.nn.Module):
         remove_weight_norm(self.conv_pre)
         remove_weight_norm(self.conv_post)
 class AdainResBlk1d(nn.Module):
     # also used in ProsodyPredictor()
     def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
                  upsample='none', dropout_p=0.0):
         super().__init__()
@@ -267,20 +277,21 @@ class AdainResBlk1d(nn.Module):
         self.upsample_type = upsample
         self.upsample = UpSample1d(upsample)
         self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out, style_dim)
         if upsample == 'none':
             self.pool = nn.Identity()
         else:
-            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
     def _build_weights(self, dim_in, dim_out, style_dim):
         self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
         self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
         self.norm1 = AdaIN1d(style_dim, dim_in)
         self.norm2 = AdaIN1d(style_dim, dim_out)
         if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
     def _shortcut(self, x):
         x = self.upsample(x)
@@ -302,7 +313,8 @@ class AdainResBlk1d(nn.Module):
         out = self._residual(x, s)
         out = (out + self._shortcut(x)) / math.sqrt(2)
         return out
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -314,64 +326,62 @@ class UpSample1d(nn.Module):
         else:
             return F.interpolate(x, scale_factor=2, mode='nearest')
 class Decoder(nn.Module):
-    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
-                resblock_kernel_sizes = [3,7,11],
-                upsample_rates = [10,5,3,2],
-                upsample_initial_channel=512,
-                resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
-                upsample_kernel_sizes=[20,10,6,4]):
         super().__init__()
         self.decode = nn.ModuleList()
         self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
-        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
-        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))  # smooth
-        self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
         self.asr_res = nn.Sequential(
             weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
         )
-        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
-        print('p', asr.shape, F0_curve.shape, N.shape)
         F0 = self.F0_conv(F0_curve)
         N = self.N_conv(N)
         # print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)
         x = self.encode(x, s)
         asr_res = self.asr_res(asr)
         res = True
         for block in self.decode:
             if res:
                 x = torch.cat([x, asr_res, F0, N], axis=1)
             x = block(x, s)
             if block.upsample_type != "none":
                 res = False
         x = self.generator(x, s, F0_curve)
         return x

 import torch
 import torch.nn.functional as F
 import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
 import math
 import numpy as np
 LRELU_SLOPE = 0.1
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size*dilation - dilation)/2)
+def _tile(x,
           length=None):
     x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
     return x
 class AdaIN1d(nn.Module):
     # used by HiFiGan & ProsodyPredictor
     def __init__(self, style_dim, num_features):
         super().__init__()
         self.norm = nn.InstanceNorm1d(num_features, affine=False)
         # x = torch.Size([1, 512, 248])     same as output
         # s = torch.Size([1, 7, 1, 128])
         s = self.fc(s.transpose(1, 2)).transpose(1, 2)
         s = _tile(s, length=x.shape[2])
         gamma, beta = torch.chunk(s, chunks=2, dim=1)
         return (1+gamma) * self.norm(x) + beta
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
         super(AdaINResBlock1, self).__init__()
                                padding=get_padding(kernel_size, 1)))
         ])
         # self.convs2.apply(init_weights)
         self.adain1 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
         ])
         self.adain2 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
             AdaIN1d(style_dim, channels),
         ])
+        self.alpha1 = nn.ParameterList(
+            [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList(
+            [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
     def forward(self, x, s):
         for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
             xt = n1(x, s)  # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
             xt = c1(xt)
+            xt = n2(xt, s)  # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
             xt = c2(xt)
             x = xt + x
             remove_weight_norm(l)
         for l in self.convs2:
             remove_weight_norm(l)
 class SineGen(torch.nn.Module):
     def __init__(self,
+                 samp_rate=24000,
+                 upsample_scale=300,
+                 harmonic_num=8,  # HARDCODED due to nn.Linear() of SourceModuleHnNSF
                  voiced_threshold=10):
         super(SineGen, self).__init__()
     def _f02sine(self, f0_values):
         # --
+        # 134 HIFI
         # torch.Size([1, 145200, 9])
         # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
+        # modulo of negative f0_values => -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT f0_values IS SIGNED
+        rad_values = (f0_values / self.sampling_rate) % 1
         rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                     scale_factor=1/self.upsample_scale,
+                                                     mode="linear").transpose(1, 2)
+        # 1.89 sounds also nice has woofer at punctuation
+        phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi
         phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
                                                 scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
         sines = torch.sin(phase)
         return sines
     def forward(self, f0):
+        # print('____________________________________\nF0 F0\n', f0.abs().mean(), f0.mean(), f0.max(), f0.min())  # male voices sound less muffed via higher scaler in sine_waves
+        # f0 is already full length - [1, 142600, 1]
+        amplif = .0104 if f0.abs().mean() < 100 else .009  # vary amplif based on f0.abs().mean() - voice sensitive
+        fn = torch.multiply(f0, torch.FloatTensor(
+            [[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
+        # .007  # very important effect DEFAULT=0.1  very sensitive to speaker - heuristically
+        sine_waves = self._f02sine(fn) * amplif  # .009
         uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return sine_waves * uv
 class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(self, harmonic_num=8):
         super(SourceModuleHnNSF, self).__init__()
         self.l_sin_gen = SineGen()
+        # harmonic=8 is hard fixed due to this nn.Linear()
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
         self.l_tanh = torch.nn.Tanh()
     def forward(self, x):
         # print('   HNnSF', x.shape)  # why this is [1, 300, 1, 535800]
         sine_wavs = self.l_sin_gen(x)
+        # This linear sums all 9 harmonics
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
         return sine_merge
 class Generator(torch.nn.Module):
     def __init__(self,
                  style_dim,
+                 resblock_kernel_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 resblock_dilation_sizes,
                  upsample_kernel_sizes):
         super(Generator, self).__init__()
         self.num_kernels = len(resblock_kernel_sizes)
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
+                                                        upsample_initial_channel//(
+                                                            2**(i+1)),
+                                                        k, u, padding=(u//2 + u % 2), output_padding=u % 2)))
+            if i + 1 < len(upsample_rates):
                 stride_f0 = np.prod(upsample_rates[i + 1:])
                 self.noise_convs.append(Conv1d(
                     1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(AdaINResBlock1(
+                    c_cur, 7, [1, 3, 5], style_dim))
             else:
                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(AdaINResBlock1(
+                    c_cur, 11, [1, 3, 5], style_dim))
         self.resblocks = nn.ModuleList()
         self.alphas = nn.ParameterList()
+        self.alphas.append(nn.Parameter(
+            torch.ones(1, upsample_initial_channel, 1)))
         for i in range(len(self.ups)):
             ch = upsample_initial_channel//(2**(i+1))
             self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                 self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
     def forward(self, x, s, f0):
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
         f0 = self.f0_upsamp(f0).transpose(1, 2)
         # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
+        # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
+        har_source = self.m_source(f0)
         har_source = har_source.transpose(1, 2)
         for i in range(self.num_upsamples):
             x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
             x_source = self.noise_convs[i](har_source)
             x_source = self.noise_res[i](x_source, s)
             xs = None
             for j in range(self.num_kernels):
                 if xs is None:
                     xs = self.resblocks[i*self.num_kernels+j](x, s)
                 else:
         remove_weight_norm(self.conv_pre)
         remove_weight_norm(self.conv_post)
 class AdainResBlk1d(nn.Module):
     # also used in ProsodyPredictor()
     def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
                  upsample='none', dropout_p=0.0):
         super().__init__()
         self.upsample_type = upsample
         self.upsample = UpSample1d(upsample)
         self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
         if upsample == 'none':
             self.pool = nn.Identity()
         else:
+            self.pool = weight_norm(nn.ConvTranspose1d(
+                dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
     def _build_weights(self, dim_in, dim_out, style_dim):
         self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
         self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
         self.norm1 = AdaIN1d(style_dim, dim_in)
         self.norm2 = AdaIN1d(style_dim, dim_out)
         if self.learned_sc:
+            self.conv1x1 = weight_norm(
+                nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
     def _shortcut(self, x):
         x = self.upsample(x)
         out = self._residual(x, s)
         out = (out + self._shortcut(x)) / math.sqrt(2)
         return out
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
         else:
             return F.interpolate(x, scale_factor=2, mode='nearest')
 class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
+                 resblock_kernel_sizes=[3, 7, 11],
+                 upsample_rates=[10, 5, 3, 2],
+                 upsample_initial_channel=512,
+                 resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                 upsample_kernel_sizes=[20, 10, 6, 4]):
         super().__init__()
         self.decode = nn.ModuleList()
         self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(
+            1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(
+            nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))  # smooth
+        self.N_conv = weight_norm(
+            nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
         self.asr_res = nn.Sequential(
             weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
         )
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
+                                   upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
+        # print('p', asr.shape, F0_curve.shape, N.shape)
         F0 = self.F0_conv(F0_curve)
         N = self.N_conv(N)
         # print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)
         x = self.encode(x, s)
         asr_res = self.asr_res(asr)
         res = True
         for block in self.decode:
             if res:
                 x = torch.cat([x, asr_res, F0, N], axis=1)
             x = block(x, s)
             if block.upsample_type != "none":
                 res = False
         x = self.generator(x, s, F0_curve)
         return x

msinference.py CHANGED Viewed

@@ -409,7 +409,7 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
             text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]   # prepend txt snippet
                                                                                                  # assert that it chooses unique voice
         else:
-            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 140, break_long_words=0)]  # allow longer non split text
                                                                                                  # for non deu MMS TTS lang.
     for _t in text:

             text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]   # prepend txt snippet
                                                                                                  # assert that it chooses unique voice
         else:
+            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]  # allow longer non split text
                                                                                                  # for non deu MMS TTS lang.
     for _t in text:

tts.py CHANGED Viewed

@@ -85,7 +85,7 @@ def command_line_args():
         '--speed',
         help='speec of TTS (only used in Non English voices).',
         type=str,
-        default=1.24,
     )
     return parser

         '--speed',
         help='speec of TTS (only used in Non English voices).',
         type=str,
+        default=1.44,
     )
     return parser