HIFIGAN tune v 1.0

Browse files

Files changed (8) hide show

Modules/hifigan.py +104 -213
Utils/ASR/__init__.py +0 -1
Utils/ASR/config.yml +0 -29
Utils/ASR/epoch_00080.pth +0 -3
Utils/ASR/layers.py +0 -354
Utils/ASR/models.py +0 -186
models.py +19 -54
msinference.py +30 -96

Modules/hifigan.py CHANGED Viewed

@@ -4,13 +4,18 @@ import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 import math
-import random
 import numpy as np
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size*dilation - dilation)/2)
-LRELU_SLOPE = 0.1
 class AdaIN1d(nn.Module):
@@ -22,15 +27,22 @@ class AdaIN1d(nn.Module):
         self.fc = nn.Linear(style_dim, num_features*2)
     def forward(self, x, s):
-        s = self.fc(s)  # [bs, 1024, 130]
-        s = F.interpolate(s[:, :, 0, :].transpose(1,2), x.shape[2], mode='linear')  # different time-resolution than Dur
-        gamma, beta = torch.chunk(s, chunks=2, dim=1)  # channels vary in for loop
-        # affine (1 + lin(x)) * inst(x) + lin(x)    is this a skip connection where the weight is a lin of itself
-        return (1 + gamma) * self.norm(x) + beta    # norm(x) = PLBERT has norm / beta&gamma = style has no norm()
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
@@ -73,10 +85,10 @@ class AdaINResBlock1(torch.nn.Module):
     def forward(self, x, s):
         for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
-            xt = n1(x, s)
             xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
             xt = c1(xt)
-            xt = n2(xt, s)
             xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
             xt = c2(xt)
             x = xt + x
@@ -89,205 +101,80 @@ class AdaINResBlock1(torch.nn.Module):
             remove_weight_norm(l)
 class SineGen(torch.nn.Module):
-    """ Definition of sine generator
-    SineGen(samp_rate, harmonic_num = 0,
-            sine_amp = 0.1, noise_std = 0.003,
-            voiced_threshold = 0,
-            flag_for_pulse=False)
-    samp_rate: sampling rate in Hz
-    harmonic_num: number of harmonic overtones (default 0)
-    sine_amp: amplitude of sine-wavefrom (default 0.1)
-    noise_std: std of Gaussian noise (default 0.003)
-    voiced_thoreshold: F0 threshold for U/V classification (default 0)
-    flag_for_pulse: this SinGen is used inside PulseGen (default False)
-    Note: when flag_for_pulse is True, the first time step of a voiced
-        segment is always sin(np.pi) or cos(0)
-    """
-    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
-                 sine_amp=0.1, noise_std=0.003,
-                 voiced_threshold=0,
-                 flag_for_pulse=False):
         super(SineGen, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = noise_std
         self.harmonic_num = harmonic_num
-        self.dim = self.harmonic_num + 1
         self.sampling_rate = samp_rate
         self.voiced_threshold = voiced_threshold
-        self.flag_for_pulse = flag_for_pulse
         self.upsample_scale = upsample_scale
-    def _f02uv(self, f0):
-        # generate uv signal
-        uv = (f0 > self.voiced_threshold).type(torch.float32)
-        return uv
     def _f02sine(self, f0_values):
-        """ f0_values: (batchsize, length, dim)
-            where dim indicates fundamental tone and overtones
-        """
-        # convert to F0 in rad. The interger part n can be ignored
-        # because 2 * np.pi * n doesn't affect phase
-        rad_values = (f0_values / self.sampling_rate) % 1
-        # initial phase noise (no noise for fundamental component)
-        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
-                              device=f0_values.device)
-        rand_ini[:, 0] = 0
-        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
-        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
-        if not self.flag_for_pulse:
-#             # for normal case
-#             # To prevent torch.cumsum numerical overflow,
-#             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
-#             # Buffer tmp_over_one_idx indicates the time step to add -1.
-#             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
-#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
-#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
-#             cumsum_shift = torch.zeros_like(rad_values)
-#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-#             phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
-            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
-                                                         scale_factor=1/self.upsample_scale,
-                                                         mode="linear").transpose(1, 2)
-#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
-#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
-#             cumsum_shift = torch.zeros_like(rad_values)
-#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
-            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
-                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
-            sines = torch.sin(phase)
-        else:
-            # If necessary, make sure that the first time step of every
-            # voiced segments is sin(pi) or cos(0)
-            # This is used for pulse-train generation
-            # identify the last time step in unvoiced segments
-            uv = self._f02uv(f0_values)
-            uv_1 = torch.roll(uv, shifts=-1, dims=1)
-            uv_1[:, -1, :] = 1
-            u_loc = (uv < 1) * (uv_1 > 0)
-            # get the instantanouse phase
-            tmp_cumsum = torch.cumsum(rad_values, dim=1)
-            # different batch needs to be processed differently
-            for idx in range(f0_values.shape[0]):
-                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
-                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
-                # stores the accumulation of i.phase within
-                # each voiced segments
-                tmp_cumsum[idx, :, :] = 0
-                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
-            # rad_values - tmp_cumsum: remove the accumulation of i.phase
-            # within the previous voiced segment.
-            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
-            # get the sines
-            sines = torch.cos(i_phase * 2 * np.pi)
         return sines
     def forward(self, f0):
-        """ sine_tensor, uv = forward(f0)
-        input F0: tensor(batchsize=1, length, dim=1)
-                  f0 for unvoiced steps should be 0
-        output sine_tensor: tensor(batchsize=1, length, dim)
-        output uv: tensor(batchsize=1, length, 1)
-        """
-        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
-                             device=f0.device)
-        # fundamental component
-        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
-        # generate sine waveforms
-        sine_waves = self._f02sine(fn) * self.sine_amp
-        # generate uv signal
-        # uv = torch.ones(f0.shape)
-        # uv = uv * (f0 > self.voiced_threshold)
-        uv = self._f02uv(f0)
-        # noise: for unvoiced should be similar to sine_amp
-        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
-        # .       for voiced regions is self.noise_std
-        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-        noise = noise_amp * torch.randn_like(sine_waves)
-        # first: set the unvoiced part to 0 by uv
-        # then: additive noise
-        sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
-class SourceModuleHnNSF(torch.nn.Module):
-    """ SourceModule for hn-nsf
-    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0)
-    sampling_rate: sampling_rate in Hz
-    harmonic_num: number of harmonic above F0 (default: 0)
-    sine_amp: amplitude of sine source signal (default: 0.1)
-    add_noise_std: std of additive Gaussian noise (default: 0.003)
-        note that amplitude of noise in unvoiced is decided
-        by sine_amp
-    voiced_threshold: threhold to set U/V given F0 (default: 0)
-    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-    F0_sampled (batchsize, length, 1)
-    Sine_source (batchsize, length, 1)
-    noise_source (batchsize, length 1)
-    uv (batchsize, length, 1)
-    """
-    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0):
-        super(SourceModuleHnNSF, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = add_noise_std
-        # to produce sine waveforms
-        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
-                                 sine_amp, add_noise_std, voiced_threshod)
-        # to merge source harmonics into a single excitation
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
         self.l_tanh = torch.nn.Tanh()
     def forward(self, x):
-        """
-        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-        F0_sampled (batchsize, length, 1)
-        Sine_source (batchsize, length, 1)
-        noise_source (batchsize, length 1)
-        """
-        # source for harmonic branch
-        with torch.no_grad():
-            sine_wavs, uv, _ = self.l_sin_gen(x)
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-        # source for noise branch, in the same shape as uv
-        noise = torch.randn_like(uv) * self.sine_amp / 3
-        return sine_merge, noise, uv
 class Generator(torch.nn.Module):
-    def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
         super(Generator, self).__init__()
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
-        resblock = AdaINResBlock1
-        self.m_source = SourceModuleHnNSF(
-                    sampling_rate=24000,
-                    upsample_scale=np.prod(upsample_rates),
-                    harmonic_num=8, voiced_threshod=10)
         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
         self.noise_convs = nn.ModuleList()
         self.ups = nn.ModuleList()
@@ -304,10 +191,10 @@ class Generator(torch.nn.Module):
                 stride_f0 = np.prod(upsample_rates[i + 1:])
                 self.noise_convs.append(Conv1d(
                     1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
-                self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
             else:
                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
-                self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
         self.resblocks = nn.ModuleList()
@@ -319,28 +206,35 @@ class Generator(torch.nn.Module):
             self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
     def forward(self, x, s, f0):
-        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
-        har_source, noi_source, uv = self.m_source(f0)
-        har_source = har_source.transpose(1, 2)
         for i in range(self.num_upsamples):
             x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
             x_source = self.noise_convs[i](har_source)
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
             x = x + x_source
             xs = None
             for j in range(self.num_kernels):
                 if xs is None:
                     xs = self.resblocks[i*self.num_kernels+j](x, s)
                 else:
@@ -373,9 +267,7 @@ class AdainResBlk1d(nn.Module):
         self.upsample_type = upsample
         self.upsample = UpSample1d(upsample)
         self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out, style_dim)
-        self.dropout = nn.Dropout(dropout_p)
         if upsample == 'none':
             self.pool = nn.Identity()
         else:
@@ -400,10 +292,10 @@ class AdainResBlk1d(nn.Module):
         x = self.norm1(x, s)
         x = self.actv(x)
         x = self.pool(x)
-        x = self.conv1(self.dropout(x))
         x = self.norm2(x, s)
         x = self.actv(x)
-        x = self.conv2(self.dropout(x))
         return x
     def forward(self, x, s):
@@ -440,7 +332,7 @@ class Decoder(nn.Module):
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
-        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
         self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
@@ -453,21 +345,17 @@ class Decoder(nn.Module):
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
-        if self.training:
-            downlist = [0, 3, 7]
-            F0_down = downlist[random.randint(0, 2)]
-            downlist = [0, 3, 7, 15]
-            N_down = downlist[random.randint(0, 3)]
-            if F0_down:
-                F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to('cuda'), padding=F0_down//2).squeeze(1) / F0_down
-            if N_down:
-                N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to('cuda'), padding=N_down//2).squeeze(1)  / N_down
-        F0 = self.F0_conv(F0_curve.unsqueeze(1))
-        N = self.N_conv(N.unsqueeze(1))
         x = torch.cat([asr, F0, N], axis=1)
         x = self.encode(x, s)
         asr_res = self.asr_res(asr)
@@ -475,7 +363,10 @@ class Decoder(nn.Module):
         res = True
         for block in self.decode:
             if res:
                 x = torch.cat([x, asr_res, F0, N], axis=1)
             x = block(x, s)
             if block.upsample_type != "none":
                 res = False
@@ -483,4 +374,4 @@ class Decoder(nn.Module):
         x = self.generator(x, s, F0_curve)
         return x

 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 import math
 import numpy as np
+LRELU_SLOPE = 0.1
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size*dilation - dilation)/2)
+def _tile(x,
+          length=None):
+    x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
+    return x
 class AdaIN1d(nn.Module):
         self.fc = nn.Linear(style_dim, num_features*2)
     def forward(self, x, s):
+        # x = torch.Size([1, 512, 248])     same as output
+        # s = torch.Size([1, 7, 1, 128])
+        s = self.fc(s.transpose(1, 2)).transpose(1, 2)
+        s = _tile(s, length=x.shape[2])
+        gamma, beta = torch.chunk(s, chunks=2, dim=1)
+        return (1+gamma) * self.norm(x) + beta
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
     def forward(self, x, s):
         for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)  # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
             xt = c1(xt)
+            xt = n2(xt, s) # THIS IS ADAIN - EXPECTS conv1d dims
             xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
             xt = c2(xt)
             x = xt + x
             remove_weight_norm(l)
 class SineGen(torch.nn.Module):
+    def __init__(self,
+                 samp_rate=24000,
+                 upsample_scale=300,
+                 harmonic_num=8, # HARDCODED due to nn.Linear() of SourceModuleHnNSF
+                 voiced_threshold=10):
         super(SineGen, self).__init__()
         self.harmonic_num = harmonic_num
         self.sampling_rate = samp_rate
         self.voiced_threshold = voiced_threshold
         self.upsample_scale = upsample_scale
     def _f02sine(self, f0_values):
+        # --
+        # 134 HIFI
+        # torch.Size([1, 145200, 9])
+        # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
+        rad_values = (f0_values / self.sampling_rate) % 1   # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
+        # print('BEF', rad_values.shape)
+        rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                        scale_factor=1/self.upsample_scale,
+                                                        mode="linear").transpose(1, 2)
+        print('AFt', rad_values.shape)  # downsamples the phases to 1/300 and sums them to be 0,,1,100000,20000*2*pi
+        phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi  # 1.89 sounds also nice has woofer at punctuation
+        phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+        sines = torch.sin(phase)
         return sines
     def forward(self, f0):
+        # f0 is already full length - [1, 142600, 1]
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
+        sine_waves = self._f02sine(fn) * .007  # very important effect DEFAULT=0.1  very sensitive to speaker
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return sine_waves * uv #+ noise
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(self, harmonic_num=8):
+        super(SourceModuleHnNSF, self).__init__()
+        self.l_sin_gen = SineGen()
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)  # harmonic=8 is hard fixed due to this nn.Linear()
         self.l_tanh = torch.nn.Tanh()
     def forward(self, x):
+        # print('   HNnSF', x.shape)  # why this is [1, 300, 1, 535800]
+        sine_wavs = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))  # This linear sums all 9 harmonics
+        return sine_merge
 class Generator(torch.nn.Module):
+    def __init__(self,
+                 style_dim,
+                 resblock_kernel_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 resblock_dilation_sizes,
+                 upsample_kernel_sizes):
         super(Generator, self).__init__()
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF()
         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
         self.noise_convs = nn.ModuleList()
         self.ups = nn.ModuleList()
                 stride_f0 = np.prod(upsample_rates[i + 1:])
                 self.noise_convs.append(Conv1d(
                     1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(AdaINResBlock1(c_cur, 7, [1,3,5], style_dim))
             else:
                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(AdaINResBlock1(c_cur, 11, [1,3,5], style_dim))
         self.resblocks = nn.ModuleList()
             self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
     def forward(self, x, s, f0):
+        # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
+        f0 = self.f0_upsamp(f0).transpose(1, 2)
+        print(f'{x.shape=} {s.shape=} {f0.shape=} GENERAT 249 LALALALALA\n\n')
+        # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
+        har_source = self.m_source(f0)  # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
+        har_source = har_source.transpose(1, 2)
         for i in range(self.num_upsamples):
             x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
             x_source = self.noise_convs[i](har_source)
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
+            print(x.min(), x.max(), x_source.min(), x_source.max())
             x = x + x_source
             xs = None
             for j in range(self.num_kernels):
                 if xs is None:
                     xs = self.resblocks[i*self.num_kernels+j](x, s)
                 else:
         self.upsample_type = upsample
         self.upsample = UpSample1d(upsample)
         self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
         if upsample == 'none':
             self.pool = nn.Identity()
         else:
         x = self.norm1(x, s)
         x = self.actv(x)
         x = self.pool(x)
+        x = self.conv1(x)
         x = self.norm2(x, s)
         x = self.actv(x)
+        x = self.conv2(x)
         return x
     def forward(self, x, s):
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
         self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))  # smooth
         self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
+        print('p', asr.shape, F0_curve.shape, N.shape)
+        F0 = self.F0_conv(F0_curve)
+        N = self.N_conv(N)
+        print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)
         x = self.encode(x, s)
         asr_res = self.asr_res(asr)
         res = True
         for block in self.decode:
             if res:
                 x = torch.cat([x, asr_res, F0, N], axis=1)
             x = block(x, s)
             if block.upsample_type != "none":
                 res = False
         x = self.generator(x, s, F0_curve)
         return x

Utils/ASR/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

Utils/ASR/config.yml DELETED Viewed

@@ -1,29 +0,0 @@
-log_dir: "logs/20201006"
-save_freq: 5
-device: "cuda"
-epochs: 180
-batch_size: 64
-pretrained_model: ""
-train_data: "ASRDataset/train_list.txt"
-val_data: "ASRDataset/val_list.txt"
-dataset_params:
-  data_augmentation: false
-preprocess_parasm:
-  sr: 24000
-  spect_params:
-    n_fft: 2048
-    win_length: 1200
-    hop_length: 300
-  mel_params:
-    n_mels: 80
-model_params:
-   input_dim: 80
-   hidden_dim: 256
-   n_token: 178
-   token_embedding_dim: 512
-optimizer_params:
-  lr: 0.0005

Utils/ASR/epoch_00080.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fedd55a1234b0c56e1e8b509c74edf3a5e2f27106a66038a4a946047a775bd6c
-size 94552811

Utils/ASR/layers.py DELETED Viewed

@@ -1,354 +0,0 @@
-import math
-import torch
-from torch import nn
-from typing import Optional, Any
-from torch import Tensor
-import torch.nn.functional as F
-import torchaudio
-import torchaudio.functional as audio_F
-import random
-random.seed(0)
-def _get_activation_fn(activ):
-    if activ == 'relu':
-        return nn.ReLU()
-    elif activ == 'lrelu':
-        return nn.LeakyReLU(0.2)
-    elif activ == 'swish':
-        return lambda x: x*torch.sigmoid(x)
-    else:
-        raise RuntimeError('Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
-class LinearNorm(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
-        super(LinearNorm, self).__init__()
-        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-        torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
-    def forward(self, x):
-        return self.linear_layer(x)
-class ConvNorm(torch.nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
-                 padding=None, dilation=1, bias=True, w_init_gain='linear', param=None):
-        super(ConvNorm, self).__init__()
-        if padding is None:
-            assert(kernel_size % 2 == 1)
-            padding = int(dilation * (kernel_size - 1) / 2)
-        self.conv = torch.nn.Conv1d(in_channels, out_channels,
-                                    kernel_size=kernel_size, stride=stride,
-                                    padding=padding, dilation=dilation,
-                                    bias=bias)
-        torch.nn.init.xavier_uniform_(
-            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
-    def forward(self, signal):
-        conv_signal = self.conv(signal)
-        return conv_signal
-class CausualConv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=1, dilation=1, bias=True, w_init_gain='linear', param=None):
-        super(CausualConv, self).__init__()
-        if padding is None:
-            assert(kernel_size % 2 == 1)
-            padding = int(dilation * (kernel_size - 1) / 2) * 2
-        else:
-            self.padding = padding * 2
-        self.conv = nn.Conv1d(in_channels, out_channels,
-                              kernel_size=kernel_size, stride=stride,
-                              padding=self.padding,
-                              dilation=dilation,
-                              bias=bias)
-        torch.nn.init.xavier_uniform_(
-            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
-    def forward(self, x):
-        x = self.conv(x)
-        x = x[:, :, :-self.padding]
-        return x
-class CausualBlock(nn.Module):
-    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='lrelu'):
-        super(CausualBlock, self).__init__()
-        self.blocks = nn.ModuleList([
-            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
-            for i in range(n_conv)])
-    def forward(self, x):
-        for block in self.blocks:
-            res = x
-            x = block(x)
-            x += res
-        return x
-    def _get_conv(self, hidden_dim, dilation, activ='lrelu', dropout_p=0.2):
-        layers = [
-            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
-            _get_activation_fn(activ),
-            nn.BatchNorm1d(hidden_dim),
-            nn.Dropout(p=dropout_p),
-            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
-            _get_activation_fn(activ),
-            nn.Dropout(p=dropout_p)
-        ]
-        return nn.Sequential(*layers)
-class ConvBlock(nn.Module):
-    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='relu'):
-        super().__init__()
-        self._n_groups = 8
-        self.blocks = nn.ModuleList([
-            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
-            for i in range(n_conv)])
-    def forward(self, x):
-        for block in self.blocks:
-            res = x
-            x = block(x)
-            x += res
-        return x
-    def _get_conv(self, hidden_dim, dilation, activ='relu', dropout_p=0.2):
-        layers = [
-            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
-            _get_activation_fn(activ),
-            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
-            nn.Dropout(p=dropout_p),
-            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
-            _get_activation_fn(activ),
-            nn.Dropout(p=dropout_p)
-        ]
-        return nn.Sequential(*layers)
-class LocationLayer(nn.Module):
-    def __init__(self, attention_n_filters, attention_kernel_size,
-                 attention_dim):
-        super(LocationLayer, self).__init__()
-        padding = int((attention_kernel_size - 1) / 2)
-        self.location_conv = ConvNorm(2, attention_n_filters,
-                                      kernel_size=attention_kernel_size,
-                                      padding=padding, bias=False, stride=1,
-                                      dilation=1)
-        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
-                                         bias=False, w_init_gain='tanh')
-    def forward(self, attention_weights_cat):
-        processed_attention = self.location_conv(attention_weights_cat)
-        processed_attention = processed_attention.transpose(1, 2)
-        processed_attention = self.location_dense(processed_attention)
-        return processed_attention
-class Attention(nn.Module):
-    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
-                 attention_location_n_filters, attention_location_kernel_size):
-        super(Attention, self).__init__()
-        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
-                                      bias=False, w_init_gain='tanh')
-        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
-                                       w_init_gain='tanh')
-        self.v = LinearNorm(attention_dim, 1, bias=False)
-        self.location_layer = LocationLayer(attention_location_n_filters,
-                                            attention_location_kernel_size,
-                                            attention_dim)
-        self.score_mask_value = -float("inf")
-    def get_alignment_energies(self, query, processed_memory,
-                               attention_weights_cat):
-        """
-        PARAMS
-        ------
-        query: decoder output (batch, n_mel_channels * n_frames_per_step)
-        processed_memory: processed encoder outputs (B, T_in, attention_dim)
-        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
-        RETURNS
-        -------
-        alignment (batch, max_time)
-        """
-        processed_query = self.query_layer(query.unsqueeze(1))
-        processed_attention_weights = self.location_layer(attention_weights_cat)
-        energies = self.v(torch.tanh(
-            processed_query + processed_attention_weights + processed_memory))
-        energies = energies.squeeze(-1)
-        return energies
-    def forward(self, attention_hidden_state, memory, processed_memory,
-                attention_weights_cat, mask):
-        """
-        PARAMS
-        ------
-        attention_hidden_state: attention rnn last output
-        memory: encoder outputs
-        processed_memory: processed encoder outputs
-        attention_weights_cat: previous and cummulative attention weights
-        mask: binary mask for padded data
-        """
-        alignment = self.get_alignment_energies(
-            attention_hidden_state, processed_memory, attention_weights_cat)
-        if mask is not None:
-            alignment.data.masked_fill_(mask, self.score_mask_value)
-        attention_weights = F.softmax(alignment, dim=1)
-        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
-        attention_context = attention_context.squeeze(1)
-        return attention_context, attention_weights
-class ForwardAttentionV2(nn.Module):
-    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
-                 attention_location_n_filters, attention_location_kernel_size):
-        super(ForwardAttentionV2, self).__init__()
-        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
-                                      bias=False, w_init_gain='tanh')
-        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
-                                       w_init_gain='tanh')
-        self.v = LinearNorm(attention_dim, 1, bias=False)
-        self.location_layer = LocationLayer(attention_location_n_filters,
-                                            attention_location_kernel_size,
-                                            attention_dim)
-        self.score_mask_value = -float(1e20)
-    def get_alignment_energies(self, query, processed_memory,
-                               attention_weights_cat):
-        """
-        PARAMS
-        ------
-        query: decoder output (batch, n_mel_channels * n_frames_per_step)
-        processed_memory: processed encoder outputs (B, T_in, attention_dim)
-        attention_weights_cat:  prev. and cumulative att weights (B, 2, max_time)
-        RETURNS
-        -------
-        alignment (batch, max_time)
-        """
-        processed_query = self.query_layer(query.unsqueeze(1))
-        processed_attention_weights = self.location_layer(attention_weights_cat)
-        energies = self.v(torch.tanh(
-            processed_query + processed_attention_weights + processed_memory))
-        energies = energies.squeeze(-1)
-        return energies
-    def forward(self, attention_hidden_state, memory, processed_memory,
-                attention_weights_cat, mask, log_alpha):
-        """
-        PARAMS
-        ------
-        attention_hidden_state: attention rnn last output
-        memory: encoder outputs
-        processed_memory: processed encoder outputs
-        attention_weights_cat: previous and cummulative attention weights
-        mask: binary mask for padded data
-        """
-        log_energy = self.get_alignment_energies(
-            attention_hidden_state, processed_memory, attention_weights_cat)
-        #log_energy =
-        if mask is not None:
-            log_energy.data.masked_fill_(mask, self.score_mask_value)
-        #attention_weights = F.softmax(alignment, dim=1)
-        #content_score = log_energy.unsqueeze(1) #[B, MAX_TIME] -> [B, 1, MAX_TIME]
-        #log_alpha = log_alpha.unsqueeze(2) #[B, MAX_TIME] -> [B, MAX_TIME, 1]
-        #log_total_score = log_alpha + content_score
-        #previous_attention_weights = attention_weights_cat[:,0,:]
-        log_alpha_shift_padded = []
-        max_time = log_energy.size(1)
-        for sft in range(2):
-            shifted = log_alpha[:,:max_time-sft]
-            shift_padded = F.pad(shifted, (sft,0), 'constant', self.score_mask_value)
-            log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
-        biased = torch.logsumexp(torch.cat(log_alpha_shift_padded,2), 2)
-        log_alpha_new = biased +  log_energy
-        attention_weights =  F.softmax(log_alpha_new, dim=1)
-        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
-        attention_context = attention_context.squeeze(1)
-        return attention_context, attention_weights, log_alpha_new
-class PhaseShuffle2d(nn.Module):
-    def __init__(self, n=2):
-        super(PhaseShuffle2d, self).__init__()
-        self.n = n
-        self.random = random.Random(1)
-    def forward(self, x, move=None):
-        # x.size = (B, C, M, L)
-        if move is None:
-            move = self.random.randint(-self.n, self.n)
-        if move == 0:
-            return x
-        else:
-            left = x[:, :, :, :move]
-            right = x[:, :, :, move:]
-            shuffled = torch.cat([right, left], dim=3)
-        return shuffled
-class PhaseShuffle1d(nn.Module):
-    def __init__(self, n=2):
-        super(PhaseShuffle1d, self).__init__()
-        self.n = n
-        self.random = random.Random(1)
-    def forward(self, x, move=None):
-        # x.size = (B, C, M, L)
-        if move is None:
-            move = self.random.randint(-self.n, self.n)
-        if move == 0:
-            return x
-        else:
-            left = x[:, :,  :move]
-            right = x[:, :, move:]
-            shuffled = torch.cat([right, left], dim=2)
-        return shuffled
-class MFCC(nn.Module):
-    def __init__(self, n_mfcc=40, n_mels=80):
-        super(MFCC, self).__init__()
-        self.n_mfcc = n_mfcc
-        self.n_mels = n_mels
-        self.norm = 'ortho'
-        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
-        self.register_buffer('dct_mat', dct_mat)
-    def forward(self, mel_specgram):
-        if len(mel_specgram.shape) == 2:
-            mel_specgram = mel_specgram.unsqueeze(0)
-            unsqueezed = True
-        else:
-            unsqueezed = False
-        # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
-        # -> (channel, time, n_mfcc).tranpose(...)
-        mfcc = torch.matmul(mel_specgram.transpose(1, 2), self.dct_mat).transpose(1, 2)
-        # unpack batch
-        if unsqueezed:
-            mfcc = mfcc.squeeze(0)
-        return mfcc

Utils/ASR/models.py DELETED Viewed

@@ -1,186 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import TransformerEncoder
-import torch.nn.functional as F
-from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
-class ASRCNN(nn.Module):
-    def __init__(self,
-                 input_dim=80,
-                 hidden_dim=256,
-                 n_token=35,
-                 n_layers=6,
-                 token_embedding_dim=256,
-    ):
-        super().__init__()
-        self.n_token = n_token
-        self.n_down = 1
-        self.to_mfcc = MFCC()
-        self.init_cnn = ConvNorm(input_dim//2, hidden_dim, kernel_size=7, padding=3, stride=2)
-        self.cnns = nn.Sequential(
-            *[nn.Sequential(
-                ConvBlock(hidden_dim),
-                nn.GroupNorm(num_groups=1, num_channels=hidden_dim)
-            ) for n in range(n_layers)])
-        self.projection = ConvNorm(hidden_dim, hidden_dim // 2)
-        self.ctc_linear = nn.Sequential(
-            LinearNorm(hidden_dim//2, hidden_dim),
-            nn.ReLU(),
-            LinearNorm(hidden_dim, n_token))
-        self.asr_s2s = ASRS2S(
-            embedding_dim=token_embedding_dim,
-            hidden_dim=hidden_dim//2,
-            n_token=n_token)
-    def forward(self, x, src_key_padding_mask=None, text_input=None):
-        x = self.to_mfcc(x)
-        x = self.init_cnn(x)
-        x = self.cnns(x)
-        x = self.projection(x)
-        x = x.transpose(1, 2)
-        ctc_logit = self.ctc_linear(x)
-        if text_input is not None:
-            _, s2s_logit, s2s_attn = self.asr_s2s(x, src_key_padding_mask, text_input)
-            return ctc_logit, s2s_logit, s2s_attn
-        else:
-            return ctc_logit
-    def get_feature(self, x):
-        x = self.to_mfcc(x.squeeze(1))
-        x = self.init_cnn(x)
-        x = self.cnns(x)
-        x = self.projection(x)
-        return x
-    def length_to_mask(self, lengths):
-        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-        mask = torch.gt(mask+1, lengths.unsqueeze(1)).to(lengths.device)
-        return mask
-    def get_future_mask(self, out_length, unmask_future_steps=0):
-        """
-        Args:
-            out_length (int): returned mask shape is (out_length, out_length).
-            unmask_futre_steps (int): unmasking future step size.
-        Return:
-            mask (torch.BoolTensor): mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
-        """
-        index_tensor = torch.arange(out_length).unsqueeze(0).expand(out_length, -1)
-        mask = torch.gt(index_tensor, index_tensor.T + unmask_future_steps)
-        return mask
-class ASRS2S(nn.Module):
-    def __init__(self,
-                 embedding_dim=256,
-                 hidden_dim=512,
-                 n_location_filters=32,
-                 location_kernel_size=63,
-                 n_token=40):
-        super(ASRS2S, self).__init__()
-        self.embedding = nn.Embedding(n_token, embedding_dim)
-        val_range = math.sqrt(6 / hidden_dim)
-        self.embedding.weight.data.uniform_(-val_range, val_range)
-        self.decoder_rnn_dim = hidden_dim
-        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
-        self.attention_layer = Attention(
-            self.decoder_rnn_dim,
-            hidden_dim,
-            hidden_dim,
-            n_location_filters,
-            location_kernel_size
-        )
-        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim, self.decoder_rnn_dim)
-        self.project_to_hidden = nn.Sequential(
-            LinearNorm(self.decoder_rnn_dim * 2, hidden_dim),
-            nn.Tanh())
-        self.sos = 1
-        self.eos = 2
-    def initialize_decoder_states(self, memory, mask):
-        """
-        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
-        """
-        B, L, H = memory.shape
-        self.decoder_hidden = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
-        self.decoder_cell = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
-        self.attention_weights = torch.zeros((B, L)).type_as(memory)
-        self.attention_weights_cum = torch.zeros((B, L)).type_as(memory)
-        self.attention_context = torch.zeros((B, H)).type_as(memory)
-        self.memory = memory
-        self.processed_memory = self.attention_layer.memory_layer(memory)
-        self.mask = mask
-        self.unk_index = 3
-        self.random_mask = 0.1
-    def forward(self, memory, memory_mask, text_input):
-        """
-        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
-        moemory_mask.shape = (B, L, )
-        texts_input.shape = (B, T)
-        """
-        self.initialize_decoder_states(memory, memory_mask)
-        # text random mask
-        random_mask = (torch.rand(text_input.shape) < self.random_mask).to(text_input.device)
-        _text_input = text_input.clone()
-        _text_input.masked_fill_(random_mask, self.unk_index)
-        decoder_inputs = self.embedding(_text_input).transpose(0, 1) # -> [T, B, channel]
-        start_embedding = self.embedding(
-            torch.LongTensor([self.sos]*decoder_inputs.size(1)).to(decoder_inputs.device))
-        decoder_inputs = torch.cat((start_embedding.unsqueeze(0), decoder_inputs), dim=0)
-        hidden_outputs, logit_outputs, alignments = [], [], []
-        while len(hidden_outputs) < decoder_inputs.size(0):
-            decoder_input = decoder_inputs[len(hidden_outputs)]
-            hidden, logit, attention_weights = self.decode(decoder_input)
-            hidden_outputs += [hidden]
-            logit_outputs += [logit]
-            alignments += [attention_weights]
-        hidden_outputs, logit_outputs, alignments = \
-            self.parse_decoder_outputs(
-                hidden_outputs, logit_outputs, alignments)
-        return hidden_outputs, logit_outputs, alignments
-    def decode(self, decoder_input):
-        cell_input = torch.cat((decoder_input, self.attention_context), -1)
-        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
-            cell_input,
-            (self.decoder_hidden, self.decoder_cell))
-        attention_weights_cat = torch.cat(
-            (self.attention_weights.unsqueeze(1),
-            self.attention_weights_cum.unsqueeze(1)),dim=1)
-        self.attention_context, self.attention_weights = self.attention_layer(
-            self.decoder_hidden,
-            self.memory,
-            self.processed_memory,
-            attention_weights_cat,
-            self.mask)
-        self.attention_weights_cum += self.attention_weights
-        hidden_and_context = torch.cat((self.decoder_hidden, self.attention_context), -1)
-        hidden = self.project_to_hidden(hidden_and_context)
-        # dropout to increasing g
-        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
-        return hidden, logit, self.attention_weights
-    def parse_decoder_outputs(self, hidden, logit, alignments):
-        # -> [B, T_out + 1, max_time]
-        alignments = torch.stack(alignments).transpose(0,1)
-        # [T_out + 1, B, n_symbols] -> [B, T_out + 1,  n_symbols]
-        logit = torch.stack(logit).transpose(0, 1).contiguous()
-        hidden = torch.stack(hidden).transpose(0, 1).contiguous()
-        return hidden, logit, alignments

models.py CHANGED Viewed

@@ -6,9 +6,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils import weight_norm, spectral_norm
-from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
-from Modules.hifigan import AdainResBlk1d
 import yaml
@@ -257,20 +257,6 @@ class TextEncoder(nn.Module):
             x, batch_first=True)
         x = x.transpose(-1, -2)
         return x
-    # def inference(self, x):
-    #     x = self.embedding(x)
-    #     x = x.transpose(1, 2)
-    #     x = self.cnn(x)
-    #     x = x.transpose(1, 2)
-    #     self.lstm.flatten_parameters()
-    #     x, _ = self.lstm(x)
-    #     return x
-    # def length_to_mask(self, lengths):
-    #     mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    #     mask = torch.gt(mask+1, lengths.unsqueeze(1))
-    #     return mask
 class AdaLayerNorm(nn.Module):
@@ -318,25 +304,28 @@ class ProsodyPredictor(nn.Module):
         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
-        x, _ = self.shared(x.transpose(-1, -2))
-        F0 = x.transpose(-1, -2)
         for block in self.F0:
-            print(f'F)N {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
-            F0 = block(F0, s)
         F0 = self.F0_proj(F0)
-        N = x.transpose(-1, -2)
         for block in self.N:
             N = block(N, s)
         N = self.N_proj(N)
-        return F0.squeeze(1), N.squeeze(1)
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
@@ -357,13 +346,13 @@ class DurationEncoder(nn.Module):
         self.sty_dim = sty_dim
     def forward(self, x, style, text_lengths):
-        style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
-        style = F.interpolate(style, x.shape[2], mode='nearest')
         x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
         input_lengths = text_lengths.cpu().numpy()
         for block in self.lstms:
@@ -398,28 +387,4 @@ def load_F0_models(path):
     F0_model.load_state_dict(params)
     _ = F0_model.train()
-    return F0_model
-def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
-    # load ASR model
-    def _load_config(path):
-        with open(path) as f:
-            config = yaml.safe_load(f)
-        model_config = config['model_params']
-        return model_config
-    def _load_model(model_config, model_path):
-        model = ASRCNN(**model_config)
-        params = torch.load(
-            model_path,
-            map_location='cpu',
-            weights_only=False
-            )['model']
-        model.load_state_dict(params)
-        return model
-    asr_model_config = _load_config(ASR_MODEL_CONFIG)
-    asr_model = _load_model(asr_model_config, ASR_MODEL_PATH)
-    _ = asr_model.train()
-    return asr_model

 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils import weight_norm, spectral_norm
+# from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
+from Modules.hifigan import _tile, AdainResBlk1d
 import yaml
             x, batch_first=True)
         x = x.transpose(-1, -2)
         return x
 class AdaLayerNorm(nn.Module):
         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
+        print(x.shape, s.shape, 'F)N T T T')
+        x, _ = self.shared(x.transpose(1, 2))  # [bs, time, ch] LSTM
+        x = x.transpose(1, 2)  # [bs, ch, time]
+        F0 = x
         for block in self.F0:
+            print(f'LOOP {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
+            F0 = block(F0, s)  # This is an AdainResBlk1d expects conv1d dimensions
         F0 = self.F0_proj(F0)
+        print('____________________________2nd F0Ntra')
+        N = x
         for block in self.N:
             N = block(N, s)
         N = self.N_proj(N)
+        return F0, N
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
         self.sty_dim = sty_dim
     def forward(self, x, style, text_lengths):
+        # style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
+        style = _tile(style, length=x.shape[2])  # replicate style vector to duration of txt - F.interpolate or cyclic/tile
         x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
         input_lengths = text_lengths.cpu().numpy()
         for block in self.lstms:
     F0_model.load_state_dict(params)
     _ = F0_model.train()
+    return F0_model

msinference.py CHANGED Viewed

@@ -7,14 +7,9 @@ import numpy as np
 import yaml
 import torchaudio
 import librosa
-from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_ASR_models, load_F0_models
 from nltk.tokenize import word_tokenize
-torch.manual_seed(0)
-# torch.backends.cudnn.benchmark = False
-# torch.backends.cudnn.deterministic = True
-np.random.seed(0)
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 _pad = "$"
@@ -72,8 +67,11 @@ def compute_style(path):
     with torch.no_grad():
         ref_s = style_encoder(mel_tensor.unsqueeze(1))
         ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
-    print(f'\n\n\n\nCOMPUTE STYLe {ref_s.shape=} {ref_p.shape=}')
-    return torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
 device = 'cpu'
 if torch.cuda.is_available():
@@ -91,53 +89,14 @@ global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_
 args = yaml.safe_load(open(str('Utils/config.yml')))
 ASR_config = args['ASR_config']
-ASR_path = args['ASR_path']
-text_aligner = load_ASR_models(ASR_path, ASR_config).eval().to(device)
 F0_path = args['F0_path']
 pitch_extractor = load_F0_models(F0_path).eval().to(device)
 from Utils.PLBERT.util import load_plbert
-bert = load_plbert(args['PLBERT_dir']).eval().to(device)
-# model_params = recursive_munch(config['model_params'])
-# --
-# def build_model(args, text_aligner, pitch_extractor, bert):
-#     print(f'\n==============\n {args.decoder.type=}\n==============L584 models.py @ build_model()\n')
-# # ======================================
-# In [4]: args['model_params']
-# Out[4]:
-# {'decoder': {'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-#                 'resblock_kernel_sizes': [3, 7, 11],
-#                 'type': 'hifigan',
-#                 'upsample_initial_channel': 512,
-#                 'upsample_kernel_sizes': [20, 10, 6, 4],
-#                 'upsample_rates': [10, 5, 3, 2]},
-#                 'diffusion': {'dist': {'estimate_sigma_data': True,
-#                 'mean': -3.0,
-#                 'sigma_data': 0.19926648961191362,
-#                 'std': 1.0},
-#                 'embedding_mask_proba': 0.1,
-#                 'transformer': {'head_features': 64,
-#                 'multiplier': 2,
-#                 'num_heads': 8,
-#                 'num_layers': 3}},
-#                 'dim_in': 64,
-#                 'dropout': 0.2,
-#                 'hidden_dim': 512,
-#                 'max_conv_dim': 512,
-#                 'max_dur': 50,
-#                 'multispeaker': True,
-#                 'n_layer': 3,
-#                 'n_mels': 80,
-#                 'n_token': 178,
-#                 'slm': {'hidden': 768,
-#                 'initial_channel': 64,
-#                 'model': 'microsoft/wavlm-base-plus',
-#                 'nlayers': 13,
-#                 'sr': 16000},
-#                 'style_dim': 128}
-# # ===============================================
 from Modules.hifigan import Decoder
 decoder = Decoder(dim_in=512,
                   style_dim=128,
                   dim_out=80,  # n_mels
@@ -166,12 +125,7 @@ predictor_encoder = StyleEncoder(dim_in=64,
                                  style_dim=128,
                                  max_conv_dim=512).eval().to(device) # prosodic style encoder
 bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
-# --
-# model = build_model(model_params, text_aligner, pitch_extractor, plbert)
-# _ = [model[key].eval() for key in model]
-# _ = [model[key].to(device) for key in model]
-# params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
@@ -204,7 +158,6 @@ decoder.load_state_dict(     _del_prefix(params['decoder']), strict=True)
 text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
 predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
 style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
-text_aligner.load_state_dict( _del_prefix(params['text_aligner']), strict=True)
 pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=True)
 # def _shift(x):
@@ -236,40 +189,22 @@ def inference(text,
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-        # -----------------------
-        # WHO TRANSLATES these tokens to sylla
-        # print(text_mask.shape, '\n__\n', tokens, '\n__\n',  text_mask.min(), text_mask.max())
-        # text_mask=is binary
-        # tokes =  tensor([[  0,  55, 157,  86, 125,  83,  55, 156,  57, 158, 123,  48,  83,  61,
-                        #  157, 102,  61,  16, 138,  64,  16,  53, 156, 138,  54,  62, 131,  85,
-                        #  123,  83,  54,  16,  50, 156,  86, 123, 102, 125, 102,  46, 147,  16,
-                        #   62, 135,  16,  76, 158,  92,  55, 156,  86,  56,  62, 177,  46,  16,
-                        #   50, 157,  43, 102,  58,  85,  55, 156,  51, 158,  46,  51, 158,  83,
-                        #   16,  48,  76, 158, 123,  16,  72,  53,  61, 157,  86,  61,  83,  44,
-                        #  156, 102,  54, 177, 125,  51,  16,  72,  56,  46,  16, 102, 112,  53,
-                        #   54, 156,  63, 158, 147,  83,  56,  16,   4]], device='cuda:0')
-        t_en = text_encoder(tokens, input_lengths)
         bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
-        ref = ref_s[:, :, :, :128] # [bs, 11, 1, 128]
-        s = ref_s[:, :, :, 128:]   # have channels as last dim so it can go through nn.Linear layers
-        # ON compute style we dont know yet the size to interpolate
-        # Perhaps we can interpolate ref_s here as now we know how many bert time-frames the text needs
-        # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
-        print(f'{d_en.shape=}  {s.shape=} {input_lengths.shape=}')
         d = predictor.text_encoder(d_en,
                                          s,
                                          input_lengths)
         x, _ = predictor.lstm(d)
-        print(d.shape, x.shape, 'Lstm')
         duration = predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
@@ -281,24 +216,23 @@ def inference(text,
         for i in range(pred_aln_trg.size(0)):
             pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
-        # encode prosody
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        asr_new = torch.zeros_like(en)
-        asr_new[:, :, 0] = en[:, :, 0]
-        asr_new[:, :, 1:] = en[:, :, 0:-1]
-        en = asr_new
-        print('_________________________________________F0_____________________________')
         F0_pred, N_pred = predictor.F0Ntrain(en, s)
-        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        asr_new = torch.zeros_like(asr)
-        asr_new[:, :, 0] = asr[:, :, 0]
-        asr_new[:, :, 1:] = asr[:, :, 0:-1]
-        asr = asr_new
-        print('_________________________________________HiFI_____________________________')
         x = decoder(asr=asr,
                     F0_curve=F0_pred,
                     N=N_pred,

 import yaml
 import torchaudio
 import librosa
+from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
 from nltk.tokenize import word_tokenize
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 _pad = "$"
     with torch.no_grad():
         ref_s = style_encoder(mel_tensor.unsqueeze(1))
         ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
+    s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
+    s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
+    return s# [1, 128, 11]
 device = 'cpu'
 if torch.cuda.is_available():
 args = yaml.safe_load(open(str('Utils/config.yml')))
 ASR_config = args['ASR_config']
 F0_path = args['F0_path']
 pitch_extractor = load_F0_models(F0_path).eval().to(device)
 from Utils.PLBERT.util import load_plbert
 from Modules.hifigan import Decoder
+bert = load_plbert(args['PLBERT_dir']).eval().to(device)
 decoder = Decoder(dim_in=512,
                   style_dim=128,
                   dim_out=80,  # n_mels
                                  style_dim=128,
                                  max_conv_dim=512).eval().to(device) # prosodic style encoder
 bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
 text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
 predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
 style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
 pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=True)
 # def _shift(x):
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+        hidden_states = text_encoder(tokens, input_lengths)
         bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
+        ref = ref_s[:, :128, :] # [bs, 128, 11]
+        s = ref_s[:, 128:, :]
+        d = predictor.text_encoder(d_en, s, input_lengths)
+        d = d.transpose(1, 2)
+        # -------------------------------- pred_aln_trg = clones bert frames as duration
         d = predictor.text_encoder(d_en,
                                          s,
                                          input_lengths)
         x, _ = predictor.lstm(d)
         duration = predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
         for i in range(pred_aln_trg.size(0)):
             pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
         F0_pred, N_pred = predictor.F0Ntrain(en, s)
+        asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
+        # -- END DURATION
+        # [bs, 640, 198]
+        # replicated Huberrt frames for duration-of-each-frame to elast [bs, 640, 130] -> [bs, 640, 198]
+        # every Hubert frame can be cloned from 1 to ~12 times and appended to the final array
+        F0_pred, N_pred = predictor.F0Ntrain(en, s)
         x = decoder(asr=asr,
                     F0_curve=F0_pred,
                     N=N_pred,