Dionyssos commited on
Commit
53c0776
·
1 Parent(s): c2687b7

voice sin amplify

Browse files
Files changed (3) hide show
  1. Modules/hifigan.py +122 -112
  2. msinference.py +1 -1
  3. tts.py +1 -1
Modules/hifigan.py CHANGED
@@ -1,26 +1,29 @@
1
  import torch
2
  import torch.nn.functional as F
3
  import torch.nn as nn
4
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
  import math
7
  import numpy as np
8
 
9
 
10
  LRELU_SLOPE = 0.1
11
 
 
12
  def get_padding(kernel_size, dilation=1):
13
  return int((kernel_size*dilation - dilation)/2)
14
 
15
- def _tile(x,
 
16
  length=None):
17
  x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
18
  return x
19
 
 
20
  class AdaIN1d(nn.Module):
21
-
22
  # used by HiFiGan & ProsodyPredictor
23
-
24
  def __init__(self, style_dim, num_features):
25
  super().__init__()
26
  self.norm = nn.InstanceNorm1d(num_features, affine=False)
@@ -30,20 +33,15 @@ class AdaIN1d(nn.Module):
30
 
31
  # x = torch.Size([1, 512, 248]) same as output
32
  # s = torch.Size([1, 7, 1, 128])
33
-
34
-
35
  s = self.fc(s.transpose(1, 2)).transpose(1, 2)
36
-
37
-
38
-
39
  s = _tile(s, length=x.shape[2])
40
-
41
  gamma, beta = torch.chunk(s, chunks=2, dim=1)
42
  return (1+gamma) * self.norm(x) + beta
43
 
44
 
45
-
46
-
47
  class AdaINResBlock1(torch.nn.Module):
48
  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
49
  super(AdaINResBlock1, self).__init__()
@@ -66,29 +64,30 @@ class AdaINResBlock1(torch.nn.Module):
66
  padding=get_padding(kernel_size, 1)))
67
  ])
68
  # self.convs2.apply(init_weights)
69
-
70
  self.adain1 = nn.ModuleList([
71
  AdaIN1d(style_dim, channels),
72
  AdaIN1d(style_dim, channels),
73
  AdaIN1d(style_dim, channels),
74
  ])
75
-
76
  self.adain2 = nn.ModuleList([
77
  AdaIN1d(style_dim, channels),
78
  AdaIN1d(style_dim, channels),
79
  AdaIN1d(style_dim, channels),
80
  ])
81
-
82
- self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
83
- self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
84
 
 
 
 
 
85
 
86
  def forward(self, x, s):
87
  for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
88
  xt = n1(x, s) # THIS IS ADAIN - EXPECTS conv1d dims
89
  xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
90
  xt = c1(xt)
91
- xt = n2(xt, s) # THIS IS ADAIN - EXPECTS conv1d dims
92
  xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
93
  xt = c2(xt)
94
  x = xt + x
@@ -99,13 +98,14 @@ class AdaINResBlock1(torch.nn.Module):
99
  remove_weight_norm(l)
100
  for l in self.convs2:
101
  remove_weight_norm(l)
102
-
 
103
  class SineGen(torch.nn.Module):
104
 
105
  def __init__(self,
106
- samp_rate=24000,
107
- upsample_scale=300,
108
- harmonic_num=8, # HARDCODED due to nn.Linear() of SourceModuleHnNSF
109
  voiced_threshold=10):
110
 
111
  super(SineGen, self).__init__()
@@ -116,60 +116,66 @@ class SineGen(torch.nn.Module):
116
 
117
  def _f02sine(self, f0_values):
118
  # --
119
- # 134 HIFI
120
  # torch.Size([1, 145200, 9])
121
  # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
122
-
123
- rad_values = (f0_values / self.sampling_rate) % 1 # -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT LCM IS SIGNED HENCE not POSITIVE integer
124
-
125
-
126
-
127
-
128
-
129
  rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
130
- scale_factor=1/self.upsample_scale,
131
- mode="linear").transpose(1, 2)
132
-
133
- phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi # 1.89 sounds also nice has woofer at punctuation
 
134
  phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
135
  scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
136
  sines = torch.sin(phase)
137
  return sines
138
 
139
  def forward(self, f0):
 
 
 
 
 
 
 
140
 
141
- # f0 is already full length - [1, 142600, 1]
142
-
143
- fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) # [1, 145200, 9]
144
-
145
- sine_waves = self._f02sine(fn) * .01 # .007 # very important effect DEFAULT=0.1 very sensitive to speaker CHECK COnTINUITY FROM SEGMENTS IN AUDIOBOOK
146
 
147
  uv = (f0 > self.voiced_threshold).type(torch.float32)
148
-
149
- return sine_waves * uv #+ noise
 
150
 
151
  class SourceModuleHnNSF(torch.nn.Module):
152
 
153
- def __init__(self, harmonic_num=8):
154
-
155
  super(SourceModuleHnNSF, self).__init__()
156
  self.l_sin_gen = SineGen()
157
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) # harmonic=8 is hard fixed due to this nn.Linear()
 
158
  self.l_tanh = torch.nn.Tanh()
159
 
160
  def forward(self, x):
161
  # print(' HNnSF', x.shape) # why this is [1, 300, 1, 535800]
162
  sine_wavs = self.l_sin_gen(x)
163
- sine_merge = self.l_tanh(self.l_linear(sine_wavs)) # This linear sums all 9 harmonics
 
164
  return sine_merge
165
 
 
166
  class Generator(torch.nn.Module):
167
  def __init__(self,
168
  style_dim,
169
- resblock_kernel_sizes,
170
- upsample_rates,
171
- upsample_initial_channel,
172
- resblock_dilation_sizes,
173
  upsample_kernel_sizes):
174
  super(Generator, self).__init__()
175
  self.num_kernels = len(resblock_kernel_sizes)
@@ -182,48 +188,52 @@ class Generator(torch.nn.Module):
182
 
183
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
184
  c_cur = upsample_initial_channel // (2 ** (i + 1))
185
-
186
- self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
187
- upsample_initial_channel//(2**(i+1)),
188
- k, u, padding=(u//2 + u%2), output_padding=u%2)))
189
-
190
- if i + 1 < len(upsample_rates): #
 
191
  stride_f0 = np.prod(upsample_rates[i + 1:])
192
  self.noise_convs.append(Conv1d(
193
  1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
194
- self.noise_res.append(AdaINResBlock1(c_cur, 7, [1,3,5], style_dim))
 
195
  else:
196
  self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
197
- self.noise_res.append(AdaINResBlock1(c_cur, 11, [1,3,5], style_dim))
198
-
 
199
  self.resblocks = nn.ModuleList()
200
-
201
  self.alphas = nn.ParameterList()
202
- self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
203
-
 
204
  for i in range(len(self.ups)):
205
  ch = upsample_initial_channel//(2**(i+1))
206
  self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
207
-
208
  for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
209
  self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
210
 
211
  self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
212
 
213
-
214
  def forward(self, x, s, f0):
215
-
216
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
217
  f0 = self.f0_upsamp(f0).transpose(1, 2)
218
-
219
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
220
 
221
- har_source = self.m_source(f0) # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
222
-
 
223
  har_source = har_source.transpose(1, 2)
224
 
225
  for i in range(self.num_upsamples):
226
-
227
  x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
228
  x_source = self.noise_convs[i](har_source)
229
  x_source = self.noise_res[i](x_source, s)
@@ -234,7 +244,7 @@ class Generator(torch.nn.Module):
234
 
235
  xs = None
236
  for j in range(self.num_kernels):
237
-
238
  if xs is None:
239
  xs = self.resblocks[i*self.num_kernels+j](x, s)
240
  else:
@@ -255,11 +265,11 @@ class Generator(torch.nn.Module):
255
  remove_weight_norm(self.conv_pre)
256
  remove_weight_norm(self.conv_post)
257
 
258
-
259
  class AdainResBlk1d(nn.Module):
260
-
261
  # also used in ProsodyPredictor()
262
-
263
  def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
264
  upsample='none', dropout_p=0.0):
265
  super().__init__()
@@ -267,20 +277,21 @@ class AdainResBlk1d(nn.Module):
267
  self.upsample_type = upsample
268
  self.upsample = UpSample1d(upsample)
269
  self.learned_sc = dim_in != dim_out
270
- self._build_weights(dim_in, dim_out, style_dim)
271
  if upsample == 'none':
272
  self.pool = nn.Identity()
273
  else:
274
- self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
275
-
276
-
277
  def _build_weights(self, dim_in, dim_out, style_dim):
278
  self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
279
  self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
280
  self.norm1 = AdaIN1d(style_dim, dim_in)
281
  self.norm2 = AdaIN1d(style_dim, dim_out)
282
  if self.learned_sc:
283
- self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
 
284
 
285
  def _shortcut(self, x):
286
  x = self.upsample(x)
@@ -302,7 +313,8 @@ class AdainResBlk1d(nn.Module):
302
  out = self._residual(x, s)
303
  out = (out + self._shortcut(x)) / math.sqrt(2)
304
  return out
305
-
 
306
  class UpSample1d(nn.Module):
307
  def __init__(self, layer_type):
308
  super().__init__()
@@ -314,64 +326,62 @@ class UpSample1d(nn.Module):
314
  else:
315
  return F.interpolate(x, scale_factor=2, mode='nearest')
316
 
 
317
  class Decoder(nn.Module):
318
- def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
319
- resblock_kernel_sizes = [3,7,11],
320
- upsample_rates = [10,5,3,2],
321
- upsample_initial_channel=512,
322
- resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
323
- upsample_kernel_sizes=[20,10,6,4]):
324
  super().__init__()
325
-
326
  self.decode = nn.ModuleList()
327
-
328
  self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
329
-
330
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
331
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
332
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
333
- self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
 
 
 
 
 
 
 
334
 
335
- self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) # smooth
336
-
337
- self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
338
-
339
  self.asr_res = nn.Sequential(
340
  weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
341
  )
342
-
343
-
344
- self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
345
 
346
-
 
 
347
  def forward(self, asr=None, F0_curve=None, N=None, s=None):
348
-
349
- print('p', asr.shape, F0_curve.shape, N.shape)
350
  F0 = self.F0_conv(F0_curve)
351
  N = self.N_conv(N)
352
-
353
-
354
  # print(asr.shape, F0.shape, N.shape, 'TF')
355
-
356
-
357
  x = torch.cat([asr, F0, N], axis=1)
358
-
359
  x = self.encode(x, s)
360
-
361
  asr_res = self.asr_res(asr)
362
-
363
  res = True
364
  for block in self.decode:
365
  if res:
366
-
367
-
368
  x = torch.cat([x, asr_res, F0, N], axis=1)
369
-
370
  x = block(x, s)
371
  if block.upsample_type != "none":
372
  res = False
373
-
374
  x = self.generator(x, s, F0_curve)
375
  return x
376
-
377
-
 
1
  import torch
2
  import torch.nn.functional as F
3
  import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm
6
  import math
7
  import numpy as np
8
 
9
 
10
  LRELU_SLOPE = 0.1
11
 
12
+
13
  def get_padding(kernel_size, dilation=1):
14
  return int((kernel_size*dilation - dilation)/2)
15
 
16
+
17
+ def _tile(x,
18
  length=None):
19
  x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
20
  return x
21
 
22
+
23
  class AdaIN1d(nn.Module):
24
+
25
  # used by HiFiGan & ProsodyPredictor
26
+
27
  def __init__(self, style_dim, num_features):
28
  super().__init__()
29
  self.norm = nn.InstanceNorm1d(num_features, affine=False)
 
33
 
34
  # x = torch.Size([1, 512, 248]) same as output
35
  # s = torch.Size([1, 7, 1, 128])
36
+
 
37
  s = self.fc(s.transpose(1, 2)).transpose(1, 2)
38
+
 
 
39
  s = _tile(s, length=x.shape[2])
40
+
41
  gamma, beta = torch.chunk(s, chunks=2, dim=1)
42
  return (1+gamma) * self.norm(x) + beta
43
 
44
 
 
 
45
  class AdaINResBlock1(torch.nn.Module):
46
  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
47
  super(AdaINResBlock1, self).__init__()
 
64
  padding=get_padding(kernel_size, 1)))
65
  ])
66
  # self.convs2.apply(init_weights)
67
+
68
  self.adain1 = nn.ModuleList([
69
  AdaIN1d(style_dim, channels),
70
  AdaIN1d(style_dim, channels),
71
  AdaIN1d(style_dim, channels),
72
  ])
73
+
74
  self.adain2 = nn.ModuleList([
75
  AdaIN1d(style_dim, channels),
76
  AdaIN1d(style_dim, channels),
77
  AdaIN1d(style_dim, channels),
78
  ])
 
 
 
79
 
80
+ self.alpha1 = nn.ParameterList(
81
+ [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
82
+ self.alpha2 = nn.ParameterList(
83
+ [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
84
 
85
  def forward(self, x, s):
86
  for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
87
  xt = n1(x, s) # THIS IS ADAIN - EXPECTS conv1d dims
88
  xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
89
  xt = c1(xt)
90
+ xt = n2(xt, s) # THIS IS ADAIN - EXPECTS conv1d dims
91
  xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
92
  xt = c2(xt)
93
  x = xt + x
 
98
  remove_weight_norm(l)
99
  for l in self.convs2:
100
  remove_weight_norm(l)
101
+
102
+
103
  class SineGen(torch.nn.Module):
104
 
105
  def __init__(self,
106
+ samp_rate=24000,
107
+ upsample_scale=300,
108
+ harmonic_num=8, # HARDCODED due to nn.Linear() of SourceModuleHnNSF
109
  voiced_threshold=10):
110
 
111
  super(SineGen, self).__init__()
 
116
 
117
  def _f02sine(self, f0_values):
118
  # --
119
+ # 134 HIFI
120
  # torch.Size([1, 145200, 9])
121
  # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
122
+
123
+ # modulo of negative f0_values => -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT f0_values IS SIGNED
124
+ rad_values = (f0_values / self.sampling_rate) % 1
125
+
 
 
 
126
  rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
127
+ scale_factor=1/self.upsample_scale,
128
+ mode="linear").transpose(1, 2)
129
+
130
+ # 1.89 sounds also nice has woofer at punctuation
131
+ phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi
132
  phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
133
  scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
134
  sines = torch.sin(phase)
135
  return sines
136
 
137
  def forward(self, f0):
138
+ # print('____________________________________\nF0 F0\n', f0.abs().mean(), f0.mean(), f0.max(), f0.min()) # male voices sound less muffed via higher scaler in sine_waves
139
+ # f0 is already full length - [1, 142600, 1]
140
+
141
+ amplif = .0104 if f0.abs().mean() < 100 else .009 # vary amplif based on f0.abs().mean() - voice sensitive
142
+
143
+ fn = torch.multiply(f0, torch.FloatTensor(
144
+ [[range(1, self.harmonic_num + 2)]]).to(f0.device)) # [1, 145200, 9]
145
 
146
+ # .007 # very important effect DEFAULT=0.1 very sensitive to speaker - heuristically
147
+ sine_waves = self._f02sine(fn) * amplif # .009
 
 
 
148
 
149
  uv = (f0 > self.voiced_threshold).type(torch.float32)
150
+
151
+ return sine_waves * uv
152
+
153
 
154
  class SourceModuleHnNSF(torch.nn.Module):
155
 
156
+ def __init__(self, harmonic_num=8):
157
+
158
  super(SourceModuleHnNSF, self).__init__()
159
  self.l_sin_gen = SineGen()
160
+ # harmonic=8 is hard fixed due to this nn.Linear()
161
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
162
  self.l_tanh = torch.nn.Tanh()
163
 
164
  def forward(self, x):
165
  # print(' HNnSF', x.shape) # why this is [1, 300, 1, 535800]
166
  sine_wavs = self.l_sin_gen(x)
167
+ # This linear sums all 9 harmonics
168
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
169
  return sine_merge
170
 
171
+
172
  class Generator(torch.nn.Module):
173
  def __init__(self,
174
  style_dim,
175
+ resblock_kernel_sizes,
176
+ upsample_rates,
177
+ upsample_initial_channel,
178
+ resblock_dilation_sizes,
179
  upsample_kernel_sizes):
180
  super(Generator, self).__init__()
181
  self.num_kernels = len(resblock_kernel_sizes)
 
188
 
189
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
190
  c_cur = upsample_initial_channel // (2 ** (i + 1))
191
+
192
+ self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
193
+ upsample_initial_channel//(
194
+ 2**(i+1)),
195
+ k, u, padding=(u//2 + u % 2), output_padding=u % 2)))
196
+
197
+ if i + 1 < len(upsample_rates):
198
  stride_f0 = np.prod(upsample_rates[i + 1:])
199
  self.noise_convs.append(Conv1d(
200
  1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
201
+ self.noise_res.append(AdaINResBlock1(
202
+ c_cur, 7, [1, 3, 5], style_dim))
203
  else:
204
  self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
205
+ self.noise_res.append(AdaINResBlock1(
206
+ c_cur, 11, [1, 3, 5], style_dim))
207
+
208
  self.resblocks = nn.ModuleList()
209
+
210
  self.alphas = nn.ParameterList()
211
+ self.alphas.append(nn.Parameter(
212
+ torch.ones(1, upsample_initial_channel, 1)))
213
+
214
  for i in range(len(self.ups)):
215
  ch = upsample_initial_channel//(2**(i+1))
216
  self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
217
+
218
  for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
219
  self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
220
 
221
  self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
222
 
 
223
  def forward(self, x, s, f0):
224
+
225
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
226
  f0 = self.f0_upsamp(f0).transpose(1, 2)
227
+
228
  # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
229
 
230
+ # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
231
+ har_source = self.m_source(f0)
232
+
233
  har_source = har_source.transpose(1, 2)
234
 
235
  for i in range(self.num_upsamples):
236
+
237
  x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
238
  x_source = self.noise_convs[i](har_source)
239
  x_source = self.noise_res[i](x_source, s)
 
244
 
245
  xs = None
246
  for j in range(self.num_kernels):
247
+
248
  if xs is None:
249
  xs = self.resblocks[i*self.num_kernels+j](x, s)
250
  else:
 
265
  remove_weight_norm(self.conv_pre)
266
  remove_weight_norm(self.conv_post)
267
 
268
+
269
  class AdainResBlk1d(nn.Module):
270
+
271
  # also used in ProsodyPredictor()
272
+
273
  def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
274
  upsample='none', dropout_p=0.0):
275
  super().__init__()
 
277
  self.upsample_type = upsample
278
  self.upsample = UpSample1d(upsample)
279
  self.learned_sc = dim_in != dim_out
280
+ self._build_weights(dim_in, dim_out, style_dim)
281
  if upsample == 'none':
282
  self.pool = nn.Identity()
283
  else:
284
+ self.pool = weight_norm(nn.ConvTranspose1d(
285
+ dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
286
+
287
  def _build_weights(self, dim_in, dim_out, style_dim):
288
  self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
289
  self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
290
  self.norm1 = AdaIN1d(style_dim, dim_in)
291
  self.norm2 = AdaIN1d(style_dim, dim_out)
292
  if self.learned_sc:
293
+ self.conv1x1 = weight_norm(
294
+ nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
295
 
296
  def _shortcut(self, x):
297
  x = self.upsample(x)
 
313
  out = self._residual(x, s)
314
  out = (out + self._shortcut(x)) / math.sqrt(2)
315
  return out
316
+
317
+
318
  class UpSample1d(nn.Module):
319
  def __init__(self, layer_type):
320
  super().__init__()
 
326
  else:
327
  return F.interpolate(x, scale_factor=2, mode='nearest')
328
 
329
+
330
  class Decoder(nn.Module):
331
+ def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
332
+ resblock_kernel_sizes=[3, 7, 11],
333
+ upsample_rates=[10, 5, 3, 2],
334
+ upsample_initial_channel=512,
335
+ resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
336
+ upsample_kernel_sizes=[20, 10, 6, 4]):
337
  super().__init__()
338
+
339
  self.decode = nn.ModuleList()
340
+
341
  self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
342
+
343
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
344
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
345
  self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
346
+ self.decode.append(AdainResBlk1d(
347
+ 1024 + 2 + 64, 512, style_dim, upsample=True))
348
+
349
+ self.F0_conv = weight_norm(
350
+ nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) # smooth
351
+
352
+ self.N_conv = weight_norm(
353
+ nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
354
 
 
 
 
 
355
  self.asr_res = nn.Sequential(
356
  weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
357
  )
 
 
 
358
 
359
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
360
+ upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
361
+
362
  def forward(self, asr=None, F0_curve=None, N=None, s=None):
363
+
364
+ # print('p', asr.shape, F0_curve.shape, N.shape)
365
  F0 = self.F0_conv(F0_curve)
366
  N = self.N_conv(N)
367
+
 
368
  # print(asr.shape, F0.shape, N.shape, 'TF')
369
+
 
370
  x = torch.cat([asr, F0, N], axis=1)
371
+
372
  x = self.encode(x, s)
373
+
374
  asr_res = self.asr_res(asr)
375
+
376
  res = True
377
  for block in self.decode:
378
  if res:
379
+
 
380
  x = torch.cat([x, asr_res, F0, N], axis=1)
381
+
382
  x = block(x, s)
383
  if block.upsample_type != "none":
384
  res = False
385
+
386
  x = self.generator(x, s, F0_curve)
387
  return x
 
 
msinference.py CHANGED
@@ -409,7 +409,7 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
409
  text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)] # prepend txt snippet
410
  # assert that it chooses unique voice
411
  else:
412
- text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 140, break_long_words=0)] # allow longer non split text
413
  # for non deu MMS TTS lang.
414
 
415
  for _t in text:
 
409
  text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)] # prepend txt snippet
410
  # assert that it chooses unique voice
411
  else:
412
+ text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)] # allow longer non split text
413
  # for non deu MMS TTS lang.
414
 
415
  for _t in text:
tts.py CHANGED
@@ -85,7 +85,7 @@ def command_line_args():
85
  '--speed',
86
  help='speec of TTS (only used in Non English voices).',
87
  type=str,
88
- default=1.24,
89
  )
90
  return parser
91
 
 
85
  '--speed',
86
  help='speec of TTS (only used in Non English voices).',
87
  type=str,
88
+ default=1.44,
89
  )
90
  return parser
91