Dionyssos commited on
Commit
dd7320e
·
1 Parent(s): bb2cd38

lest periodicity

Browse files
Modules/vits/attentions.py CHANGED
@@ -47,55 +47,7 @@ class Encoder(nn.Module):
47
  return x
48
 
49
 
50
- class Decoder(nn.Module):
51
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
- super().__init__()
53
- self.hidden_channels = hidden_channels
54
- self.filter_channels = filter_channels
55
- self.n_heads = n_heads
56
- self.n_layers = n_layers
57
- self.kernel_size = kernel_size
58
- self.p_dropout = p_dropout
59
- self.proximal_bias = proximal_bias
60
- self.proximal_init = proximal_init
61
 
62
- self.drop = nn.Dropout(p_dropout)
63
- self.self_attn_layers = nn.ModuleList()
64
- self.norm_layers_0 = nn.ModuleList()
65
- self.encdec_attn_layers = nn.ModuleList()
66
- self.norm_layers_1 = nn.ModuleList()
67
- self.ffn_layers = nn.ModuleList()
68
- self.norm_layers_2 = nn.ModuleList()
69
- for i in range(self.n_layers):
70
- self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
- self.norm_layers_0.append(LayerNorm(hidden_channels))
72
- self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
- self.norm_layers_1.append(LayerNorm(hidden_channels))
74
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
- self.norm_layers_2.append(LayerNorm(hidden_channels))
76
-
77
- def forward(self, x, x_mask, h, h_mask):
78
- """
79
- x: decoder input
80
- h: encoder output
81
- """
82
- self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
- encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
- x = x * x_mask
85
- for i in range(self.n_layers):
86
- y = self.self_attn_layers[i](x, x, self_attn_mask)
87
- y = self.drop(y)
88
- x = self.norm_layers_0[i](x + y)
89
-
90
- y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
- y = self.drop(y)
92
- x = self.norm_layers_1[i](x + y)
93
-
94
- y = self.ffn_layers[i](x, x_mask)
95
- y = self.drop(y)
96
- x = self.norm_layers_2[i](x + y)
97
- x = x * x_mask
98
- return x
99
 
100
 
101
  class MultiHeadAttention(nn.Module):
 
47
  return x
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  class MultiHeadAttention(nn.Module):
Modules/vits/commons.py CHANGED
@@ -1,10 +1,8 @@
1
  import math
2
  import numpy as np
3
  import torch
4
- from torch import nn
5
  from torch.nn import functional as F
6
 
7
-
8
  def init_weights(m, mean=0.0, std=0.01):
9
  classname = m.__class__.__name__
10
  if classname.find("Conv") != -1:
@@ -15,13 +13,8 @@ def get_padding(kernel_size, dilation=1):
15
  return int((kernel_size*dilation - dilation)/2)
16
 
17
 
18
- def convert_pad_shape(pad_shape):
19
- l = pad_shape[::-1]
20
- pad_shape = [item for sublist in l for item in sublist]
21
- return pad_shape
22
-
23
-
24
  def intersperse(lst, item):
 
25
  result = [item] * (len(lst) * 2 + 1)
26
  result[1::2] = lst
27
  return result
@@ -40,63 +33,6 @@ def rand_gumbel(shape):
40
  return -torch.log(-torch.log(uniform_samples))
41
 
42
 
43
- def rand_gumbel_like(x):
44
- g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
- return g
46
-
47
-
48
- def slice_segments(x, ids_str, segment_size=4):
49
- ret = torch.zeros_like(x[:, :, :segment_size])
50
- for i in range(x.size(0)):
51
- idx_str = ids_str[i]
52
- idx_end = idx_str + segment_size
53
- ret[i] = x[i, :, idx_str:idx_end]
54
- return ret
55
-
56
-
57
- def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
- b, d, t = x.size()
59
- if x_lengths is None:
60
- x_lengths = t
61
- ids_str_max = x_lengths - segment_size + 1
62
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
- ret = slice_segments(x, ids_str, segment_size)
64
- return ret, ids_str
65
-
66
-
67
- def get_timing_signal_1d(
68
- length, channels, min_timescale=1.0, max_timescale=1.0e4):
69
- position = torch.arange(length, dtype=torch.float)
70
- num_timescales = channels // 2
71
- log_timescale_increment = (
72
- math.log(float(max_timescale) / float(min_timescale)) /
73
- (num_timescales - 1))
74
- inv_timescales = min_timescale * torch.exp(
75
- torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
76
- scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
- signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
- signal = F.pad(signal, [0, 0, 0, channels % 2])
79
- signal = signal.view(1, channels, length)
80
- return signal
81
-
82
-
83
- def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
- b, channels, length = x.size()
85
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
- return x + signal.to(dtype=x.dtype, device=x.device)
87
-
88
-
89
- def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
- b, channels, length = x.size()
91
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
- return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
-
94
-
95
- def subsequent_mask(length):
96
- mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
- return mask
98
-
99
-
100
  @torch.jit.script
101
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
  n_channels_int = n_channels[0]
@@ -113,11 +49,6 @@ def convert_pad_shape(pad_shape):
113
  return pad_shape
114
 
115
 
116
- def shift_1d(x):
117
- x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
- return x
119
-
120
-
121
  def sequence_mask(length, max_length=None):
122
  if max_length is None:
123
  max_length = length.max()
@@ -140,22 +71,4 @@ def generate_path(duration, mask):
140
  path = path.view(b, t_x, t_y)
141
  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142
  path = path.unsqueeze(1).transpose(2,3) * mask
143
- return path
144
-
145
-
146
- def clip_grad_value_(parameters, clip_value, norm_type=2):
147
- if isinstance(parameters, torch.Tensor):
148
- parameters = [parameters]
149
- parameters = list(filter(lambda p: p.grad is not None, parameters))
150
- norm_type = float(norm_type)
151
- if clip_value is not None:
152
- clip_value = float(clip_value)
153
-
154
- total_norm = 0
155
- for p in parameters:
156
- param_norm = p.grad.data.norm(norm_type)
157
- total_norm += param_norm.item() ** norm_type
158
- if clip_value is not None:
159
- p.grad.data.clamp_(min=-clip_value, max=clip_value)
160
- total_norm = total_norm ** (1. / norm_type)
161
- return total_norm
 
1
  import math
2
  import numpy as np
3
  import torch
 
4
  from torch.nn import functional as F
5
 
 
6
  def init_weights(m, mean=0.0, std=0.01):
7
  classname = m.__class__.__name__
8
  if classname.find("Conv") != -1:
 
13
  return int((kernel_size*dilation - dilation)/2)
14
 
15
 
 
 
 
 
 
 
16
  def intersperse(lst, item):
17
+ # needed for inference
18
  result = [item] * (len(lst) * 2 + 1)
19
  result[1::2] = lst
20
  return result
 
33
  return -torch.log(-torch.log(uniform_samples))
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @torch.jit.script
37
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
38
  n_channels_int = n_channels[0]
 
49
  return pad_shape
50
 
51
 
 
 
 
 
 
52
  def sequence_mask(length, max_length=None):
53
  if max_length is None:
54
  max_length = length.max()
 
71
  path = path.view(b, t_x, t_y)
72
  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
73
  path = path.unsqueeze(1).transpose(2,3) * mask
74
+ return path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/utils.py CHANGED
@@ -56,25 +56,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
56
  'learning_rate': learning_rate}, checkpoint_path)
57
 
58
 
59
- def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
60
- for k, v in scalars.items():
61
- writer.add_scalar(k, v, global_step)
62
- for k, v in histograms.items():
63
- writer.add_histogram(k, v, global_step)
64
- for k, v in images.items():
65
- writer.add_image(k, v, global_step, dataformats='HWC')
66
- for k, v in audios.items():
67
- writer.add_audio(k, v, global_step, audio_sampling_rate)
68
-
69
-
70
- def latest_checkpoint_path(dir_path, regex="G_*.pth"):
71
- f_list = glob.glob(os.path.join(dir_path, regex))
72
- f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
73
- x = f_list[-1]
74
- print(x)
75
- return x
76
-
77
-
78
  def plot_spectrogram_to_numpy(spectrogram):
79
  global MATPLOTLIB_FLAG
80
  if not MATPLOTLIB_FLAG:
@@ -190,42 +171,6 @@ def get_hparams_from_file(config_path):
190
  hparams =HParams(**config)
191
  return hparams
192
 
193
-
194
- def check_git_hash(model_dir):
195
- source_dir = os.path.dirname(os.path.realpath(__file__))
196
- if not os.path.exists(os.path.join(source_dir, ".git")):
197
- logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
198
- source_dir
199
- ))
200
- return
201
-
202
- cur_hash = subprocess.getoutput("git rev-parse HEAD")
203
-
204
- path = os.path.join(model_dir, "githash")
205
- if os.path.exists(path):
206
- saved_hash = open(path).read()
207
- if saved_hash != cur_hash:
208
- logger.warn("git hash values are different. {}(saved) != {}(current)".format(
209
- saved_hash[:8], cur_hash[:8]))
210
- else:
211
- open(path, "w").write(cur_hash)
212
-
213
-
214
- def get_logger(model_dir, filename="train.log"):
215
- global logger
216
- logger = logging.getLogger(os.path.basename(model_dir))
217
- logger.setLevel(logging.DEBUG)
218
-
219
- formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
220
- if not os.path.exists(model_dir):
221
- os.makedirs(model_dir)
222
- h = logging.FileHandler(os.path.join(model_dir, filename))
223
- h.setLevel(logging.DEBUG)
224
- h.setFormatter(formatter)
225
- logger.addHandler(h)
226
- return logger
227
-
228
-
229
  class HParams():
230
  def __init__(self, **kwargs):
231
  for k, v in kwargs.items():
 
56
  'learning_rate': learning_rate}, checkpoint_path)
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def plot_spectrogram_to_numpy(spectrogram):
60
  global MATPLOTLIB_FLAG
61
  if not MATPLOTLIB_FLAG:
 
171
  hparams =HParams(**config)
172
  return hparams
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  class HParams():
175
  def __init__(self, **kwargs):
176
  for k, v in kwargs.items():
api.py CHANGED
@@ -17,7 +17,7 @@ from flask_cors import CORS
17
  from moviepy.editor import *
18
  from audiocraft.builders import AudioGen
19
  CACHE_DIR = 'flask_cache/'
20
- NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
21
 
22
  sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
23
 
@@ -88,54 +88,51 @@ def overlay(x, soundscape=None):
88
  background = audresample.resample(
89
  background,
90
  original_rate=16000, # sound_generator.sample_rate,
91
- target_rate=24000)[0, :-25000]
92
- # TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
93
 
94
-
95
- # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
96
-
97
-
98
-
99
 
100
  k = background.shape[0]
101
 
102
 
103
 
104
-
105
 
106
 
107
 
108
 
109
- hop = int(.7 * k) # only overlap 10%
 
110
  n_repeat = len(x) // hop
111
  total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
112
 
113
  m = np.ones(k)
114
  overlap = k - hop
115
  m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
116
- # m[:overlap] = np.linspace(0, 1, overlap)
117
 
118
  for j in range(n_repeat):
119
- # total[j*k + hop:(j+1)*k + hop] += background
120
- # total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
121
- # total[j * (k+hop):(j+1) * k + j*hop] =background
122
- total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
123
  # total = total.clip(-1, 1) # if too many signals were added on top of each other
124
  # print(total[40000:70000].tolist())
125
  print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
126
 
127
- # less periodic - cloned sounds
 
128
  for _ in range(4):
129
- background = _shift(background)
130
- # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
131
- # f'{np.abs(background.max())=}\n{x.shape=}')
132
- total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
133
- x = .26 * x + .74 * total[:len(x)]
134
-
 
135
  else:
 
136
  print('sound_background = None')
 
137
  return x
138
 
 
139
  def tts_multi_sentence(precomputed_style_vector=None,
140
  text=None,
141
  voice=None,
@@ -258,6 +255,7 @@ def serve_wav():
258
  with open(args.text, 'r') as f:
259
  t = ''.join(f)
260
  t = re.sub(' +', ' ', t) # delete spaces
 
261
  text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
262
 
263
  # ====STYLE VECTOR====
 
17
  from moviepy.editor import *
18
  from audiocraft.builders import AudioGen
19
  CACHE_DIR = 'flask_cache/'
20
+ NUM_SOUND_GENERATIONS = 3 # batch size to generate same text (same soundscape for long video)
21
 
22
  sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
23
 
 
88
  background = audresample.resample(
89
  background,
90
  original_rate=16000, # sound_generator.sample_rate,
91
+ target_rate=24000)[0, :-250] # last samples have splash sounds DISCARD 25000 last samples
 
92
 
 
 
 
 
 
93
 
94
  k = background.shape[0]
95
 
96
 
97
 
 
98
 
99
 
100
 
101
 
102
+
103
+ hop = int(.99 * k) # only overlap 10%
104
  n_repeat = len(x) // hop
105
  total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
106
 
107
  m = np.ones(k)
108
  overlap = k - hop
109
  m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
110
+
111
 
112
  for j in range(n_repeat):
113
+ total[j*hop:j*hop + k] += m * background # the total is already smoothly fading due to the previous mask. Only new addition of signal needs to rise smoothly
114
+ print((total < -1).sum(), (total > 1).sum(), 'OUTOF BOUNDS\n\n\n\n')
 
 
115
  # total = total.clip(-1, 1) # if too many signals were added on top of each other
116
  # print(total[40000:70000].tolist())
117
  print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
118
 
119
+ # less periodic
120
+
121
  for _ in range(4):
122
+ total = _shift(total)
123
+
124
+ # amplify sounds full [-1,1]
125
+
126
+ total /= np.abs(total).max() + 1e-7
127
+ x = .4 * x + .6 * total[:len(x)]
128
+
129
  else:
130
+
131
  print('sound_background = None')
132
+
133
  return x
134
 
135
+
136
  def tts_multi_sentence(precomputed_style_vector=None,
137
  text=None,
138
  voice=None,
 
255
  with open(args.text, 'r') as f:
256
  t = ''.join(f)
257
  t = re.sub(' +', ' ', t) # delete spaces
258
+ # -- sub all punctuation with ' '
259
  text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
260
 
261
  # ====STYLE VECTOR====
demo.py CHANGED
@@ -4,9 +4,9 @@ import msinference
4
 
5
 
6
  def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
7
- voice='af_ZA_google-nwu_1919', # 'serbian', # 'en_US/vctk_low#p276', 'isl', 'abi',
8
- speed=1.4, # only for non-english
9
- affect = True # False = high clarity for partially sight
10
  ):
11
  '''returns 24kHZ np.array TTS
12
 
@@ -51,8 +51,8 @@ def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are
51
 
52
  else:
53
 
54
- # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
55
- x = msinference.foreign(text=text,
56
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
57
  speed=speed) # normalisation externally
58
 
 
4
 
5
 
6
  def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
7
+ voice='af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
8
+ speed=1.4, # only for MMS TTS
9
+ affect = True # False = higher clarity sound for partially sight
10
  ):
11
  '''returns 24kHZ np.array TTS
12
 
 
51
 
52
  else:
53
 
54
+ # MMS TTS - list of sentences
55
+ x = msinference.foreign(text=[text],
56
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
57
  speed=speed) # normalisation externally
58
 
live_demo.py CHANGED
@@ -31,9 +31,8 @@ while True:
31
  _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
32
  args.soundscape = _str
33
 
34
- # xtra duration for audiogen to sound cool!!!!
35
- if len(_str) < 20:
36
- _str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata few silence for audiogen to impress you.'
37
  args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
38
 
39
  with open(args.text, 'w') as f:
 
31
  _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
32
  args.soundscape = _str
33
 
34
+ _str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
35
+
 
36
  args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
37
 
38
  with open(args.text, 'w') as f: