lest periodicity

Browse files

Files changed (6) hide show

Modules/vits/attentions.py +0 -48
Modules/vits/commons.py +2 -89
Modules/vits/utils.py +0 -55
api.py +20 -22
demo.py +5 -5
live_demo.py +2 -3

Modules/vits/attentions.py CHANGED Viewed

@@ -47,55 +47,7 @@ class Encoder(nn.Module):
     return x
-class Decoder(nn.Module):
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
-    super().__init__()
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.proximal_bias = proximal_bias
-    self.proximal_init = proximal_init
-    self.drop = nn.Dropout(p_dropout)
-    self.self_attn_layers = nn.ModuleList()
-    self.norm_layers_0 = nn.ModuleList()
-    self.encdec_attn_layers = nn.ModuleList()
-    self.norm_layers_1 = nn.ModuleList()
-    self.ffn_layers = nn.ModuleList()
-    self.norm_layers_2 = nn.ModuleList()
-    for i in range(self.n_layers):
-      self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
-      self.norm_layers_0.append(LayerNorm(hidden_channels))
-      self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask, h, h_mask):
-    """
-    x: decoder input
-    h: encoder output
-    """
-    self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
-    encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-    x = x * x_mask
-    for i in range(self.n_layers):
-      y = self.self_attn_layers[i](x, x, self_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_0[i](x + y)
-      y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_1[i](x + y)
-      y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
-      x = self.norm_layers_2[i](x + y)
-    x = x * x_mask
-    return x
 class MultiHeadAttention(nn.Module):


47	return x
48
49











50





































51
52
53	class MultiHeadAttention(nn.Module):

Modules/vits/commons.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import math
 import numpy as np
 import torch
-from torch import nn
 from torch.nn import functional as F
 def init_weights(m, mean=0.0, std=0.01):
   classname = m.__class__.__name__
   if classname.find("Conv") != -1:
@@ -15,13 +13,8 @@ def get_padding(kernel_size, dilation=1):
   return int((kernel_size*dilation - dilation)/2)
-def convert_pad_shape(pad_shape):
-  l = pad_shape[::-1]
-  pad_shape = [item for sublist in l for item in sublist]
-  return pad_shape
 def intersperse(lst, item):
   result = [item] * (len(lst) * 2 + 1)
   result[1::2] = lst
   return result
@@ -40,63 +33,6 @@ def rand_gumbel(shape):
   return -torch.log(-torch.log(uniform_samples))
-def rand_gumbel_like(x):
-  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
-  return g
-def slice_segments(x, ids_str, segment_size=4):
-  ret = torch.zeros_like(x[:, :, :segment_size])
-  for i in range(x.size(0)):
-    idx_str = ids_str[i]
-    idx_end = idx_str + segment_size
-    ret[i] = x[i, :, idx_str:idx_end]
-  return ret
-def rand_slice_segments(x, x_lengths=None, segment_size=4):
-  b, d, t = x.size()
-  if x_lengths is None:
-    x_lengths = t
-  ids_str_max = x_lengths - segment_size + 1
-  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
-  ret = slice_segments(x, ids_str, segment_size)
-  return ret, ids_str
-def get_timing_signal_1d(
-    length, channels, min_timescale=1.0, max_timescale=1.0e4):
-  position = torch.arange(length, dtype=torch.float)
-  num_timescales = channels // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (num_timescales - 1))
-  inv_timescales = min_timescale * torch.exp(
-      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
-  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
-  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
-  signal = F.pad(signal, [0, 0, 0, channels % 2])
-  signal = signal.view(1, channels, length)
-  return signal
-def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return x + signal.to(dtype=x.dtype, device=x.device)
-def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
-def subsequent_mask(length):
-  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
-  return mask
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
   n_channels_int = n_channels[0]
@@ -113,11 +49,6 @@ def convert_pad_shape(pad_shape):
   return pad_shape
-def shift_1d(x):
-  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
-  return x
 def sequence_mask(length, max_length=None):
   if max_length is None:
     max_length = length.max()
@@ -140,22 +71,4 @@ def generate_path(duration, mask):
   path = path.view(b, t_x, t_y)
   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
   path = path.unsqueeze(1).transpose(2,3) * mask
-  return path
-def clip_grad_value_(parameters, clip_value, norm_type=2):
-  if isinstance(parameters, torch.Tensor):
-    parameters = [parameters]
-  parameters = list(filter(lambda p: p.grad is not None, parameters))
-  norm_type = float(norm_type)
-  if clip_value is not None:
-    clip_value = float(clip_value)
-  total_norm = 0
-  for p in parameters:
-    param_norm = p.grad.data.norm(norm_type)
-    total_norm += param_norm.item() ** norm_type
-    if clip_value is not None:
-      p.grad.data.clamp_(min=-clip_value, max=clip_value)
-  total_norm = total_norm ** (1. / norm_type)
-  return total_norm

 import math
 import numpy as np
 import torch
 from torch.nn import functional as F
 def init_weights(m, mean=0.0, std=0.01):
   classname = m.__class__.__name__
   if classname.find("Conv") != -1:
   return int((kernel_size*dilation - dilation)/2)
 def intersperse(lst, item):
+  # needed for inference
   result = [item] * (len(lst) * 2 + 1)
   result[1::2] = lst
   return result
   return -torch.log(-torch.log(uniform_samples))
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
   n_channels_int = n_channels[0]
   return pad_shape
 def sequence_mask(length, max_length=None):
   if max_length is None:
     max_length = length.max()
   path = path.view(b, t_x, t_y)
   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
   path = path.unsqueeze(1).transpose(2,3) * mask
+  return path

Modules/vits/utils.py CHANGED Viewed

@@ -56,25 +56,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
               'learning_rate': learning_rate}, checkpoint_path)
-def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
-  for k, v in scalars.items():
-    writer.add_scalar(k, v, global_step)
-  for k, v in histograms.items():
-    writer.add_histogram(k, v, global_step)
-  for k, v in images.items():
-    writer.add_image(k, v, global_step, dataformats='HWC')
-  for k, v in audios.items():
-    writer.add_audio(k, v, global_step, audio_sampling_rate)
-def latest_checkpoint_path(dir_path, regex="G_*.pth"):
-  f_list = glob.glob(os.path.join(dir_path, regex))
-  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
-  x = f_list[-1]
-  print(x)
-  return x
 def plot_spectrogram_to_numpy(spectrogram):
   global MATPLOTLIB_FLAG
   if not MATPLOTLIB_FLAG:
@@ -190,42 +171,6 @@ def get_hparams_from_file(config_path):
   hparams =HParams(**config)
   return hparams
-def check_git_hash(model_dir):
-  source_dir = os.path.dirname(os.path.realpath(__file__))
-  if not os.path.exists(os.path.join(source_dir, ".git")):
-    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
-      source_dir
-    ))
-    return
-  cur_hash = subprocess.getoutput("git rev-parse HEAD")
-  path = os.path.join(model_dir, "githash")
-  if os.path.exists(path):
-    saved_hash = open(path).read()
-    if saved_hash != cur_hash:
-      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
-        saved_hash[:8], cur_hash[:8]))
-  else:
-    open(path, "w").write(cur_hash)
-def get_logger(model_dir, filename="train.log"):
-  global logger
-  logger = logging.getLogger(os.path.basename(model_dir))
-  logger.setLevel(logging.DEBUG)
-  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
-  if not os.path.exists(model_dir):
-    os.makedirs(model_dir)
-  h = logging.FileHandler(os.path.join(model_dir, filename))
-  h.setLevel(logging.DEBUG)
-  h.setFormatter(formatter)
-  logger.addHandler(h)
-  return logger
 class HParams():
   def __init__(self, **kwargs):
     for k, v in kwargs.items():

               'learning_rate': learning_rate}, checkpoint_path)
 def plot_spectrogram_to_numpy(spectrogram):
   global MATPLOTLIB_FLAG
   if not MATPLOTLIB_FLAG:
   hparams =HParams(**config)
   return hparams
 class HParams():
   def __init__(self, **kwargs):
     for k, v in kwargs.items():

api.py CHANGED Viewed

@@ -17,7 +17,7 @@ from flask_cors import CORS
 from moviepy.editor import *
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
-NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same soundscape for long video)
 sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
@@ -88,54 +88,51 @@ def overlay(x, soundscape=None):
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
-            target_rate=24000)[0, :-25000]
-        # TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
-        # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
         k = background.shape[0]
-        hop = int(.7 * k)  # only overlap 10%
         n_repeat = len(x) // hop
         total = np.zeros( hop * (n_repeat + 2))  # add some extra pad space for last frame to fit
         m = np.ones(k)
         overlap = k - hop
         m[hop:] = np.linspace(1, 0, overlap)  # tril mask for avg sound in the interpolated hop
-        # m[:overlap] = np.linspace(0, 1, overlap)
         for j in range(n_repeat):
-            # total[j*k + hop:(j+1)*k + hop] += background
-            # total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background  # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
-            # total[j * (k+hop):(j+1) * k + j*hop] =background
-            total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
         # total = total.clip(-1, 1)  # if too many signals were added on top of each other
         # print(total[40000:70000].tolist())
         print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
-        # less periodic - cloned sounds
         for _ in range(4):
-            background = _shift(background)
-        # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
-        #       f'{np.abs(background.max())=}\n{x.shape=}')
-        total /= np.abs(total).max() + 1e-7  # amplify speech to full [-1,1]
-        x = .26 * x + .74 * total[:len(x)]
     else:
         print('sound_background = None')
     return x
 def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
@@ -258,6 +255,7 @@ def serve_wav():
         with open(args.text, 'r') as f:
             t = ''.join(f)
         t = re.sub(' +', ' ', t)  # delete spaces
         text = split_into_sentences(t)  # split to short sentences (~100 phonemes max for OOM)
     # ====STYLE VECTOR====

 from moviepy.editor import *
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
+NUM_SOUND_GENERATIONS = 3  # batch size to generate same text (same soundscape for long video)
 sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
+            target_rate=24000)[0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
         k = background.shape[0]
+        hop = int(.99 * k)  # only overlap 10%
         n_repeat = len(x) // hop
         total = np.zeros( hop * (n_repeat + 2))  # add some extra pad space for last frame to fit
         m = np.ones(k)
         overlap = k - hop
         m[hop:] = np.linspace(1, 0, overlap)  # tril mask for avg sound in the interpolated hop
         for j in range(n_repeat):
+            total[j*hop:j*hop + k] += m * background  # the total is already smoothly fading due to the previous mask. Only new addition of signal needs to rise smoothly
+        print((total < -1).sum(), (total > 1).sum(), 'OUTOF BOUNDS\n\n\n\n')
         # total = total.clip(-1, 1)  # if too many signals were added on top of each other
         # print(total[40000:70000].tolist())
         print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
+        # less periodic
         for _ in range(4):
+            total = _shift(total)
+        # amplify sounds full [-1,1]
+        total /= np.abs(total).max() + 1e-7
+        x = .4 * x + .6 * total[:len(x)]
     else:
         print('sound_background = None')
     return x
 def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
         with open(args.text, 'r') as f:
             t = ''.join(f)
         t = re.sub(' +', ' ', t)  # delete spaces
+        # -- sub all punctuation with ' '
         text = split_into_sentences(t)  # split to short sentences (~100 phonemes max for OOM)
     # ====STYLE VECTOR====

demo.py CHANGED Viewed

@@ -4,9 +4,9 @@ import msinference
 def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
-              voice='af_ZA_google-nwu_1919',  # 'serbian', # 'en_US/vctk_low#p276', 'isl', 'abi',
-              speed=1.4,  # only for non-english
-              affect = True  # False = high clarity for partially sight
               ):
     '''returns 24kHZ np.array TTS
@@ -51,8 +51,8 @@ def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are
     else:
-        # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
-        x = msinference.foreign(text=text,
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally

 def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
+              voice='af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
+              speed=1.4,  # only for MMS TTS
+              affect = True  # False = higher clarity sound for partially sight
               ):
     '''returns 24kHZ np.array TTS
     else:
+        # MMS TTS - list of sentences
+        x = msinference.foreign(text=[text],
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally

live_demo.py CHANGED Viewed

@@ -31,9 +31,8 @@ while True:
     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
     args.soundscape = _str
-    # xtra duration for audiogen to sound cool!!!!
-    if len(_str) < 20:
-        _str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata  few silence for audiogen to impress you.'
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f:

     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
     args.soundscape = _str
+    _str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f: