Cloning_Box

Running

App Files Files Community

Kremon96 commited on 12 days ago

Commit

102d284

verified ·

1 Parent(s): df438fc

Delete vocoder

Browse files

Files changed (11) hide show

vocoder/LICENSE.txt +0 -22
vocoder/audio.py +0 -108
vocoder/display.py +0 -127
vocoder/distribution.py +0 -132
vocoder/gen_wavernn.py +0 -31
vocoder/hparams.py +0 -44
vocoder/inference.py +0 -64
vocoder/models/deepmind_version.py +0 -170
vocoder/models/fatchord_version.py +0 -434
vocoder/train.py +0 -118
vocoder/vocoder_dataset.py +0 -84

vocoder/LICENSE.txt DELETED Viewed

@@ -1,22 +0,0 @@
-MIT License
-Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
-Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

vocoder/audio.py DELETED Viewed

@@ -1,108 +0,0 @@
-import math
-import numpy as np
-import librosa
-import vocoder.hparams as hp
-from scipy.signal import lfilter
-import soundfile as sf
-def label_2_float(x, bits) :
-    return 2 * x / (2**bits - 1.) - 1.
-def float_2_label(x, bits) :
-    assert abs(x).max() <= 1.0
-    x = (x + 1.) * (2**bits - 1) / 2
-    return x.clip(0, 2**bits - 1)
-def load_wav(path) :
-    return librosa.load(str(path), sr=hp.sample_rate)[0]
-def save_wav(x, path) :
-    sf.write(path, x.astype(np.float32), hp.sample_rate)
-def split_signal(x) :
-    unsigned = x + 2**15
-    coarse = unsigned // 256
-    fine = unsigned % 256
-    return coarse, fine
-def combine_signal(coarse, fine) :
-    return coarse * 256 + fine - 2**15
-def encode_16bits(x) :
-    return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
-mel_basis = None
-def linear_to_mel(spectrogram):
-    global mel_basis
-    if mel_basis is None:
-        mel_basis = build_mel_basis()
-    return np.dot(mel_basis, spectrogram)
-def build_mel_basis():
-    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
-def normalize(S):
-    return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
-def denormalize(S):
-    return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
-def amp_to_db(x):
-    return 20 * np.log10(np.maximum(1e-5, x))
-def db_to_amp(x):
-    return np.power(10.0, x * 0.05)
-def spectrogram(y):
-    D = stft(y)
-    S = amp_to_db(np.abs(D)) - hp.ref_level_db
-    return normalize(S)
-def melspectrogram(y):
-    D = stft(y)
-    S = amp_to_db(linear_to_mel(np.abs(D)))
-    return normalize(S)
-def stft(y):
-    return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
-def pre_emphasis(x):
-    return lfilter([1, -hp.preemphasis], [1], x)
-def de_emphasis(x):
-    return lfilter([1], [1, -hp.preemphasis], x)
-def encode_mu_law(x, mu) :
-    mu = mu - 1
-    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
-    return np.floor((fx + 1) / 2 * mu + 0.5)
-def decode_mu_law(y, mu, from_labels=True) :
-    if from_labels:
-        y = label_2_float(y, math.log2(mu))
-    mu = mu - 1
-    x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
-    return x

vocoder/display.py DELETED Viewed

@@ -1,127 +0,0 @@
-import time
-import numpy as np
-import sys
-def progbar(i, n, size=16):
-    done = (i * size) // n
-    bar = ''
-    for i in range(size):
-        bar += '█' if i <= done else '░'
-    return bar
-def stream(message) :
-    try:
-        sys.stdout.write("\r{%s}" % message)
-    except:
-        #Remove non-ASCII characters from message
-        message = ''.join(i for i in message if ord(i)<128)
-        sys.stdout.write("\r{%s}" % message)
-def simple_table(item_tuples) :
-    border_pattern = '+---------------------------------------'
-    whitespace = '                                            '
-    headings, cells, = [], []
-    for item in item_tuples :
-        heading, cell = str(item[0]), str(item[1])
-        pad_head = True if len(heading) < len(cell) else False
-        pad = abs(len(heading) - len(cell))
-        pad = whitespace[:pad]
-        pad_left = pad[:len(pad)//2]
-        pad_right = pad[len(pad)//2:]
-        if pad_head :
-            heading = pad_left + heading + pad_right
-        else :
-            cell = pad_left + cell + pad_right
-        headings += [heading]
-        cells += [cell]
-    border, head, body = '', '', ''
-    for i in range(len(item_tuples)) :
-        temp_head = f'| {headings[i]} '
-        temp_body = f'| {cells[i]} '
-        border += border_pattern[:len(temp_head)]
-        head += temp_head
-        body += temp_body
-        if i == len(item_tuples) - 1 :
-            head += '|'
-            body += '|'
-            border += '+'
-    print(border)
-    print(head)
-    print(border)
-    print(body)
-    print(border)
-    print(' ')
-def time_since(started) :
-    elapsed = time.time() - started
-    m = int(elapsed // 60)
-    s = int(elapsed % 60)
-    if m >= 60 :
-        h = int(m // 60)
-        m = m % 60
-        return f'{h}h {m}m {s}s'
-    else :
-        return f'{m}m {s}s'
-def save_attention(attn, path):
-    import matplotlib.pyplot as plt
-    fig = plt.figure(figsize=(12, 6))
-    plt.imshow(attn.T, interpolation='nearest', aspect='auto')
-    fig.savefig(f'{path}.png', bbox_inches='tight')
-    plt.close(fig)
-def save_spectrogram(M, path, length=None):
-    import matplotlib.pyplot as plt
-    M = np.flip(M, axis=0)
-    if length : M = M[:, :length]
-    fig = plt.figure(figsize=(12, 6))
-    plt.imshow(M, interpolation='nearest', aspect='auto')
-    fig.savefig(f'{path}.png', bbox_inches='tight')
-    plt.close(fig)
-def plot(array):
-    import matplotlib.pyplot as plt
-    fig = plt.figure(figsize=(30, 5))
-    ax = fig.add_subplot(111)
-    ax.xaxis.label.set_color('grey')
-    ax.yaxis.label.set_color('grey')
-    ax.xaxis.label.set_fontsize(23)
-    ax.yaxis.label.set_fontsize(23)
-    ax.tick_params(axis='x', colors='grey', labelsize=23)
-    ax.tick_params(axis='y', colors='grey', labelsize=23)
-    plt.plot(array)
-def plot_spec(M):
-    import matplotlib.pyplot as plt
-    M = np.flip(M, axis=0)
-    plt.figure(figsize=(18,4))
-    plt.imshow(M, interpolation='nearest', aspect='auto')
-    plt.show()

vocoder/distribution.py DELETED Viewed

@@ -1,132 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-def log_sum_exp(x):
-    """ numerically stable log_sum_exp implementation that prevents overflow """
-    # TF ordering
-    axis = len(x.size()) - 1
-    m, _ = torch.max(x, dim=axis)
-    m2, _ = torch.max(x, dim=axis, keepdim=True)
-    return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
-# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
-def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
-                                  log_scale_min=None, reduce=True):
-    if log_scale_min is None:
-        log_scale_min = float(np.log(1e-14))
-    y_hat = y_hat.permute(0,2,1)
-    assert y_hat.dim() == 3
-    assert y_hat.size(1) % 3 == 0
-    nr_mix = y_hat.size(1) // 3
-    # (B x T x C)
-    y_hat = y_hat.transpose(1, 2)
-    # unpack parameters. (B, T, num_mixtures) x 3
-    logit_probs = y_hat[:, :, :nr_mix]
-    means = y_hat[:, :, nr_mix:2 * nr_mix]
-    log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
-    # B x T x 1 -> B x T x num_mixtures
-    y = y.expand_as(means)
-    centered_y = y - means
-    inv_stdv = torch.exp(-log_scales)
-    plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
-    cdf_plus = torch.sigmoid(plus_in)
-    min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
-    cdf_min = torch.sigmoid(min_in)
-    # log probability for edge case of 0 (before scaling)
-    # equivalent: torch.log(F.sigmoid(plus_in))
-    log_cdf_plus = plus_in - F.softplus(plus_in)
-    # log probability for edge case of 255 (before scaling)
-    # equivalent: (1 - F.sigmoid(min_in)).log()
-    log_one_minus_cdf_min = -F.softplus(min_in)
-    # probability for all other cases
-    cdf_delta = cdf_plus - cdf_min
-    mid_in = inv_stdv * centered_y
-    # log probability in the center of the bin, to be used in extreme cases
-    # (not actually used in our code)
-    log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
-    # tf equivalent
-    """
-    log_probs = tf.where(x < -0.999, log_cdf_plus,
-                         tf.where(x > 0.999, log_one_minus_cdf_min,
-                                  tf.where(cdf_delta > 1e-5,
-                                           tf.log(tf.maximum(cdf_delta, 1e-12)),
-                                           log_pdf_mid - np.log(127.5))))
-    """
-    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
-    # for num_classes=65536 case? 1e-7? not sure..
-    inner_inner_cond = (cdf_delta > 1e-5).float()
-    inner_inner_out = inner_inner_cond * \
-        torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
-        (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
-    inner_cond = (y > 0.999).float()
-    inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
-    cond = (y < -0.999).float()
-    log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
-    log_probs = log_probs + F.log_softmax(logit_probs, -1)
-    if reduce:
-        return -torch.mean(log_sum_exp(log_probs))
-    else:
-        return -log_sum_exp(log_probs).unsqueeze(-1)
-def sample_from_discretized_mix_logistic(y, log_scale_min=None):
-    """
-    Sample from discretized mixture of logistic distributions
-    Args:
-        y (Tensor): B x C x T
-        log_scale_min (float): Log scale minimum value
-    Returns:
-        Tensor: sample in range of [-1, 1].
-    """
-    if log_scale_min is None:
-        log_scale_min = float(np.log(1e-14))
-    assert y.size(1) % 3 == 0
-    nr_mix = y.size(1) // 3
-    # B x T x C
-    y = y.transpose(1, 2)
-    logit_probs = y[:, :, :nr_mix]
-    # sample mixture indicator from softmax
-    temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
-    temp = logit_probs.data - torch.log(- torch.log(temp))
-    _, argmax = temp.max(dim=-1)
-    # (B, T) -> (B, T, nr_mix)
-    one_hot = to_one_hot(argmax, nr_mix)
-    # select logistic parameters
-    means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
-    log_scales = torch.clamp(torch.sum(
-        y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
-    # sample from logistic & clip to interval
-    # we don't actually round to the nearest 8bit value when sampling
-    u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
-    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
-    x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
-    return x
-def to_one_hot(tensor, n, fill_with=1.):
-    # we perform one hot encore with respect to the last axis
-    one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
-    if tensor.is_cuda:
-        one_hot = one_hot.cuda()
-    one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
-    return one_hot

vocoder/gen_wavernn.py DELETED Viewed

@@ -1,31 +0,0 @@
-from vocoder.models.fatchord_version import  WaveRNN
-from vocoder.audio import *
-def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path):
-    k = model.get_step() // 1000
-    for i, (m, x) in enumerate(test_set, 1):
-        if i > samples:
-            break
-        print('\n| Generating: %i/%i' % (i, samples))
-        x = x[0].numpy()
-        bits = 16 if hp.voc_mode == 'MOL' else hp.bits
-        if hp.mu_law and hp.voc_mode != 'MOL' :
-            x = decode_mu_law(x, 2**bits, from_labels=True)
-        else :
-            x = label_2_float(x, bits)
-        save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i)))
-        batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \
-            "gen_not_batched"
-        save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str))
-        wav = model.generate(m, batched, target, overlap, hp.mu_law)
-        save_wav(wav, save_str)

vocoder/hparams.py DELETED Viewed

@@ -1,44 +0,0 @@
-from synthesizer.hparams import hparams as _syn_hp
-# Audio settings------------------------------------------------------------------------
-# Match the values of the synthesizer
-sample_rate = _syn_hp.sample_rate
-n_fft = _syn_hp.n_fft
-num_mels = _syn_hp.num_mels
-hop_length = _syn_hp.hop_size
-win_length = _syn_hp.win_size
-fmin = _syn_hp.fmin
-min_level_db = _syn_hp.min_level_db
-ref_level_db = _syn_hp.ref_level_db
-mel_max_abs_value = _syn_hp.max_abs_value
-preemphasis = _syn_hp.preemphasis
-apply_preemphasis = _syn_hp.preemphasize
-bits = 9                            # bit depth of signal
-mu_law = True                       # Recommended to suppress noise if using raw bits in hp.voc_mode
-                                    # below
-# WAVERNN / VOCODER --------------------------------------------------------------------------------
-voc_mode = 'RAW'                    # either 'RAW' (softmax on raw bits) or 'MOL' (sample from
-# mixture of logistics)
-voc_upsample_factors = (5, 5, 8)    # NB - this needs to correctly factorise hop_length
-voc_rnn_dims = 512
-voc_fc_dims = 512
-voc_compute_dims = 128
-voc_res_out_dims = 128
-voc_res_blocks = 10
-# Training
-voc_batch_size = 100
-voc_lr = 1e-4
-voc_gen_at_checkpoint = 5           # number of samples to generate at each checkpoint
-voc_pad = 2                         # this will pad the input so that the resnet can 'see' wider
-                                    # than input length
-voc_seq_len = hop_length * 5        # must be a multiple of hop_length
-# Generating / Synthesizing
-voc_gen_batched = True              # very fast (realtime+) single utterance batched generation
-voc_target = 8000                   # target number of samples to be generated in each batch entry
-voc_overlap = 400                   # number of samples for crossfading between batches

vocoder/inference.py DELETED Viewed

@@ -1,64 +0,0 @@
-from vocoder.models.fatchord_version import WaveRNN
-from vocoder import hparams as hp
-import torch
-_model = None   # type: WaveRNN
-def load_model(weights_fpath, verbose=True):
-    global _model, _device
-    if verbose:
-        print("Building Wave-RNN")
-    _model = WaveRNN(
-        rnn_dims=hp.voc_rnn_dims,
-        fc_dims=hp.voc_fc_dims,
-        bits=hp.bits,
-        pad=hp.voc_pad,
-        upsample_factors=hp.voc_upsample_factors,
-        feat_dims=hp.num_mels,
-        compute_dims=hp.voc_compute_dims,
-        res_out_dims=hp.voc_res_out_dims,
-        res_blocks=hp.voc_res_blocks,
-        hop_length=hp.hop_length,
-        sample_rate=hp.sample_rate,
-        mode=hp.voc_mode
-    )
-    if torch.cuda.is_available():
-        _model = _model.cuda()
-        _device = torch.device('cuda')
-    else:
-        _device = torch.device('cpu')
-    if verbose:
-        print("Loading model weights at %s" % weights_fpath)
-    checkpoint = torch.load(weights_fpath, _device)
-    _model.load_state_dict(checkpoint['model_state'])
-    _model.eval()
-def is_loaded():
-    return _model is not None
-def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800,
-                   progress_callback=None):
-    """
-    Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
-    that of the synthesizer!)
-    :param normalize:
-    :param batched:
-    :param target:
-    :param overlap:
-    :return:
-    """
-    if _model is None:
-        raise Exception("Please load Wave-RNN in memory before using it")
-    if normalize:
-        mel = mel / hp.mel_max_abs_value
-    mel = torch.from_numpy(mel[None, ...])
-    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
-    return wav

vocoder/models/deepmind_version.py DELETED Viewed

@@ -1,170 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from utils.display import *
-from utils.dsp import *
-class WaveRNN(nn.Module) :
-    def __init__(self, hidden_size=896, quantisation=256) :
-        super(WaveRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.split_size = hidden_size // 2
-        # The main matmul
-        self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
-        # Output fc layers
-        self.O1 = nn.Linear(self.split_size, self.split_size)
-        self.O2 = nn.Linear(self.split_size, quantisation)
-        self.O3 = nn.Linear(self.split_size, self.split_size)
-        self.O4 = nn.Linear(self.split_size, quantisation)
-        # Input fc layers
-        self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
-        self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
-        # biases for the gates
-        self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
-        self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
-        self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
-        # display num params
-        self.num_params()
-    def forward(self, prev_y, prev_hidden, current_coarse) :
-        # Main matmul - the projection is split 3 ways
-        R_hidden = self.R(prev_hidden)
-        R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
-        # Project the prev input
-        coarse_input_proj = self.I_coarse(prev_y)
-        I_coarse_u, I_coarse_r, I_coarse_e = \
-            torch.split(coarse_input_proj, self.split_size, dim=1)
-        # Project the prev input and current coarse sample
-        fine_input = torch.cat([prev_y, current_coarse], dim=1)
-        fine_input_proj = self.I_fine(fine_input)
-        I_fine_u, I_fine_r, I_fine_e = \
-            torch.split(fine_input_proj, self.split_size, dim=1)
-        # concatenate for the gates
-        I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
-        I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
-        I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
-        # Compute all gates for coarse and fine
-        u = F.sigmoid(R_u + I_u + self.bias_u)
-        r = F.sigmoid(R_r + I_r + self.bias_r)
-        e = F.tanh(r * R_e + I_e + self.bias_e)
-        hidden = u * prev_hidden + (1. - u) * e
-        # Split the hidden state
-        hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
-        # Compute outputs
-        out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
-        out_fine = self.O4(F.relu(self.O3(hidden_fine)))
-        return out_coarse, out_fine, hidden
-    def generate(self, seq_len):
-        with torch.no_grad():
-            # First split up the biases for the gates
-            b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
-            b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
-            b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
-            # Lists for the two output seqs
-            c_outputs, f_outputs = [], []
-            # Some initial inputs
-            out_coarse = torch.LongTensor([0]).cuda()
-            out_fine = torch.LongTensor([0]).cuda()
-            # We'll meed a hidden state
-            hidden = self.init_hidden()
-            # Need a clock for display
-            start = time.time()
-            # Loop for generation
-            for i in range(seq_len) :
-                # Split into two hidden states
-                hidden_coarse, hidden_fine = \
-                    torch.split(hidden, self.split_size, dim=1)
-                # Scale and concat previous predictions
-                out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
-                out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
-                prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
-                # Project input
-                coarse_input_proj = self.I_coarse(prev_outputs)
-                I_coarse_u, I_coarse_r, I_coarse_e = \
-                    torch.split(coarse_input_proj, self.split_size, dim=1)
-                # Project hidden state and split 6 ways
-                R_hidden = self.R(hidden)
-                R_coarse_u , R_fine_u, \
-                R_coarse_r, R_fine_r, \
-                R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
-                # Compute the coarse gates
-                u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
-                r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
-                e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
-                hidden_coarse = u * hidden_coarse + (1. - u) * e
-                # Compute the coarse output
-                out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
-                posterior = F.softmax(out_coarse, dim=1)
-                distrib = torch.distributions.Categorical(posterior)
-                out_coarse = distrib.sample()
-                c_outputs.append(out_coarse)
-                # Project the [prev outputs and predicted coarse sample]
-                coarse_pred = out_coarse.float() / 127.5 - 1.
-                fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
-                fine_input_proj = self.I_fine(fine_input)
-                I_fine_u, I_fine_r, I_fine_e = \
-                    torch.split(fine_input_proj, self.split_size, dim=1)
-                # Compute the fine gates
-                u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
-                r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
-                e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
-                hidden_fine = u * hidden_fine + (1. - u) * e
-                # Compute the fine output
-                out_fine = self.O4(F.relu(self.O3(hidden_fine)))
-                posterior = F.softmax(out_fine, dim=1)
-                distrib = torch.distributions.Categorical(posterior)
-                out_fine = distrib.sample()
-                f_outputs.append(out_fine)
-                # Put the hidden state back together
-                hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
-                # Display progress
-                speed = (i + 1) / (time.time() - start)
-                stream('Gen: %i/%i -- Speed: %i',  (i + 1, seq_len, speed))
-            coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
-            fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
-            output = combine_signal(coarse, fine)
-        return output, coarse, fine
-    def init_hidden(self, batch_size=1) :
-        return torch.zeros(batch_size, self.hidden_size).cuda()
-    def num_params(self) :
-        parameters = filter(lambda p: p.requires_grad, self.parameters())
-        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
-        print('Trainable Parameters: %.3f million' % parameters)

vocoder/models/fatchord_version.py DELETED Viewed

@@ -1,434 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from vocoder.distribution import sample_from_discretized_mix_logistic
-from vocoder.display import *
-from vocoder.audio import *
-class ResBlock(nn.Module):
-    def __init__(self, dims):
-        super().__init__()
-        self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
-        self.batch_norm1 = nn.BatchNorm1d(dims)
-        self.batch_norm2 = nn.BatchNorm1d(dims)
-    def forward(self, x):
-        residual = x
-        x = self.conv1(x)
-        x = self.batch_norm1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        x = self.batch_norm2(x)
-        return x + residual
-class MelResNet(nn.Module):
-    def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
-        super().__init__()
-        k_size = pad * 2 + 1
-        self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
-        self.batch_norm = nn.BatchNorm1d(compute_dims)
-        self.layers = nn.ModuleList()
-        for i in range(res_blocks):
-            self.layers.append(ResBlock(compute_dims))
-        self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
-    def forward(self, x):
-        x = self.conv_in(x)
-        x = self.batch_norm(x)
-        x = F.relu(x)
-        for f in self.layers: x = f(x)
-        x = self.conv_out(x)
-        return x
-class Stretch2d(nn.Module):
-    def __init__(self, x_scale, y_scale):
-        super().__init__()
-        self.x_scale = x_scale
-        self.y_scale = y_scale
-    def forward(self, x):
-        b, c, h, w = x.size()
-        x = x.unsqueeze(-1).unsqueeze(3)
-        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
-        return x.view(b, c, h * self.y_scale, w * self.x_scale)
-class UpsampleNetwork(nn.Module):
-    def __init__(self, feat_dims, upsample_scales, compute_dims,
-                 res_blocks, res_out_dims, pad):
-        super().__init__()
-        total_scale = np.cumproduct(upsample_scales)[-1]
-        self.indent = pad * total_scale
-        self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
-        self.resnet_stretch = Stretch2d(total_scale, 1)
-        self.up_layers = nn.ModuleList()
-        for scale in upsample_scales:
-            k_size = (1, scale * 2 + 1)
-            padding = (0, scale)
-            stretch = Stretch2d(scale, 1)
-            conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
-            conv.weight.data.fill_(1. / k_size[1])
-            self.up_layers.append(stretch)
-            self.up_layers.append(conv)
-    def forward(self, m):
-        aux = self.resnet(m).unsqueeze(1)
-        aux = self.resnet_stretch(aux)
-        aux = aux.squeeze(1)
-        m = m.unsqueeze(1)
-        for f in self.up_layers: m = f(m)
-        m = m.squeeze(1)[:, :, self.indent:-self.indent]
-        return m.transpose(1, 2), aux.transpose(1, 2)
-class WaveRNN(nn.Module):
-    def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
-                 feat_dims, compute_dims, res_out_dims, res_blocks,
-                 hop_length, sample_rate, mode='RAW'):
-        super().__init__()
-        self.mode = mode
-        self.pad = pad
-        if self.mode == 'RAW' :
-            self.n_classes = 2 ** bits
-        elif self.mode == 'MOL' :
-            self.n_classes = 30
-        else :
-            RuntimeError("Unknown model mode value - ", self.mode)
-        self.rnn_dims = rnn_dims
-        self.aux_dims = res_out_dims // 4
-        self.hop_length = hop_length
-        self.sample_rate = sample_rate
-        self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad)
-        self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
-        self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
-        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
-        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
-        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
-        self.fc3 = nn.Linear(fc_dims, self.n_classes)
-        self.step = nn.Parameter(torch.zeros(1).long(), requires_grad=False)
-        self.num_params()
-    def forward(self, x, mels):
-        self.step += 1
-        bsize = x.size(0)
-        if torch.cuda.is_available():
-            h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
-            h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
-        else:
-            h1 = torch.zeros(1, bsize, self.rnn_dims).cpu()
-            h2 = torch.zeros(1, bsize, self.rnn_dims).cpu()
-        mels, aux = self.upsample(mels)
-        aux_idx = [self.aux_dims * i for i in range(5)]
-        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
-        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
-        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
-        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
-        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
-        x = self.I(x)
-        res = x
-        x, _ = self.rnn1(x, h1)
-        x = x + res
-        res = x
-        x = torch.cat([x, a2], dim=2)
-        x, _ = self.rnn2(x, h2)
-        x = x + res
-        x = torch.cat([x, a3], dim=2)
-        x = F.relu(self.fc1(x))
-        x = torch.cat([x, a4], dim=2)
-        x = F.relu(self.fc2(x))
-        return self.fc3(x)
-    def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None):
-        mu_law = mu_law if self.mode == 'RAW' else False
-        progress_callback = progress_callback or self.gen_display
-        self.eval()
-        output = []
-        start = time.time()
-        rnn1 = self.get_gru_cell(self.rnn1)
-        rnn2 = self.get_gru_cell(self.rnn2)
-        with torch.no_grad():
-            if torch.cuda.is_available():
-                mels = mels.cuda()
-            else:
-                mels = mels.cpu()
-            wave_len = (mels.size(-1) - 1) * self.hop_length
-            mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
-            mels, aux = self.upsample(mels.transpose(1, 2))
-            if batched:
-                mels = self.fold_with_overlap(mels, target, overlap)
-                aux = self.fold_with_overlap(aux, target, overlap)
-            b_size, seq_len, _ = mels.size()
-            if torch.cuda.is_available():
-                h1 = torch.zeros(b_size, self.rnn_dims).cuda()
-                h2 = torch.zeros(b_size, self.rnn_dims).cuda()
-                x = torch.zeros(b_size, 1).cuda()
-            else:
-                h1 = torch.zeros(b_size, self.rnn_dims).cpu()
-                h2 = torch.zeros(b_size, self.rnn_dims).cpu()
-                x = torch.zeros(b_size, 1).cpu()
-            d = self.aux_dims
-            aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
-            for i in range(seq_len):
-                m_t = mels[:, i, :]
-                a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
-                x = torch.cat([x, m_t, a1_t], dim=1)
-                x = self.I(x)
-                h1 = rnn1(x, h1)
-                x = x + h1
-                inp = torch.cat([x, a2_t], dim=1)
-                h2 = rnn2(inp, h2)
-                x = x + h2
-                x = torch.cat([x, a3_t], dim=1)
-                x = F.relu(self.fc1(x))
-                x = torch.cat([x, a4_t], dim=1)
-                x = F.relu(self.fc2(x))
-                logits = self.fc3(x)
-                if self.mode == 'MOL':
-                    sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
-                    output.append(sample.view(-1))
-                    if torch.cuda.is_available():
-                        # x = torch.FloatTensor([[sample]]).cuda()
-                        x = sample.transpose(0, 1).cuda()
-                    else:
-                        x = sample.transpose(0, 1)
-                elif self.mode == 'RAW' :
-                    posterior = F.softmax(logits, dim=1)
-                    distrib = torch.distributions.Categorical(posterior)
-                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
-                    output.append(sample)
-                    x = sample.unsqueeze(-1)
-                else:
-                    raise RuntimeError("Unknown model mode value - ", self.mode)
-                if i % 100 == 0:
-                    gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
-                    progress_callback(i, seq_len, b_size, gen_rate)
-        output = torch.stack(output).transpose(0, 1)
-        output = output.cpu().numpy()
-        output = output.astype(np.float64)
-        if batched:
-            output = self.xfade_and_unfold(output, target, overlap)
-        else:
-            output = output[0]
-        if mu_law:
-            output = decode_mu_law(output, self.n_classes, False)
-        if hp.apply_preemphasis:
-            output = de_emphasis(output)
-        # Fade-out at the end to avoid signal cutting out suddenly
-        fade_out = np.linspace(1, 0, 20 * self.hop_length)
-        output = output[:wave_len]
-        output[-20 * self.hop_length:] *= fade_out
-        self.train()
-        return output
-    def gen_display(self, i, seq_len, b_size, gen_rate):
-        pbar = progbar(i, seq_len)
-        msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
-        stream(msg)
-    def get_gru_cell(self, gru):
-        gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
-        gru_cell.weight_hh.data = gru.weight_hh_l0.data
-        gru_cell.weight_ih.data = gru.weight_ih_l0.data
-        gru_cell.bias_hh.data = gru.bias_hh_l0.data
-        gru_cell.bias_ih.data = gru.bias_ih_l0.data
-        return gru_cell
-    def pad_tensor(self, x, pad, side='both'):
-        # NB - this is just a quick method i need right now
-        # i.e., it won't generalise to other shapes/dims
-        b, t, c = x.size()
-        total = t + 2 * pad if side == 'both' else t + pad
-        if torch.cuda.is_available():
-            padded = torch.zeros(b, total, c).cuda()
-        else:
-            padded = torch.zeros(b, total, c).cpu()
-        if side == 'before' or side == 'both':
-            padded[:, pad:pad + t, :] = x
-        elif side == 'after':
-            padded[:, :t, :] = x
-        return padded
-    def fold_with_overlap(self, x, target, overlap):
-        ''' Fold the tensor with overlap for quick batched inference.
-            Overlap will be used for crossfading in xfade_and_unfold()
-        Args:
-            x (tensor)    : Upsampled conditioning features.
-                            shape=(1, timesteps, features)
-            target (int)  : Target timesteps for each index of batch
-            overlap (int) : Timesteps for both xfade and rnn warmup
-        Return:
-            (tensor) : shape=(num_folds, target + 2 * overlap, features)
-        Details:
-            x = [[h1, h2, ... hn]]
-            Where each h is a vector of conditioning features
-            Eg: target=2, overlap=1 with x.size(1)=10
-            folded = [[h1, h2, h3, h4],
-                      [h4, h5, h6, h7],
-                      [h7, h8, h9, h10]]
-        '''
-        _, total_len, features = x.size()
-        # Calculate variables needed
-        num_folds = (total_len - overlap) // (target + overlap)
-        extended_len = num_folds * (overlap + target) + overlap
-        remaining = total_len - extended_len
-        # Pad if some time steps poking out
-        if remaining != 0:
-            num_folds += 1
-            padding = target + 2 * overlap - remaining
-            x = self.pad_tensor(x, padding, side='after')
-        if torch.cuda.is_available():
-            folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
-        else:
-            folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
-        # Get the values for the folded tensor
-        for i in range(num_folds):
-            start = i * (target + overlap)
-            end = start + target + 2 * overlap
-            folded[i] = x[:, start:end, :]
-        return folded
-    def xfade_and_unfold(self, y, target, overlap):
-        ''' Applies a crossfade and unfolds into a 1d array.
-        Args:
-            y (ndarry)    : Batched sequences of audio samples
-                            shape=(num_folds, target + 2 * overlap)
-                            dtype=np.float64
-            overlap (int) : Timesteps for both xfade and rnn warmup
-        Return:
-            (ndarry) : audio samples in a 1d array
-                       shape=(total_len)
-                       dtype=np.float64
-        Details:
-            y = [[seq1],
-                 [seq2],
-                 [seq3]]
-            Apply a gain envelope at both ends of the sequences
-            y = [[seq1_in, seq1_target, seq1_out],
-                 [seq2_in, seq2_target, seq2_out],
-                 [seq3_in, seq3_target, seq3_out]]
-            Stagger and add up the groups of samples:
-            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
-        '''
-        num_folds, length = y.shape
-        target = length - 2 * overlap
-        total_len = num_folds * (target + overlap) + overlap
-        # Need some silence for the rnn warmup
-        silence_len = overlap // 2
-        fade_len = overlap - silence_len
-        silence = np.zeros((silence_len), dtype=np.float64)
-        # Equal power crossfade
-        t = np.linspace(-1, 1, fade_len, dtype=np.float64)
-        fade_in = np.sqrt(0.5 * (1 + t))
-        fade_out = np.sqrt(0.5 * (1 - t))
-        # Concat the silence to the fades
-        fade_in = np.concatenate([silence, fade_in])
-        fade_out = np.concatenate([fade_out, silence])
-        # Apply the gain to the overlap samples
-        y[:, :overlap] *= fade_in
-        y[:, -overlap:] *= fade_out
-        unfolded = np.zeros((total_len), dtype=np.float64)
-        # Loop to add up all the samples
-        for i in range(num_folds):
-            start = i * (target + overlap)
-            end = start + target + 2 * overlap
-            unfolded[start:end] += y[i]
-        return unfolded
-    def get_step(self) :
-        return self.step.data.item()
-    def checkpoint(self, model_dir, optimizer) :
-        k_steps = self.get_step() // 1000
-        self.save(model_dir.joinpath("checkpoint_%dk_steps.pt" % k_steps), optimizer)
-    def log(self, path, msg) :
-        with open(path, 'a') as f:
-            print(msg, file=f)
-    def load(self, path, optimizer) :
-        checkpoint = torch.load(path)
-        if "optimizer_state" in checkpoint:
-            self.load_state_dict(checkpoint["model_state"])
-            optimizer.load_state_dict(checkpoint["optimizer_state"])
-        else:
-            # Backwards compatibility
-            self.load_state_dict(checkpoint)
-    def save(self, path, optimizer) :
-        torch.save({
-            "model_state": self.state_dict(),
-            "optimizer_state": optimizer.state_dict(),
-        }, path)
-    def num_params(self, print_out=True):
-        parameters = filter(lambda p: p.requires_grad, self.parameters())
-        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
-        if print_out :
-            print('Trainable Parameters: %.3fM' % parameters)

vocoder/train.py DELETED Viewed

@@ -1,118 +0,0 @@
-import time
-from pathlib import Path
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import optim
-from torch.utils.data import DataLoader
-import vocoder.hparams as hp
-from vocoder.display import stream, simple_table
-from vocoder.distribution import discretized_mix_logistic_loss
-from vocoder.gen_wavernn import gen_testset
-from vocoder.models.fatchord_version import WaveRNN
-from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
-def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int,
-          backup_every: int, force_restart: bool):
-    # Check to make sure the hop length is correctly factorised
-    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
-    # Instantiate the model
-    print("Initializing the model...")
-    model = WaveRNN(
-        rnn_dims=hp.voc_rnn_dims,
-        fc_dims=hp.voc_fc_dims,
-        bits=hp.bits,
-        pad=hp.voc_pad,
-        upsample_factors=hp.voc_upsample_factors,
-        feat_dims=hp.num_mels,
-        compute_dims=hp.voc_compute_dims,
-        res_out_dims=hp.voc_res_out_dims,
-        res_blocks=hp.voc_res_blocks,
-        hop_length=hp.hop_length,
-        sample_rate=hp.sample_rate,
-        mode=hp.voc_mode
-    )
-    if torch.cuda.is_available():
-        model = model.cuda()
-    # Initialize the optimizer
-    optimizer = optim.Adam(model.parameters())
-    for p in optimizer.param_groups:
-        p["lr"] = hp.voc_lr
-    loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss
-    # Load the weights
-    model_dir = models_dir / run_id
-    model_dir.mkdir(exist_ok=True)
-    weights_fpath = model_dir / "vocoder.pt"
-    if force_restart or not weights_fpath.exists():
-        print("\nStarting the training of WaveRNN from scratch\n")
-        model.save(weights_fpath, optimizer)
-    else:
-        print("\nLoading weights at %s" % weights_fpath)
-        model.load(weights_fpath, optimizer)
-        print("WaveRNN weights loaded from step %d" % model.step)
-    # Initialize the dataset
-    metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
-        voc_dir.joinpath("synthesized.txt")
-    mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta")
-    wav_dir = syn_dir.joinpath("audio")
-    dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
-    test_loader = DataLoader(dataset, batch_size=1, shuffle=True)
-    # Begin the training
-    simple_table([('Batch size', hp.voc_batch_size),
-                  ('LR', hp.voc_lr),
-                  ('Sequence Len', hp.voc_seq_len)])
-    for epoch in range(1, 350):
-        data_loader = DataLoader(dataset, hp.voc_batch_size, shuffle=True, num_workers=2, collate_fn=collate_vocoder)
-        start = time.time()
-        running_loss = 0.
-        for i, (x, y, m) in enumerate(data_loader, 1):
-            if torch.cuda.is_available():
-                x, m, y = x.cuda(), m.cuda(), y.cuda()
-            # Forward pass
-            y_hat = model(x, m)
-            if model.mode == 'RAW':
-                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
-            elif model.mode == 'MOL':
-                y = y.float()
-            y = y.unsqueeze(-1)
-            # Backward pass
-            loss = loss_func(y_hat, y)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item()
-            speed = i / (time.time() - start)
-            avg_loss = running_loss / i
-            step = model.get_step()
-            k = step // 1000
-            if backup_every != 0 and step % backup_every == 0 :
-                model.checkpoint(model_dir, optimizer)
-            if save_every != 0 and step % save_every == 0 :
-                model.save(weights_fpath, optimizer)
-            msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
-                f"Loss: {avg_loss:.4f} | {speed:.1f} " \
-                f"steps/s | Step: {k}k | "
-            stream(msg)
-        gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
-                    hp.voc_target, hp.voc_overlap, model_dir)
-        print("")

vocoder/vocoder_dataset.py DELETED Viewed

@@ -1,84 +0,0 @@
-from torch.utils.data import Dataset
-from pathlib import Path
-from vocoder import audio
-import vocoder.hparams as hp
-import numpy as np
-import torch
-class VocoderDataset(Dataset):
-    def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path):
-        print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir))
-        with metadata_fpath.open("r") as metadata_file:
-            metadata = [line.split("|") for line in metadata_file]
-        gta_fnames = [x[1] for x in metadata if int(x[4])]
-        gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames]
-        wav_fnames = [x[0] for x in metadata if int(x[4])]
-        wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames]
-        self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths))
-        print("Found %d samples" % len(self.samples_fpaths))
-    def __getitem__(self, index):
-        mel_path, wav_path = self.samples_fpaths[index]
-        # Load the mel spectrogram and adjust its range to [-1, 1]
-        mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value
-        # Load the wav
-        wav = np.load(wav_path)
-        if hp.apply_preemphasis:
-            wav = audio.pre_emphasis(wav)
-        wav = np.clip(wav, -1, 1)
-        # Fix for missing padding   # TODO: settle on whether this is any useful
-        r_pad =  (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
-        wav = np.pad(wav, (0, r_pad), mode='constant')
-        assert len(wav) >= mel.shape[1] * hp.hop_length
-        wav = wav[:mel.shape[1] * hp.hop_length]
-        assert len(wav) % hp.hop_length == 0
-        # Quantize the wav
-        if hp.voc_mode == 'RAW':
-            if hp.mu_law:
-                quant = audio.encode_mu_law(wav, mu=2 ** hp.bits)
-            else:
-                quant = audio.float_2_label(wav, bits=hp.bits)
-        elif hp.voc_mode == 'MOL':
-            quant = audio.float_2_label(wav, bits=16)
-        return mel.astype(np.float32), quant.astype(np.int64)
-    def __len__(self):
-        return len(self.samples_fpaths)
-def collate_vocoder(batch):
-    mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
-    max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
-    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
-    sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]
-    mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
-    labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]
-    mels = np.stack(mels).astype(np.float32)
-    labels = np.stack(labels).astype(np.int64)
-    mels = torch.tensor(mels)
-    labels = torch.tensor(labels).long()
-    x = labels[:, :hp.voc_seq_len]
-    y = labels[:, 1:]
-    bits = 16 if hp.voc_mode == 'MOL' else hp.bits
-    x = audio.label_2_float(x.float(), bits)
-    if hp.voc_mode == 'MOL' :
-        y = audio.label_2_float(y.float(), bits)
-    return x, y, mels