Cloning_Box

Running

App Files Files Community

Kremon96 commited on 12 days ago

Commit

a00d7bf

verified ·

1 Parent(s): fe6ebc3

Delete synthesizer

Browse files

Files changed (17) hide show

synthesizer/LICENSE.txt +0 -24
synthesizer/__init__.py +0 -1
synthesizer/audio.py +0 -206
synthesizer/hparams.py +0 -92
synthesizer/inference.py +0 -165
synthesizer/models/tacotron.py +0 -519
synthesizer/preprocess.py +0 -258
synthesizer/synthesize.py +0 -92
synthesizer/synthesizer_dataset.py +0 -92
synthesizer/train.py +0 -258
synthesizer/utils/__init__.py +0 -45
synthesizer/utils/_cmudict.py +0 -62
synthesizer/utils/cleaners.py +0 -88
synthesizer/utils/numbers.py +0 -69
synthesizer/utils/plot.py +0 -82
synthesizer/utils/symbols.py +0 -17
synthesizer/utils/text.py +0 -75

synthesizer/LICENSE.txt DELETED Viewed

@@ -1,24 +0,0 @@
-MIT License
-Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
-Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
-Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
-Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish)
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

synthesizer/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- #

synthesizer/audio.py DELETED Viewed

@@ -1,206 +0,0 @@
-import librosa
-import librosa.filters
-import numpy as np
-from scipy import signal
-from scipy.io import wavfile
-import soundfile as sf
-def load_wav(path, sr):
-    return librosa.core.load(path, sr=sr)[0]
-def save_wav(wav, path, sr):
-    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
-    #proposed by @dsmiller
-    wavfile.write(path, sr, wav.astype(np.int16))
-def save_wavenet_wav(wav, path, sr):
-    sf.write(path, wav.astype(np.float32), sr)
-def preemphasis(wav, k, preemphasize=True):
-    if preemphasize:
-        return signal.lfilter([1, -k], [1], wav)
-    return wav
-def inv_preemphasis(wav, k, inv_preemphasize=True):
-    if inv_preemphasize:
-        return signal.lfilter([1], [1, -k], wav)
-    return wav
-#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
-def start_and_end_indices(quantized, silence_threshold=2):
-    for start in range(quantized.size):
-        if abs(quantized[start] - 127) > silence_threshold:
-            break
-    for end in range(quantized.size - 1, 1, -1):
-        if abs(quantized[end] - 127) > silence_threshold:
-            break
-    assert abs(quantized[start] - 127) > silence_threshold
-    assert abs(quantized[end] - 127) > silence_threshold
-    return start, end
-def get_hop_size(hparams):
-    hop_size = hparams.hop_size
-    if hop_size is None:
-        assert hparams.frame_shift_ms is not None
-        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
-    return hop_size
-def linearspectrogram(wav, hparams):
-    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
-    S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
-    if hparams.signal_normalization:
-        return _normalize(S, hparams)
-    return S
-def melspectrogram(wav, hparams):
-    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
-    S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
-    if hparams.signal_normalization:
-        return _normalize(S, hparams)
-    return S
-def inv_linear_spectrogram(linear_spectrogram, hparams):
-    """Converts linear spectrogram to waveform using librosa"""
-    if hparams.signal_normalization:
-        D = _denormalize(linear_spectrogram, hparams)
-    else:
-        D = linear_spectrogram
-    S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
-    if hparams.use_lws:
-        processor = _lws_processor(hparams)
-        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-        y = processor.istft(D).astype(np.float32)
-        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-    else:
-        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
-def inv_mel_spectrogram(mel_spectrogram, hparams):
-    """Converts mel spectrogram to waveform using librosa"""
-    if hparams.signal_normalization:
-        D = _denormalize(mel_spectrogram, hparams)
-    else:
-        D = mel_spectrogram
-    S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
-    if hparams.use_lws:
-        processor = _lws_processor(hparams)
-        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-        y = processor.istft(D).astype(np.float32)
-        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-    else:
-        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
-def _lws_processor(hparams):
-    import lws
-    return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
-def _griffin_lim(S, hparams):
-    """librosa implementation of Griffin-Lim
-    Based on https://github.com/librosa/librosa/issues/434
-    """
-    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
-    S_complex = np.abs(S).astype(np.complex)
-    y = _istft(S_complex * angles, hparams)
-    for i in range(hparams.griffin_lim_iters):
-        angles = np.exp(1j * np.angle(_stft(y, hparams)))
-        y = _istft(S_complex * angles, hparams)
-    return y
-def _stft(y, hparams):
-    if hparams.use_lws:
-        return _lws_processor(hparams).stft(y).T
-    else:
-        return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
-def _istft(y, hparams):
-    return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
-##########################################################
-#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
-def num_frames(length, fsize, fshift):
-    """Compute number of time frames of spectrogram
-    """
-    pad = (fsize - fshift)
-    if length % fshift == 0:
-        M = (length + pad * 2 - fsize) // fshift + 1
-    else:
-        M = (length + pad * 2 - fsize) // fshift + 2
-    return M
-def pad_lr(x, fsize, fshift):
-    """Compute left and right padding
-    """
-    M = num_frames(len(x), fsize, fshift)
-    pad = (fsize - fshift)
-    T = len(x) + 2 * pad
-    r = (M - 1) * fshift + fsize - T
-    return pad, pad + r
-##########################################################
-#Librosa correct padding
-def librosa_pad_lr(x, fsize, fshift):
-    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
-# Conversions
-_mel_basis = None
-_inv_mel_basis = None
-def _linear_to_mel(spectogram, hparams):
-    global _mel_basis
-    if _mel_basis is None:
-        _mel_basis = _build_mel_basis(hparams)
-    return np.dot(_mel_basis, spectogram)
-def _mel_to_linear(mel_spectrogram, hparams):
-    global _inv_mel_basis
-    if _inv_mel_basis is None:
-        _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
-    return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
-def _build_mel_basis(hparams):
-    assert hparams.fmax <= hparams.sample_rate // 2
-    return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
-                               fmin=hparams.fmin, fmax=hparams.fmax)
-def _amp_to_db(x, hparams):
-    min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
-    return 20 * np.log10(np.maximum(min_level, x))
-def _db_to_amp(x):
-    return np.power(10.0, (x) * 0.05)
-def _normalize(S, hparams):
-    if hparams.allow_clipping_in_normalization:
-        if hparams.symmetric_mels:
-            return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-                           -hparams.max_abs_value, hparams.max_abs_value)
-        else:
-            return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
-    assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
-    if hparams.symmetric_mels:
-        return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
-    else:
-        return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
-def _denormalize(D, hparams):
-    if hparams.allow_clipping_in_normalization:
-        if hparams.symmetric_mels:
-            return (((np.clip(D, -hparams.max_abs_value,
-                              hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
-                    + hparams.min_level_db)
-        else:
-            return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
-    if hparams.symmetric_mels:
-        return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
-    else:
-        return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)

synthesizer/hparams.py DELETED Viewed

@@ -1,92 +0,0 @@
-import ast
-import pprint
-class HParams(object):
-    def __init__(self, **kwargs): self.__dict__.update(kwargs)
-    def __setitem__(self, key, value): setattr(self, key, value)
-    def __getitem__(self, key): return getattr(self, key)
-    def __repr__(self): return pprint.pformat(self.__dict__)
-    def parse(self, string):
-        # Overrides hparams from a comma-separated string of name=value pairs
-        if len(string) > 0:
-            overrides = [s.split("=") for s in string.split(",")]
-            keys, values = zip(*overrides)
-            keys = list(map(str.strip, keys))
-            values = list(map(str.strip, values))
-            for k in keys:
-                self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
-        return self
-hparams = HParams(
-        ### Signal Processing (used in both synthesizer and vocoder)
-        sample_rate = 16000,
-        n_fft = 800,
-        num_mels = 80,
-        hop_size = 200,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
-        win_size = 800,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
-        fmin = 55,
-        min_level_db = -100,
-        ref_level_db = 20,
-        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
-        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
-        preemphasize = True,
-        ### Tacotron Text-to-Speech (TTS)
-        tts_embed_dims = 512,                       # Embedding dimension for the graphemes/phoneme inputs
-        tts_encoder_dims = 256,
-        tts_decoder_dims = 128,
-        tts_postnet_dims = 512,
-        tts_encoder_K = 5,
-        tts_lstm_dims = 1024,
-        tts_postnet_K = 5,
-        tts_num_highways = 4,
-        tts_dropout = 0.5,
-        tts_cleaner_names = ["english_cleaners"],
-        tts_stop_threshold = -3.4,                  # Value below which audio generation ends.
-                                                    # For example, for a range of [-4, 4], this
-                                                    # will terminate the sequence at the first
-                                                    # frame that has all values < -3.4
-        ### Tacotron Training
-        tts_schedule = [(2,  1e-3,  20_000,  12),   # Progressive training schedule
-                        (2,  5e-4,  40_000,  12),   # (r, lr, step, batch_size)
-                        (2,  2e-4,  80_000,  12),   #
-                        (2,  1e-4, 160_000,  12),   # r = reduction factor (# of mel frames
-                        (2,  3e-5, 320_000,  12),   #     synthesized for each decoder iteration)
-                        (2,  1e-5, 640_000,  12)],  # lr = learning rate
-        tts_clip_grad_norm = 1.0,                   # clips the gradient norm to prevent explosion - set to None if not needed
-        tts_eval_interval = 500,                    # Number of steps between model evaluation (sample generation)
-                                                    # Set to -1 to generate after completing epoch, or 0 to disable
-        tts_eval_num_samples = 1,                   # Makes this number of samples
-        ### Data Preprocessing
-        max_mel_frames = 900,
-        rescale = True,
-        rescaling_max = 0.9,
-        synthesis_batch_size = 16,                  # For vocoder preprocessing and inference.
-        ### Mel Visualization and Griffin-Lim
-        signal_normalization = True,
-        power = 1.5,
-        griffin_lim_iters = 60,
-        ### Audio processing options
-        fmax = 7600,                                # Should not exceed (sample_rate // 2)
-        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
-        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
-        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
-        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
-                                                    #               and [0, max_abs_value] if False
-        trim_silence = True,                        # Use with sample_rate of 16000 for best results
-        ### SV2TTS
-        speaker_embedding_size = 256,               # Dimension for the speaker embedding
-        silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
-        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
-        )
-def hparams_debug_string():
-    return str(hparams)

synthesizer/inference.py DELETED Viewed

@@ -1,165 +0,0 @@
-import torch
-from synthesizer import audio
-from synthesizer.hparams import hparams
-from synthesizer.models.tacotron import Tacotron
-from synthesizer.utils.symbols import symbols
-from synthesizer.utils.text import text_to_sequence
-from vocoder.display import simple_table
-from pathlib import Path
-from typing import Union, List
-import numpy as np
-import librosa
-class Synthesizer:
-    sample_rate = hparams.sample_rate
-    hparams = hparams
-    def __init__(self, model_fpath: Path, verbose=True):
-        """
-        The model isn't instantiated and loaded in memory until needed or until load() is called.
-        :param model_fpath: path to the trained model file
-        :param verbose: if False, prints less information when using the model
-        """
-        self.model_fpath = model_fpath
-        self.verbose = verbose
-        # Check for GPU
-        if torch.cuda.is_available():
-            self.device = torch.device("cuda")
-        else:
-            self.device = torch.device("cpu")
-        if self.verbose:
-            print("Synthesizer using device:", self.device)
-        # Tacotron model will be instantiated later on first use.
-        self._model = None
-    def is_loaded(self):
-        """
-        Whether the model is loaded in memory.
-        """
-        return self._model is not None
-    def load(self):
-        """
-        Instantiates and loads the model given the weights file that was passed in the constructor.
-        """
-        self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
-                               num_chars=len(symbols),
-                               encoder_dims=hparams.tts_encoder_dims,
-                               decoder_dims=hparams.tts_decoder_dims,
-                               n_mels=hparams.num_mels,
-                               fft_bins=hparams.num_mels,
-                               postnet_dims=hparams.tts_postnet_dims,
-                               encoder_K=hparams.tts_encoder_K,
-                               lstm_dims=hparams.tts_lstm_dims,
-                               postnet_K=hparams.tts_postnet_K,
-                               num_highways=hparams.tts_num_highways,
-                               dropout=hparams.tts_dropout,
-                               stop_threshold=hparams.tts_stop_threshold,
-                               speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
-        self._model.load(self.model_fpath)
-        self._model.eval()
-        if self.verbose:
-            print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
-    def synthesize_spectrograms(self, texts: List[str],
-                                embeddings: Union[np.ndarray, List[np.ndarray]],
-                                return_alignments=False):
-        """
-        Synthesizes mel spectrograms from texts and speaker embeddings.
-        :param texts: a list of N text prompts to be synthesized
-        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
-        :param return_alignments: if True, a matrix representing the alignments between the
-        characters
-        and each decoder output step will be returned for each spectrogram
-        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
-        sequence length of spectrogram i, and possibly the alignments.
-        """
-        # Load the model on the first request.
-        if not self.is_loaded():
-            self.load()
-        # Preprocess text inputs
-        inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
-        if not isinstance(embeddings, list):
-            embeddings = [embeddings]
-        # Batch inputs
-        batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
-                             for i in range(0, len(inputs), hparams.synthesis_batch_size)]
-        batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
-                             for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
-        specs = []
-        for i, batch in enumerate(batched_inputs, 1):
-            if self.verbose:
-                print(f"\n| Generating {i}/{len(batched_inputs)}")
-            # Pad texts so they are all the same length
-            text_lens = [len(text) for text in batch]
-            max_text_len = max(text_lens)
-            chars = [pad1d(text, max_text_len) for text in batch]
-            chars = np.stack(chars)
-            # Stack speaker embeddings into 2D array for batch processing
-            speaker_embeds = np.stack(batched_embeds[i-1])
-            # Convert to tensor
-            chars = torch.tensor(chars).long().to(self.device)
-            speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
-            # Inference
-            _, mels, alignments = self._model.generate(chars, speaker_embeddings)
-            mels = mels.detach().cpu().numpy()
-            for m in mels:
-                # Trim silence from end of each spectrogram
-                while np.max(m[:, -1]) < hparams.tts_stop_threshold:
-                    m = m[:, :-1]
-                specs.append(m)
-        if self.verbose:
-            print("\n\nDone.\n")
-        return (specs, alignments) if return_alignments else specs
-    @staticmethod
-    def load_preprocess_wav(fpath):
-        """
-        Loads and preprocesses an audio file under the same conditions the audio files were used to
-        train the synthesizer.
-        """
-        wav = librosa.load(str(fpath), hparams.sample_rate)[0]
-        if hparams.rescale:
-            wav = wav / np.abs(wav).max() * hparams.rescaling_max
-        return wav
-    @staticmethod
-    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
-        """
-        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
-        were fed to the synthesizer when training.
-        """
-        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
-        else:
-            wav = fpath_or_wav
-        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-        return mel_spectrogram
-    @staticmethod
-    def griffin_lim(mel):
-        """
-        Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
-        with the same parameters present in hparams.py.
-        """
-        return audio.inv_mel_spectrogram(mel, hparams)
-def pad1d(x, max_len, pad_value=0):
-    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)

synthesizer/models/tacotron.py DELETED Viewed

@@ -1,519 +0,0 @@
-import os
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pathlib import Path
-from typing import Union
-class HighwayNetwork(nn.Module):
-    def __init__(self, size):
-        super().__init__()
-        self.W1 = nn.Linear(size, size)
-        self.W2 = nn.Linear(size, size)
-        self.W1.bias.data.fill_(0.)
-    def forward(self, x):
-        x1 = self.W1(x)
-        x2 = self.W2(x)
-        g = torch.sigmoid(x2)
-        y = g * F.relu(x1) + (1. - g) * x
-        return y
-class Encoder(nn.Module):
-    def __init__(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
-        super().__init__()
-        prenet_dims = (encoder_dims, encoder_dims)
-        cbhg_channels = encoder_dims
-        self.embedding = nn.Embedding(num_chars, embed_dims)
-        self.pre_net = PreNet(embed_dims, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
-                              dropout=dropout)
-        self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
-                         proj_channels=[cbhg_channels, cbhg_channels],
-                         num_highways=num_highways)
-    def forward(self, x, speaker_embedding=None):
-        x = self.embedding(x)
-        x = self.pre_net(x)
-        x.transpose_(1, 2)
-        x = self.cbhg(x)
-        if speaker_embedding is not None:
-            x = self.add_speaker_embedding(x, speaker_embedding)
-        return x
-    def add_speaker_embedding(self, x, speaker_embedding):
-        # SV2TTS
-        # The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
-        # When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
-        #     (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
-        # This concats the speaker embedding for each char in the encoder output
-        # Save the dimensions as human-readable names
-        batch_size = x.size()[0]
-        num_chars = x.size()[1]
-        if speaker_embedding.dim() == 1:
-            idx = 0
-        else:
-            idx = 1
-        # Start by making a copy of each speaker embedding to match the input text length
-        # The output of this has size (batch_size, num_chars * tts_embed_dims)
-        speaker_embedding_size = speaker_embedding.size()[idx]
-        e = speaker_embedding.repeat_interleave(num_chars, dim=idx)
-        # Reshape it and transpose
-        e = e.reshape(batch_size, speaker_embedding_size, num_chars)
-        e = e.transpose(1, 2)
-        # Concatenate the tiled speaker embedding with the encoder output
-        x = torch.cat((x, e), 2)
-        return x
-class BatchNormConv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel, relu=True):
-        super().__init__()
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
-        self.bnorm = nn.BatchNorm1d(out_channels)
-        self.relu = relu
-    def forward(self, x):
-        x = self.conv(x)
-        x = F.relu(x) if self.relu is True else x
-        return self.bnorm(x)
-class CBHG(nn.Module):
-    def __init__(self, K, in_channels, channels, proj_channels, num_highways):
-        super().__init__()
-        # List of all rnns to call `flatten_parameters()` on
-        self._to_flatten = []
-        self.bank_kernels = [i for i in range(1, K + 1)]
-        self.conv1d_bank = nn.ModuleList()
-        for k in self.bank_kernels:
-            conv = BatchNormConv(in_channels, channels, k)
-            self.conv1d_bank.append(conv)
-        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
-        self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
-        self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)
-        # Fix the highway input if necessary
-        if proj_channels[-1] != channels:
-            self.highway_mismatch = True
-            self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
-        else:
-            self.highway_mismatch = False
-        self.highways = nn.ModuleList()
-        for i in range(num_highways):
-            hn = HighwayNetwork(channels)
-            self.highways.append(hn)
-        self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
-        self._to_flatten.append(self.rnn)
-        # Avoid fragmentation of RNN parameters and associated warning
-        self._flatten_parameters()
-    def forward(self, x):
-        # Although we `_flatten_parameters()` on init, when using DataParallel
-        # the model gets replicated, making it no longer guaranteed that the
-        # weights are contiguous in GPU memory. Hence, we must call it again
-        self._flatten_parameters()
-        # Save these for later
-        residual = x
-        seq_len = x.size(-1)
-        conv_bank = []
-        # Convolution Bank
-        for conv in self.conv1d_bank:
-            c = conv(x) # Convolution
-            conv_bank.append(c[:, :, :seq_len])
-        # Stack along the channel axis
-        conv_bank = torch.cat(conv_bank, dim=1)
-        # dump the last padding to fit residual
-        x = self.maxpool(conv_bank)[:, :, :seq_len]
-        # Conv1d projections
-        x = self.conv_project1(x)
-        x = self.conv_project2(x)
-        # Residual Connect
-        x = x + residual
-        # Through the highways
-        x = x.transpose(1, 2)
-        if self.highway_mismatch is True:
-            x = self.pre_highway(x)
-        for h in self.highways: x = h(x)
-        # And then the RNN
-        x, _ = self.rnn(x)
-        return x
-    def _flatten_parameters(self):
-        """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
-        to improve efficiency and avoid PyTorch yelling at us."""
-        [m.flatten_parameters() for m in self._to_flatten]
-class PreNet(nn.Module):
-    def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
-        super().__init__()
-        self.fc1 = nn.Linear(in_dims, fc1_dims)
-        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
-        self.p = dropout
-    def forward(self, x):
-        x = self.fc1(x)
-        x = F.relu(x)
-        x = F.dropout(x, self.p, training=True)
-        x = self.fc2(x)
-        x = F.relu(x)
-        x = F.dropout(x, self.p, training=True)
-        return x
-class Attention(nn.Module):
-    def __init__(self, attn_dims):
-        super().__init__()
-        self.W = nn.Linear(attn_dims, attn_dims, bias=False)
-        self.v = nn.Linear(attn_dims, 1, bias=False)
-    def forward(self, encoder_seq_proj, query, t):
-        # print(encoder_seq_proj.shape)
-        # Transform the query vector
-        query_proj = self.W(query).unsqueeze(1)
-        # Compute the scores
-        u = self.v(torch.tanh(encoder_seq_proj + query_proj))
-        scores = F.softmax(u, dim=1)
-        return scores.transpose(1, 2)
-class LSA(nn.Module):
-    def __init__(self, attn_dim, kernel_size=31, filters=32):
-        super().__init__()
-        self.conv = nn.Conv1d(1, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=True)
-        self.L = nn.Linear(filters, attn_dim, bias=False)
-        self.W = nn.Linear(attn_dim, attn_dim, bias=True) # Include the attention bias in this term
-        self.v = nn.Linear(attn_dim, 1, bias=False)
-        self.cumulative = None
-        self.attention = None
-    def init_attention(self, encoder_seq_proj):
-        device = next(self.parameters()).device  # use same device as parameters
-        b, t, c = encoder_seq_proj.size()
-        self.cumulative = torch.zeros(b, t, device=device)
-        self.attention = torch.zeros(b, t, device=device)
-    def forward(self, encoder_seq_proj, query, t, chars):
-        if t == 0: self.init_attention(encoder_seq_proj)
-        processed_query = self.W(query).unsqueeze(1)
-        location = self.cumulative.unsqueeze(1)
-        processed_loc = self.L(self.conv(location).transpose(1, 2))
-        u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
-        u = u.squeeze(-1)
-        # Mask zero padding chars
-        u = u * (chars != 0).float()
-        # Smooth Attention
-        # scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
-        scores = F.softmax(u, dim=1)
-        self.attention = scores
-        self.cumulative = self.cumulative + self.attention
-        return scores.unsqueeze(-1).transpose(1, 2)
-class Decoder(nn.Module):
-    # Class variable because its value doesn't change between classes
-    # yet ought to be scoped by class because its a property of a Decoder
-    max_r = 20
-    def __init__(self, n_mels, encoder_dims, decoder_dims, lstm_dims,
-                 dropout, speaker_embedding_size):
-        super().__init__()
-        self.register_buffer("r", torch.tensor(1, dtype=torch.int))
-        self.n_mels = n_mels
-        prenet_dims = (decoder_dims * 2, decoder_dims * 2)
-        self.prenet = PreNet(n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
-                             dropout=dropout)
-        self.attn_net = LSA(decoder_dims)
-        self.attn_rnn = nn.GRUCell(encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims)
-        self.rnn_input = nn.Linear(encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims)
-        self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
-        self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
-        self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
-        self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)
-    def zoneout(self, prev, current, p=0.1):
-        device = next(self.parameters()).device  # Use same device as parameters
-        mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
-        return prev * mask + current * (1 - mask)
-    def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
-                hidden_states, cell_states, context_vec, t, chars):
-        # Need this for reshaping mels
-        batch_size = encoder_seq.size(0)
-        # Unpack the hidden and cell states
-        attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
-        rnn1_cell, rnn2_cell = cell_states
-        # PreNet for the Attention RNN
-        prenet_out = self.prenet(prenet_in)
-        # Compute the Attention RNN hidden state
-        attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
-        attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
-        # Compute the attention scores
-        scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)
-        # Dot product to create the context vector
-        context_vec = scores @ encoder_seq
-        context_vec = context_vec.squeeze(1)
-        # Concat Attention RNN output w. Context Vector & project
-        x = torch.cat([context_vec, attn_hidden], dim=1)
-        x = self.rnn_input(x)
-        # Compute first Residual RNN
-        rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
-        if self.training:
-            rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
-        else:
-            rnn1_hidden = rnn1_hidden_next
-        x = x + rnn1_hidden
-        # Compute second Residual RNN
-        rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
-        if self.training:
-            rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
-        else:
-            rnn2_hidden = rnn2_hidden_next
-        x = x + rnn2_hidden
-        # Project Mels
-        mels = self.mel_proj(x)
-        mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
-        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
-        cell_states = (rnn1_cell, rnn2_cell)
-        # Stop token prediction
-        s = torch.cat((x, context_vec), dim=1)
-        s = self.stop_proj(s)
-        stop_tokens = torch.sigmoid(s)
-        return mels, scores, hidden_states, cell_states, context_vec, stop_tokens
-class Tacotron(nn.Module):
-    def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels,
-                 fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways,
-                 dropout, stop_threshold, speaker_embedding_size):
-        super().__init__()
-        self.n_mels = n_mels
-        self.lstm_dims = lstm_dims
-        self.encoder_dims = encoder_dims
-        self.decoder_dims = decoder_dims
-        self.speaker_embedding_size = speaker_embedding_size
-        self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
-                               encoder_K, num_highways, dropout)
-        self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
-        self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
-                               dropout, speaker_embedding_size)
-        self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
-                            [postnet_dims, fft_bins], num_highways)
-        self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)
-        self.init_model()
-        self.num_params()
-        self.register_buffer("step", torch.zeros(1, dtype=torch.long))
-        self.register_buffer("stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32))
-    @property
-    def r(self):
-        return self.decoder.r.item()
-    @r.setter
-    def r(self, value):
-        self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
-    def forward(self, x, m, speaker_embedding):
-        device = next(self.parameters()).device  # use same device as parameters
-        self.step += 1
-        batch_size, _, steps  = m.size()
-        # Initialise all hidden states and pack into tuple
-        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
-        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
-        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
-        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
-        # Initialise all lstm cell states and pack into tuple
-        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
-        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
-        cell_states = (rnn1_cell, rnn2_cell)
-        # <GO> Frame for start of decoder loop
-        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
-        # Need an initial context vector
-        context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
-        # SV2TTS: Run the encoder with the speaker embedding
-        # The projection avoids unnecessary matmuls in the decoder loop
-        encoder_seq = self.encoder(x, speaker_embedding)
-        encoder_seq_proj = self.encoder_proj(encoder_seq)
-        # Need a couple of lists for outputs
-        mel_outputs, attn_scores, stop_outputs = [], [], []
-        # Run the decoder loop
-        for t in range(0, steps, self.r):
-            prenet_in = m[:, :, t - 1] if t > 0 else go_frame
-            mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
-                self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
-                             hidden_states, cell_states, context_vec, t, x)
-            mel_outputs.append(mel_frames)
-            attn_scores.append(scores)
-            stop_outputs.extend([stop_tokens] * self.r)
-        # Concat the mel outputs into sequence
-        mel_outputs = torch.cat(mel_outputs, dim=2)
-        # Post-Process for Linear Spectrograms
-        postnet_out = self.postnet(mel_outputs)
-        linear = self.post_proj(postnet_out)
-        linear = linear.transpose(1, 2)
-        # For easy visualisation
-        attn_scores = torch.cat(attn_scores, 1)
-        # attn_scores = attn_scores.cpu().data.numpy()
-        stop_outputs = torch.cat(stop_outputs, 1)
-        return mel_outputs, linear, attn_scores, stop_outputs
-    def generate(self, x, speaker_embedding=None, steps=2000):
-        self.eval()
-        device = next(self.parameters()).device  # use same device as parameters
-        batch_size, _  = x.size()
-        # Need to initialise all hidden states and pack into tuple for tidyness
-        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
-        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
-        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
-        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
-        # Need to initialise all lstm cell states and pack into tuple for tidyness
-        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
-        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
-        cell_states = (rnn1_cell, rnn2_cell)
-        # Need a <GO> Frame for start of decoder loop
-        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
-        # Need an initial context vector
-        context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
-        # SV2TTS: Run the encoder with the speaker embedding
-        # The projection avoids unnecessary matmuls in the decoder loop
-        encoder_seq = self.encoder(x, speaker_embedding)
-        encoder_seq_proj = self.encoder_proj(encoder_seq)
-        # Need a couple of lists for outputs
-        mel_outputs, attn_scores, stop_outputs = [], [], []
-        # Run the decoder loop
-        for t in range(0, steps, self.r):
-            prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
-            mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
-            self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
-                         hidden_states, cell_states, context_vec, t, x)
-            mel_outputs.append(mel_frames)
-            attn_scores.append(scores)
-            stop_outputs.extend([stop_tokens] * self.r)
-            # Stop the loop when all stop tokens in batch exceed threshold
-            if (stop_tokens > 0.5).all() and t > 10: break
-        # Concat the mel outputs into sequence
-        mel_outputs = torch.cat(mel_outputs, dim=2)
-        # Post-Process for Linear Spectrograms
-        postnet_out = self.postnet(mel_outputs)
-        linear = self.post_proj(postnet_out)
-        linear = linear.transpose(1, 2)
-        # For easy visualisation
-        attn_scores = torch.cat(attn_scores, 1)
-        stop_outputs = torch.cat(stop_outputs, 1)
-        self.train()
-        return mel_outputs, linear, attn_scores
-    def init_model(self):
-        for p in self.parameters():
-            if p.dim() > 1: nn.init.xavier_uniform_(p)
-    def get_step(self):
-        return self.step.data.item()
-    def reset_step(self):
-        # assignment to parameters or buffers is overloaded, updates internal dict entry
-        self.step = self.step.data.new_tensor(1)
-    def log(self, path, msg):
-        with open(path, "a") as f:
-            print(msg, file=f)
-    def load(self, path, optimizer=None):
-        # Use device of model params as location for loaded state
-        device = next(self.parameters()).device
-        checkpoint = torch.load(str(path), map_location=device)
-        self.load_state_dict(checkpoint["model_state"])
-        if "optimizer_state" in checkpoint and optimizer is not None:
-            optimizer.load_state_dict(checkpoint["optimizer_state"])
-    def save(self, path, optimizer=None):
-        if optimizer is not None:
-            torch.save({
-                "model_state": self.state_dict(),
-                "optimizer_state": optimizer.state_dict(),
-            }, str(path))
-        else:
-            torch.save({
-                "model_state": self.state_dict(),
-            }, str(path))
-    def num_params(self, print_out=True):
-        parameters = filter(lambda p: p.requires_grad, self.parameters())
-        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
-        if print_out:
-            print("Trainable Parameters: %.3fM" % parameters)
-        return parameters

synthesizer/preprocess.py DELETED Viewed

@@ -1,258 +0,0 @@
-from multiprocessing.pool import Pool
-from synthesizer import audio
-from functools import partial
-from itertools import chain
-from encoder import inference as encoder
-from pathlib import Path
-from utils import logmmse
-from tqdm import tqdm
-import numpy as np
-import librosa
-def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
-                       no_alignments: bool, datasets_name: str, subfolders: str):
-    # Gather the input directories
-    dataset_root = datasets_root.joinpath(datasets_name)
-    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
-    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
-    assert all(input_dir.exists() for input_dir in input_dirs)
-    # Create the output directories for each output file type
-    out_dir.joinpath("mels").mkdir(exist_ok=True)
-    out_dir.joinpath("audio").mkdir(exist_ok=True)
-    # Create a metadata file
-    metadata_fpath = out_dir.joinpath("train.txt")
-    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
-    # Preprocess the dataset
-    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
-    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
-                   hparams=hparams, no_alignments=no_alignments)
-    job = Pool(n_processes).imap(func, speaker_dirs)
-    for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
-        for metadatum in speaker_metadata:
-            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
-    metadata_file.close()
-    # Verify the contents of the metadata file
-    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
-        metadata = [line.split("|") for line in metadata_file]
-    mel_frames = sum([int(m[4]) for m in metadata])
-    timesteps = sum([int(m[3]) for m in metadata])
-    sample_rate = hparams.sample_rate
-    hours = (timesteps / sample_rate) / 3600
-    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
-          (len(metadata), mel_frames, timesteps, hours))
-    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
-    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
-    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
-def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
-    metadata = []
-    for book_dir in speaker_dir.glob("*"):
-        if no_alignments:
-            # Gather the utterance audios and texts
-            # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
-            extensions = ["*.wav", "*.flac", "*.mp3"]
-            for extension in extensions:
-                wav_fpaths = book_dir.glob(extension)
-                for wav_fpath in wav_fpaths:
-                    # Load the audio waveform
-                    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-                    if hparams.rescale:
-                        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-                    # Get the corresponding text
-                    # Check for .txt (for compatibility with other datasets)
-                    text_fpath = wav_fpath.with_suffix(".txt")
-                    if not text_fpath.exists():
-                        # Check for .normalized.txt (LibriTTS)
-                        text_fpath = wav_fpath.with_suffix(".normalized.txt")
-                        assert text_fpath.exists()
-                    with text_fpath.open("r") as text_file:
-                        text = "".join([line for line in text_file])
-                        text = text.replace("\"", "")
-                        text = text.strip()
-                    # Process the utterance
-                    metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
-                                                      skip_existing, hparams))
-        else:
-            # Process alignment file (LibriSpeech support)
-            # Gather the utterance audios and texts
-            try:
-                alignments_fpath = next(book_dir.glob("*.alignment.txt"))
-                with alignments_fpath.open("r") as alignments_file:
-                    alignments = [line.rstrip().split(" ") for line in alignments_file]
-            except StopIteration:
-                # A few alignment files will be missing
-                continue
-            # Iterate over each entry in the alignments file
-            for wav_fname, words, end_times in alignments:
-                wav_fpath = book_dir.joinpath(wav_fname + ".flac")
-                assert wav_fpath.exists()
-                words = words.replace("\"", "").split(",")
-                end_times = list(map(float, end_times.replace("\"", "").split(",")))
-                # Process each sub-utterance
-                wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
-                for i, (wav, text) in enumerate(zip(wavs, texts)):
-                    sub_basename = "%s_%02d" % (wav_fname, i)
-                    metadata.append(process_utterance(wav, text, out_dir, sub_basename,
-                                                      skip_existing, hparams))
-    return [m for m in metadata if m is not None]
-def split_on_silences(wav_fpath, words, end_times, hparams):
-    # Load the audio waveform
-    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    words = np.array(words)
-    start_times = np.array([0.0] + end_times[:-1])
-    end_times = np.array(end_times)
-    assert len(words) == len(end_times) == len(start_times)
-    assert words[0] == "" and words[-1] == ""
-    # Find pauses that are too long
-    mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
-    mask[0] = mask[-1] = True
-    breaks = np.where(mask)[0]
-    # Profile the noise from the silences and perform noise reduction on the waveform
-    silence_times = [[start_times[i], end_times[i]] for i in breaks]
-    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
-    noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
-    if len(noisy_wav) > hparams.sample_rate * 0.02:
-        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
-        wav = logmmse.denoise(wav, profile, eta=0)
-    # Re-attach segments that are too short
-    segments = list(zip(breaks[:-1], breaks[1:]))
-    segment_durations = [start_times[end] - end_times[start] for start, end in segments]
-    i = 0
-    while i < len(segments) and len(segments) > 1:
-        if segment_durations[i] < hparams.utterance_min_duration:
-            # See if the segment can be re-attached with the right or the left segment
-            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
-            right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
-            joined_duration = segment_durations[i] + min(left_duration, right_duration)
-            # Do not re-attach if it causes the joined utterance to be too long
-            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
-                i += 1
-                continue
-            # Re-attach the segment with the neighbour of shortest duration
-            j = i - 1 if left_duration <= right_duration else i
-            segments[j] = (segments[j][0], segments[j + 1][1])
-            segment_durations[j] = joined_duration
-            del segments[j + 1], segment_durations[j + 1]
-        else:
-            i += 1
-    # Split the utterance
-    segment_times = [[end_times[start], start_times[end]] for start, end in segments]
-    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
-    wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
-    texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
-    # # DEBUG: play the audio segments (run with -n=1)
-    # import sounddevice as sd
-    # if len(wavs) > 1:
-    #     print("This sentence was split in %d segments:" % len(wavs))
-    # else:
-    #     print("There are no silences long enough for this sentence to be split:")
-    # for wav, text in zip(wavs, texts):
-    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
-    #     # when playing them. You shouldn't need to do that in your parsers.
-    #     wav = np.concatenate((wav, [0] * 16000))
-    #     print("\t%s" % text)
-    #     sd.play(wav, 16000, blocking=True)
-    # print("")
-    return wavs, texts
-def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
-                      skip_existing: bool, hparams):
-    ## FOR REFERENCE:
-    # For you not to lose your head if you ever wish to change things here or implement your own
-    # synthesizer.
-    # - Both the audios and the mel spectrograms are saved as numpy arrays
-    # - There is no processing done to the audios that will be saved to disk beyond volume
-    #   normalization (in split_on_silences)
-    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
-    #   is why we re-apply it on the audio on the side of the vocoder.
-    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
-    #   without extra padding. This means that you won't have an exact relation between the length
-    #   of the wav and of the mel spectrogram. See the vocoder data loader.
-    # Skip existing utterances if needed
-    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
-    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
-    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
-        return None
-    # Trim silence
-    if hparams.trim_silence:
-        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
-    # Skip utterances that are too short
-    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
-        return None
-    # Compute the mel spectrogram
-    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-    mel_frames = mel_spectrogram.shape[1]
-    # Skip utterances that are too long
-    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
-        return None
-    # Write the spectrogram, embed and audio to disk
-    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
-    np.save(wav_fpath, wav, allow_pickle=False)
-    # Return a tuple describing this training example
-    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
-def embed_utterance(fpaths, encoder_model_fpath):
-    if not encoder.is_loaded():
-        encoder.load_model(encoder_model_fpath)
-    # Compute the speaker embedding of the utterance
-    wav_fpath, embed_fpath = fpaths
-    wav = np.load(wav_fpath)
-    wav = encoder.preprocess_wav(wav)
-    embed = encoder.embed_utterance(wav)
-    np.save(embed_fpath, embed, allow_pickle=False)
-def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
-    wav_dir = synthesizer_root.joinpath("audio")
-    metadata_fpath = synthesizer_root.joinpath("train.txt")
-    assert wav_dir.exists() and metadata_fpath.exists()
-    embed_dir = synthesizer_root.joinpath("embeds")
-    embed_dir.mkdir(exist_ok=True)
-    # Gather the input wave filepath and the target output embed filepath
-    with metadata_fpath.open("r") as metadata_file:
-        metadata = [line.split("|") for line in metadata_file]
-        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
-    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
-    # Embed the utterances in separate threads
-    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
-    job = Pool(n_processes).imap(func, fpaths)
-    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

synthesizer/synthesize.py DELETED Viewed

@@ -1,92 +0,0 @@
-import platform
-from functools import partial
-from pathlib import Path
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from synthesizer.hparams import hparams_debug_string
-from synthesizer.models.tacotron import Tacotron
-from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
-from synthesizer.utils import data_parallel_workaround
-from synthesizer.utils.symbols import symbols
-def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
-    # This generates ground truth-aligned mels for vocoder training
-    synth_dir = out_dir / "mels_gta"
-    synth_dir.mkdir(exist_ok=True, parents=True)
-    print(hparams_debug_string())
-    # Check for GPU
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
-            raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
-    else:
-        device = torch.device("cpu")
-    print("Synthesizer using device:", device)
-    # Instantiate Tacotron model
-    model = Tacotron(embed_dims=hparams.tts_embed_dims,
-                     num_chars=len(symbols),
-                     encoder_dims=hparams.tts_encoder_dims,
-                     decoder_dims=hparams.tts_decoder_dims,
-                     n_mels=hparams.num_mels,
-                     fft_bins=hparams.num_mels,
-                     postnet_dims=hparams.tts_postnet_dims,
-                     encoder_K=hparams.tts_encoder_K,
-                     lstm_dims=hparams.tts_lstm_dims,
-                     postnet_K=hparams.tts_postnet_K,
-                     num_highways=hparams.tts_num_highways,
-                     dropout=0., # Use zero dropout for gta mels
-                     stop_threshold=hparams.tts_stop_threshold,
-                     speaker_embedding_size=hparams.speaker_embedding_size).to(device)
-    # Load the weights
-    print("\nLoading weights at %s" % syn_model_fpath)
-    model.load(syn_model_fpath)
-    print("Tacotron weights loaded from step %d" % model.step)
-    # Synthesize using same reduction factor as the model is currently trained
-    r = np.int32(model.r)
-    # Set model to eval mode (disable gradient and zoneout)
-    model.eval()
-    # Initialize the dataset
-    metadata_fpath = in_dir.joinpath("train.txt")
-    mel_dir = in_dir.joinpath("mels")
-    embed_dir = in_dir.joinpath("embeds")
-    dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
-    collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
-    data_loader = DataLoader(dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
-    # Generate GTA mels
-    meta_out_fpath = out_dir / "synthesized.txt"
-    with meta_out_fpath.open("w") as file:
-        for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
-            texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
-            # Parallelize model onto GPUS using workaround due to python bug
-            if device.type == "cuda" and torch.cuda.device_count() > 1:
-                _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
-            else:
-                _, mels_out, _, _ = model(texts, mels, embeds)
-            for j, k in enumerate(idx):
-                # Note: outputs mel-spectrogram files and target ones have same names, just different folders
-                mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
-                mel_out = mels_out[j].detach().cpu().numpy().T
-                # Use the length of the ground truth mel to remove padding from the generated mels
-                mel_out = mel_out[:int(dataset.metadata[k][4])]
-                # Write the spectrogram to disk
-                np.save(mel_filename, mel_out, allow_pickle=False)
-                # Write metadata into the synthesized file
-                file.write("|".join(dataset.metadata[k]))

synthesizer/synthesizer_dataset.py DELETED Viewed

@@ -1,92 +0,0 @@
-import torch
-from torch.utils.data import Dataset
-import numpy as np
-from pathlib import Path
-from synthesizer.utils.text import text_to_sequence
-class SynthesizerDataset(Dataset):
-    def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
-        print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
-        with metadata_fpath.open("r") as metadata_file:
-            metadata = [line.split("|") for line in metadata_file]
-        mel_fnames = [x[1] for x in metadata if int(x[4])]
-        mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
-        embed_fnames = [x[2] for x in metadata if int(x[4])]
-        embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
-        self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
-        self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
-        self.metadata = metadata
-        self.hparams = hparams
-        print("Found %d samples" % len(self.samples_fpaths))
-    def __getitem__(self, index):
-        # Sometimes index may be a list of 2 (not sure why this happens)
-        # If that is the case, return a single item corresponding to first element in index
-        if index is list:
-            index = index[0]
-        mel_path, embed_path = self.samples_fpaths[index]
-        mel = np.load(mel_path).T.astype(np.float32)
-        # Load the embed
-        embed = np.load(embed_path)
-        # Get the text and clean it
-        text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
-        # Convert the list returned by text_to_sequence to a numpy array
-        text = np.asarray(text).astype(np.int32)
-        return text, mel.astype(np.float32), embed.astype(np.float32), index
-    def __len__(self):
-        return len(self.samples_fpaths)
-def collate_synthesizer(batch, r, hparams):
-    # Text
-    x_lens = [len(x[0]) for x in batch]
-    max_x_len = max(x_lens)
-    chars = [pad1d(x[0], max_x_len) for x in batch]
-    chars = np.stack(chars)
-    # Mel spectrogram
-    spec_lens = [x[1].shape[-1] for x in batch]
-    max_spec_len = max(spec_lens) + 1
-    if max_spec_len % r != 0:
-        max_spec_len += r - max_spec_len % r
-    # WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
-    # By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
-    if hparams.symmetric_mels:
-        mel_pad_value = -1 * hparams.max_abs_value
-    else:
-        mel_pad_value = 0
-    mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
-    mel = np.stack(mel)
-    # Speaker embedding (SV2TTS)
-    embeds = np.array([x[2] for x in batch])
-    # Index (for vocoder preprocessing)
-    indices = [x[3] for x in batch]
-    # Convert all to tensor
-    chars = torch.tensor(chars).long()
-    mel = torch.tensor(mel)
-    embeds = torch.tensor(embeds)
-    return chars, mel, embeds, indices
-def pad1d(x, max_len, pad_value=0):
-    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
-def pad2d(x, max_len, pad_value=0):
-    return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)

synthesizer/train.py DELETED Viewed

@@ -1,258 +0,0 @@
-from datetime import datetime
-from functools import partial
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import optim
-from torch.utils.data import DataLoader
-from synthesizer import audio
-from synthesizer.models.tacotron import Tacotron
-from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
-from synthesizer.utils import ValueWindow, data_parallel_workaround
-from synthesizer.utils.plot import plot_spectrogram
-from synthesizer.utils.symbols import symbols
-from synthesizer.utils.text import sequence_to_text
-from vocoder.display import *
-def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
-def time_string():
-    return datetime.now().strftime("%Y-%m-%d %H:%M")
-def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup_every: int, force_restart: bool,
-          hparams):
-    models_dir.mkdir(exist_ok=True)
-    model_dir = models_dir.joinpath(run_id)
-    plot_dir = model_dir.joinpath("plots")
-    wav_dir = model_dir.joinpath("wavs")
-    mel_output_dir = model_dir.joinpath("mel-spectrograms")
-    meta_folder = model_dir.joinpath("metas")
-    model_dir.mkdir(exist_ok=True)
-    plot_dir.mkdir(exist_ok=True)
-    wav_dir.mkdir(exist_ok=True)
-    mel_output_dir.mkdir(exist_ok=True)
-    meta_folder.mkdir(exist_ok=True)
-    weights_fpath = model_dir / f"synthesizer.pt"
-    metadata_fpath = syn_dir.joinpath("train.txt")
-    print("Checkpoint path: {}".format(weights_fpath))
-    print("Loading training data from: {}".format(metadata_fpath))
-    print("Using model: Tacotron")
-    # Bookkeeping
-    time_window = ValueWindow(100)
-    loss_window = ValueWindow(100)
-    # From WaveRNN/train_tacotron.py
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        for session in hparams.tts_schedule:
-            _, _, _, batch_size = session
-            if batch_size % torch.cuda.device_count() != 0:
-                raise ValueError("`batch_size` must be evenly divisible by n_gpus!")
-    else:
-        device = torch.device("cpu")
-    print("Using device:", device)
-    # Instantiate Tacotron Model
-    print("\nInitialising Tacotron Model...\n")
-    model = Tacotron(embed_dims=hparams.tts_embed_dims,
-                     num_chars=len(symbols),
-                     encoder_dims=hparams.tts_encoder_dims,
-                     decoder_dims=hparams.tts_decoder_dims,
-                     n_mels=hparams.num_mels,
-                     fft_bins=hparams.num_mels,
-                     postnet_dims=hparams.tts_postnet_dims,
-                     encoder_K=hparams.tts_encoder_K,
-                     lstm_dims=hparams.tts_lstm_dims,
-                     postnet_K=hparams.tts_postnet_K,
-                     num_highways=hparams.tts_num_highways,
-                     dropout=hparams.tts_dropout,
-                     stop_threshold=hparams.tts_stop_threshold,
-                     speaker_embedding_size=hparams.speaker_embedding_size).to(device)
-    # Initialize the optimizer
-    optimizer = optim.Adam(model.parameters())
-    # Load the weights
-    if force_restart or not weights_fpath.exists():
-        print("\nStarting the training of Tacotron from scratch\n")
-        model.save(weights_fpath)
-        # Embeddings metadata
-        char_embedding_fpath = meta_folder.joinpath("CharacterEmbeddings.tsv")
-        with open(char_embedding_fpath, "w", encoding="utf-8") as f:
-            for symbol in symbols:
-                if symbol == " ":
-                    symbol = "\\s"  # For visual purposes, swap space with \s
-                f.write("{}\n".format(symbol))
-    else:
-        print("\nLoading weights at %s" % weights_fpath)
-        model.load(weights_fpath, optimizer)
-        print("Tacotron weights loaded from step %d" % model.step)
-    # Initialize the dataset
-    metadata_fpath = syn_dir.joinpath("train.txt")
-    mel_dir = syn_dir.joinpath("mels")
-    embed_dir = syn_dir.joinpath("embeds")
-    dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
-    for i, session in enumerate(hparams.tts_schedule):
-        current_step = model.get_step()
-        r, lr, max_step, batch_size = session
-        training_steps = max_step - current_step
-        # Do we need to change to the next session?
-        if current_step >= max_step:
-            # Are there no further sessions than the current one?
-            if i == len(hparams.tts_schedule) - 1:
-                # We have completed training. Save the model and exit
-                model.save(weights_fpath, optimizer)
-                break
-            else:
-                # There is a following session, go to it
-                continue
-        model.r = r
-        # Begin the training
-        simple_table([(f"Steps with r={r}", str(training_steps // 1000) + "k Steps"),
-                      ("Batch Size", batch_size),
-                      ("Learning Rate", lr),
-                      ("Outputs/Step (r)", model.r)])
-        for p in optimizer.param_groups:
-            p["lr"] = lr
-        collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
-        data_loader = DataLoader(dataset, batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn)
-        total_iters = len(dataset)
-        steps_per_epoch = np.ceil(total_iters / batch_size).astype(np.int32)
-        epochs = np.ceil(training_steps / steps_per_epoch).astype(np.int32)
-        for epoch in range(1, epochs+1):
-            for i, (texts, mels, embeds, idx) in enumerate(data_loader, 1):
-                start_time = time.time()
-                # Generate stop tokens for training
-                stop = torch.ones(mels.shape[0], mels.shape[2])
-                for j, k in enumerate(idx):
-                    stop[j, :int(dataset.metadata[k][4])-1] = 0
-                texts = texts.to(device)
-                mels = mels.to(device)
-                embeds = embeds.to(device)
-                stop = stop.to(device)
-                # Forward pass
-                # Parallelize model onto GPUS using workaround due to python bug
-                if device.type == "cuda" and torch.cuda.device_count() > 1:
-                    m1_hat, m2_hat, attention, stop_pred = data_parallel_workaround(model, texts, mels, embeds)
-                else:
-                    m1_hat, m2_hat, attention, stop_pred = model(texts, mels, embeds)
-                # Backward pass
-                m1_loss = F.mse_loss(m1_hat, mels) + F.l1_loss(m1_hat, mels)
-                m2_loss = F.mse_loss(m2_hat, mels)
-                stop_loss = F.binary_cross_entropy(stop_pred, stop)
-                loss = m1_loss + m2_loss + stop_loss
-                optimizer.zero_grad()
-                loss.backward()
-                if hparams.tts_clip_grad_norm is not None:
-                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.tts_clip_grad_norm)
-                    if np.isnan(grad_norm.cpu()):
-                        print("grad_norm was NaN!")
-                optimizer.step()
-                time_window.append(time.time() - start_time)
-                loss_window.append(loss.item())
-                step = model.get_step()
-                k = step // 1000
-                msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | " \
-                      f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
-                stream(msg)
-                # Backup or save model as appropriate
-                if backup_every != 0 and step % backup_every == 0 :
-                    backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
-                    model.save(backup_fpath, optimizer)
-                if save_every != 0 and step % save_every == 0 :
-                    # Must save latest optimizer state to ensure that resuming training
-                    # doesn't produce artifacts
-                    model.save(weights_fpath, optimizer)
-                # Evaluate model to generate samples
-                epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch  # If epoch is done
-                step_eval = hparams.tts_eval_interval > 0 and step % hparams.tts_eval_interval == 0  # Every N steps
-                if epoch_eval or step_eval:
-                    for sample_idx in range(hparams.tts_eval_num_samples):
-                        # At most, generate samples equal to number in the batch
-                        if sample_idx + 1 <= len(texts):
-                            # Remove padding from mels using frame length in metadata
-                            mel_length = int(dataset.metadata[idx[sample_idx]][4])
-                            mel_prediction = np_now(m2_hat[sample_idx]).T[:mel_length]
-                            target_spectrogram = np_now(mels[sample_idx]).T[:mel_length]
-                            attention_len = mel_length // model.r
-                            eval_model(attention=np_now(attention[sample_idx][:, :attention_len]),
-                                       mel_prediction=mel_prediction,
-                                       target_spectrogram=target_spectrogram,
-                                       input_seq=np_now(texts[sample_idx]),
-                                       step=step,
-                                       plot_dir=plot_dir,
-                                       mel_output_dir=mel_output_dir,
-                                       wav_dir=wav_dir,
-                                       sample_num=sample_idx + 1,
-                                       loss=loss,
-                                       hparams=hparams)
-                # Break out of loop to update training schedule
-                if step >= max_step:
-                    break
-            # Add line break after every epoch
-            print("")
-def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
-               plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
-    # Save some results for evaluation
-    attention_path = str(plot_dir.joinpath("attention_step_{}_sample_{}".format(step, sample_num)))
-    save_attention(attention, attention_path)
-    # save predicted mel spectrogram to disk (debug)
-    mel_output_fpath = mel_output_dir.joinpath("mel-prediction-step-{}_sample_{}.npy".format(step, sample_num))
-    np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False)
-    # save griffin lim inverted wav for debug (mel -> wav)
-    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
-    wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format(step, sample_num))
-    audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate)
-    # save real and predicted mel-spectrogram plot to disk (control purposes)
-    spec_fpath = plot_dir.joinpath("step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num))
-    title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss)
-    plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str,
-                     target_spectrogram=target_spectrogram,
-                     max_len=target_spectrogram.size // hparams.num_mels)
-    print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))

synthesizer/utils/__init__.py DELETED Viewed

@@ -1,45 +0,0 @@
-import torch
-_output_ref = None
-_replicas_ref = None
-def data_parallel_workaround(model, *input):
-    global _output_ref
-    global _replicas_ref
-    device_ids = list(range(torch.cuda.device_count()))
-    output_device = device_ids[0]
-    replicas = torch.nn.parallel.replicate(model, device_ids)
-    # input.shape = (num_args, batch, ...)
-    inputs = torch.nn.parallel.scatter(input, device_ids)
-    # inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
-    replicas = replicas[:len(inputs)]
-    outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
-    y_hat = torch.nn.parallel.gather(outputs, output_device)
-    _output_ref = outputs
-    _replicas_ref = replicas
-    return y_hat
-class ValueWindow():
-  def __init__(self, window_size=100):
-    self._window_size = window_size
-    self._values = []
-  def append(self, x):
-    self._values = self._values[-(self._window_size - 1):] + [x]
-  @property
-  def sum(self):
-    return sum(self._values)
-  @property
-  def count(self):
-    return len(self._values)
-  @property
-  def average(self):
-    return self.sum / max(1, self.count)
-  def reset(self):
-    self._values = []

synthesizer/utils/_cmudict.py DELETED Viewed

@@ -1,62 +0,0 @@
-import re
-valid_symbols = [
-  "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
-  "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
-  "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
-  "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
-  "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
-  "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
-  "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
-]
-_valid_symbol_set = set(valid_symbols)
-class CMUDict:
-  """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
-  def __init__(self, file_or_path, keep_ambiguous=True):
-    if isinstance(file_or_path, str):
-      with open(file_or_path, encoding="latin-1") as f:
-        entries = _parse_cmudict(f)
-    else:
-      entries = _parse_cmudict(file_or_path)
-    if not keep_ambiguous:
-      entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
-    self._entries = entries
-  def __len__(self):
-    return len(self._entries)
-  def lookup(self, word):
-    """Returns list of ARPAbet pronunciations of the given word."""
-    return self._entries.get(word.upper())
-_alt_re = re.compile(r"\([0-9]+\)")
-def _parse_cmudict(file):
-  cmudict = {}
-  for line in file:
-    if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
-      parts = line.split("  ")
-      word = re.sub(_alt_re, "", parts[0])
-      pronunciation = _get_pronunciation(parts[1])
-      if pronunciation:
-        if word in cmudict:
-          cmudict[word].append(pronunciation)
-        else:
-          cmudict[word] = [pronunciation]
-  return cmudict
-def _get_pronunciation(s):
-  parts = s.strip().split(" ")
-  for part in parts:
-    if part not in _valid_symbol_set:
-      return None
-  return " ".join(parts)

synthesizer/utils/cleaners.py DELETED Viewed

@@ -1,88 +0,0 @@
-"""
-Cleaners are transformations that run over the input text at both training and eval time.
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-"""
-import re
-from unidecode import unidecode
-from synthesizer.utils.numbers import normalize_numbers
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r"\s+")
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
-    ("mrs", "misess"),
-    ("mr", "mister"),
-    ("dr", "doctor"),
-    ("st", "saint"),
-    ("co", "company"),
-    ("jr", "junior"),
-    ("maj", "major"),
-    ("gen", "general"),
-    ("drs", "doctors"),
-    ("rev", "reverend"),
-    ("lt", "lieutenant"),
-    ("hon", "honorable"),
-    ("sgt", "sergeant"),
-    ("capt", "captain"),
-    ("esq", "esquire"),
-    ("ltd", "limited"),
-    ("col", "colonel"),
-    ("ft", "fort"),
-]]
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-def expand_numbers(text):
-    return normalize_numbers(text)
-def lowercase(text):
-    """lowercase input tokens."""
-    return text.lower()
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, " ", text)
-def convert_to_ascii(text):
-    return unidecode(text)
-def basic_cleaners(text):
-    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def transliteration_cleaners(text):
-    """Pipeline for non-English text that transliterates to ASCII."""
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def english_cleaners(text):
-    """Pipeline for English text, including number and abbreviation expansion."""
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text

synthesizer/utils/numbers.py DELETED Viewed

@@ -1,69 +0,0 @@
-import re
-import inflect
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
-_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
-_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
-_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
-_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
-_number_re = re.compile(r"[0-9]+")
-def _remove_commas(m):
-    return m.group(1).replace(",", "")
-def _expand_decimal_point(m):
-    return m.group(1).replace(".", " point ")
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split(".")
-    if len(parts) > 2:
-        return match + " dollars"  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        return "%s %s" % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s" % (cents, cent_unit)
-    else:
-        return "zero dollars"
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-def _expand_number(m):
-    num = int(m.group(0))
-    if num > 1000 and num < 3000:
-        if num == 2000:
-            return "two thousand"
-        elif num > 2000 and num < 2010:
-            return "two thousand " + _inflect.number_to_words(num % 100)
-        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + " hundred"
-        else:
-            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-    else:
-        return _inflect.number_to_words(num, andword="")
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r"\1 pounds", text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text

synthesizer/utils/plot.py DELETED Viewed

@@ -1,82 +0,0 @@
-import numpy as np
-def split_title_line(title_text, max_words=5):
-	"""
-	A function that splits any string based on specific character
-	(returning it with the string), with maximum number of words on it
-	"""
-	seq = title_text.split()
-	return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
-def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
-	import matplotlib
-	matplotlib.use("Agg")
-	import matplotlib.pyplot as plt
-	if max_len is not None:
-		alignment = alignment[:, :max_len]
-	fig = plt.figure(figsize=(8, 6))
-	ax = fig.add_subplot(111)
-	im = ax.imshow(
-		alignment,
-		aspect="auto",
-		origin="lower",
-		interpolation="none")
-	fig.colorbar(im, ax=ax)
-	xlabel = "Decoder timestep"
-	if split_title:
-		title = split_title_line(title)
-	plt.xlabel(xlabel)
-	plt.title(title)
-	plt.ylabel("Encoder timestep")
-	plt.tight_layout()
-	plt.savefig(path, format="png")
-	plt.close()
-def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
-	import matplotlib
-	matplotlib.use("Agg")
-	import matplotlib.pyplot as plt
-	if max_len is not None:
-		target_spectrogram = target_spectrogram[:max_len]
-		pred_spectrogram = pred_spectrogram[:max_len]
-	if split_title:
-		title = split_title_line(title)
-	fig = plt.figure(figsize=(10, 8))
-	# Set common labels
-	fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
-	#target spectrogram subplot
-	if target_spectrogram is not None:
-		ax1 = fig.add_subplot(311)
-		ax2 = fig.add_subplot(312)
-		if auto_aspect:
-			im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
-		else:
-			im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
-		ax1.set_title("Target Mel-Spectrogram")
-		fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
-		ax2.set_title("Predicted Mel-Spectrogram")
-	else:
-		ax2 = fig.add_subplot(211)
-	if auto_aspect:
-		im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
-	else:
-		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
-	fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
-	plt.tight_layout()
-	plt.savefig(path, format="png")
-	plt.close()

synthesizer/utils/symbols.py DELETED Viewed

@@ -1,17 +0,0 @@
-"""
-Defines the set of symbols used in text input to the model.
-The default is a set of ASCII characters that works well for English or text that has been run
-through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
-"""
-# from . import cmudict
-_pad        = "_"
-_eos        = "~"
-_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-#_arpabet = ["@' + s for s in cmudict.valid_symbols]
-# Export all symbols:
-symbols = [_pad, _eos] + list(_characters) #+ _arpabet

synthesizer/utils/text.py DELETED Viewed

@@ -1,75 +0,0 @@
-from synthesizer.utils.symbols import symbols
-from synthesizer.utils import cleaners
-import re
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-# Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
-def text_to_sequence(text, cleaner_names):
-    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-      The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-      in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-      Args:
-        text: string to convert to a sequence
-        cleaner_names: names of the cleaner functions to run the text through
-      Returns:
-        List of integers corresponding to the symbols in the text
-    """
-    sequence = []
-    # Check for curly braces and treat their contents as ARPAbet:
-    while len(text):
-        m = _curly_re.match(text)
-        if not m:
-            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
-            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-        sequence += _arpabet_to_sequence(m.group(2))
-        text = m.group(3)
-    # Append EOS token
-    sequence.append(_symbol_to_id["~"])
-    return sequence
-def sequence_to_text(sequence):
-    """Converts a sequence of IDs back to a string"""
-    result = ""
-    for symbol_id in sequence:
-        if symbol_id in _id_to_symbol:
-            s = _id_to_symbol[symbol_id]
-            # Enclose ARPAbet back in curly braces:
-            if len(s) > 1 and s[0] == "@":
-                s = "{%s}" % s[1:]
-            result += s
-    return result.replace("}{", " ")
-def _clean_text(text, cleaner_names):
-    for name in cleaner_names:
-        cleaner = getattr(cleaners, name)
-        if not cleaner:
-            raise Exception("Unknown cleaner: %s" % name)
-        text = cleaner(text)
-    return text
-def _symbols_to_sequence(symbols):
-    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-def _arpabet_to_sequence(text):
-    return _symbols_to_sequence(["@" + s for s in text.split()])
-def _should_keep_symbol(s):
-    return s in _symbol_to_id and s not in ("_", "~")