Cloning_Box

Running

App Files Files Community

Kremon96 commited on 15 days ago

Commit

fe6ebc3

verified ·

1 Parent(s): cd1bb0a

Delete encoder

Browse files

Files changed (16) hide show

encoder/__init__.py +0 -0
encoder/audio.py +0 -117
encoder/config.py +0 -45
encoder/data_objects/__init__.py +0 -2
encoder/data_objects/random_cycler.py +0 -37
encoder/data_objects/speaker.py +0 -40
encoder/data_objects/speaker_batch.py +0 -13
encoder/data_objects/speaker_verification_dataset.py +0 -56
encoder/data_objects/utterance.py +0 -26
encoder/inference.py +0 -178
encoder/model.py +0 -135
encoder/params_data.py +0 -29
encoder/params_model.py +0 -11
encoder/preprocess.py +0 -184
encoder/train.py +0 -125
encoder/visualizations.py +0 -179

encoder/__init__.py DELETED Viewed

File without changes

encoder/audio.py DELETED Viewed

@@ -1,117 +0,0 @@
-from scipy.ndimage.morphology import binary_dilation
-from encoder.params_data import *
-from pathlib import Path
-from typing import Optional, Union
-from warnings import warn
-import numpy as np
-import librosa
-import struct
-try:
-    import webrtcvad
-except:
-    warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
-    webrtcvad=None
-int16_max = (2 ** 15) - 1
-def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
-                   source_sr: Optional[int] = None,
-                   normalize: Optional[bool] = True,
-                   trim_silence: Optional[bool] = True):
-    """
-    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
-    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
-    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
-    just .wav), either the waveform as a numpy array of floats.
-    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
-    preprocessing. After preprocessing, the waveform's sampling rate will match the data
-    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
-    this argument will be ignored.
-    """
-    # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
-    else:
-        wav = fpath_or_wav
-    # Resample the wav if needed
-    if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
-    # Apply the preprocessing: normalize volume and shorten long silences
-    if normalize:
-        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
-    if webrtcvad and trim_silence:
-        wav = trim_long_silences(wav)
-    return wav
-def wav_to_mel_spectrogram(wav):
-    """
-    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
-    Note: this not a log-mel spectrogram.
-    """
-    frames = librosa.feature.melspectrogram(
-        wav,
-        sampling_rate,
-        n_fft=int(sampling_rate * mel_window_length / 1000),
-        hop_length=int(sampling_rate * mel_window_step / 1000),
-        n_mels=mel_n_channels
-    )
-    return frames.astype(np.float32).T
-def trim_long_silences(wav):
-    """
-    Ensures that segments without voice in the waveform remain no longer than a
-    threshold determined by the VAD parameters in params.py.
-    :param wav: the raw waveform as a numpy array of floats
-    :return: the same waveform with silences trimmed away (length <= original wav length)
-    """
-    # Compute the voice detection window size
-    samples_per_window = (vad_window_length * sampling_rate) // 1000
-    # Trim the end of the audio to have a multiple of the window size
-    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    # Convert the float waveform to 16-bit mono PCM
-    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    # Perform voice activation detection
-    voice_flags = []
-    vad = webrtcvad.Vad(mode=3)
-    for window_start in range(0, len(wav), samples_per_window):
-        window_end = window_start + samples_per_window
-        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
-                                         sample_rate=sampling_rate))
-    voice_flags = np.array(voice_flags)
-    # Smooth the voice detection with a moving average
-    def moving_average(array, width):
-        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
-        ret = np.cumsum(array_padded, dtype=float)
-        ret[width:] = ret[width:] - ret[:-width]
-        return ret[width - 1:] / width
-    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
-    # Dilate the voiced regions
-    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
-    audio_mask = np.repeat(audio_mask, samples_per_window)
-    return wav[audio_mask == True]
-def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
-    if increase_only and decrease_only:
-        raise ValueError("Both increase only and decrease only are set")
-    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
-    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
-        return wav
-    return wav * (10 ** (dBFS_change / 20))

encoder/config.py DELETED Viewed

@@ -1,45 +0,0 @@
-librispeech_datasets = {
-    "train": {
-        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
-        "other": ["LibriSpeech/train-other-500"]
-    },
-    "test": {
-        "clean": ["LibriSpeech/test-clean"],
-        "other": ["LibriSpeech/test-other"]
-    },
-    "dev": {
-        "clean": ["LibriSpeech/dev-clean"],
-        "other": ["LibriSpeech/dev-other"]
-    },
-}
-libritts_datasets = {
-    "train": {
-        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
-        "other": ["LibriTTS/train-other-500"]
-    },
-    "test": {
-        "clean": ["LibriTTS/test-clean"],
-        "other": ["LibriTTS/test-other"]
-    },
-    "dev": {
-        "clean": ["LibriTTS/dev-clean"],
-        "other": ["LibriTTS/dev-other"]
-    },
-}
-voxceleb_datasets = {
-    "voxceleb1" : {
-        "train": ["VoxCeleb1/wav"],
-        "test": ["VoxCeleb1/test_wav"]
-    },
-    "voxceleb2" : {
-        "train": ["VoxCeleb2/dev/aac"],
-        "test": ["VoxCeleb2/test_wav"]
-    }
-}
-other_datasets = [
-    "LJSpeech-1.1",
-    "VCTK-Corpus/wav48",
-]
-anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

encoder/data_objects/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2	- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

encoder/data_objects/random_cycler.py DELETED Viewed

@@ -1,37 +0,0 @@
-import random
-class RandomCycler:
-    """
-    Creates an internal copy of a sequence and allows access to its items in a constrained random
-    order. For a source sequence of n items and one or several consecutive queries of a total
-    of m items, the following guarantees hold (one implies the other):
-        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
-        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
-    """
-    def __init__(self, source):
-        if len(source) == 0:
-            raise Exception("Can't create RandomCycler from an empty collection")
-        self.all_items = list(source)
-        self.next_items = []
-    def sample(self, count: int):
-        shuffle = lambda l: random.sample(l, len(l))
-        out = []
-        while count > 0:
-            if count >= len(self.all_items):
-                out.extend(shuffle(list(self.all_items)))
-                count -= len(self.all_items)
-                continue
-            n = min(count, len(self.next_items))
-            out.extend(self.next_items[:n])
-            count -= n
-            self.next_items = self.next_items[n:]
-            if len(self.next_items) == 0:
-                self.next_items = shuffle(list(self.all_items))
-        return out
-    def __next__(self):
-        return self.sample(1)[0]

encoder/data_objects/speaker.py DELETED Viewed

@@ -1,40 +0,0 @@
-from encoder.data_objects.random_cycler import RandomCycler
-from encoder.data_objects.utterance import Utterance
-from pathlib import Path
-# Contains the set of utterances of a single speaker
-class Speaker:
-    def __init__(self, root: Path):
-        self.root = root
-        self.name = root.name
-        self.utterances = None
-        self.utterance_cycler = None
-    def _load_utterances(self):
-        with self.root.joinpath("_sources.txt").open("r") as sources_file:
-            sources = [l.split(",") for l in sources_file]
-        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
-        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
-        self.utterance_cycler = RandomCycler(self.utterances)
-    def random_partial(self, count, n_frames):
-        """
-        Samples a batch of <count> unique partial utterances from the disk in a way that all
-        utterances come up at least once every two cycles and in a random order every time.
-        :param count: The number of partial utterances to sample from the set of utterances from
-        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
-        the number of utterances available.
-        :param n_frames: The number of frames in the partial utterance.
-        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
-        frames are the frames of the partial utterances and range is the range of the partial
-        utterance with regard to the complete utterance.
-        """
-        if self.utterances is None:
-            self._load_utterances()
-        utterances = self.utterance_cycler.sample(count)
-        a = [(u,) + u.random_partial(n_frames) for u in utterances]
-        return a

encoder/data_objects/speaker_batch.py DELETED Viewed

@@ -1,13 +0,0 @@
-import numpy as np
-from typing import List
-from encoder.data_objects.speaker import Speaker
-class SpeakerBatch:
-    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
-        self.speakers = speakers
-        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
-        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
-        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
-        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

encoder/data_objects/speaker_verification_dataset.py DELETED Viewed

@@ -1,56 +0,0 @@
-from encoder.data_objects.random_cycler import RandomCycler
-from encoder.data_objects.speaker_batch import SpeakerBatch
-from encoder.data_objects.speaker import Speaker
-from encoder.params_data import partials_n_frames
-from torch.utils.data import Dataset, DataLoader
-from pathlib import Path
-# TODO: improve with a pool of speakers for data efficiency
-class SpeakerVerificationDataset(Dataset):
-    def __init__(self, datasets_root: Path):
-        self.root = datasets_root
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
-        if len(speaker_dirs) == 0:
-            raise Exception("No speakers found. Make sure you are pointing to the directory "
-                            "containing all preprocessed speaker directories.")
-        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
-        self.speaker_cycler = RandomCycler(self.speakers)
-    def __len__(self):
-        return int(1e10)
-    def __getitem__(self, index):
-        return next(self.speaker_cycler)
-    def get_logs(self):
-        log_string = ""
-        for log_fpath in self.root.glob("*.txt"):
-            with log_fpath.open("r") as log_file:
-                log_string += "".join(log_file.readlines())
-        return log_string
-class SpeakerVerificationDataLoader(DataLoader):
-    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
-                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
-                 worker_init_fn=None):
-        self.utterances_per_speaker = utterances_per_speaker
-        super().__init__(
-            dataset=dataset,
-            batch_size=speakers_per_batch,
-            shuffle=False,
-            sampler=sampler,
-            batch_sampler=batch_sampler,
-            num_workers=num_workers,
-            collate_fn=self.collate,
-            pin_memory=pin_memory,
-            drop_last=False,
-            timeout=timeout,
-            worker_init_fn=worker_init_fn
-        )
-    def collate(self, speakers):
-        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

encoder/data_objects/utterance.py DELETED Viewed

@@ -1,26 +0,0 @@
-import numpy as np
-class Utterance:
-    def __init__(self, frames_fpath, wave_fpath):
-        self.frames_fpath = frames_fpath
-        self.wave_fpath = wave_fpath
-    def get_frames(self):
-        return np.load(self.frames_fpath)
-    def random_partial(self, n_frames):
-        """
-        Crops the frames into a partial utterance of n_frames
-        :param n_frames: The number of frames of the partial utterance
-        :return: the partial utterance frames and a tuple indicating the start and end of the
-        partial utterance in the complete utterance.
-        """
-        frames = self.get_frames()
-        if frames.shape[0] == n_frames:
-            start = 0
-        else:
-            start = np.random.randint(0, frames.shape[0] - n_frames)
-        end = start + n_frames
-        return frames[start:end], (start, end)

encoder/inference.py DELETED Viewed

@@ -1,178 +0,0 @@
-from encoder.params_data import *
-from encoder.model import SpeakerEncoder
-from encoder.audio import preprocess_wav   # We want to expose this function from here
-from matplotlib import cm
-from encoder import audio
-from pathlib import Path
-import numpy as np
-import torch
-_model = None # type: SpeakerEncoder
-_device = None # type: torch.device
-def load_model(weights_fpath: Path, device=None):
-    """
-    Loads the model in memory. If this function is not explicitely called, it will be run on the
-    first call to embed_frames() with the default weights file.
-    :param weights_fpath: the path to saved model weights.
-    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
-    model will be loaded and will run on this device. Outputs will however always be on the cpu.
-    If None, will default to your GPU if it"s available, otherwise your CPU.
-    """
-    # TODO: I think the slow loading of the encoder might have something to do with the device it
-    #   was saved on. Worth investigating.
-    global _model, _device
-    if device is None:
-        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    elif isinstance(device, str):
-        _device = torch.device(device)
-    _model = SpeakerEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath, _device)
-    _model.load_state_dict(checkpoint["model_state"])
-    _model.eval()
-    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
-def is_loaded():
-    return _model is not None
-def embed_frames_batch(frames_batch):
-    """
-    Computes embeddings for a batch of mel spectrogram.
-    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
-    (batch_size, n_frames, n_channels)
-    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
-    """
-    if _model is None:
-        raise Exception("Model was not loaded. Call load_model() before inference.")
-    frames = torch.from_numpy(frames_batch).to(_device)
-    embed = _model.forward(frames).detach().cpu().numpy()
-    return embed
-def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
-                           min_pad_coverage=0.75, overlap=0.5):
-    """
-    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
-    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
-    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
-    its spectrogram. This function assumes that the mel spectrogram parameters used are those
-    defined in params_data.py.
-    The returned ranges may be indexing further than the length of the waveform. It is
-    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
-    :param n_samples: the number of samples in the waveform
-    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
-    utterance
-    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
-    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
-    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
-    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
-    utterance, this parameter is ignored so that the function always returns at least 1 slice.
-    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
-    utterances are entirely disjoint.
-    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
-    respectively the waveform and the mel spectrogram with these slices to obtain the partial
-    utterances.
-    """
-    assert 0 <= overlap < 1
-    assert 0 < min_pad_coverage <= 1
-    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
-    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
-    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
-    # Compute the slices
-    wav_slices, mel_slices = [], []
-    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
-    for i in range(0, steps, frame_step):
-        mel_range = np.array([i, i + partial_utterance_n_frames])
-        wav_range = mel_range * samples_per_frame
-        mel_slices.append(slice(*mel_range))
-        wav_slices.append(slice(*wav_range))
-    # Evaluate whether extra padding is warranted or not
-    last_wav_range = wav_slices[-1]
-    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
-    if coverage < min_pad_coverage and len(mel_slices) > 1:
-        mel_slices = mel_slices[:-1]
-        wav_slices = wav_slices[:-1]
-    return wav_slices, mel_slices
-def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
-    """
-    Computes an embedding for a single utterance.
-    # TODO: handle multiple wavs to benefit from batching on GPU
-    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
-    :param using_partials: if True, then the utterance is split in partial utterances of
-    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
-    normalized average. If False, the utterance is instead computed from feeding the entire
-    spectogram to the network.
-    :param return_partials: if True, the partial embeddings will also be returned along with the
-    wav slices that correspond to the partial embeddings.
-    :param kwargs: additional arguments to compute_partial_splits()
-    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
-    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
-    returned. If <using_partials> is simultaneously set to False, both these values will be None
-    instead.
-    """
-    # Process the entire utterance if not using partials
-    if not using_partials:
-        frames = audio.wav_to_mel_spectrogram(wav)
-        embed = embed_frames_batch(frames[None, ...])[0]
-        if return_partials:
-            return embed, None, None
-        return embed
-    # Compute where to split the utterance into partials and pad if necessary
-    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
-    max_wave_length = wave_slices[-1].stop
-    if max_wave_length >= len(wav):
-        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
-    # Split the utterance into partials
-    frames = audio.wav_to_mel_spectrogram(wav)
-    frames_batch = np.array([frames[s] for s in mel_slices])
-    partial_embeds = embed_frames_batch(frames_batch)
-    # Compute the utterance embedding from the partial embeddings
-    raw_embed = np.mean(partial_embeds, axis=0)
-    embed = raw_embed / np.linalg.norm(raw_embed, 2)
-    if return_partials:
-        return embed, partial_embeds, wave_slices
-    return embed
-def embed_speaker(wavs, **kwargs):
-    raise NotImplemented()
-def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
-    import matplotlib.pyplot as plt
-    if ax is None:
-        ax = plt.gca()
-    if shape is None:
-        height = int(np.sqrt(len(embed)))
-        shape = (height, -1)
-    embed = embed.reshape(shape)
-    cmap = cm.get_cmap()
-    mappable = ax.imshow(embed, cmap=cmap)
-    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
-    sm = cm.ScalarMappable(cmap=cmap)
-    sm.set_clim(*color_range)
-    ax.set_xticks([]), ax.set_yticks([])
-    ax.set_title(title)

encoder/model.py DELETED Viewed

@@ -1,135 +0,0 @@
-from encoder.params_model import *
-from encoder.params_data import *
-from scipy.interpolate import interp1d
-from sklearn.metrics import roc_curve
-from torch.nn.utils import clip_grad_norm_
-from scipy.optimize import brentq
-from torch import nn
-import numpy as np
-import torch
-class SpeakerEncoder(nn.Module):
-    def __init__(self, device, loss_device):
-        super().__init__()
-        self.loss_device = loss_device
-        # Network defition
-        self.lstm = nn.LSTM(input_size=mel_n_channels,
-                            hidden_size=model_hidden_size,
-                            num_layers=model_num_layers,
-                            batch_first=True).to(device)
-        self.linear = nn.Linear(in_features=model_hidden_size,
-                                out_features=model_embedding_size).to(device)
-        self.relu = torch.nn.ReLU().to(device)
-        # Cosine similarity scaling (with fixed initial parameter values)
-        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
-        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
-        # Loss
-        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
-    def do_gradient_ops(self):
-        # Gradient scale
-        self.similarity_weight.grad *= 0.01
-        self.similarity_bias.grad *= 0.01
-        # Gradient clipping
-        clip_grad_norm_(self.parameters(), 3, norm_type=2)
-    def forward(self, utterances, hidden_init=None):
-        """
-        Computes the embeddings of a batch of utterance spectrograms.
-        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
-        (batch_size, n_frames, n_channels)
-        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
-        batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
-        """
-        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
-        # and the final cell state.
-        out, (hidden, cell) = self.lstm(utterances, hidden_init)
-        # We take only the hidden state of the last layer
-        embeds_raw = self.relu(self.linear(hidden[-1]))
-        # L2-normalize it
-        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
-        return embeds
-    def similarity_matrix(self, embeds):
-        """
-        Computes the similarity matrix according the section 2.1 of GE2E.
-        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, embedding_size)
-        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, speakers_per_batch)
-        """
-        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
-        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
-        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
-        centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
-        # Exclusive centroids (1 per utterance)
-        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
-        centroids_excl /= (utterances_per_speaker - 1)
-        centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
-        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
-        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
-        # We vectorize the computation for efficiency.
-        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
-                                 speakers_per_batch).to(self.loss_device)
-        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
-        for j in range(speakers_per_batch):
-            mask = np.where(mask_matrix[j])[0]
-            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
-            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
-        ## Even more vectorized version (slower maybe because of transpose)
-        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
-        #                           ).to(self.loss_device)
-        # eye = np.eye(speakers_per_batch, dtype=np.int)
-        # mask = np.where(1 - eye)
-        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
-        # mask = np.where(eye)
-        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
-        # sim_matrix2 = sim_matrix2.transpose(1, 2)
-        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
-        return sim_matrix
-    def loss(self, embeds):
-        """
-        Computes the softmax loss according the section 2.1 of GE2E.
-        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, embedding_size)
-        :return: the loss and the EER for this batch of embeddings.
-        """
-        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
-        # Loss
-        sim_matrix = self.similarity_matrix(embeds)
-        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
-                                         speakers_per_batch))
-        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
-        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
-        loss = self.loss_fn(sim_matrix, target)
-        # EER (not backpropagated)
-        with torch.no_grad():
-            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
-            labels = np.array([inv_argmax(i) for i in ground_truth])
-            preds = sim_matrix.detach().cpu().numpy()
-            # Snippet from https://yangcha.github.io/EER-ROC/
-            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
-            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
-        return loss, eer

encoder/params_data.py DELETED Viewed

@@ -1,29 +0,0 @@
-## Mel-filterbank
-mel_window_length = 25  # In milliseconds
-mel_window_step = 10    # In milliseconds
-mel_n_channels = 40
-## Audio
-sampling_rate = 16000
-# Number of spectrogram frames in a partial utterance
-partials_n_frames = 160     # 1600 ms
-# Number of spectrogram frames at inference
-inference_n_frames = 80     #  800 ms
-## Voice Activation Detection
-# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
-# This sets the granularity of the VAD. Should not need to be changed.
-vad_window_length = 30  # In milliseconds
-# Number of frames to average together when performing the moving average smoothing.
-# The larger this value, the larger the VAD variations must be to not get smoothed out.
-vad_moving_average_width = 8
-# Maximum number of consecutive silent frames a segment can have.
-vad_max_silence_length = 6
-## Audio volume normalization
-audio_norm_target_dBFS = -30

encoder/params_model.py DELETED Viewed

@@ -1,11 +0,0 @@
-## Model parameters
-model_hidden_size = 256
-model_embedding_size = 256
-model_num_layers = 3
-## Training parameters
-learning_rate_init = 1e-4
-speakers_per_batch = 64
-utterances_per_speaker = 10

encoder/preprocess.py DELETED Viewed

@@ -1,184 +0,0 @@
-from datetime import datetime
-from functools import partial
-from multiprocessing import Pool
-from pathlib import Path
-import numpy as np
-from tqdm import tqdm
-from encoder import audio
-from encoder.config import librispeech_datasets, anglophone_nationalites
-from encoder.params_data import *
-_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
-class DatasetLog:
-    """
-    Registers metadata about the dataset in a text file.
-    """
-    def __init__(self, root, name):
-        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
-        self.sample_data = dict()
-        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Creating dataset %s on %s" % (name, start_time))
-        self.write_line("-----")
-        self._log_params()
-    def _log_params(self):
-        from encoder import params_data
-        self.write_line("Parameter values:")
-        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
-            value = getattr(params_data, param_name)
-            self.write_line("\t%s: %s" % (param_name, value))
-        self.write_line("-----")
-    def write_line(self, line):
-        self.text_file.write("%s\n" % line)
-    def add_sample(self, **kwargs):
-        for param_name, value in kwargs.items():
-            if not param_name in self.sample_data:
-                self.sample_data[param_name] = []
-            self.sample_data[param_name].append(value)
-    def finalize(self):
-        self.write_line("Statistics:")
-        for param_name, values in self.sample_data.items():
-            self.write_line("\t%s:" % param_name)
-            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
-            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
-        self.write_line("-----")
-        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Finished on %s" % end_time)
-        self.text_file.close()
-def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
-    dataset_root = datasets_root.joinpath(dataset_name)
-    if not dataset_root.exists():
-        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
-        return None, None
-    return dataset_root, DatasetLog(out_dir, dataset_name)
-def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
-    # Give a name to the speaker that includes its dataset
-    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
-    # Create an output directory with that name, as well as a txt file containing a
-    # reference to each source file.
-    speaker_out_dir = out_dir.joinpath(speaker_name)
-    speaker_out_dir.mkdir(exist_ok=True)
-    sources_fpath = speaker_out_dir.joinpath("_sources.txt")
-    # There's a possibility that the preprocessing was interrupted earlier, check if
-    # there already is a sources file.
-    if sources_fpath.exists():
-        try:
-            with sources_fpath.open("r") as sources_file:
-                existing_fnames = {line.split(",")[0] for line in sources_file}
-        except:
-            existing_fnames = {}
-    else:
-        existing_fnames = {}
-    # Gather all audio files for that speaker recursively
-    sources_file = sources_fpath.open("a" if skip_existing else "w")
-    audio_durs = []
-    for extension in _AUDIO_EXTENSIONS:
-        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
-            # Check if the target output file already exists
-            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
-            out_fname = out_fname.replace(".%s" % extension, ".npy")
-            if skip_existing and out_fname in existing_fnames:
-                continue
-            # Load and preprocess the waveform
-            wav = audio.preprocess_wav(in_fpath)
-            if len(wav) == 0:
-                continue
-            # Create the mel spectrogram, discard those that are too short
-            frames = audio.wav_to_mel_spectrogram(wav)
-            if len(frames) < partials_n_frames:
-                continue
-            out_fpath = speaker_out_dir.joinpath(out_fname)
-            np.save(out_fpath, frames)
-            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
-            audio_durs.append(len(wav) / sampling_rate)
-    sources_file.close()
-    return audio_durs
-def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
-    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
-    # Process the utterances for each speaker
-    work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
-    with Pool(4) as pool:
-        tasks = pool.imap(work_fn, speaker_dirs)
-        for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
-            for sample_dur in sample_durs:
-                logger.add_sample(duration=sample_dur)
-    logger.finalize()
-    print("Done preprocessing %s.\n" % dataset_name)
-def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
-    for dataset_name in librispeech_datasets["train"]["other"]:
-        # Initialize the preprocessing
-        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-        if not dataset_root:
-            return
-        # Preprocess all speakers
-        speaker_dirs = list(dataset_root.glob("*"))
-        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
-def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
-    # Initialize the preprocessing
-    dataset_name = "VoxCeleb1"
-    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-    if not dataset_root:
-        return
-    # Get the contents of the meta file
-    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
-        metadata = [line.split("\t") for line in metafile][1:]
-    # Select the ID and the nationality, filter out non-anglophone speakers
-    nationalities = {line[0]: line[3] for line in metadata}
-    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
-                        nationality.lower() in anglophone_nationalites]
-    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
-          (len(keep_speaker_ids), len(nationalities)))
-    # Get the speaker directories for anglophone speakers only
-    speaker_dirs = dataset_root.joinpath("wav").glob("*")
-    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
-                    speaker_dir.name in keep_speaker_ids]
-    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
-          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
-    # Preprocess all speakers
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
-def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
-    # Initialize the preprocessing
-    dataset_name = "VoxCeleb2"
-    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-    if not dataset_root:
-        return
-    # Get the speaker directories
-    # Preprocess all speakers
-    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

encoder/train.py DELETED Viewed

@@ -1,125 +0,0 @@
-from pathlib import Path
-import torch
-from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
-from encoder.model import SpeakerEncoder
-from encoder.params_model import *
-from encoder.visualizations import Visualizations
-from utils.profiler import Profiler
-def sync(device: torch.device):
-    # For correct profiling (cuda operations are async)
-    if device.type == "cuda":
-        torch.cuda.synchronize(device)
-def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
-          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
-          no_visdom: bool):
-    # Create a dataset and a dataloader
-    dataset = SpeakerVerificationDataset(clean_data_root)
-    loader = SpeakerVerificationDataLoader(
-        dataset,
-        speakers_per_batch,
-        utterances_per_speaker,
-        num_workers=4,
-    )
-    # Setup the device on which to run the forward pass and the loss. These can be different,
-    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
-    # hyperparameters) faster on the CPU.
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # FIXME: currently, the gradient is None if loss_device is cuda
-    loss_device = torch.device("cpu")
-    # Create the model and the optimizer
-    model = SpeakerEncoder(device, loss_device)
-    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
-    init_step = 1
-    # Configure file path for the model
-    model_dir = models_dir / run_id
-    model_dir.mkdir(exist_ok=True, parents=True)
-    state_fpath = model_dir / "encoder.pt"
-    # Load any existing model
-    if not force_restart:
-        if state_fpath.exists():
-            print("Found existing model \"%s\", loading it and resuming training." % run_id)
-            checkpoint = torch.load(state_fpath)
-            init_step = checkpoint["step"]
-            model.load_state_dict(checkpoint["model_state"])
-            optimizer.load_state_dict(checkpoint["optimizer_state"])
-            optimizer.param_groups[0]["lr"] = learning_rate_init
-        else:
-            print("No model \"%s\" found, starting training from scratch." % run_id)
-    else:
-        print("Starting the training from scratch.")
-    model.train()
-    # Initialize the visualization environment
-    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
-    vis.log_dataset(dataset)
-    vis.log_params()
-    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
-    vis.log_implementation({"Device": device_name})
-    # Training loop
-    profiler = Profiler(summarize_every=10, disabled=False)
-    for step, speaker_batch in enumerate(loader, init_step):
-        profiler.tick("Blocking, waiting for batch (threaded)")
-        # Forward pass
-        inputs = torch.from_numpy(speaker_batch.data).to(device)
-        sync(device)
-        profiler.tick("Data to %s" % device)
-        embeds = model(inputs)
-        sync(device)
-        profiler.tick("Forward pass")
-        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
-        loss, eer = model.loss(embeds_loss)
-        sync(loss_device)
-        profiler.tick("Loss")
-        # Backward pass
-        model.zero_grad()
-        loss.backward()
-        profiler.tick("Backward pass")
-        model.do_gradient_ops()
-        optimizer.step()
-        profiler.tick("Parameter update")
-        # Update visualizations
-        # learning_rate = optimizer.param_groups[0]["lr"]
-        vis.update(loss.item(), eer, step)
-        # Draw projections and save them to the backup folder
-        if umap_every != 0 and step % umap_every == 0:
-            print("Drawing and saving projections (step %d)" % step)
-            projection_fpath = model_dir / f"umap_{step:06d}.png"
-            embeds = embeds.detach().cpu().numpy()
-            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
-            vis.save()
-        # Overwrite the latest version of the model
-        if save_every != 0 and step % save_every == 0:
-            print("Saving the model (step %d)" % step)
-            torch.save({
-                "step": step + 1,
-                "model_state": model.state_dict(),
-                "optimizer_state": optimizer.state_dict(),
-            }, state_fpath)
-        # Make a backup
-        if backup_every != 0 and step % backup_every == 0:
-            print("Making a backup (step %d)" % step)
-            backup_fpath = model_dir / f"encoder_{step:06d}.bak"
-            torch.save({
-                "step": step + 1,
-                "model_state": model.state_dict(),
-                "optimizer_state": optimizer.state_dict(),
-            }, backup_fpath)
-        profiler.tick("Extras (visualizations, saving)")

encoder/visualizations.py DELETED Viewed

@@ -1,179 +0,0 @@
-from datetime import datetime
-from time import perf_counter as timer
-import numpy as np
-import umap
-import visdom
-from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
-colormap = np.array([
-    [76, 255, 0],
-    [0, 127, 70],
-    [255, 0, 0],
-    [255, 217, 38],
-    [0, 135, 255],
-    [165, 0, 165],
-    [255, 167, 255],
-    [0, 255, 255],
-    [255, 96, 38],
-    [142, 76, 0],
-    [33, 0, 127],
-    [0, 0, 0],
-    [183, 183, 183],
-], dtype=np.float) / 255
-class Visualizations:
-    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
-        # Tracking data
-        self.last_update_timestamp = timer()
-        self.update_every = update_every
-        self.step_times = []
-        self.losses = []
-        self.eers = []
-        print("Updating the visualizations every %d steps." % update_every)
-        # If visdom is disabled TODO: use a better paradigm for that
-        self.disabled = disabled
-        if self.disabled:
-            return
-        # Set the environment name
-        now = str(datetime.now().strftime("%d-%m %Hh%M"))
-        if env_name is None:
-            self.env_name = now
-        else:
-            self.env_name = "%s (%s)" % (env_name, now)
-        # Connect to visdom and open the corresponding window in the browser
-        try:
-            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
-        except ConnectionError:
-            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
-                            "start it.")
-        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
-        # Create the windows
-        self.loss_win = None
-        self.eer_win = None
-        # self.lr_win = None
-        self.implementation_win = None
-        self.projection_win = None
-        self.implementation_string = ""
-    def log_params(self):
-        if self.disabled:
-            return
-        from encoder import params_data
-        from encoder import params_model
-        param_string = "<b>Model parameters</b>:<br>"
-        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
-            value = getattr(params_model, param_name)
-            param_string += "\t%s: %s<br>" % (param_name, value)
-        param_string += "<b>Data parameters</b>:<br>"
-        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
-            value = getattr(params_data, param_name)
-            param_string += "\t%s: %s<br>" % (param_name, value)
-        self.vis.text(param_string, opts={"title": "Parameters"})
-    def log_dataset(self, dataset: SpeakerVerificationDataset):
-        if self.disabled:
-            return
-        dataset_string = ""
-        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
-        dataset_string += "\n" + dataset.get_logs()
-        dataset_string = dataset_string.replace("\n", "<br>")
-        self.vis.text(dataset_string, opts={"title": "Dataset"})
-    def log_implementation(self, params):
-        if self.disabled:
-            return
-        implementation_string = ""
-        for param, value in params.items():
-            implementation_string += "<b>%s</b>: %s\n" % (param, value)
-            implementation_string = implementation_string.replace("\n", "<br>")
-        self.implementation_string = implementation_string
-        self.implementation_win = self.vis.text(
-            implementation_string,
-            opts={"title": "Training implementation"}
-        )
-    def update(self, loss, eer, step):
-        # Update the tracking data
-        now = timer()
-        self.step_times.append(1000 * (now - self.last_update_timestamp))
-        self.last_update_timestamp = now
-        self.losses.append(loss)
-        self.eers.append(eer)
-        print(".", end="")
-        # Update the plots every <update_every> steps
-        if step % self.update_every != 0:
-            return
-        time_string = "Step time:  mean: %5dms  std: %5dms" % \
-                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
-        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
-              (step, np.mean(self.losses), np.mean(self.eers), time_string))
-        if not self.disabled:
-            self.loss_win = self.vis.line(
-                [np.mean(self.losses)],
-                [step],
-                win=self.loss_win,
-                update="append" if self.loss_win else None,
-                opts=dict(
-                    legend=["Avg. loss"],
-                    xlabel="Step",
-                    ylabel="Loss",
-                    title="Loss",
-                )
-            )
-            self.eer_win = self.vis.line(
-                [np.mean(self.eers)],
-                [step],
-                win=self.eer_win,
-                update="append" if self.eer_win else None,
-                opts=dict(
-                    legend=["Avg. EER"],
-                    xlabel="Step",
-                    ylabel="EER",
-                    title="Equal error rate"
-                )
-            )
-            if self.implementation_win is not None:
-                self.vis.text(
-                    self.implementation_string + ("<b>%s</b>" % time_string),
-                    win=self.implementation_win,
-                    opts={"title": "Training implementation"},
-                )
-        # Reset the tracking
-        self.losses.clear()
-        self.eers.clear()
-        self.step_times.clear()
-    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
-        import matplotlib.pyplot as plt
-        max_speakers = min(max_speakers, len(colormap))
-        embeds = embeds[:max_speakers * utterances_per_speaker]
-        n_speakers = len(embeds) // utterances_per_speaker
-        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
-        colors = [colormap[i] for i in ground_truth]
-        reducer = umap.UMAP()
-        projected = reducer.fit_transform(embeds)
-        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
-        plt.gca().set_aspect("equal", "datalim")
-        plt.title("UMAP projection (step %d)" % step)
-        if not self.disabled:
-            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
-        if out_fpath is not None:
-            plt.savefig(out_fpath)
-        plt.clf()
-    def save(self):
-        if not self.disabled:
-            self.vis.save([self.env_name])