Spaces:

TomCallan
/

Big

Runtime error

App Files Files Community

TomCallan commited on Jan 21, 2023

Commit

aed64b5

1 Parent(s): eb11506

Upload 14 files

Browse files

Files changed (14) hide show

app.py +14 -0
requirements.txt +2 -0
speaker_recognition/__init__.py +0 -0
speaker_recognition/__pycache__/audio.cpython-310.pyc +0 -0
speaker_recognition/app.py +60 -0
speaker_recognition/audio.py +121 -0
speaker_recognition/batcher.py +505 -0
speaker_recognition/constants.py +18 -0
speaker_recognition/conv_models.py +296 -0
speaker_recognition/eval_metrics.py +84 -0
speaker_recognition/test.py +69 -0
speaker_recognition/train.py +111 -0
speaker_recognition/triplet_loss.py +63 -0
speaker_recognition/utils.py +120 -0

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import gradio as gr
+from speaker_recognition import app
+def recognition(audio):
+    do = app.speaker_recognition()
+    return do.run_transform(audio)
+demo = gr.Interface(fn=recognition, inputs=["audio"], outputs="text")
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gdown
2	+ numpy

speaker_recognition/__init__.py ADDED Viewed

File without changes

speaker_recognition/__pycache__/audio.cpython-310.pyc ADDED Viewed

Binary file (4.64 kB). View file

speaker_recognition/app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gdown
+import random
+import numpy as np
+from audio import read_mfcc
+from batcher import sample_from_mfcc
+from constants import SAMPLE_RATE, NUM_FRAMES
+from conv_models import DeepSpeakerModel
+from test import batch_cosine_similarity
+class speaker_recognition:
+    def __init__(self):
+      np.random.seed(123)
+      random.seed(123)
+      self.speakers = {}
+      self.weights = ""
+      self.by_name = True
+      self.SAMPLE_RATE = SAMPLE_RATE
+      self.NUM_FRAMES = NUM_FRAMES
+      self.spin_up()
+    def spin_up(self):
+      if self.weights == "":
+        output = "weights.h5"
+        gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False)
+        self.weights = "weights.h5"
+      self.model = DeepSpeakerModel()
+      self.model.m.load_weights(self.weights, by_name=True)
+    def create_speaker(self, data, id=""):
+      id = id if id != "" else f"{len(self.speakers)}"
+      self.speakers[id] = data
+      return id
+    def check_speakers(self, data, id="", threshold = 0.5):
+      us = ""
+      n = 0
+      for speaker in self.speakers:
+        k = batch_cosine_similarity(self.speakers[speaker], data)
+        if k > threshold:
+          if k > n:
+            n = k
+            us = speaker
+          else:pass
+      if n == 0:
+        id = self.create_speaker(data, id)
+        return f"created new speaker : {id}"
+      return (us, k[0])
+    def run_transform(self, audio, pcm = False):
+      data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES)
+      data = self.model.m.predict(np.expand_dims(data, axis=0))
+      return data

speaker_recognition/audio.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+import librosa
+import numpy as np
+from python_speech_features import fbank
+from tqdm import tqdm
+from deep_speaker.constants import SAMPLE_RATE, NUM_FBANKS
+from deep_speaker.utils import find_files, ensures_dir
+logger = logging.getLogger(__name__)
+def read_mfcc(input_filename, sample_rate):
+    audio = Audio.read(input_filename, sample_rate)
+    energy = np.abs(audio)
+    silence_threshold = np.percentile(energy, 95)
+    offsets = np.where(energy > silence_threshold)[0]
+    # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
+    # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
+    # TODO: could use trim_silence() here or a better VAD.
+    audio_voice_only = audio[offsets[0]:offsets[-1]]
+    mfcc = mfcc_fbank(audio_voice_only, sample_rate)
+    return mfcc
+def extract_speaker_and_utterance_ids(filename: str):  # LIBRI.
+    # 'audio/dev-other/116/288045/116-288045-0000.flac'
+    speaker, _, basename = Path(filename).parts[-3:]
+    filename.split('-')
+    utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
+    assert basename.split('-')[0] == speaker
+    return speaker, utterance
+class Audio:
+    def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'):
+        self.ext = ext
+        self.cache_dir = os.path.join(cache_dir, 'audio-fbanks')
+        ensures_dir(self.cache_dir)
+        if audio_dir is not None:
+            self.build_cache(os.path.expanduser(audio_dir), sample_rate)
+        self.speakers_to_utterances = defaultdict(dict)
+        for cache_file in find_files(self.cache_dir, ext='npy'):
+            # /path/to/speaker_utterance.npy
+            speaker_id, utterance_id = Path(cache_file).stem.split('_')
+            self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
+    @property
+    def speaker_ids(self):
+        return sorted(self.speakers_to_utterances)
+    @staticmethod
+    def trim_silence(audio, threshold):
+        """Removes silence at the beginning and end of a sample."""
+        # pylint: disable=E1121
+        energy = librosa.feature.rms(audio)
+        frames = np.nonzero(np.array(energy > threshold))
+        indices = librosa.core.frames_to_samples(frames)[1]
+        # Note: indices can be an empty array, if the whole audio was silence.
+        audio_trim = audio[0:0]
+        left_blank = audio[0:0]
+        right_blank = audio[0:0]
+        if indices.size:
+            audio_trim = audio[indices[0]:indices[-1]]
+            left_blank = audio[:indices[0]]  # slice before.
+            right_blank = audio[indices[-1]:]  # slice after.
+        return audio_trim, left_blank, right_blank
+    @staticmethod
+    def read(filename, sample_rate=SAMPLE_RATE):
+        audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
+        assert sr == sample_rate
+        return audio
+    def build_cache(self, audio_dir, sample_rate):
+        logger.info(f'audio_dir: {audio_dir}.')
+        logger.info(f'sample_rate: {sample_rate:,} hz.')
+        audio_files = find_files(audio_dir, ext=self.ext)
+        audio_files_count = len(audio_files)
+        assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
+        logger.info(f'Found {audio_files_count:,} files in {audio_dir}.')
+        with tqdm(audio_files) as bar:
+            for audio_filename in bar:
+                bar.set_description(audio_filename)
+                self.cache_audio_file(audio_filename, sample_rate)
+    def cache_audio_file(self, input_filename, sample_rate):
+        sp, utt = extract_speaker_and_utterance_ids(input_filename)
+        cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
+        if not os.path.isfile(cache_filename):
+            try:
+                mfcc = read_mfcc(input_filename, sample_rate)
+                np.save(cache_filename, mfcc)
+            except librosa.util.exceptions.ParameterError as e:
+                logger.error(e)
+def pad_mfcc(mfcc, max_length):  # num_frames, nfilt=64.
+    if len(mfcc) < max_length:
+        mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
+    return mfcc
+def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
+    # Returns MFCC with shape (num_frames, n_filters, 3).
+    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
+    frames_features = normalize_frames(filter_banks)
+    # delta_1 = delta(filter_banks, N=1)
+    # delta_2 = delta(delta_1, N=1)
+    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
+    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.
+def normalize_frames(m, epsilon=1e-12):
+    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]

speaker_recognition/batcher.py ADDED Viewed

	@@ -0,0 +1,505 @@

+import json
+import logging
+import os
+from collections import deque, Counter
+from random import choice
+from time import time
+import dill
+import numpy as np
+from tqdm import tqdm
+from deep_speaker.audio import pad_mfcc, Audio
+from deep_speaker.constants import NUM_FRAMES, NUM_FBANKS
+from deep_speaker.conv_models import DeepSpeakerModel
+from deep_speaker.utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
+logger = logging.getLogger(__name__)
+def extract_speaker(utt_file):
+    return utt_file.split('/')[-1].split('_')[0]
+def sample_from_mfcc(mfcc, max_length):
+    if mfcc.shape[0] >= max_length:
+        r = choice(range(0, len(mfcc) - max_length + 1))
+        s = mfcc[r:r + max_length]
+    else:
+        s = pad_mfcc(mfcc, max_length)
+    return np.expand_dims(s, axis=-1)
+def sample_from_mfcc_file(utterance_file, max_length):
+    mfcc = np.load(utterance_file)
+    return sample_from_mfcc(mfcc, max_length)
+class KerasFormatConverter:
+    def __init__(self, working_dir, load_test_only=False):
+        self.working_dir = working_dir
+        self.output_dir = os.path.join(self.working_dir, 'keras-inputs')
+        ensures_dir(self.output_dir)
+        self.categorical_speakers = load_pickle(os.path.join(self.output_dir, 'categorical_speakers.pkl'))
+        if not load_test_only:
+            self.kx_train = load_npy(os.path.join(self.output_dir, 'kx_train.npy'))
+            self.ky_train = load_npy(os.path.join(self.output_dir, 'ky_train.npy'))
+        self.kx_test = load_npy(os.path.join(self.output_dir, 'kx_test.npy'))
+        self.ky_test = load_npy(os.path.join(self.output_dir, 'ky_test.npy'))
+        self.audio = Audio(cache_dir=self.working_dir, audio_dir=None)
+        if self.categorical_speakers is None:
+            self.categorical_speakers = SparseCategoricalSpeakers(self.audio.speaker_ids)
+    def persist_to_disk(self):
+        with open(os.path.join(self.output_dir, 'categorical_speakers.pkl'), 'wb') as w:
+            dill.dump(self.categorical_speakers, w)
+        np.save(os.path.join(self.output_dir, 'kx_train.npy'), self.kx_train)
+        np.save(os.path.join(self.output_dir, 'kx_test.npy'), self.kx_test)
+        np.save(os.path.join(self.output_dir, 'ky_train.npy'), self.ky_train)
+        np.save(os.path.join(self.output_dir, 'ky_test.npy'), self.ky_test)
+    def generate_per_phase(self, max_length=NUM_FRAMES, num_per_speaker=3000, is_test=False):
+        # train OR test.
+        num_speakers = len(self.audio.speaker_ids)
+        sp_to_utt = train_test_sp_to_utt(self.audio, is_test)
+        # 64 fbanks 1 channel(s).
+        # float32
+        kx = np.zeros((num_speakers * num_per_speaker, max_length, NUM_FBANKS, 1), dtype=np.float32)
+        ky = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32)
+        desc = f'Converting to Keras format [{"test" if is_test else "train"}]'
+        for i, speaker_id in enumerate(tqdm(self.audio.speaker_ids, desc=desc)):
+            utterances_files = sp_to_utt[speaker_id]
+            for j, utterance_file in enumerate(np.random.choice(utterances_files, size=num_per_speaker, replace=True)):
+                self.load_into_mat(utterance_file, self.categorical_speakers, speaker_id, max_length, kx, ky,
+                                   i * num_per_speaker + j)
+        return kx, ky
+    def generate(self, max_length=NUM_FRAMES, counts_per_speaker=(3000, 500)):
+        kx_train, ky_train = self.generate_per_phase(max_length, counts_per_speaker[0], is_test=False)
+        kx_test, ky_test = self.generate_per_phase(max_length, counts_per_speaker[1], is_test=True)
+        logger.info(f'kx_train.shape = {kx_train.shape}')
+        logger.info(f'ky_train.shape = {ky_train.shape}')
+        logger.info(f'kx_test.shape = {kx_test.shape}')
+        logger.info(f'ky_test.shape = {ky_test.shape}')
+        self.kx_train, self.ky_train, self.kx_test, self.ky_test = kx_train, ky_train, kx_test, ky_test
+    @staticmethod
+    def load_into_mat(utterance_file, categorical_speakers, speaker_id, max_length, kx, ky, i):
+        kx[i] = sample_from_mfcc_file(utterance_file, max_length)
+        ky[i] = categorical_speakers.get_index(speaker_id)
+class SparseCategoricalSpeakers:
+    def __init__(self, speakers_list):
+        self.speaker_ids = sorted(speakers_list)
+        assert len(set(self.speaker_ids)) == len(self.speaker_ids)  # all unique.
+        self.map = dict(zip(self.speaker_ids, range(len(self.speaker_ids))))
+    def get_index(self, speaker_id):
+        return self.map[speaker_id]
+class OneHotSpeakers:
+    def __init__(self, speakers_list):
+        # pylint: disable=E0611,E0401
+        from tensorflow.keras.utils import to_categorical
+        self.speaker_ids = sorted(speakers_list)
+        self.int_speaker_ids = list(range(len(self.speaker_ids)))
+        self.map_speakers_to_index = dict([(k, v) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
+        self.map_index_to_speakers = dict([(v, k) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
+        self.speaker_categories = to_categorical(self.int_speaker_ids, num_classes=len(self.speaker_ids))
+    def get_speaker_from_index(self, index):
+        return self.map_index_to_speakers[index]
+    def get_one_hot(self, speaker_id):
+        index = self.map_speakers_to_index[speaker_id]
+        return self.speaker_categories[index]
+class LazyTripletBatcher:
+    def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
+        self.working_dir = working_dir
+        self.audio = Audio(cache_dir=working_dir)
+        logger.info(f'Picking audio from {working_dir}.')
+        self.sp_to_utt_train = train_test_sp_to_utt(self.audio, is_test=False)
+        self.sp_to_utt_test = train_test_sp_to_utt(self.audio, is_test=True)
+        self.max_length = max_length
+        self.model = model
+        self.nb_per_speaker = 2
+        self.nb_speakers = 640
+        self.history_length = 4
+        self.history_every = 100  # batches.
+        self.total_history_length = self.nb_speakers * self.nb_per_speaker * self.history_length  # 25,600
+        self.metadata_train_speakers = Counter()
+        self.metadata_output_file = os.path.join(self.working_dir, 'debug_batcher.json')
+        self.history_embeddings_train = deque(maxlen=self.total_history_length)
+        self.history_utterances_train = deque(maxlen=self.total_history_length)
+        self.history_model_inputs_train = deque(maxlen=self.total_history_length)
+        self.history_embeddings = None
+        self.history_utterances = None
+        self.history_model_inputs = None
+        self.batch_count = 0
+        for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
+            self.update_triplets_history()
+    def update_triplets_history(self):
+        model_inputs = []
+        speakers = list(self.audio.speakers_to_utterances.keys())
+        np.random.shuffle(speakers)
+        selected_speakers = speakers[: self.nb_speakers]
+        embeddings_utterances = []
+        for speaker_id in selected_speakers:
+            train_utterances = self.sp_to_utt_train[speaker_id]
+            for selected_utterance in np.random.choice(a=train_utterances, size=self.nb_per_speaker, replace=False):
+                mfcc = sample_from_mfcc_file(selected_utterance, self.max_length)
+                embeddings_utterances.append(selected_utterance)
+                model_inputs.append(mfcc)
+        embeddings = self.model.m.predict(np.array(model_inputs))
+        assert embeddings.shape[-1] == 512
+        embeddings = np.reshape(embeddings, (len(selected_speakers), self.nb_per_speaker, 512))
+        self.history_embeddings_train.extend(list(embeddings.reshape((-1, 512))))
+        self.history_utterances_train.extend(embeddings_utterances)
+        self.history_model_inputs_train.extend(model_inputs)
+        # reason: can't index a deque with a np.array.
+        self.history_embeddings = np.array(self.history_embeddings_train)
+        self.history_utterances = np.array(self.history_utterances_train)
+        self.history_model_inputs = np.array(self.history_model_inputs_train)
+        with open(self.metadata_output_file, 'w') as w:
+            json.dump(obj=dict(self.metadata_train_speakers), fp=w, indent=2)
+    def get_batch(self, batch_size, is_test=False):
+        return self.get_batch_test(batch_size) if is_test else self.get_random_batch(batch_size, is_test=False)
+    def get_batch_test(self, batch_size):
+        return self.get_random_batch(batch_size, is_test=True)
+    def get_random_batch(self, batch_size, is_test=False):
+        sp_to_utt = self.sp_to_utt_test if is_test else self.sp_to_utt_train
+        speakers = list(self.audio.speakers_to_utterances.keys())
+        anchor_speakers = np.random.choice(speakers, size=batch_size // 3, replace=False)
+        anchor_utterances = []
+        positive_utterances = []
+        negative_utterances = []
+        for anchor_speaker in anchor_speakers:
+            negative_speaker = np.random.choice(list(set(speakers) - {anchor_speaker}), size=1)[0]
+            assert negative_speaker != anchor_speaker
+            pos_utterances = np.random.choice(sp_to_utt[anchor_speaker], 2, replace=False)
+            neg_utterance = np.random.choice(sp_to_utt[negative_speaker], 1, replace=True)[0]
+            anchor_utterances.append(pos_utterances[0])
+            positive_utterances.append(pos_utterances[1])
+            negative_utterances.append(neg_utterance)
+        # anchor and positive should have difference utterances (but same speaker!).
+        anc_pos = np.array([positive_utterances, anchor_utterances])
+        assert np.all(anc_pos[0, :] != anc_pos[1, :])
+        assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
+            [extract_speaker(s) for s in anc_pos[1, :]]))
+        pos_neg = np.array([positive_utterances, negative_utterances])
+        assert np.all(pos_neg[0, :] != pos_neg[1, :])
+        assert np.all(np.array([extract_speaker(s) for s in pos_neg[0, :]]) != np.array(
+            [extract_speaker(s) for s in pos_neg[1, :]]))
+        batch_x = np.vstack([
+            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
+        ])
+        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
+        return batch_x, batch_y
+    def get_batch_train(self, batch_size):
+        from deep_speaker.test import batch_cosine_similarity
+        # s1 = time()
+        self.batch_count += 1
+        if self.batch_count % self.history_every == 0:
+            self.update_triplets_history()
+        all_indexes = range(len(self.history_embeddings_train))
+        anchor_indexes = np.random.choice(a=all_indexes, size=batch_size // 3, replace=False)
+        # s2 = time()
+        similar_negative_indexes = []
+        dissimilar_positive_indexes = []
+        # could be made parallel.
+        for anchor_index in anchor_indexes:
+            # s21 = time()
+            anchor_embedding = self.history_embeddings[anchor_index]
+            anchor_speaker = extract_speaker(self.history_utterances[anchor_index])
+            # why self.nb_speakers // 2? just random. because it is fast. otherwise it's too much.
+            negative_indexes = [j for (j, a) in enumerate(self.history_utterances)
+                                if extract_speaker(a) != anchor_speaker]
+            negative_indexes = np.random.choice(negative_indexes, size=self.nb_speakers // 2)
+            # s22 = time()
+            anchor_embedding_tile = [anchor_embedding] * len(negative_indexes)
+            anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[negative_indexes])
+            # s23 = time()
+            similar_negative_index = negative_indexes[np.argsort(anchor_cos)[-1]]  # [-1:]
+            similar_negative_indexes.append(similar_negative_index)
+            # s24 = time()
+            positive_indexes = [j for (j, a) in enumerate(self.history_utterances) if
+                                extract_speaker(a) == anchor_speaker and j != anchor_index]
+            # s25 = time()
+            anchor_embedding_tile = [anchor_embedding] * len(positive_indexes)
+            # s26 = time()
+            anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[positive_indexes])
+            dissimilar_positive_index = positive_indexes[np.argsort(anchor_cos)[0]]  # [:1]
+            dissimilar_positive_indexes.append(dissimilar_positive_index)
+            # s27 = time()
+        # s3 = time()
+        batch_x = np.vstack([
+            self.history_model_inputs[anchor_indexes],
+            self.history_model_inputs[dissimilar_positive_indexes],
+            self.history_model_inputs[similar_negative_indexes]
+        ])
+        # s4 = time()
+        # for anchor, positive, negative in zip(history_utterances[anchor_indexes],
+        #                                       history_utterances[dissimilar_positive_indexes],
+        #                                       history_utterances[similar_negative_indexes]):
+        # print('anchor', os.path.basename(anchor),
+        #       'positive', os.path.basename(positive),
+        #       'negative', os.path.basename(negative))
+        # print('_' * 80)
+        # assert utterances as well positive != anchor.
+        anchor_speakers = [extract_speaker(a) for a in self.history_utterances[anchor_indexes]]
+        positive_speakers = [extract_speaker(a) for a in self.history_utterances[dissimilar_positive_indexes]]
+        negative_speakers = [extract_speaker(a) for a in self.history_utterances[similar_negative_indexes]]
+        assert len(anchor_indexes) == len(dissimilar_positive_indexes)
+        assert len(similar_negative_indexes) == len(dissimilar_positive_indexes)
+        assert list(self.history_utterances[dissimilar_positive_indexes]) != list(
+            self.history_utterances[anchor_indexes])
+        assert anchor_speakers == positive_speakers
+        assert negative_speakers != anchor_speakers
+        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
+        for a in anchor_speakers:
+            self.metadata_train_speakers[a] += 1
+        for a in positive_speakers:
+            self.metadata_train_speakers[a] += 1
+        for a in negative_speakers:
+            self.metadata_train_speakers[a] += 1
+        # s5 = time()
+        # print('1-2', s2 - s1)
+        # print('2-3', s3 - s2)
+        # print('3-4', s4 - s3)
+        # print('4-5', s5 - s4)
+        # print('21-22', (s22 - s21) * (batch_size // 3))
+        # print('22-23', (s23 - s22) * (batch_size // 3))
+        # print('23-24', (s24 - s23) * (batch_size // 3))
+        # print('24-25', (s25 - s24) * (batch_size // 3))
+        # print('25-26', (s26 - s25) * (batch_size // 3))
+        # print('26-27', (s27 - s26) * (batch_size // 3))
+        return batch_x, batch_y
+    def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
+        speakers = list(self.audio.speakers_to_utterances.keys())
+        anchor_utterances = []
+        positive_utterances = []
+        negative_utterances = []
+        negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
+        assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
+        pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
+        neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
+        anchor_utterances.append(pos_utterances[0])
+        positive_utterances.append(pos_utterances[1])
+        negative_utterances.extend(neg_utterances)
+        # anchor and positive should have difference utterances (but same speaker!).
+        anc_pos = np.array([positive_utterances, anchor_utterances])
+        assert np.all(anc_pos[0, :] != anc_pos[1, :])
+        assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
+            [extract_speaker(s) for s in anc_pos[1, :]]))
+        batch_x = np.vstack([
+            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
+        ])
+        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
+        return batch_x, batch_y
+class TripletBatcher:
+    def __init__(self, kx_train, ky_train, kx_test, ky_test):
+        self.kx_train = kx_train
+        self.ky_train = ky_train
+        self.kx_test = kx_test
+        self.ky_test = ky_test
+        speakers_list = sorted(set(ky_train.argmax(axis=1)))
+        num_different_speakers = len(speakers_list)
+        assert speakers_list == sorted(set(ky_test.argmax(axis=1)))  # train speakers = test speakers.
+        assert speakers_list == list(range(num_different_speakers))
+        self.train_indices_per_speaker = {}
+        self.test_indices_per_speaker = {}
+        for speaker_id in speakers_list:
+            self.train_indices_per_speaker[speaker_id] = list(np.where(ky_train.argmax(axis=1) == speaker_id)[0])
+            self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
+        # check.
+        # print(sorted(sum([v for v in self.train_indices_per_speaker.values()], [])))
+        # print(range(len(ky_train)))
+        assert sorted(sum([v for v in self.train_indices_per_speaker.values()], [])) == sorted(range(len(ky_train)))
+        assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
+        self.speakers_list = speakers_list
+    def select_speaker_data(self, speaker, n, is_test):
+        x = self.kx_test if is_test else self.kx_train
+        indices_per_speaker = self.test_indices_per_speaker if is_test else self.train_indices_per_speaker
+        indices = np.random.choice(indices_per_speaker[speaker], size=n)
+        return x[indices]
+    def get_batch(self, batch_size, is_test=False):
+        # y = self.ky_test if is_test else self.ky_train
+        two_different_speakers = np.random.choice(self.speakers_list, size=2, replace=False)
+        anchor_positive_speaker = two_different_speakers[0]
+        negative_speaker = two_different_speakers[1]
+        assert negative_speaker != anchor_positive_speaker
+        batch_x = np.vstack([
+            self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
+            self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
+            self.select_speaker_data(negative_speaker, batch_size // 3, is_test)
+        ])
+        batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
+        return batch_x, batch_y
+class TripletBatcherMiner(TripletBatcher):
+    def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
+        super().__init__(kx_train, ky_train, kx_test, ky_test)
+        self.model = model
+        self.num_evaluations_to_find_best_batch = 10
+    def get_batch(self, batch_size, is_test=False):
+        if is_test:
+            return super().get_batch(batch_size, is_test)
+        max_loss = 0
+        max_batch = None, None
+        for i in range(self.num_evaluations_to_find_best_batch):
+            bx, by = super().get_batch(batch_size, is_test=False)  # only train here.
+            loss = self.model.m.evaluate(bx, by, batch_size=batch_size, verbose=0)
+            if loss > max_loss:
+                max_loss = loss
+                max_batch = bx, by
+        return max_batch
+class TripletBatcherSelectHardNegatives(TripletBatcher):
+    def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
+        super().__init__(kx_train, ky_train, kx_test, ky_test)
+        self.model = model
+    def get_batch(self, batch_size, is_test=False, predict=None):
+        if predict is None:
+            predict = self.model.m.predict
+        from deep_speaker.test import batch_cosine_similarity
+        num_triplets = batch_size // 3
+        inputs = []
+        k = 2  # do not change this.
+        for speaker in self.speakers_list:
+            inputs.append(self.select_speaker_data(speaker, n=k, is_test=is_test))
+        inputs = np.array(inputs)  # num_speakers * [k, num_frames, num_fbanks, 1].
+        embeddings = predict(np.vstack(inputs))
+        assert embeddings.shape[-1] == 512
+        # (speaker, utterance, 512)
+        embeddings = np.reshape(embeddings, (len(self.speakers_list), k, 512))
+        cs = batch_cosine_similarity(embeddings[:, 0], embeddings[:, 1])
+        arg_sort = np.argsort(cs)
+        assert len(arg_sort) > num_triplets
+        anchor_speakers = arg_sort[0:num_triplets]
+        anchor_embeddings = embeddings[anchor_speakers, 0]
+        negative_speakers = sorted(set(self.speakers_list) - set(anchor_speakers))
+        negative_embeddings = embeddings[negative_speakers, 0]
+        selected_negative_speakers = []
+        for anchor_embedding in anchor_embeddings:
+            cs_negative = [batch_cosine_similarity([anchor_embedding], neg) for neg in negative_embeddings]
+            selected_negative_speakers.append(negative_speakers[int(np.argmax(cs_negative))])
+        # anchor with frame 0.
+        # positive with frame 1.
+        # negative with frame 0.
+        assert len(set(selected_negative_speakers).intersection(anchor_speakers)) == 0
+        negative = inputs[selected_negative_speakers, 0]
+        positive = inputs[anchor_speakers, 1]
+        anchor = inputs[anchor_speakers, 0]
+        batch_x = np.vstack([anchor, positive, negative])
+        batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
+        return batch_x, batch_y
+class TripletEvaluator:
+    def __init__(self, kx_test, ky_test):
+        self.kx_test = kx_test
+        self.ky_test = ky_test
+        speakers_list = sorted(set(ky_test.argmax(axis=1)))
+        num_different_speakers = len(speakers_list)
+        assert speakers_list == list(range(num_different_speakers))
+        self.test_indices_per_speaker = {}
+        for speaker_id in speakers_list:
+            self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
+        assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
+        self.speakers_list = speakers_list
+    def _select_speaker_data(self, speaker):
+        indices = np.random.choice(self.test_indices_per_speaker[speaker], size=1)
+        return self.kx_test[indices]
+    def get_speaker_verification_data(self, positive_speaker, num_different_speakers):
+        all_negative_speakers = list(set(self.speakers_list) - {positive_speaker})
+        assert len(self.speakers_list) - 1 == len(all_negative_speakers)
+        negative_speakers = np.random.choice(all_negative_speakers, size=num_different_speakers, replace=False)
+        assert positive_speaker not in negative_speakers
+        anchor = self._select_speaker_data(positive_speaker)
+        positive = self._select_speaker_data(positive_speaker)
+        data = [anchor, positive]
+        data.extend([self._select_speaker_data(n) for n in negative_speakers])
+        return np.vstack(data)
+if __name__ == '__main__':
+    np.random.seed(123)
+    ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
+                             max_length=NUM_FRAMES,
+                             model=DeepSpeakerModel())
+    for i in range(1000):
+        print(i)
+        start = time()
+        ltb.get_batch_train(batch_size=9)
+        print(time() - start)
+        # ltb.get_batch(batch_size=96)

speaker_recognition/constants.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Constants.
+SAMPLE_RATE = 16000  # not higher than that otherwise we may have errors when computing the fbanks.
+# Train/Test sets share the same speakers. They contain different utterances.
+# 0.8 means 20% of the utterances of each speaker will be held out and placed in the test set.
+TRAIN_TEST_RATIO = 0.8
+CHECKPOINTS_SOFTMAX_DIR = 'checkpoints-softmax'
+CHECKPOINTS_TRIPLET_DIR = 'checkpoints-triplets'
+BATCH_SIZE = 32 * 3  # have to be a multiple of 3.
+# Input to the model will be a 4D image: (batch_size, num_frames, num_fbanks, 3)
+# Where the 3 channels are: FBANK, DIFF(FBANK), DIFF(DIFF(FBANK)).
+NUM_FRAMES = 160  # 1 second ~ 100 frames with default params winlen=0.025,winstep=0.01
+NUM_FBANKS = 64

speaker_recognition/conv_models.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import logging
+import os
+import numpy as np
+import tensorflow as tf
+# pylint: disable=E0611,E0401
+import tensorflow.keras.backend as K
+# pylint: disable=E0611,E0401
+from tensorflow.keras import layers, regularizers
+# pylint: disable=E0611,E0401
+from tensorflow.keras.layers import (
+    BatchNormalization,
+    Conv2D,
+    Dense,
+    Dropout,
+    Input,
+    Lambda,
+    Reshape,
+)
+# pylint: disable=E0611,E0401
+from tensorflow.keras.models import Model
+# pylint: disable=E0611,E0401
+from tensorflow.keras.optimizers import Adam
+from deep_speaker.constants import NUM_FBANKS, SAMPLE_RATE, NUM_FRAMES
+from deep_speaker.triplet_loss import deep_speaker_loss
+logger = logging.getLogger(__name__)
+@tf.function
+def tf_normalize(data, ndims, eps=0, adjusted=False):
+    data = tf.convert_to_tensor(data, name='data')
+    reduce_dims = [-i - 1 for i in range(ndims)]
+    # pylint: disable=E1123,E1120
+    data = tf.cast(data, dtype=tf.dtypes.float32)
+    data_num = tf.reduce_prod(data.shape[-ndims:])
+    data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)
+    # Apply a minimum normalization that protects us against uniform images.
+    stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
+    adjusted_stddev = stddev
+    if adjusted:
+        min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
+        eps = tf.maximum(eps, min_stddev)
+    if eps > 0:
+        adjusted_stddev = tf.maximum(adjusted_stddev, eps)
+    return (data - data_mean) / adjusted_stddev
+@tf.function
+def tf_fbank(samples):
+    """
+    Compute Mel-filterbank energy features from an audio signal.
+    See python_speech_features.fbank
+    """
+    frame_length = int(0.025 * SAMPLE_RATE)
+    frame_step = int(0.01 * SAMPLE_RATE)
+    fft_length = 512
+    fft_bins = fft_length // 2 + 1
+    pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]
+    # Original implementation from python_speech_features
+    # frames = tf.expand_dims(sigproc.framesig(preemphasis[0], frame_length,
+    # frame_step, winfunc=lambda x: np.ones((x,))), 0)
+    # powspec = sigproc.powspec(frames, fft_length)
+    # Tensorflow impl #1, using manually-split frames and rfft
+    # spec = tf.abs(tf.signal.rfft(frames, [fft_length]))
+    # powspec = tf.square(spec) / fft_length
+    # Tensorflow impl #2, using stft to handle framing automatically
+    # (There is a one-off mismatch on the number of frames on the resulting tensor, but I guess this is ok)
+    spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
+    powspec = tf.square(spec) / fft_length
+    # Matrix to transform spectrum to mel-frequencies
+    # Original implementation from python_speech_features
+    # linear_to_mel_weight_matrix = get_filterbanks(NUM_FBANKS, fft_length,
+    # SAMPLE_RATE, 0, SAMPLE_RATE/2).astype(np.float32).T
+    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
+        num_mel_bins=NUM_FBANKS,
+        num_spectrogram_bins=fft_bins,
+        sample_rate=SAMPLE_RATE,
+        lower_edge_hertz=0,
+        upper_edge_hertz=SAMPLE_RATE / 2,
+    )
+    feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
+    # feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
+    return feat
+class DeepSpeakerModel:
+    # I thought it was 3 but maybe energy is added at a 4th dimension.
+    # would be better to have 4 dimensions:
+    # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
+    # this seems to help match the parameter counts.
+    def __init__(
+            self,
+            batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
+            include_softmax=False,
+            num_speakers_softmax=None,
+            pcm_input=False
+    ):
+        if pcm_input:
+            batch_input_shape = None
+        self.include_softmax = include_softmax
+        if self.include_softmax:
+            assert num_speakers_softmax > 0
+        self.clipped_relu_count = 0
+        # http://cs231n.github.io/convolutional-networks/
+        # conv weights
+        # #params = ks * ks * nb_filters * num_channels_input
+        # Conv128-s
+        # 5*5*128*128/2+128
+        # ks*ks*nb_filters*channels/strides+bias(=nb_filters)
+        # take 100 ms -> 4 frames.
+        # if signal is 3 seconds, then take 100ms per 100ms and average out this network.
+        # 8*8 = 64 features.
+        # used to share all the layers across the inputs
+        # num_frames = K.shape() - do it dynamically after.
+        if pcm_input:
+            batch_input_shape = batch_input_shape or (None, None)  # Batch-size, num-samples
+            inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
+            x = inputs
+            x = Lambda(tf_fbank)(x)
+            x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
+            x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
+        else:
+            batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
+            inputs = Input(batch_shape=batch_input_shape, name='input')
+            x = inputs
+        x = self.cnn_component(x)
+        x = Reshape((-1, 2048))(x)
+        # Temporal average layer. axis=1 is time.
+        x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
+        if include_softmax:
+            logger.info('Including a Dropout layer to reduce overfitting.')
+            # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
+            x = Dropout(0.5)(x)
+        x = Dense(512, name='affine')(x)
+        if include_softmax:
+            # Those weights are just when we train on softmax.
+            x = Dense(num_speakers_softmax, activation='softmax')(x)
+        else:
+            # Does not contain any weights.
+            x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
+        self.m = Model(inputs, x, name='ResCNN')
+    def keras_model(self):
+        return self.m
+    def get_weights(self):
+        w = self.m.get_weights()
+        if self.include_softmax:
+            w.pop()  # last 2 are the W_softmax and b_softmax.
+            w.pop()
+        return w
+    def clipped_relu(self, inputs):
+        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
+        self.clipped_relu_count += 1
+        return relu
+    def identity_block(self, input_tensor, kernel_size, filters, stage, block):
+        conv_name_base = f'res{stage}_{block}_branch'
+        x = Conv2D(filters,
+                   kernel_size=kernel_size,
+                   strides=1,
+                   activation=None,
+                   padding='same',
+                   kernel_initializer='glorot_uniform',
+                   kernel_regularizer=regularizers.l2(l=0.0001),
+                   name=conv_name_base + '_2a')(input_tensor)
+        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
+        x = self.clipped_relu(x)
+        x = Conv2D(
+            filters,
+            kernel_size=kernel_size,
+            strides=1,
+            activation=None,
+            padding='same',
+            kernel_initializer='glorot_uniform',
+            kernel_regularizer=regularizers.l2(l=0.0001),
+            name=conv_name_base + '_2b',
+        )(x)
+        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)
+        x = self.clipped_relu(x)
+        x = layers.add([x, input_tensor])
+        x = self.clipped_relu(x)
+        return x
+    def conv_and_res_block(self, inp, filters, stage):
+        conv_name = 'conv{}-s'.format(filters)
+        # TODO: why kernel_regularizer?
+        o = Conv2D(filters,
+                   kernel_size=5,
+                   strides=2,
+                   activation=None,
+                   padding='same',
+                   kernel_initializer='glorot_uniform',
+                   kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
+        o = BatchNormalization(name=conv_name + '_bn')(o)
+        o = self.clipped_relu(o)
+        for i in range(3):
+            o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
+        return o
+    def cnn_component(self, inp):
+        x = self.conv_and_res_block(inp, 64, stage=1)
+        x = self.conv_and_res_block(x, 128, stage=2)
+        x = self.conv_and_res_block(x, 256, stage=3)
+        x = self.conv_and_res_block(x, 512, stage=4)
+        return x
+    def set_weights(self, w):
+        for layer, layer_w in zip(self.m.layers, w):
+            layer.set_weights(layer_w)
+            logger.info(f'Setting weights for [{layer.name}]...')
+def main():
+    # Looks correct to me.
+    # I have 37K but paper reports 41K. which is not too far.
+    dsm = DeepSpeakerModel()
+    dsm.m.summary()
+    # I suspect num frames to be 32.
+    # Then fbank=64, then total would be 32*64 = 2048.
+    # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
+def _train():
+    # x = np.random.uniform(size=(6, 32, 64, 4))  # 6 is multiple of 3.
+    # y_softmax = np.random.uniform(size=(6, 100))
+    # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
+    # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
+    # print(dsm.m.predict(x).shape)
+    # print(dsm.m.evaluate(x, y_softmax))
+    # w = dsm.get_weights()
+    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
+    # dsm.m.set_weights(w)
+    dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
+    # it works!!!!!!!!!!!!!!!!!!!!
+    # unit_batch_size = 20
+    # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
+    # positive = np.array(anchor)
+    # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
+    # batch = np.vstack((anchor, positive, negative))
+    # x = batch
+    # y = np.zeros(shape=(len(batch), 512))  # not important.
+    # print('Starting to fit...')
+    # while True:
+    #     print(dsm.m.train_on_batch(x, y))
+    # should not work... and it does not work!
+    unit_batch_size = 20
+    negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
+    batch = np.vstack((negative, negative, negative))
+    x = batch
+    y = np.zeros(shape=(len(batch), 512))  # not important.
+    print('Starting to fit...')
+    while True:
+        print(dsm.m.train_on_batch(x, y))
+def _test_checkpoint_compatibility():
+    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
+    dsm.m.save_weights('test.h5')
+    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
+    dsm.m.load_weights('test.h5', by_name=True)
+    os.remove('test.h5')
+if __name__ == '__main__':
+    _test_checkpoint_compatibility()

speaker_recognition/eval_metrics.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import numpy as np
+def evaluate(sims, labels):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 1.0, 0.001)
+    fm, tpr, acc = calculate_roc(thresholds, sims, labels)
+    eer = calculate_eer(thresholds, sims, labels)
+    return fm, tpr, acc, eer
+def calculate_roc(thresholds, sims, labels):
+    nrof_thresholds = len(thresholds)
+    tprs = np.zeros((nrof_thresholds))
+    fprs = np.zeros((nrof_thresholds))
+    acc_train = np.zeros((nrof_thresholds))
+    precisions = np.zeros((nrof_thresholds))
+    fms = np.zeros((nrof_thresholds))
+    # Find the best threshold for the fold
+    for threshold_idx, threshold in enumerate(thresholds):
+        tprs[threshold_idx], fprs[threshold_idx], precisions[threshold_idx], fms[threshold_idx], acc_train[
+            threshold_idx] = calculate_accuracy(threshold, sims, labels)
+    bestindex = np.argmax(fms)
+    bestfm = fms[bestindex]
+    besttpr = tprs[bestindex]
+    bestacc = acc_train[bestindex]
+    return bestfm, besttpr, bestacc
+def calculate_accuracy(threshold, sims, actual_issame):
+    predict_issame = np.greater(sims, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)  # recall
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    precision = 0 if (tp + fp == 0) else float(tp) / float(tp + fp)
+    fm = 2 * precision * tpr / (precision + tpr + 1e-12)
+    acc = float(tp + tn) / (sims.size + 1e-12)
+    return tpr, fpr, precision, fm, acc
+def calculate_eer(thresholds, sims, labels):
+    nrof_thresholds = len(thresholds)
+    # Find the threshold that gives FAR = far_target
+    far_train = np.zeros(nrof_thresholds)
+    frr_train = np.zeros(nrof_thresholds)
+    eer_index = 0
+    eer_diff = 100000000
+    for threshold_idx, threshold in enumerate(thresholds):
+        frr_train[threshold_idx], far_train[threshold_idx] = calculate_val_far(threshold, sims, labels)
+        if abs(frr_train[threshold_idx] - far_train[threshold_idx]) < eer_diff:
+            eer_diff = abs(frr_train[threshold_idx] - far_train[threshold_idx])
+            eer_index = threshold_idx
+    frr, far = frr_train[eer_index], far_train[eer_index]
+    eer = (frr + far) / 2
+    return eer
+def calculate_val_far(threshold, sims, actual_issame):
+    predict_issame = np.greater(sims, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    if n_diff == 0:
+        n_diff = 1
+    if n_same == 0:
+        return 0, 0
+    val = float(true_accept) / float(n_same)
+    frr = 1 - val
+    far = float(false_accept) / float(n_diff)
+    return frr, far

speaker_recognition/test.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+import numpy as np
+from tqdm import tqdm
+from deep_speaker.audio import Audio
+from deep_speaker.batcher import LazyTripletBatcher
+from deep_speaker.constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE
+from deep_speaker.conv_models import DeepSpeakerModel
+from deep_speaker.eval_metrics import evaluate
+from deep_speaker.utils import load_best_checkpoint, enable_deterministic
+logger = logging.getLogger(__name__)
+def batch_cosine_similarity(x1, x2):
+    # https://en.wikipedia.org/wiki/Cosine_similarity
+    # 1 = equal direction ; -1 = opposite direction
+    mul = np.multiply(x1, x2)
+    s = np.sum(mul, axis=1)
+    # l1 = np.sum(np.multiply(x1, x1),axis=1)
+    # l2 = np.sum(np.multiply(x2, x2), axis=1)
+    # as values have have length 1, we don't need to divide by norm (as it is 1)
+    return s
+def eval_model(working_dir: str, model: DeepSpeakerModel):
+    enable_deterministic()
+    audio = Audio(working_dir)
+    batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model)
+    speakers_list = list(audio.speakers_to_utterances.keys())
+    num_negative_speakers = 99
+    num_speakers = len(speakers_list)
+    y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1))  # negatives + positive
+    for i, positive_speaker in tqdm(enumerate(speakers_list), desc='test', total=num_speakers):
+        # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
+        input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers)
+        # batch size is not relevant. just making sure we don't push too much on the GPU.
+        predictions = model.m.predict(input_data, batch_size=BATCH_SIZE)
+        anchor_embedding = predictions[0]
+        for j, other_than_anchor_embedding in enumerate(predictions[1:]):  # positive + negatives
+            y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0]
+        # y_pred[i] = softmax(y_pred[i])
+    # could apply softmax here.
+    y_true = np.zeros_like(y_pred)  # positive is at index 0.
+    y_true[:, 0] = 1.0
+    print(np.matrix(y_true))
+    print(np.matrix(y_pred))
+    print(np.min(y_pred), np.max(y_pred))
+    fm, tpr, acc, eer = evaluate(y_pred, y_true)
+    return fm, tpr, acc, eer
+def test(working_dir, checkpoint_file=None):
+    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
+    dsm = DeepSpeakerModel(batch_input_shape)
+    if checkpoint_file is None:
+        checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
+    if checkpoint_file is not None:
+        logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...')
+        dsm.m.load_weights(checkpoint_file, by_name=True)
+    else:
+        logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
+        exit(1)
+    fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
+    logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
+                f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')

speaker_recognition/train.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import logging
+import os
+# pylint: disable=E0611,E0401
+from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
+# pylint: disable=E0611,E0401
+from tensorflow.keras.optimizers import SGD
+from tqdm import tqdm
+from deep_speaker.batcher import KerasFormatConverter, LazyTripletBatcher
+from deep_speaker.constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
+from deep_speaker.conv_models import DeepSpeakerModel
+from deep_speaker.triplet_loss import deep_speaker_loss
+from deep_speaker.utils import load_best_checkpoint, ensures_dir
+logger = logging.getLogger(__name__)
+# Otherwise it's just too much logging from Tensorflow...
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE):
+    batcher = LazyTripletBatcher(working_dir, max_length, dsm)
+    # build small test set.
+    test_batches = []
+    for _ in tqdm(range(200), desc='Build test set'):
+        test_batches.append(batcher.get_batch_test(batch_size))
+    def test_generator():
+        while True:
+            for bb in test_batches:
+                yield bb
+    def train_generator():
+        while True:
+            yield batcher.get_random_batch(batch_size, is_test=False)
+    checkpoint_name = dsm.m.name + '_checkpoint'
+    checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
+    checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
+    dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
+              epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
+              callbacks=[checkpoint])
+def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test,
+                      batch_size=BATCH_SIZE, max_epochs=1000, initial_epoch=0):
+    checkpoint_name = dsm.m.name + '_checkpoint'
+    checkpoint_filename = os.path.join(CHECKPOINTS_SOFTMAX_DIR, checkpoint_name + '_{epoch}.h5')
+    checkpoint = ModelCheckpoint(monitor='val_accuracy', filepath=checkpoint_filename, save_best_only=True)
+    # if the accuracy does not increase by 0.1% over 20 epochs, we stop the training.
+    early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=20, verbose=1, mode='max')
+    # if the accuracy does not increase over 10 epochs, we reduce the learning rate by half.
+    reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.0001, verbose=1)
+    max_len_train = len(kx_train) - len(kx_train) % batch_size
+    kx_train = kx_train[0:max_len_train]
+    ky_train = ky_train[0:max_len_train]
+    max_len_test = len(kx_test) - len(kx_test) % batch_size
+    kx_test = kx_test[0:max_len_test]
+    ky_test = ky_test[0:max_len_test]
+    dsm.m.fit(x=kx_train,
+              y=ky_train,
+              batch_size=batch_size,
+              epochs=initial_epoch + max_epochs,
+              initial_epoch=initial_epoch,
+              verbose=1,
+              shuffle=True,
+              validation_data=(kx_test, ky_test),
+              callbacks=[early_stopping, reduce_lr, checkpoint])
+def start_training(working_dir, pre_training_phase=True):
+    ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
+    ensures_dir(CHECKPOINTS_TRIPLET_DIR)
+    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
+    if pre_training_phase:
+        logger.info('Softmax pre-training.')
+        kc = KerasFormatConverter(working_dir)
+        num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
+        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
+        dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
+        if pre_training_checkpoint is not None:
+            initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
+            logger.info(f'Initial epoch is {initial_epoch}.')
+            logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
+            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
+        else:
+            initial_epoch = 0
+        fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
+    else:
+        logger.info('Training with the triplet loss.')
+        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
+        triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
+        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
+        if triplet_checkpoint is not None:
+            logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
+            dsm.m.load_weights(triplet_checkpoint)
+        elif pre_training_checkpoint is not None:
+            logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
+            # If `by_name` is True, weights are loaded into layers only if they share the
+            # same name. This is useful for fine-tuning or transfer-learning models where
+            # some of the layers have changed.
+            dsm.m.load_weights(pre_training_checkpoint, by_name=True)
+        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
+        fit_model(dsm, working_dir, NUM_FRAMES)

speaker_recognition/triplet_loss.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# pylint: disable=E0611,E0401
+import tensorflow.keras.backend as K
+# ALPHA = 0.2  # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf
+ALPHA = 0.1  # used in Deep Speaker.
+def batch_cosine_similarity(x1, x2):
+    # https://en.wikipedia.org/wiki/Cosine_similarity
+    # 1 = equal direction ; -1 = opposite direction
+    dot = K.squeeze(K.batch_dot(x1, x2, axes=1), axis=1)
+    # as values have have length 1, we don't need to divide by norm (as it is 1)
+    return dot
+def deep_speaker_loss(y_true, y_pred, alpha=ALPHA):
+    # y_true is not used. we respect this convention:
+    # y_true.shape = (batch_size, embedding_size) [not used]
+    # y_pred.shape = (batch_size, embedding_size)
+    # EXAMPLE:
+    # _____________________________________________________
+    # ANCHOR 1 (512,)
+    # ANCHOR 2 (512,)
+    # POS EX 1 (512,)
+    # POS EX 2 (512,)
+    # NEG EX 1 (512,)
+    # NEG EX 2 (512,)
+    # _____________________________________________________
+    split = K.shape(y_pred)[0] // 3
+    anchor = y_pred[0:split]
+    positive_ex = y_pred[split:2 * split]
+    negative_ex = y_pred[2 * split:]
+    # If the loss does not decrease below ALPHA then the model does not learn anything.
+    # If all anchor = positive = negative (model outputs the same vector always).
+    # Then sap = san = 1. and loss = max(alpha,0) = alpha.
+    # On the contrary if anchor = positive = [1] and negative = [-1].
+    # Then sap = 1 and san = -1. loss = max(-1-1+0.1,0) = max(-1.9, 0) = 0.
+    sap = batch_cosine_similarity(anchor, positive_ex)
+    san = batch_cosine_similarity(anchor, negative_ex)
+    loss = K.maximum(san - sap + alpha, 0.0)
+    total_loss = K.mean(loss)
+    return total_loss
+if __name__ == '__main__':
+    import numpy as np
+    print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print('--------------')
+    print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
+    print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
+    print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
+    print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
+    print('--------------')
+    print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
+    print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))

speaker_recognition/utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import logging
+import os
+import random
+import shutil
+from glob import glob
+import click
+import dill
+import numpy as np
+import pandas as pd
+from natsort import natsorted
+from deep_speaker.constants import TRAIN_TEST_RATIO
+logger = logging.getLogger(__name__)
+def find_files(directory, ext='wav'):
+    return sorted(glob(directory + f'/**/*.{ext}', recursive=True))
+def init_pandas():
+    pd.set_option('display.float_format', lambda x: '%.3f' % x)
+    pd.set_option('display.max_rows', None)
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.width', 1000)
+def create_new_empty_dir(directory: str):
+    if os.path.exists(directory):
+        shutil.rmtree(directory)
+    os.makedirs(directory)
+def ensure_dir_for_filename(filename: str):
+    ensures_dir(os.path.dirname(filename))
+def ensures_dir(directory: str):
+    if len(directory) > 0 and not os.path.exists(directory):
+        os.makedirs(directory)
+class ClickType:
+    @staticmethod
+    def input_file(writable=False):
+        return click.Path(exists=True, file_okay=True, dir_okay=False,
+                          writable=writable, readable=True, resolve_path=True)
+    @staticmethod
+    def input_dir(writable=False):
+        return click.Path(exists=True, file_okay=False, dir_okay=True,
+                          writable=writable, readable=True, resolve_path=True)
+    @staticmethod
+    def output_file():
+        return click.Path(exists=False, file_okay=True, dir_okay=False,
+                          writable=True, readable=True, resolve_path=True)
+    @staticmethod
+    def output_dir():
+        return click.Path(exists=False, file_okay=False, dir_okay=True,
+                          writable=True, readable=True, resolve_path=True)
+def parallel_function(f, sequence, num_threads=None):
+    from multiprocessing import Pool
+    pool = Pool(processes=num_threads)
+    result = pool.map(f, sequence)
+    cleaned = [x for x in result if x is not None]
+    pool.close()
+    pool.join()
+    return cleaned
+def load_best_checkpoint(checkpoint_dir):
+    checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
+    if len(checkpoints) != 0:
+        return checkpoints[-1]
+    return None
+def delete_older_checkpoints(checkpoint_dir, max_to_keep=5):
+    assert max_to_keep > 0
+    checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
+    checkpoints_to_keep = checkpoints[-max_to_keep:]
+    for checkpoint in checkpoints:
+        if checkpoint not in checkpoints_to_keep:
+            os.remove(checkpoint)
+def enable_deterministic():
+    print('Deterministic mode enabled.')
+    np.random.seed(123)
+    random.seed(123)
+def load_pickle(file):
+    if not os.path.exists(file):
+        return None
+    logger.info(f'Loading PKL file: {file}.')
+    with open(file, 'rb') as r:
+        return dill.load(r)
+def load_npy(file):
+    if not os.path.exists(file):
+        return None
+    logger.info(f'Loading NPY file: {file}.')
+    return np.load(file)
+def train_test_sp_to_utt(audio, is_test):
+    sp_to_utt = {}
+    for speaker_id, utterances in audio.speakers_to_utterances.items():
+        utterances_files = sorted(utterances.values())
+        train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
+        sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
+    return sp_to_utt