TomCallan commited on
Commit
aed64b5
·
1 Parent(s): eb11506

Upload 14 files

Browse files
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from speaker_recognition import app
4
+
5
+
6
+
7
+ def recognition(audio):
8
+ do = app.speaker_recognition()
9
+ return do.run_transform(audio)
10
+
11
+ demo = gr.Interface(fn=recognition, inputs=["audio"], outputs="text")
12
+
13
+ demo.launch()
14
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gdown
2
+ numpy
speaker_recognition/__init__.py ADDED
File without changes
speaker_recognition/__pycache__/audio.cpython-310.pyc ADDED
Binary file (4.64 kB). View file
 
speaker_recognition/app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gdown
2
+ import random
3
+
4
+ import numpy as np
5
+
6
+ from audio import read_mfcc
7
+ from batcher import sample_from_mfcc
8
+ from constants import SAMPLE_RATE, NUM_FRAMES
9
+ from conv_models import DeepSpeakerModel
10
+ from test import batch_cosine_similarity
11
+
12
+ class speaker_recognition:
13
+ def __init__(self):
14
+
15
+ np.random.seed(123)
16
+ random.seed(123)
17
+
18
+ self.speakers = {}
19
+ self.weights = ""
20
+ self.by_name = True
21
+
22
+ self.SAMPLE_RATE = SAMPLE_RATE
23
+ self.NUM_FRAMES = NUM_FRAMES
24
+
25
+ self.spin_up()
26
+
27
+ def spin_up(self):
28
+ if self.weights == "":
29
+ output = "weights.h5"
30
+ gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False)
31
+ self.weights = "weights.h5"
32
+
33
+ self.model = DeepSpeakerModel()
34
+ self.model.m.load_weights(self.weights, by_name=True)
35
+
36
+ def create_speaker(self, data, id=""):
37
+ id = id if id != "" else f"{len(self.speakers)}"
38
+ self.speakers[id] = data
39
+ return id
40
+
41
+ def check_speakers(self, data, id="", threshold = 0.5):
42
+ us = ""
43
+ n = 0
44
+ for speaker in self.speakers:
45
+ k = batch_cosine_similarity(self.speakers[speaker], data)
46
+ if k > threshold:
47
+ if k > n:
48
+ n = k
49
+ us = speaker
50
+ else:pass
51
+ if n == 0:
52
+ id = self.create_speaker(data, id)
53
+ return f"created new speaker : {id}"
54
+
55
+ return (us, k[0])
56
+
57
+ def run_transform(self, audio, pcm = False):
58
+ data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES)
59
+ data = self.model.m.predict(np.expand_dims(data, axis=0))
60
+ return data
speaker_recognition/audio.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+
6
+ import librosa
7
+ import numpy as np
8
+ from python_speech_features import fbank
9
+ from tqdm import tqdm
10
+
11
+ from deep_speaker.constants import SAMPLE_RATE, NUM_FBANKS
12
+ from deep_speaker.utils import find_files, ensures_dir
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def read_mfcc(input_filename, sample_rate):
18
+ audio = Audio.read(input_filename, sample_rate)
19
+ energy = np.abs(audio)
20
+ silence_threshold = np.percentile(energy, 95)
21
+ offsets = np.where(energy > silence_threshold)[0]
22
+ # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
23
+ # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
24
+ # TODO: could use trim_silence() here or a better VAD.
25
+ audio_voice_only = audio[offsets[0]:offsets[-1]]
26
+ mfcc = mfcc_fbank(audio_voice_only, sample_rate)
27
+ return mfcc
28
+
29
+
30
+ def extract_speaker_and_utterance_ids(filename: str): # LIBRI.
31
+ # 'audio/dev-other/116/288045/116-288045-0000.flac'
32
+ speaker, _, basename = Path(filename).parts[-3:]
33
+ filename.split('-')
34
+ utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
35
+ assert basename.split('-')[0] == speaker
36
+ return speaker, utterance
37
+
38
+
39
+ class Audio:
40
+
41
+ def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'):
42
+ self.ext = ext
43
+ self.cache_dir = os.path.join(cache_dir, 'audio-fbanks')
44
+ ensures_dir(self.cache_dir)
45
+ if audio_dir is not None:
46
+ self.build_cache(os.path.expanduser(audio_dir), sample_rate)
47
+ self.speakers_to_utterances = defaultdict(dict)
48
+ for cache_file in find_files(self.cache_dir, ext='npy'):
49
+ # /path/to/speaker_utterance.npy
50
+ speaker_id, utterance_id = Path(cache_file).stem.split('_')
51
+ self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
52
+
53
+ @property
54
+ def speaker_ids(self):
55
+ return sorted(self.speakers_to_utterances)
56
+
57
+ @staticmethod
58
+ def trim_silence(audio, threshold):
59
+ """Removes silence at the beginning and end of a sample."""
60
+ # pylint: disable=E1121
61
+ energy = librosa.feature.rms(audio)
62
+ frames = np.nonzero(np.array(energy > threshold))
63
+ indices = librosa.core.frames_to_samples(frames)[1]
64
+
65
+ # Note: indices can be an empty array, if the whole audio was silence.
66
+ audio_trim = audio[0:0]
67
+ left_blank = audio[0:0]
68
+ right_blank = audio[0:0]
69
+ if indices.size:
70
+ audio_trim = audio[indices[0]:indices[-1]]
71
+ left_blank = audio[:indices[0]] # slice before.
72
+ right_blank = audio[indices[-1]:] # slice after.
73
+ return audio_trim, left_blank, right_blank
74
+
75
+ @staticmethod
76
+ def read(filename, sample_rate=SAMPLE_RATE):
77
+ audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
78
+ assert sr == sample_rate
79
+ return audio
80
+
81
+ def build_cache(self, audio_dir, sample_rate):
82
+ logger.info(f'audio_dir: {audio_dir}.')
83
+ logger.info(f'sample_rate: {sample_rate:,} hz.')
84
+ audio_files = find_files(audio_dir, ext=self.ext)
85
+ audio_files_count = len(audio_files)
86
+ assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
87
+ logger.info(f'Found {audio_files_count:,} files in {audio_dir}.')
88
+ with tqdm(audio_files) as bar:
89
+ for audio_filename in bar:
90
+ bar.set_description(audio_filename)
91
+ self.cache_audio_file(audio_filename, sample_rate)
92
+
93
+ def cache_audio_file(self, input_filename, sample_rate):
94
+ sp, utt = extract_speaker_and_utterance_ids(input_filename)
95
+ cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
96
+ if not os.path.isfile(cache_filename):
97
+ try:
98
+ mfcc = read_mfcc(input_filename, sample_rate)
99
+ np.save(cache_filename, mfcc)
100
+ except librosa.util.exceptions.ParameterError as e:
101
+ logger.error(e)
102
+
103
+
104
+ def pad_mfcc(mfcc, max_length): # num_frames, nfilt=64.
105
+ if len(mfcc) < max_length:
106
+ mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
107
+ return mfcc
108
+
109
+
110
+ def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array.
111
+ # Returns MFCC with shape (num_frames, n_filters, 3).
112
+ filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
113
+ frames_features = normalize_frames(filter_banks)
114
+ # delta_1 = delta(filter_banks, N=1)
115
+ # delta_2 = delta(delta_1, N=1)
116
+ # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
117
+ return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.
118
+
119
+
120
+ def normalize_frames(m, epsilon=1e-12):
121
+ return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
speaker_recognition/batcher.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from collections import deque, Counter
5
+ from random import choice
6
+ from time import time
7
+
8
+ import dill
9
+ import numpy as np
10
+ from tqdm import tqdm
11
+
12
+ from deep_speaker.audio import pad_mfcc, Audio
13
+ from deep_speaker.constants import NUM_FRAMES, NUM_FBANKS
14
+ from deep_speaker.conv_models import DeepSpeakerModel
15
+ from deep_speaker.utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def extract_speaker(utt_file):
21
+ return utt_file.split('/')[-1].split('_')[0]
22
+
23
+
24
+ def sample_from_mfcc(mfcc, max_length):
25
+ if mfcc.shape[0] >= max_length:
26
+ r = choice(range(0, len(mfcc) - max_length + 1))
27
+ s = mfcc[r:r + max_length]
28
+ else:
29
+ s = pad_mfcc(mfcc, max_length)
30
+ return np.expand_dims(s, axis=-1)
31
+
32
+
33
+ def sample_from_mfcc_file(utterance_file, max_length):
34
+ mfcc = np.load(utterance_file)
35
+ return sample_from_mfcc(mfcc, max_length)
36
+
37
+
38
+ class KerasFormatConverter:
39
+
40
+ def __init__(self, working_dir, load_test_only=False):
41
+ self.working_dir = working_dir
42
+ self.output_dir = os.path.join(self.working_dir, 'keras-inputs')
43
+ ensures_dir(self.output_dir)
44
+ self.categorical_speakers = load_pickle(os.path.join(self.output_dir, 'categorical_speakers.pkl'))
45
+ if not load_test_only:
46
+ self.kx_train = load_npy(os.path.join(self.output_dir, 'kx_train.npy'))
47
+ self.ky_train = load_npy(os.path.join(self.output_dir, 'ky_train.npy'))
48
+ self.kx_test = load_npy(os.path.join(self.output_dir, 'kx_test.npy'))
49
+ self.ky_test = load_npy(os.path.join(self.output_dir, 'ky_test.npy'))
50
+ self.audio = Audio(cache_dir=self.working_dir, audio_dir=None)
51
+ if self.categorical_speakers is None:
52
+ self.categorical_speakers = SparseCategoricalSpeakers(self.audio.speaker_ids)
53
+
54
+ def persist_to_disk(self):
55
+ with open(os.path.join(self.output_dir, 'categorical_speakers.pkl'), 'wb') as w:
56
+ dill.dump(self.categorical_speakers, w)
57
+ np.save(os.path.join(self.output_dir, 'kx_train.npy'), self.kx_train)
58
+ np.save(os.path.join(self.output_dir, 'kx_test.npy'), self.kx_test)
59
+ np.save(os.path.join(self.output_dir, 'ky_train.npy'), self.ky_train)
60
+ np.save(os.path.join(self.output_dir, 'ky_test.npy'), self.ky_test)
61
+
62
+ def generate_per_phase(self, max_length=NUM_FRAMES, num_per_speaker=3000, is_test=False):
63
+ # train OR test.
64
+ num_speakers = len(self.audio.speaker_ids)
65
+ sp_to_utt = train_test_sp_to_utt(self.audio, is_test)
66
+
67
+ # 64 fbanks 1 channel(s).
68
+ # float32
69
+ kx = np.zeros((num_speakers * num_per_speaker, max_length, NUM_FBANKS, 1), dtype=np.float32)
70
+ ky = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32)
71
+
72
+ desc = f'Converting to Keras format [{"test" if is_test else "train"}]'
73
+ for i, speaker_id in enumerate(tqdm(self.audio.speaker_ids, desc=desc)):
74
+ utterances_files = sp_to_utt[speaker_id]
75
+ for j, utterance_file in enumerate(np.random.choice(utterances_files, size=num_per_speaker, replace=True)):
76
+ self.load_into_mat(utterance_file, self.categorical_speakers, speaker_id, max_length, kx, ky,
77
+ i * num_per_speaker + j)
78
+ return kx, ky
79
+
80
+ def generate(self, max_length=NUM_FRAMES, counts_per_speaker=(3000, 500)):
81
+ kx_train, ky_train = self.generate_per_phase(max_length, counts_per_speaker[0], is_test=False)
82
+ kx_test, ky_test = self.generate_per_phase(max_length, counts_per_speaker[1], is_test=True)
83
+ logger.info(f'kx_train.shape = {kx_train.shape}')
84
+ logger.info(f'ky_train.shape = {ky_train.shape}')
85
+ logger.info(f'kx_test.shape = {kx_test.shape}')
86
+ logger.info(f'ky_test.shape = {ky_test.shape}')
87
+ self.kx_train, self.ky_train, self.kx_test, self.ky_test = kx_train, ky_train, kx_test, ky_test
88
+
89
+ @staticmethod
90
+ def load_into_mat(utterance_file, categorical_speakers, speaker_id, max_length, kx, ky, i):
91
+ kx[i] = sample_from_mfcc_file(utterance_file, max_length)
92
+ ky[i] = categorical_speakers.get_index(speaker_id)
93
+
94
+
95
+ class SparseCategoricalSpeakers:
96
+
97
+ def __init__(self, speakers_list):
98
+ self.speaker_ids = sorted(speakers_list)
99
+ assert len(set(self.speaker_ids)) == len(self.speaker_ids) # all unique.
100
+ self.map = dict(zip(self.speaker_ids, range(len(self.speaker_ids))))
101
+
102
+ def get_index(self, speaker_id):
103
+ return self.map[speaker_id]
104
+
105
+
106
+ class OneHotSpeakers:
107
+
108
+ def __init__(self, speakers_list):
109
+ # pylint: disable=E0611,E0401
110
+ from tensorflow.keras.utils import to_categorical
111
+ self.speaker_ids = sorted(speakers_list)
112
+ self.int_speaker_ids = list(range(len(self.speaker_ids)))
113
+ self.map_speakers_to_index = dict([(k, v) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
114
+ self.map_index_to_speakers = dict([(v, k) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
115
+ self.speaker_categories = to_categorical(self.int_speaker_ids, num_classes=len(self.speaker_ids))
116
+
117
+ def get_speaker_from_index(self, index):
118
+ return self.map_index_to_speakers[index]
119
+
120
+ def get_one_hot(self, speaker_id):
121
+ index = self.map_speakers_to_index[speaker_id]
122
+ return self.speaker_categories[index]
123
+
124
+
125
+ class LazyTripletBatcher:
126
+ def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
127
+ self.working_dir = working_dir
128
+ self.audio = Audio(cache_dir=working_dir)
129
+ logger.info(f'Picking audio from {working_dir}.')
130
+ self.sp_to_utt_train = train_test_sp_to_utt(self.audio, is_test=False)
131
+ self.sp_to_utt_test = train_test_sp_to_utt(self.audio, is_test=True)
132
+ self.max_length = max_length
133
+ self.model = model
134
+ self.nb_per_speaker = 2
135
+ self.nb_speakers = 640
136
+ self.history_length = 4
137
+ self.history_every = 100 # batches.
138
+ self.total_history_length = self.nb_speakers * self.nb_per_speaker * self.history_length # 25,600
139
+ self.metadata_train_speakers = Counter()
140
+ self.metadata_output_file = os.path.join(self.working_dir, 'debug_batcher.json')
141
+
142
+ self.history_embeddings_train = deque(maxlen=self.total_history_length)
143
+ self.history_utterances_train = deque(maxlen=self.total_history_length)
144
+ self.history_model_inputs_train = deque(maxlen=self.total_history_length)
145
+
146
+ self.history_embeddings = None
147
+ self.history_utterances = None
148
+ self.history_model_inputs = None
149
+
150
+ self.batch_count = 0
151
+ for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history.
152
+ self.update_triplets_history()
153
+
154
+ def update_triplets_history(self):
155
+ model_inputs = []
156
+ speakers = list(self.audio.speakers_to_utterances.keys())
157
+ np.random.shuffle(speakers)
158
+ selected_speakers = speakers[: self.nb_speakers]
159
+ embeddings_utterances = []
160
+ for speaker_id in selected_speakers:
161
+ train_utterances = self.sp_to_utt_train[speaker_id]
162
+ for selected_utterance in np.random.choice(a=train_utterances, size=self.nb_per_speaker, replace=False):
163
+ mfcc = sample_from_mfcc_file(selected_utterance, self.max_length)
164
+ embeddings_utterances.append(selected_utterance)
165
+ model_inputs.append(mfcc)
166
+ embeddings = self.model.m.predict(np.array(model_inputs))
167
+ assert embeddings.shape[-1] == 512
168
+ embeddings = np.reshape(embeddings, (len(selected_speakers), self.nb_per_speaker, 512))
169
+ self.history_embeddings_train.extend(list(embeddings.reshape((-1, 512))))
170
+ self.history_utterances_train.extend(embeddings_utterances)
171
+ self.history_model_inputs_train.extend(model_inputs)
172
+
173
+ # reason: can't index a deque with a np.array.
174
+ self.history_embeddings = np.array(self.history_embeddings_train)
175
+ self.history_utterances = np.array(self.history_utterances_train)
176
+ self.history_model_inputs = np.array(self.history_model_inputs_train)
177
+
178
+ with open(self.metadata_output_file, 'w') as w:
179
+ json.dump(obj=dict(self.metadata_train_speakers), fp=w, indent=2)
180
+
181
+ def get_batch(self, batch_size, is_test=False):
182
+ return self.get_batch_test(batch_size) if is_test else self.get_random_batch(batch_size, is_test=False)
183
+
184
+ def get_batch_test(self, batch_size):
185
+ return self.get_random_batch(batch_size, is_test=True)
186
+
187
+ def get_random_batch(self, batch_size, is_test=False):
188
+ sp_to_utt = self.sp_to_utt_test if is_test else self.sp_to_utt_train
189
+ speakers = list(self.audio.speakers_to_utterances.keys())
190
+ anchor_speakers = np.random.choice(speakers, size=batch_size // 3, replace=False)
191
+
192
+ anchor_utterances = []
193
+ positive_utterances = []
194
+ negative_utterances = []
195
+ for anchor_speaker in anchor_speakers:
196
+ negative_speaker = np.random.choice(list(set(speakers) - {anchor_speaker}), size=1)[0]
197
+ assert negative_speaker != anchor_speaker
198
+ pos_utterances = np.random.choice(sp_to_utt[anchor_speaker], 2, replace=False)
199
+ neg_utterance = np.random.choice(sp_to_utt[negative_speaker], 1, replace=True)[0]
200
+ anchor_utterances.append(pos_utterances[0])
201
+ positive_utterances.append(pos_utterances[1])
202
+ negative_utterances.append(neg_utterance)
203
+
204
+ # anchor and positive should have difference utterances (but same speaker!).
205
+ anc_pos = np.array([positive_utterances, anchor_utterances])
206
+ assert np.all(anc_pos[0, :] != anc_pos[1, :])
207
+ assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
208
+ [extract_speaker(s) for s in anc_pos[1, :]]))
209
+
210
+ pos_neg = np.array([positive_utterances, negative_utterances])
211
+ assert np.all(pos_neg[0, :] != pos_neg[1, :])
212
+ assert np.all(np.array([extract_speaker(s) for s in pos_neg[0, :]]) != np.array(
213
+ [extract_speaker(s) for s in pos_neg[1, :]]))
214
+
215
+ batch_x = np.vstack([
216
+ [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
217
+ [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
218
+ [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
219
+ ])
220
+
221
+ batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
222
+ return batch_x, batch_y
223
+
224
+ def get_batch_train(self, batch_size):
225
+ from deep_speaker.test import batch_cosine_similarity
226
+ # s1 = time()
227
+ self.batch_count += 1
228
+ if self.batch_count % self.history_every == 0:
229
+ self.update_triplets_history()
230
+
231
+ all_indexes = range(len(self.history_embeddings_train))
232
+ anchor_indexes = np.random.choice(a=all_indexes, size=batch_size // 3, replace=False)
233
+
234
+ # s2 = time()
235
+ similar_negative_indexes = []
236
+ dissimilar_positive_indexes = []
237
+ # could be made parallel.
238
+ for anchor_index in anchor_indexes:
239
+ # s21 = time()
240
+ anchor_embedding = self.history_embeddings[anchor_index]
241
+ anchor_speaker = extract_speaker(self.history_utterances[anchor_index])
242
+
243
+ # why self.nb_speakers // 2? just random. because it is fast. otherwise it's too much.
244
+ negative_indexes = [j for (j, a) in enumerate(self.history_utterances)
245
+ if extract_speaker(a) != anchor_speaker]
246
+ negative_indexes = np.random.choice(negative_indexes, size=self.nb_speakers // 2)
247
+
248
+ # s22 = time()
249
+
250
+ anchor_embedding_tile = [anchor_embedding] * len(negative_indexes)
251
+ anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[negative_indexes])
252
+
253
+ # s23 = time()
254
+ similar_negative_index = negative_indexes[np.argsort(anchor_cos)[-1]] # [-1:]
255
+ similar_negative_indexes.append(similar_negative_index)
256
+
257
+ # s24 = time()
258
+ positive_indexes = [j for (j, a) in enumerate(self.history_utterances) if
259
+ extract_speaker(a) == anchor_speaker and j != anchor_index]
260
+ # s25 = time()
261
+ anchor_embedding_tile = [anchor_embedding] * len(positive_indexes)
262
+ # s26 = time()
263
+ anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[positive_indexes])
264
+ dissimilar_positive_index = positive_indexes[np.argsort(anchor_cos)[0]] # [:1]
265
+ dissimilar_positive_indexes.append(dissimilar_positive_index)
266
+ # s27 = time()
267
+
268
+ # s3 = time()
269
+ batch_x = np.vstack([
270
+ self.history_model_inputs[anchor_indexes],
271
+ self.history_model_inputs[dissimilar_positive_indexes],
272
+ self.history_model_inputs[similar_negative_indexes]
273
+ ])
274
+
275
+ # s4 = time()
276
+
277
+ # for anchor, positive, negative in zip(history_utterances[anchor_indexes],
278
+ # history_utterances[dissimilar_positive_indexes],
279
+ # history_utterances[similar_negative_indexes]):
280
+ # print('anchor', os.path.basename(anchor),
281
+ # 'positive', os.path.basename(positive),
282
+ # 'negative', os.path.basename(negative))
283
+ # print('_' * 80)
284
+
285
+ # assert utterances as well positive != anchor.
286
+ anchor_speakers = [extract_speaker(a) for a in self.history_utterances[anchor_indexes]]
287
+ positive_speakers = [extract_speaker(a) for a in self.history_utterances[dissimilar_positive_indexes]]
288
+ negative_speakers = [extract_speaker(a) for a in self.history_utterances[similar_negative_indexes]]
289
+
290
+ assert len(anchor_indexes) == len(dissimilar_positive_indexes)
291
+ assert len(similar_negative_indexes) == len(dissimilar_positive_indexes)
292
+ assert list(self.history_utterances[dissimilar_positive_indexes]) != list(
293
+ self.history_utterances[anchor_indexes])
294
+ assert anchor_speakers == positive_speakers
295
+ assert negative_speakers != anchor_speakers
296
+
297
+ batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
298
+
299
+ for a in anchor_speakers:
300
+ self.metadata_train_speakers[a] += 1
301
+ for a in positive_speakers:
302
+ self.metadata_train_speakers[a] += 1
303
+ for a in negative_speakers:
304
+ self.metadata_train_speakers[a] += 1
305
+
306
+ # s5 = time()
307
+ # print('1-2', s2 - s1)
308
+ # print('2-3', s3 - s2)
309
+ # print('3-4', s4 - s3)
310
+ # print('4-5', s5 - s4)
311
+ # print('21-22', (s22 - s21) * (batch_size // 3))
312
+ # print('22-23', (s23 - s22) * (batch_size // 3))
313
+ # print('23-24', (s24 - s23) * (batch_size // 3))
314
+ # print('24-25', (s25 - s24) * (batch_size // 3))
315
+ # print('25-26', (s26 - s25) * (batch_size // 3))
316
+ # print('26-27', (s27 - s26) * (batch_size // 3))
317
+
318
+ return batch_x, batch_y
319
+
320
+ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
321
+ speakers = list(self.audio.speakers_to_utterances.keys())
322
+ anchor_utterances = []
323
+ positive_utterances = []
324
+ negative_utterances = []
325
+ negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
326
+ assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
327
+ pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
328
+ neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
329
+ anchor_utterances.append(pos_utterances[0])
330
+ positive_utterances.append(pos_utterances[1])
331
+ negative_utterances.extend(neg_utterances)
332
+
333
+ # anchor and positive should have difference utterances (but same speaker!).
334
+ anc_pos = np.array([positive_utterances, anchor_utterances])
335
+ assert np.all(anc_pos[0, :] != anc_pos[1, :])
336
+ assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
337
+ [extract_speaker(s) for s in anc_pos[1, :]]))
338
+
339
+ batch_x = np.vstack([
340
+ [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
341
+ [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
342
+ [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
343
+ ])
344
+
345
+ batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
346
+ return batch_x, batch_y
347
+
348
+
349
+ class TripletBatcher:
350
+
351
+ def __init__(self, kx_train, ky_train, kx_test, ky_test):
352
+ self.kx_train = kx_train
353
+ self.ky_train = ky_train
354
+ self.kx_test = kx_test
355
+ self.ky_test = ky_test
356
+ speakers_list = sorted(set(ky_train.argmax(axis=1)))
357
+ num_different_speakers = len(speakers_list)
358
+ assert speakers_list == sorted(set(ky_test.argmax(axis=1))) # train speakers = test speakers.
359
+ assert speakers_list == list(range(num_different_speakers))
360
+ self.train_indices_per_speaker = {}
361
+ self.test_indices_per_speaker = {}
362
+
363
+ for speaker_id in speakers_list:
364
+ self.train_indices_per_speaker[speaker_id] = list(np.where(ky_train.argmax(axis=1) == speaker_id)[0])
365
+ self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
366
+
367
+ # check.
368
+ # print(sorted(sum([v for v in self.train_indices_per_speaker.values()], [])))
369
+ # print(range(len(ky_train)))
370
+ assert sorted(sum([v for v in self.train_indices_per_speaker.values()], [])) == sorted(range(len(ky_train)))
371
+ assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
372
+ self.speakers_list = speakers_list
373
+
374
+ def select_speaker_data(self, speaker, n, is_test):
375
+ x = self.kx_test if is_test else self.kx_train
376
+ indices_per_speaker = self.test_indices_per_speaker if is_test else self.train_indices_per_speaker
377
+ indices = np.random.choice(indices_per_speaker[speaker], size=n)
378
+ return x[indices]
379
+
380
+ def get_batch(self, batch_size, is_test=False):
381
+ # y = self.ky_test if is_test else self.ky_train
382
+
383
+ two_different_speakers = np.random.choice(self.speakers_list, size=2, replace=False)
384
+ anchor_positive_speaker = two_different_speakers[0]
385
+ negative_speaker = two_different_speakers[1]
386
+ assert negative_speaker != anchor_positive_speaker
387
+
388
+ batch_x = np.vstack([
389
+ self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
390
+ self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
391
+ self.select_speaker_data(negative_speaker, batch_size // 3, is_test)
392
+ ])
393
+
394
+ batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
395
+ return batch_x, batch_y
396
+
397
+
398
+ class TripletBatcherMiner(TripletBatcher):
399
+
400
+ def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
401
+ super().__init__(kx_train, ky_train, kx_test, ky_test)
402
+ self.model = model
403
+ self.num_evaluations_to_find_best_batch = 10
404
+
405
+ def get_batch(self, batch_size, is_test=False):
406
+ if is_test:
407
+ return super().get_batch(batch_size, is_test)
408
+ max_loss = 0
409
+ max_batch = None, None
410
+ for i in range(self.num_evaluations_to_find_best_batch):
411
+ bx, by = super().get_batch(batch_size, is_test=False) # only train here.
412
+ loss = self.model.m.evaluate(bx, by, batch_size=batch_size, verbose=0)
413
+ if loss > max_loss:
414
+ max_loss = loss
415
+ max_batch = bx, by
416
+ return max_batch
417
+
418
+
419
+ class TripletBatcherSelectHardNegatives(TripletBatcher):
420
+
421
+ def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
422
+ super().__init__(kx_train, ky_train, kx_test, ky_test)
423
+ self.model = model
424
+
425
+ def get_batch(self, batch_size, is_test=False, predict=None):
426
+ if predict is None:
427
+ predict = self.model.m.predict
428
+ from deep_speaker.test import batch_cosine_similarity
429
+ num_triplets = batch_size // 3
430
+ inputs = []
431
+ k = 2 # do not change this.
432
+ for speaker in self.speakers_list:
433
+ inputs.append(self.select_speaker_data(speaker, n=k, is_test=is_test))
434
+ inputs = np.array(inputs) # num_speakers * [k, num_frames, num_fbanks, 1].
435
+ embeddings = predict(np.vstack(inputs))
436
+ assert embeddings.shape[-1] == 512
437
+ # (speaker, utterance, 512)
438
+ embeddings = np.reshape(embeddings, (len(self.speakers_list), k, 512))
439
+ cs = batch_cosine_similarity(embeddings[:, 0], embeddings[:, 1])
440
+ arg_sort = np.argsort(cs)
441
+ assert len(arg_sort) > num_triplets
442
+ anchor_speakers = arg_sort[0:num_triplets]
443
+
444
+ anchor_embeddings = embeddings[anchor_speakers, 0]
445
+ negative_speakers = sorted(set(self.speakers_list) - set(anchor_speakers))
446
+ negative_embeddings = embeddings[negative_speakers, 0]
447
+
448
+ selected_negative_speakers = []
449
+ for anchor_embedding in anchor_embeddings:
450
+ cs_negative = [batch_cosine_similarity([anchor_embedding], neg) for neg in negative_embeddings]
451
+ selected_negative_speakers.append(negative_speakers[int(np.argmax(cs_negative))])
452
+
453
+ # anchor with frame 0.
454
+ # positive with frame 1.
455
+ # negative with frame 0.
456
+ assert len(set(selected_negative_speakers).intersection(anchor_speakers)) == 0
457
+ negative = inputs[selected_negative_speakers, 0]
458
+ positive = inputs[anchor_speakers, 1]
459
+ anchor = inputs[anchor_speakers, 0]
460
+ batch_x = np.vstack([anchor, positive, negative])
461
+ batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
462
+ return batch_x, batch_y
463
+
464
+
465
+ class TripletEvaluator:
466
+
467
+ def __init__(self, kx_test, ky_test):
468
+ self.kx_test = kx_test
469
+ self.ky_test = ky_test
470
+ speakers_list = sorted(set(ky_test.argmax(axis=1)))
471
+ num_different_speakers = len(speakers_list)
472
+ assert speakers_list == list(range(num_different_speakers))
473
+ self.test_indices_per_speaker = {}
474
+ for speaker_id in speakers_list:
475
+ self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
476
+ assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
477
+ self.speakers_list = speakers_list
478
+
479
+ def _select_speaker_data(self, speaker):
480
+ indices = np.random.choice(self.test_indices_per_speaker[speaker], size=1)
481
+ return self.kx_test[indices]
482
+
483
+ def get_speaker_verification_data(self, positive_speaker, num_different_speakers):
484
+ all_negative_speakers = list(set(self.speakers_list) - {positive_speaker})
485
+ assert len(self.speakers_list) - 1 == len(all_negative_speakers)
486
+ negative_speakers = np.random.choice(all_negative_speakers, size=num_different_speakers, replace=False)
487
+ assert positive_speaker not in negative_speakers
488
+ anchor = self._select_speaker_data(positive_speaker)
489
+ positive = self._select_speaker_data(positive_speaker)
490
+ data = [anchor, positive]
491
+ data.extend([self._select_speaker_data(n) for n in negative_speakers])
492
+ return np.vstack(data)
493
+
494
+
495
+ if __name__ == '__main__':
496
+ np.random.seed(123)
497
+ ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
498
+ max_length=NUM_FRAMES,
499
+ model=DeepSpeakerModel())
500
+ for i in range(1000):
501
+ print(i)
502
+ start = time()
503
+ ltb.get_batch_train(batch_size=9)
504
+ print(time() - start)
505
+ # ltb.get_batch(batch_size=96)
speaker_recognition/constants.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Constants.
2
+
3
+ SAMPLE_RATE = 16000 # not higher than that otherwise we may have errors when computing the fbanks.
4
+
5
+ # Train/Test sets share the same speakers. They contain different utterances.
6
+ # 0.8 means 20% of the utterances of each speaker will be held out and placed in the test set.
7
+ TRAIN_TEST_RATIO = 0.8
8
+
9
+ CHECKPOINTS_SOFTMAX_DIR = 'checkpoints-softmax'
10
+
11
+ CHECKPOINTS_TRIPLET_DIR = 'checkpoints-triplets'
12
+
13
+ BATCH_SIZE = 32 * 3 # have to be a multiple of 3.
14
+
15
+ # Input to the model will be a 4D image: (batch_size, num_frames, num_fbanks, 3)
16
+ # Where the 3 channels are: FBANK, DIFF(FBANK), DIFF(DIFF(FBANK)).
17
+ NUM_FRAMES = 160 # 1 second ~ 100 frames with default params winlen=0.025,winstep=0.01
18
+ NUM_FBANKS = 64
speaker_recognition/conv_models.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import numpy as np
5
+ import tensorflow as tf
6
+ # pylint: disable=E0611,E0401
7
+ import tensorflow.keras.backend as K
8
+ # pylint: disable=E0611,E0401
9
+ from tensorflow.keras import layers, regularizers
10
+ # pylint: disable=E0611,E0401
11
+ from tensorflow.keras.layers import (
12
+ BatchNormalization,
13
+ Conv2D,
14
+ Dense,
15
+ Dropout,
16
+ Input,
17
+ Lambda,
18
+ Reshape,
19
+ )
20
+ # pylint: disable=E0611,E0401
21
+ from tensorflow.keras.models import Model
22
+ # pylint: disable=E0611,E0401
23
+ from tensorflow.keras.optimizers import Adam
24
+
25
+ from deep_speaker.constants import NUM_FBANKS, SAMPLE_RATE, NUM_FRAMES
26
+ from deep_speaker.triplet_loss import deep_speaker_loss
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @tf.function
32
+ def tf_normalize(data, ndims, eps=0, adjusted=False):
33
+ data = tf.convert_to_tensor(data, name='data')
34
+
35
+ reduce_dims = [-i - 1 for i in range(ndims)]
36
+ # pylint: disable=E1123,E1120
37
+ data = tf.cast(data, dtype=tf.dtypes.float32)
38
+ data_num = tf.reduce_prod(data.shape[-ndims:])
39
+ data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)
40
+
41
+ # Apply a minimum normalization that protects us against uniform images.
42
+ stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
43
+ adjusted_stddev = stddev
44
+ if adjusted:
45
+ min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
46
+ eps = tf.maximum(eps, min_stddev)
47
+ if eps > 0:
48
+ adjusted_stddev = tf.maximum(adjusted_stddev, eps)
49
+
50
+ return (data - data_mean) / adjusted_stddev
51
+
52
+
53
+ @tf.function
54
+ def tf_fbank(samples):
55
+ """
56
+ Compute Mel-filterbank energy features from an audio signal.
57
+ See python_speech_features.fbank
58
+ """
59
+ frame_length = int(0.025 * SAMPLE_RATE)
60
+ frame_step = int(0.01 * SAMPLE_RATE)
61
+ fft_length = 512
62
+ fft_bins = fft_length // 2 + 1
63
+
64
+ pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]
65
+
66
+ # Original implementation from python_speech_features
67
+ # frames = tf.expand_dims(sigproc.framesig(preemphasis[0], frame_length,
68
+ # frame_step, winfunc=lambda x: np.ones((x,))), 0)
69
+ # powspec = sigproc.powspec(frames, fft_length)
70
+
71
+ # Tensorflow impl #1, using manually-split frames and rfft
72
+ # spec = tf.abs(tf.signal.rfft(frames, [fft_length]))
73
+ # powspec = tf.square(spec) / fft_length
74
+
75
+ # Tensorflow impl #2, using stft to handle framing automatically
76
+ # (There is a one-off mismatch on the number of frames on the resulting tensor, but I guess this is ok)
77
+ spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
78
+ powspec = tf.square(spec) / fft_length
79
+
80
+ # Matrix to transform spectrum to mel-frequencies
81
+
82
+ # Original implementation from python_speech_features
83
+ # linear_to_mel_weight_matrix = get_filterbanks(NUM_FBANKS, fft_length,
84
+ # SAMPLE_RATE, 0, SAMPLE_RATE/2).astype(np.float32).T
85
+
86
+ linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
87
+ num_mel_bins=NUM_FBANKS,
88
+ num_spectrogram_bins=fft_bins,
89
+ sample_rate=SAMPLE_RATE,
90
+ lower_edge_hertz=0,
91
+ upper_edge_hertz=SAMPLE_RATE / 2,
92
+ )
93
+
94
+ feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
95
+ # feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
96
+ return feat
97
+
98
+
99
+ class DeepSpeakerModel:
100
+
101
+ # I thought it was 3 but maybe energy is added at a 4th dimension.
102
+ # would be better to have 4 dimensions:
103
+ # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
104
+ # this seems to help match the parameter counts.
105
+ def __init__(
106
+ self,
107
+ batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
108
+ include_softmax=False,
109
+ num_speakers_softmax=None,
110
+ pcm_input=False
111
+ ):
112
+ if pcm_input:
113
+ batch_input_shape = None
114
+ self.include_softmax = include_softmax
115
+ if self.include_softmax:
116
+ assert num_speakers_softmax > 0
117
+ self.clipped_relu_count = 0
118
+
119
+ # http://cs231n.github.io/convolutional-networks/
120
+ # conv weights
121
+ # #params = ks * ks * nb_filters * num_channels_input
122
+
123
+ # Conv128-s
124
+ # 5*5*128*128/2+128
125
+ # ks*ks*nb_filters*channels/strides+bias(=nb_filters)
126
+
127
+ # take 100 ms -> 4 frames.
128
+ # if signal is 3 seconds, then take 100ms per 100ms and average out this network.
129
+ # 8*8 = 64 features.
130
+
131
+ # used to share all the layers across the inputs
132
+
133
+ # num_frames = K.shape() - do it dynamically after.
134
+
135
+ if pcm_input:
136
+ batch_input_shape = batch_input_shape or (None, None) # Batch-size, num-samples
137
+ inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
138
+ x = inputs
139
+ x = Lambda(tf_fbank)(x)
140
+ x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
141
+ x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
142
+ else:
143
+ batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
144
+ inputs = Input(batch_shape=batch_input_shape, name='input')
145
+ x = inputs
146
+
147
+ x = self.cnn_component(x)
148
+
149
+ x = Reshape((-1, 2048))(x)
150
+ # Temporal average layer. axis=1 is time.
151
+ x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
152
+ if include_softmax:
153
+ logger.info('Including a Dropout layer to reduce overfitting.')
154
+ # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
155
+ x = Dropout(0.5)(x)
156
+ x = Dense(512, name='affine')(x)
157
+ if include_softmax:
158
+ # Those weights are just when we train on softmax.
159
+ x = Dense(num_speakers_softmax, activation='softmax')(x)
160
+ else:
161
+ # Does not contain any weights.
162
+ x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
163
+ self.m = Model(inputs, x, name='ResCNN')
164
+
165
+ def keras_model(self):
166
+ return self.m
167
+
168
+ def get_weights(self):
169
+ w = self.m.get_weights()
170
+ if self.include_softmax:
171
+ w.pop() # last 2 are the W_softmax and b_softmax.
172
+ w.pop()
173
+ return w
174
+
175
+ def clipped_relu(self, inputs):
176
+ relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
177
+ self.clipped_relu_count += 1
178
+ return relu
179
+
180
+ def identity_block(self, input_tensor, kernel_size, filters, stage, block):
181
+ conv_name_base = f'res{stage}_{block}_branch'
182
+
183
+ x = Conv2D(filters,
184
+ kernel_size=kernel_size,
185
+ strides=1,
186
+ activation=None,
187
+ padding='same',
188
+ kernel_initializer='glorot_uniform',
189
+ kernel_regularizer=regularizers.l2(l=0.0001),
190
+ name=conv_name_base + '_2a')(input_tensor)
191
+ x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
192
+ x = self.clipped_relu(x)
193
+
194
+ x = Conv2D(
195
+ filters,
196
+ kernel_size=kernel_size,
197
+ strides=1,
198
+ activation=None,
199
+ padding='same',
200
+ kernel_initializer='glorot_uniform',
201
+ kernel_regularizer=regularizers.l2(l=0.0001),
202
+ name=conv_name_base + '_2b',
203
+ )(x)
204
+ x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)
205
+
206
+ x = self.clipped_relu(x)
207
+
208
+ x = layers.add([x, input_tensor])
209
+ x = self.clipped_relu(x)
210
+ return x
211
+
212
+ def conv_and_res_block(self, inp, filters, stage):
213
+ conv_name = 'conv{}-s'.format(filters)
214
+ # TODO: why kernel_regularizer?
215
+ o = Conv2D(filters,
216
+ kernel_size=5,
217
+ strides=2,
218
+ activation=None,
219
+ padding='same',
220
+ kernel_initializer='glorot_uniform',
221
+ kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
222
+ o = BatchNormalization(name=conv_name + '_bn')(o)
223
+ o = self.clipped_relu(o)
224
+ for i in range(3):
225
+ o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
226
+ return o
227
+
228
+ def cnn_component(self, inp):
229
+ x = self.conv_and_res_block(inp, 64, stage=1)
230
+ x = self.conv_and_res_block(x, 128, stage=2)
231
+ x = self.conv_and_res_block(x, 256, stage=3)
232
+ x = self.conv_and_res_block(x, 512, stage=4)
233
+ return x
234
+
235
+ def set_weights(self, w):
236
+ for layer, layer_w in zip(self.m.layers, w):
237
+ layer.set_weights(layer_w)
238
+ logger.info(f'Setting weights for [{layer.name}]...')
239
+
240
+
241
+ def main():
242
+ # Looks correct to me.
243
+ # I have 37K but paper reports 41K. which is not too far.
244
+ dsm = DeepSpeakerModel()
245
+ dsm.m.summary()
246
+
247
+ # I suspect num frames to be 32.
248
+ # Then fbank=64, then total would be 32*64 = 2048.
249
+ # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
250
+
251
+
252
+ def _train():
253
+ # x = np.random.uniform(size=(6, 32, 64, 4)) # 6 is multiple of 3.
254
+ # y_softmax = np.random.uniform(size=(6, 100))
255
+ # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
256
+ # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
257
+ # print(dsm.m.predict(x).shape)
258
+ # print(dsm.m.evaluate(x, y_softmax))
259
+ # w = dsm.get_weights()
260
+ dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
261
+ # dsm.m.set_weights(w)
262
+ dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
263
+
264
+ # it works!!!!!!!!!!!!!!!!!!!!
265
+ # unit_batch_size = 20
266
+ # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
267
+ # positive = np.array(anchor)
268
+ # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
269
+ # batch = np.vstack((anchor, positive, negative))
270
+ # x = batch
271
+ # y = np.zeros(shape=(len(batch), 512)) # not important.
272
+ # print('Starting to fit...')
273
+ # while True:
274
+ # print(dsm.m.train_on_batch(x, y))
275
+
276
+ # should not work... and it does not work!
277
+ unit_batch_size = 20
278
+ negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
279
+ batch = np.vstack((negative, negative, negative))
280
+ x = batch
281
+ y = np.zeros(shape=(len(batch), 512)) # not important.
282
+ print('Starting to fit...')
283
+ while True:
284
+ print(dsm.m.train_on_batch(x, y))
285
+
286
+
287
+ def _test_checkpoint_compatibility():
288
+ dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
289
+ dsm.m.save_weights('test.h5')
290
+ dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
291
+ dsm.m.load_weights('test.h5', by_name=True)
292
+ os.remove('test.h5')
293
+
294
+
295
+ if __name__ == '__main__':
296
+ _test_checkpoint_compatibility()
speaker_recognition/eval_metrics.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def evaluate(sims, labels):
5
+ # Calculate evaluation metrics
6
+ thresholds = np.arange(0, 1.0, 0.001)
7
+ fm, tpr, acc = calculate_roc(thresholds, sims, labels)
8
+ eer = calculate_eer(thresholds, sims, labels)
9
+ return fm, tpr, acc, eer
10
+
11
+
12
+ def calculate_roc(thresholds, sims, labels):
13
+ nrof_thresholds = len(thresholds)
14
+
15
+ tprs = np.zeros((nrof_thresholds))
16
+ fprs = np.zeros((nrof_thresholds))
17
+ acc_train = np.zeros((nrof_thresholds))
18
+ precisions = np.zeros((nrof_thresholds))
19
+ fms = np.zeros((nrof_thresholds))
20
+
21
+ # Find the best threshold for the fold
22
+
23
+ for threshold_idx, threshold in enumerate(thresholds):
24
+ tprs[threshold_idx], fprs[threshold_idx], precisions[threshold_idx], fms[threshold_idx], acc_train[
25
+ threshold_idx] = calculate_accuracy(threshold, sims, labels)
26
+
27
+ bestindex = np.argmax(fms)
28
+ bestfm = fms[bestindex]
29
+ besttpr = tprs[bestindex]
30
+ bestacc = acc_train[bestindex]
31
+
32
+ return bestfm, besttpr, bestacc
33
+
34
+
35
+ def calculate_accuracy(threshold, sims, actual_issame):
36
+ predict_issame = np.greater(sims, threshold)
37
+ tp = np.sum(np.logical_and(predict_issame, actual_issame))
38
+ fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
39
+ tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
40
+ fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
41
+
42
+ tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn) # recall
43
+ fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
44
+ precision = 0 if (tp + fp == 0) else float(tp) / float(tp + fp)
45
+ fm = 2 * precision * tpr / (precision + tpr + 1e-12)
46
+ acc = float(tp + tn) / (sims.size + 1e-12)
47
+ return tpr, fpr, precision, fm, acc
48
+
49
+
50
+ def calculate_eer(thresholds, sims, labels):
51
+ nrof_thresholds = len(thresholds)
52
+
53
+ # Find the threshold that gives FAR = far_target
54
+ far_train = np.zeros(nrof_thresholds)
55
+ frr_train = np.zeros(nrof_thresholds)
56
+ eer_index = 0
57
+ eer_diff = 100000000
58
+ for threshold_idx, threshold in enumerate(thresholds):
59
+ frr_train[threshold_idx], far_train[threshold_idx] = calculate_val_far(threshold, sims, labels)
60
+ if abs(frr_train[threshold_idx] - far_train[threshold_idx]) < eer_diff:
61
+ eer_diff = abs(frr_train[threshold_idx] - far_train[threshold_idx])
62
+ eer_index = threshold_idx
63
+
64
+ frr, far = frr_train[eer_index], far_train[eer_index]
65
+
66
+ eer = (frr + far) / 2
67
+
68
+ return eer
69
+
70
+
71
+ def calculate_val_far(threshold, sims, actual_issame):
72
+ predict_issame = np.greater(sims, threshold)
73
+ true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
74
+ false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
75
+ n_same = np.sum(actual_issame)
76
+ n_diff = np.sum(np.logical_not(actual_issame))
77
+ if n_diff == 0:
78
+ n_diff = 1
79
+ if n_same == 0:
80
+ return 0, 0
81
+ val = float(true_accept) / float(n_same)
82
+ frr = 1 - val
83
+ far = float(false_accept) / float(n_diff)
84
+ return frr, far
speaker_recognition/test.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+
6
+ from deep_speaker.audio import Audio
7
+ from deep_speaker.batcher import LazyTripletBatcher
8
+ from deep_speaker.constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE
9
+ from deep_speaker.conv_models import DeepSpeakerModel
10
+ from deep_speaker.eval_metrics import evaluate
11
+ from deep_speaker.utils import load_best_checkpoint, enable_deterministic
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def batch_cosine_similarity(x1, x2):
17
+ # https://en.wikipedia.org/wiki/Cosine_similarity
18
+ # 1 = equal direction ; -1 = opposite direction
19
+ mul = np.multiply(x1, x2)
20
+ s = np.sum(mul, axis=1)
21
+
22
+ # l1 = np.sum(np.multiply(x1, x1),axis=1)
23
+ # l2 = np.sum(np.multiply(x2, x2), axis=1)
24
+ # as values have have length 1, we don't need to divide by norm (as it is 1)
25
+ return s
26
+
27
+
28
+ def eval_model(working_dir: str, model: DeepSpeakerModel):
29
+ enable_deterministic()
30
+ audio = Audio(working_dir)
31
+ batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model)
32
+ speakers_list = list(audio.speakers_to_utterances.keys())
33
+ num_negative_speakers = 99
34
+ num_speakers = len(speakers_list)
35
+ y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1)) # negatives + positive
36
+ for i, positive_speaker in tqdm(enumerate(speakers_list), desc='test', total=num_speakers):
37
+ # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
38
+ input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers)
39
+ # batch size is not relevant. just making sure we don't push too much on the GPU.
40
+ predictions = model.m.predict(input_data, batch_size=BATCH_SIZE)
41
+ anchor_embedding = predictions[0]
42
+ for j, other_than_anchor_embedding in enumerate(predictions[1:]): # positive + negatives
43
+ y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0]
44
+ # y_pred[i] = softmax(y_pred[i])
45
+ # could apply softmax here.
46
+ y_true = np.zeros_like(y_pred) # positive is at index 0.
47
+ y_true[:, 0] = 1.0
48
+ print(np.matrix(y_true))
49
+ print(np.matrix(y_pred))
50
+ print(np.min(y_pred), np.max(y_pred))
51
+ fm, tpr, acc, eer = evaluate(y_pred, y_true)
52
+ return fm, tpr, acc, eer
53
+
54
+
55
+ def test(working_dir, checkpoint_file=None):
56
+ batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
57
+ dsm = DeepSpeakerModel(batch_input_shape)
58
+ if checkpoint_file is None:
59
+ checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
60
+ if checkpoint_file is not None:
61
+ logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...')
62
+ dsm.m.load_weights(checkpoint_file, by_name=True)
63
+ else:
64
+ logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
65
+ exit(1)
66
+
67
+ fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
68
+ logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
69
+ f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
speaker_recognition/train.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ # pylint: disable=E0611,E0401
5
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
6
+ # pylint: disable=E0611,E0401
7
+ from tensorflow.keras.optimizers import SGD
8
+ from tqdm import tqdm
9
+
10
+ from deep_speaker.batcher import KerasFormatConverter, LazyTripletBatcher
11
+ from deep_speaker.constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
12
+ from deep_speaker.conv_models import DeepSpeakerModel
13
+ from deep_speaker.triplet_loss import deep_speaker_loss
14
+ from deep_speaker.utils import load_best_checkpoint, ensures_dir
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Otherwise it's just too much logging from Tensorflow...
19
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
20
+
21
+
22
+ def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE):
23
+ batcher = LazyTripletBatcher(working_dir, max_length, dsm)
24
+
25
+ # build small test set.
26
+ test_batches = []
27
+ for _ in tqdm(range(200), desc='Build test set'):
28
+ test_batches.append(batcher.get_batch_test(batch_size))
29
+
30
+ def test_generator():
31
+ while True:
32
+ for bb in test_batches:
33
+ yield bb
34
+
35
+ def train_generator():
36
+ while True:
37
+ yield batcher.get_random_batch(batch_size, is_test=False)
38
+
39
+ checkpoint_name = dsm.m.name + '_checkpoint'
40
+ checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
41
+ checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
42
+ dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
43
+ epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
44
+ callbacks=[checkpoint])
45
+
46
+
47
+ def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test,
48
+ batch_size=BATCH_SIZE, max_epochs=1000, initial_epoch=0):
49
+ checkpoint_name = dsm.m.name + '_checkpoint'
50
+ checkpoint_filename = os.path.join(CHECKPOINTS_SOFTMAX_DIR, checkpoint_name + '_{epoch}.h5')
51
+ checkpoint = ModelCheckpoint(monitor='val_accuracy', filepath=checkpoint_filename, save_best_only=True)
52
+
53
+ # if the accuracy does not increase by 0.1% over 20 epochs, we stop the training.
54
+ early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=20, verbose=1, mode='max')
55
+
56
+ # if the accuracy does not increase over 10 epochs, we reduce the learning rate by half.
57
+ reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.0001, verbose=1)
58
+
59
+ max_len_train = len(kx_train) - len(kx_train) % batch_size
60
+ kx_train = kx_train[0:max_len_train]
61
+ ky_train = ky_train[0:max_len_train]
62
+ max_len_test = len(kx_test) - len(kx_test) % batch_size
63
+ kx_test = kx_test[0:max_len_test]
64
+ ky_test = ky_test[0:max_len_test]
65
+
66
+ dsm.m.fit(x=kx_train,
67
+ y=ky_train,
68
+ batch_size=batch_size,
69
+ epochs=initial_epoch + max_epochs,
70
+ initial_epoch=initial_epoch,
71
+ verbose=1,
72
+ shuffle=True,
73
+ validation_data=(kx_test, ky_test),
74
+ callbacks=[early_stopping, reduce_lr, checkpoint])
75
+
76
+
77
+ def start_training(working_dir, pre_training_phase=True):
78
+ ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
79
+ ensures_dir(CHECKPOINTS_TRIPLET_DIR)
80
+ batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
81
+ if pre_training_phase:
82
+ logger.info('Softmax pre-training.')
83
+ kc = KerasFormatConverter(working_dir)
84
+ num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
85
+ dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
86
+ dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
87
+ pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
88
+ if pre_training_checkpoint is not None:
89
+ initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
90
+ logger.info(f'Initial epoch is {initial_epoch}.')
91
+ logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
92
+ dsm.m.load_weights(pre_training_checkpoint) # latest one.
93
+ else:
94
+ initial_epoch = 0
95
+ fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
96
+ else:
97
+ logger.info('Training with the triplet loss.')
98
+ dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
99
+ triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
100
+ pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
101
+ if triplet_checkpoint is not None:
102
+ logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
103
+ dsm.m.load_weights(triplet_checkpoint)
104
+ elif pre_training_checkpoint is not None:
105
+ logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
106
+ # If `by_name` is True, weights are loaded into layers only if they share the
107
+ # same name. This is useful for fine-tuning or transfer-learning models where
108
+ # some of the layers have changed.
109
+ dsm.m.load_weights(pre_training_checkpoint, by_name=True)
110
+ dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
111
+ fit_model(dsm, working_dir, NUM_FRAMES)
speaker_recognition/triplet_loss.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pylint: disable=E0611,E0401
2
+ import tensorflow.keras.backend as K
3
+
4
+ # ALPHA = 0.2 # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf
5
+ ALPHA = 0.1 # used in Deep Speaker.
6
+
7
+
8
+ def batch_cosine_similarity(x1, x2):
9
+ # https://en.wikipedia.org/wiki/Cosine_similarity
10
+ # 1 = equal direction ; -1 = opposite direction
11
+ dot = K.squeeze(K.batch_dot(x1, x2, axes=1), axis=1)
12
+ # as values have have length 1, we don't need to divide by norm (as it is 1)
13
+ return dot
14
+
15
+
16
+ def deep_speaker_loss(y_true, y_pred, alpha=ALPHA):
17
+ # y_true is not used. we respect this convention:
18
+ # y_true.shape = (batch_size, embedding_size) [not used]
19
+ # y_pred.shape = (batch_size, embedding_size)
20
+ # EXAMPLE:
21
+ # _____________________________________________________
22
+ # ANCHOR 1 (512,)
23
+ # ANCHOR 2 (512,)
24
+ # POS EX 1 (512,)
25
+ # POS EX 2 (512,)
26
+ # NEG EX 1 (512,)
27
+ # NEG EX 2 (512,)
28
+ # _____________________________________________________
29
+ split = K.shape(y_pred)[0] // 3
30
+
31
+ anchor = y_pred[0:split]
32
+ positive_ex = y_pred[split:2 * split]
33
+ negative_ex = y_pred[2 * split:]
34
+
35
+ # If the loss does not decrease below ALPHA then the model does not learn anything.
36
+ # If all anchor = positive = negative (model outputs the same vector always).
37
+ # Then sap = san = 1. and loss = max(alpha,0) = alpha.
38
+ # On the contrary if anchor = positive = [1] and negative = [-1].
39
+ # Then sap = 1 and san = -1. loss = max(-1-1+0.1,0) = max(-1.9, 0) = 0.
40
+ sap = batch_cosine_similarity(anchor, positive_ex)
41
+ san = batch_cosine_similarity(anchor, negative_ex)
42
+ loss = K.maximum(san - sap + alpha, 0.0)
43
+ total_loss = K.mean(loss)
44
+ return total_loss
45
+
46
+
47
+ if __name__ == '__main__':
48
+ import numpy as np
49
+
50
+ print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
51
+ print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
52
+ print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
53
+ print('--------------')
54
+ print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
55
+ print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
56
+ print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
57
+ print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
58
+
59
+ print('--------------')
60
+ print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
61
+ print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
62
+ print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
63
+ print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
speaker_recognition/utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import random
4
+ import shutil
5
+ from glob import glob
6
+
7
+ import click
8
+ import dill
9
+ import numpy as np
10
+ import pandas as pd
11
+ from natsort import natsorted
12
+
13
+ from deep_speaker.constants import TRAIN_TEST_RATIO
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def find_files(directory, ext='wav'):
19
+ return sorted(glob(directory + f'/**/*.{ext}', recursive=True))
20
+
21
+
22
+ def init_pandas():
23
+ pd.set_option('display.float_format', lambda x: '%.3f' % x)
24
+ pd.set_option('display.max_rows', None)
25
+ pd.set_option('display.max_columns', None)
26
+ pd.set_option('display.width', 1000)
27
+
28
+
29
+ def create_new_empty_dir(directory: str):
30
+ if os.path.exists(directory):
31
+ shutil.rmtree(directory)
32
+ os.makedirs(directory)
33
+
34
+
35
+ def ensure_dir_for_filename(filename: str):
36
+ ensures_dir(os.path.dirname(filename))
37
+
38
+
39
+ def ensures_dir(directory: str):
40
+ if len(directory) > 0 and not os.path.exists(directory):
41
+ os.makedirs(directory)
42
+
43
+
44
+ class ClickType:
45
+
46
+ @staticmethod
47
+ def input_file(writable=False):
48
+ return click.Path(exists=True, file_okay=True, dir_okay=False,
49
+ writable=writable, readable=True, resolve_path=True)
50
+
51
+ @staticmethod
52
+ def input_dir(writable=False):
53
+ return click.Path(exists=True, file_okay=False, dir_okay=True,
54
+ writable=writable, readable=True, resolve_path=True)
55
+
56
+ @staticmethod
57
+ def output_file():
58
+ return click.Path(exists=False, file_okay=True, dir_okay=False,
59
+ writable=True, readable=True, resolve_path=True)
60
+
61
+ @staticmethod
62
+ def output_dir():
63
+ return click.Path(exists=False, file_okay=False, dir_okay=True,
64
+ writable=True, readable=True, resolve_path=True)
65
+
66
+
67
+ def parallel_function(f, sequence, num_threads=None):
68
+ from multiprocessing import Pool
69
+ pool = Pool(processes=num_threads)
70
+ result = pool.map(f, sequence)
71
+ cleaned = [x for x in result if x is not None]
72
+ pool.close()
73
+ pool.join()
74
+ return cleaned
75
+
76
+
77
+ def load_best_checkpoint(checkpoint_dir):
78
+ checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
79
+ if len(checkpoints) != 0:
80
+ return checkpoints[-1]
81
+ return None
82
+
83
+
84
+ def delete_older_checkpoints(checkpoint_dir, max_to_keep=5):
85
+ assert max_to_keep > 0
86
+ checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
87
+ checkpoints_to_keep = checkpoints[-max_to_keep:]
88
+ for checkpoint in checkpoints:
89
+ if checkpoint not in checkpoints_to_keep:
90
+ os.remove(checkpoint)
91
+
92
+
93
+ def enable_deterministic():
94
+ print('Deterministic mode enabled.')
95
+ np.random.seed(123)
96
+ random.seed(123)
97
+
98
+
99
+ def load_pickle(file):
100
+ if not os.path.exists(file):
101
+ return None
102
+ logger.info(f'Loading PKL file: {file}.')
103
+ with open(file, 'rb') as r:
104
+ return dill.load(r)
105
+
106
+
107
+ def load_npy(file):
108
+ if not os.path.exists(file):
109
+ return None
110
+ logger.info(f'Loading NPY file: {file}.')
111
+ return np.load(file)
112
+
113
+
114
+ def train_test_sp_to_utt(audio, is_test):
115
+ sp_to_utt = {}
116
+ for speaker_id, utterances in audio.speakers_to_utterances.items():
117
+ utterances_files = sorted(utterances.values())
118
+ train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
119
+ sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
120
+ return sp_to_utt