Upload 14 files
Browse files- app.py +14 -0
- requirements.txt +2 -0
- speaker_recognition/__init__.py +0 -0
- speaker_recognition/__pycache__/audio.cpython-310.pyc +0 -0
- speaker_recognition/app.py +60 -0
- speaker_recognition/audio.py +121 -0
- speaker_recognition/batcher.py +505 -0
- speaker_recognition/constants.py +18 -0
- speaker_recognition/conv_models.py +296 -0
- speaker_recognition/eval_metrics.py +84 -0
- speaker_recognition/test.py +69 -0
- speaker_recognition/train.py +111 -0
- speaker_recognition/triplet_loss.py +63 -0
- speaker_recognition/utils.py +120 -0
app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from speaker_recognition import app
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
def recognition(audio):
|
8 |
+
do = app.speaker_recognition()
|
9 |
+
return do.run_transform(audio)
|
10 |
+
|
11 |
+
demo = gr.Interface(fn=recognition, inputs=["audio"], outputs="text")
|
12 |
+
|
13 |
+
demo.launch()
|
14 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gdown
|
2 |
+
numpy
|
speaker_recognition/__init__.py
ADDED
File without changes
|
speaker_recognition/__pycache__/audio.cpython-310.pyc
ADDED
Binary file (4.64 kB). View file
|
|
speaker_recognition/app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gdown
|
2 |
+
import random
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from audio import read_mfcc
|
7 |
+
from batcher import sample_from_mfcc
|
8 |
+
from constants import SAMPLE_RATE, NUM_FRAMES
|
9 |
+
from conv_models import DeepSpeakerModel
|
10 |
+
from test import batch_cosine_similarity
|
11 |
+
|
12 |
+
class speaker_recognition:
|
13 |
+
def __init__(self):
|
14 |
+
|
15 |
+
np.random.seed(123)
|
16 |
+
random.seed(123)
|
17 |
+
|
18 |
+
self.speakers = {}
|
19 |
+
self.weights = ""
|
20 |
+
self.by_name = True
|
21 |
+
|
22 |
+
self.SAMPLE_RATE = SAMPLE_RATE
|
23 |
+
self.NUM_FRAMES = NUM_FRAMES
|
24 |
+
|
25 |
+
self.spin_up()
|
26 |
+
|
27 |
+
def spin_up(self):
|
28 |
+
if self.weights == "":
|
29 |
+
output = "weights.h5"
|
30 |
+
gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False)
|
31 |
+
self.weights = "weights.h5"
|
32 |
+
|
33 |
+
self.model = DeepSpeakerModel()
|
34 |
+
self.model.m.load_weights(self.weights, by_name=True)
|
35 |
+
|
36 |
+
def create_speaker(self, data, id=""):
|
37 |
+
id = id if id != "" else f"{len(self.speakers)}"
|
38 |
+
self.speakers[id] = data
|
39 |
+
return id
|
40 |
+
|
41 |
+
def check_speakers(self, data, id="", threshold = 0.5):
|
42 |
+
us = ""
|
43 |
+
n = 0
|
44 |
+
for speaker in self.speakers:
|
45 |
+
k = batch_cosine_similarity(self.speakers[speaker], data)
|
46 |
+
if k > threshold:
|
47 |
+
if k > n:
|
48 |
+
n = k
|
49 |
+
us = speaker
|
50 |
+
else:pass
|
51 |
+
if n == 0:
|
52 |
+
id = self.create_speaker(data, id)
|
53 |
+
return f"created new speaker : {id}"
|
54 |
+
|
55 |
+
return (us, k[0])
|
56 |
+
|
57 |
+
def run_transform(self, audio, pcm = False):
|
58 |
+
data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES)
|
59 |
+
data = self.model.m.predict(np.expand_dims(data, axis=0))
|
60 |
+
return data
|
speaker_recognition/audio.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from collections import defaultdict
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import librosa
|
7 |
+
import numpy as np
|
8 |
+
from python_speech_features import fbank
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from deep_speaker.constants import SAMPLE_RATE, NUM_FBANKS
|
12 |
+
from deep_speaker.utils import find_files, ensures_dir
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
|
17 |
+
def read_mfcc(input_filename, sample_rate):
|
18 |
+
audio = Audio.read(input_filename, sample_rate)
|
19 |
+
energy = np.abs(audio)
|
20 |
+
silence_threshold = np.percentile(energy, 95)
|
21 |
+
offsets = np.where(energy > silence_threshold)[0]
|
22 |
+
# left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
|
23 |
+
# right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
|
24 |
+
# TODO: could use trim_silence() here or a better VAD.
|
25 |
+
audio_voice_only = audio[offsets[0]:offsets[-1]]
|
26 |
+
mfcc = mfcc_fbank(audio_voice_only, sample_rate)
|
27 |
+
return mfcc
|
28 |
+
|
29 |
+
|
30 |
+
def extract_speaker_and_utterance_ids(filename: str): # LIBRI.
|
31 |
+
# 'audio/dev-other/116/288045/116-288045-0000.flac'
|
32 |
+
speaker, _, basename = Path(filename).parts[-3:]
|
33 |
+
filename.split('-')
|
34 |
+
utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
|
35 |
+
assert basename.split('-')[0] == speaker
|
36 |
+
return speaker, utterance
|
37 |
+
|
38 |
+
|
39 |
+
class Audio:
|
40 |
+
|
41 |
+
def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'):
|
42 |
+
self.ext = ext
|
43 |
+
self.cache_dir = os.path.join(cache_dir, 'audio-fbanks')
|
44 |
+
ensures_dir(self.cache_dir)
|
45 |
+
if audio_dir is not None:
|
46 |
+
self.build_cache(os.path.expanduser(audio_dir), sample_rate)
|
47 |
+
self.speakers_to_utterances = defaultdict(dict)
|
48 |
+
for cache_file in find_files(self.cache_dir, ext='npy'):
|
49 |
+
# /path/to/speaker_utterance.npy
|
50 |
+
speaker_id, utterance_id = Path(cache_file).stem.split('_')
|
51 |
+
self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
|
52 |
+
|
53 |
+
@property
|
54 |
+
def speaker_ids(self):
|
55 |
+
return sorted(self.speakers_to_utterances)
|
56 |
+
|
57 |
+
@staticmethod
|
58 |
+
def trim_silence(audio, threshold):
|
59 |
+
"""Removes silence at the beginning and end of a sample."""
|
60 |
+
# pylint: disable=E1121
|
61 |
+
energy = librosa.feature.rms(audio)
|
62 |
+
frames = np.nonzero(np.array(energy > threshold))
|
63 |
+
indices = librosa.core.frames_to_samples(frames)[1]
|
64 |
+
|
65 |
+
# Note: indices can be an empty array, if the whole audio was silence.
|
66 |
+
audio_trim = audio[0:0]
|
67 |
+
left_blank = audio[0:0]
|
68 |
+
right_blank = audio[0:0]
|
69 |
+
if indices.size:
|
70 |
+
audio_trim = audio[indices[0]:indices[-1]]
|
71 |
+
left_blank = audio[:indices[0]] # slice before.
|
72 |
+
right_blank = audio[indices[-1]:] # slice after.
|
73 |
+
return audio_trim, left_blank, right_blank
|
74 |
+
|
75 |
+
@staticmethod
|
76 |
+
def read(filename, sample_rate=SAMPLE_RATE):
|
77 |
+
audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
|
78 |
+
assert sr == sample_rate
|
79 |
+
return audio
|
80 |
+
|
81 |
+
def build_cache(self, audio_dir, sample_rate):
|
82 |
+
logger.info(f'audio_dir: {audio_dir}.')
|
83 |
+
logger.info(f'sample_rate: {sample_rate:,} hz.')
|
84 |
+
audio_files = find_files(audio_dir, ext=self.ext)
|
85 |
+
audio_files_count = len(audio_files)
|
86 |
+
assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
|
87 |
+
logger.info(f'Found {audio_files_count:,} files in {audio_dir}.')
|
88 |
+
with tqdm(audio_files) as bar:
|
89 |
+
for audio_filename in bar:
|
90 |
+
bar.set_description(audio_filename)
|
91 |
+
self.cache_audio_file(audio_filename, sample_rate)
|
92 |
+
|
93 |
+
def cache_audio_file(self, input_filename, sample_rate):
|
94 |
+
sp, utt = extract_speaker_and_utterance_ids(input_filename)
|
95 |
+
cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
|
96 |
+
if not os.path.isfile(cache_filename):
|
97 |
+
try:
|
98 |
+
mfcc = read_mfcc(input_filename, sample_rate)
|
99 |
+
np.save(cache_filename, mfcc)
|
100 |
+
except librosa.util.exceptions.ParameterError as e:
|
101 |
+
logger.error(e)
|
102 |
+
|
103 |
+
|
104 |
+
def pad_mfcc(mfcc, max_length): # num_frames, nfilt=64.
|
105 |
+
if len(mfcc) < max_length:
|
106 |
+
mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
|
107 |
+
return mfcc
|
108 |
+
|
109 |
+
|
110 |
+
def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array.
|
111 |
+
# Returns MFCC with shape (num_frames, n_filters, 3).
|
112 |
+
filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
|
113 |
+
frames_features = normalize_frames(filter_banks)
|
114 |
+
# delta_1 = delta(filter_banks, N=1)
|
115 |
+
# delta_2 = delta(delta_1, N=1)
|
116 |
+
# frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
|
117 |
+
return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.
|
118 |
+
|
119 |
+
|
120 |
+
def normalize_frames(m, epsilon=1e-12):
|
121 |
+
return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
|
speaker_recognition/batcher.py
ADDED
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from collections import deque, Counter
|
5 |
+
from random import choice
|
6 |
+
from time import time
|
7 |
+
|
8 |
+
import dill
|
9 |
+
import numpy as np
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from deep_speaker.audio import pad_mfcc, Audio
|
13 |
+
from deep_speaker.constants import NUM_FRAMES, NUM_FBANKS
|
14 |
+
from deep_speaker.conv_models import DeepSpeakerModel
|
15 |
+
from deep_speaker.utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
|
16 |
+
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
def extract_speaker(utt_file):
|
21 |
+
return utt_file.split('/')[-1].split('_')[0]
|
22 |
+
|
23 |
+
|
24 |
+
def sample_from_mfcc(mfcc, max_length):
|
25 |
+
if mfcc.shape[0] >= max_length:
|
26 |
+
r = choice(range(0, len(mfcc) - max_length + 1))
|
27 |
+
s = mfcc[r:r + max_length]
|
28 |
+
else:
|
29 |
+
s = pad_mfcc(mfcc, max_length)
|
30 |
+
return np.expand_dims(s, axis=-1)
|
31 |
+
|
32 |
+
|
33 |
+
def sample_from_mfcc_file(utterance_file, max_length):
|
34 |
+
mfcc = np.load(utterance_file)
|
35 |
+
return sample_from_mfcc(mfcc, max_length)
|
36 |
+
|
37 |
+
|
38 |
+
class KerasFormatConverter:
|
39 |
+
|
40 |
+
def __init__(self, working_dir, load_test_only=False):
|
41 |
+
self.working_dir = working_dir
|
42 |
+
self.output_dir = os.path.join(self.working_dir, 'keras-inputs')
|
43 |
+
ensures_dir(self.output_dir)
|
44 |
+
self.categorical_speakers = load_pickle(os.path.join(self.output_dir, 'categorical_speakers.pkl'))
|
45 |
+
if not load_test_only:
|
46 |
+
self.kx_train = load_npy(os.path.join(self.output_dir, 'kx_train.npy'))
|
47 |
+
self.ky_train = load_npy(os.path.join(self.output_dir, 'ky_train.npy'))
|
48 |
+
self.kx_test = load_npy(os.path.join(self.output_dir, 'kx_test.npy'))
|
49 |
+
self.ky_test = load_npy(os.path.join(self.output_dir, 'ky_test.npy'))
|
50 |
+
self.audio = Audio(cache_dir=self.working_dir, audio_dir=None)
|
51 |
+
if self.categorical_speakers is None:
|
52 |
+
self.categorical_speakers = SparseCategoricalSpeakers(self.audio.speaker_ids)
|
53 |
+
|
54 |
+
def persist_to_disk(self):
|
55 |
+
with open(os.path.join(self.output_dir, 'categorical_speakers.pkl'), 'wb') as w:
|
56 |
+
dill.dump(self.categorical_speakers, w)
|
57 |
+
np.save(os.path.join(self.output_dir, 'kx_train.npy'), self.kx_train)
|
58 |
+
np.save(os.path.join(self.output_dir, 'kx_test.npy'), self.kx_test)
|
59 |
+
np.save(os.path.join(self.output_dir, 'ky_train.npy'), self.ky_train)
|
60 |
+
np.save(os.path.join(self.output_dir, 'ky_test.npy'), self.ky_test)
|
61 |
+
|
62 |
+
def generate_per_phase(self, max_length=NUM_FRAMES, num_per_speaker=3000, is_test=False):
|
63 |
+
# train OR test.
|
64 |
+
num_speakers = len(self.audio.speaker_ids)
|
65 |
+
sp_to_utt = train_test_sp_to_utt(self.audio, is_test)
|
66 |
+
|
67 |
+
# 64 fbanks 1 channel(s).
|
68 |
+
# float32
|
69 |
+
kx = np.zeros((num_speakers * num_per_speaker, max_length, NUM_FBANKS, 1), dtype=np.float32)
|
70 |
+
ky = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32)
|
71 |
+
|
72 |
+
desc = f'Converting to Keras format [{"test" if is_test else "train"}]'
|
73 |
+
for i, speaker_id in enumerate(tqdm(self.audio.speaker_ids, desc=desc)):
|
74 |
+
utterances_files = sp_to_utt[speaker_id]
|
75 |
+
for j, utterance_file in enumerate(np.random.choice(utterances_files, size=num_per_speaker, replace=True)):
|
76 |
+
self.load_into_mat(utterance_file, self.categorical_speakers, speaker_id, max_length, kx, ky,
|
77 |
+
i * num_per_speaker + j)
|
78 |
+
return kx, ky
|
79 |
+
|
80 |
+
def generate(self, max_length=NUM_FRAMES, counts_per_speaker=(3000, 500)):
|
81 |
+
kx_train, ky_train = self.generate_per_phase(max_length, counts_per_speaker[0], is_test=False)
|
82 |
+
kx_test, ky_test = self.generate_per_phase(max_length, counts_per_speaker[1], is_test=True)
|
83 |
+
logger.info(f'kx_train.shape = {kx_train.shape}')
|
84 |
+
logger.info(f'ky_train.shape = {ky_train.shape}')
|
85 |
+
logger.info(f'kx_test.shape = {kx_test.shape}')
|
86 |
+
logger.info(f'ky_test.shape = {ky_test.shape}')
|
87 |
+
self.kx_train, self.ky_train, self.kx_test, self.ky_test = kx_train, ky_train, kx_test, ky_test
|
88 |
+
|
89 |
+
@staticmethod
|
90 |
+
def load_into_mat(utterance_file, categorical_speakers, speaker_id, max_length, kx, ky, i):
|
91 |
+
kx[i] = sample_from_mfcc_file(utterance_file, max_length)
|
92 |
+
ky[i] = categorical_speakers.get_index(speaker_id)
|
93 |
+
|
94 |
+
|
95 |
+
class SparseCategoricalSpeakers:
|
96 |
+
|
97 |
+
def __init__(self, speakers_list):
|
98 |
+
self.speaker_ids = sorted(speakers_list)
|
99 |
+
assert len(set(self.speaker_ids)) == len(self.speaker_ids) # all unique.
|
100 |
+
self.map = dict(zip(self.speaker_ids, range(len(self.speaker_ids))))
|
101 |
+
|
102 |
+
def get_index(self, speaker_id):
|
103 |
+
return self.map[speaker_id]
|
104 |
+
|
105 |
+
|
106 |
+
class OneHotSpeakers:
|
107 |
+
|
108 |
+
def __init__(self, speakers_list):
|
109 |
+
# pylint: disable=E0611,E0401
|
110 |
+
from tensorflow.keras.utils import to_categorical
|
111 |
+
self.speaker_ids = sorted(speakers_list)
|
112 |
+
self.int_speaker_ids = list(range(len(self.speaker_ids)))
|
113 |
+
self.map_speakers_to_index = dict([(k, v) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
|
114 |
+
self.map_index_to_speakers = dict([(v, k) for (k, v) in zip(self.speaker_ids, self.int_speaker_ids)])
|
115 |
+
self.speaker_categories = to_categorical(self.int_speaker_ids, num_classes=len(self.speaker_ids))
|
116 |
+
|
117 |
+
def get_speaker_from_index(self, index):
|
118 |
+
return self.map_index_to_speakers[index]
|
119 |
+
|
120 |
+
def get_one_hot(self, speaker_id):
|
121 |
+
index = self.map_speakers_to_index[speaker_id]
|
122 |
+
return self.speaker_categories[index]
|
123 |
+
|
124 |
+
|
125 |
+
class LazyTripletBatcher:
|
126 |
+
def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
|
127 |
+
self.working_dir = working_dir
|
128 |
+
self.audio = Audio(cache_dir=working_dir)
|
129 |
+
logger.info(f'Picking audio from {working_dir}.')
|
130 |
+
self.sp_to_utt_train = train_test_sp_to_utt(self.audio, is_test=False)
|
131 |
+
self.sp_to_utt_test = train_test_sp_to_utt(self.audio, is_test=True)
|
132 |
+
self.max_length = max_length
|
133 |
+
self.model = model
|
134 |
+
self.nb_per_speaker = 2
|
135 |
+
self.nb_speakers = 640
|
136 |
+
self.history_length = 4
|
137 |
+
self.history_every = 100 # batches.
|
138 |
+
self.total_history_length = self.nb_speakers * self.nb_per_speaker * self.history_length # 25,600
|
139 |
+
self.metadata_train_speakers = Counter()
|
140 |
+
self.metadata_output_file = os.path.join(self.working_dir, 'debug_batcher.json')
|
141 |
+
|
142 |
+
self.history_embeddings_train = deque(maxlen=self.total_history_length)
|
143 |
+
self.history_utterances_train = deque(maxlen=self.total_history_length)
|
144 |
+
self.history_model_inputs_train = deque(maxlen=self.total_history_length)
|
145 |
+
|
146 |
+
self.history_embeddings = None
|
147 |
+
self.history_utterances = None
|
148 |
+
self.history_model_inputs = None
|
149 |
+
|
150 |
+
self.batch_count = 0
|
151 |
+
for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history.
|
152 |
+
self.update_triplets_history()
|
153 |
+
|
154 |
+
def update_triplets_history(self):
|
155 |
+
model_inputs = []
|
156 |
+
speakers = list(self.audio.speakers_to_utterances.keys())
|
157 |
+
np.random.shuffle(speakers)
|
158 |
+
selected_speakers = speakers[: self.nb_speakers]
|
159 |
+
embeddings_utterances = []
|
160 |
+
for speaker_id in selected_speakers:
|
161 |
+
train_utterances = self.sp_to_utt_train[speaker_id]
|
162 |
+
for selected_utterance in np.random.choice(a=train_utterances, size=self.nb_per_speaker, replace=False):
|
163 |
+
mfcc = sample_from_mfcc_file(selected_utterance, self.max_length)
|
164 |
+
embeddings_utterances.append(selected_utterance)
|
165 |
+
model_inputs.append(mfcc)
|
166 |
+
embeddings = self.model.m.predict(np.array(model_inputs))
|
167 |
+
assert embeddings.shape[-1] == 512
|
168 |
+
embeddings = np.reshape(embeddings, (len(selected_speakers), self.nb_per_speaker, 512))
|
169 |
+
self.history_embeddings_train.extend(list(embeddings.reshape((-1, 512))))
|
170 |
+
self.history_utterances_train.extend(embeddings_utterances)
|
171 |
+
self.history_model_inputs_train.extend(model_inputs)
|
172 |
+
|
173 |
+
# reason: can't index a deque with a np.array.
|
174 |
+
self.history_embeddings = np.array(self.history_embeddings_train)
|
175 |
+
self.history_utterances = np.array(self.history_utterances_train)
|
176 |
+
self.history_model_inputs = np.array(self.history_model_inputs_train)
|
177 |
+
|
178 |
+
with open(self.metadata_output_file, 'w') as w:
|
179 |
+
json.dump(obj=dict(self.metadata_train_speakers), fp=w, indent=2)
|
180 |
+
|
181 |
+
def get_batch(self, batch_size, is_test=False):
|
182 |
+
return self.get_batch_test(batch_size) if is_test else self.get_random_batch(batch_size, is_test=False)
|
183 |
+
|
184 |
+
def get_batch_test(self, batch_size):
|
185 |
+
return self.get_random_batch(batch_size, is_test=True)
|
186 |
+
|
187 |
+
def get_random_batch(self, batch_size, is_test=False):
|
188 |
+
sp_to_utt = self.sp_to_utt_test if is_test else self.sp_to_utt_train
|
189 |
+
speakers = list(self.audio.speakers_to_utterances.keys())
|
190 |
+
anchor_speakers = np.random.choice(speakers, size=batch_size // 3, replace=False)
|
191 |
+
|
192 |
+
anchor_utterances = []
|
193 |
+
positive_utterances = []
|
194 |
+
negative_utterances = []
|
195 |
+
for anchor_speaker in anchor_speakers:
|
196 |
+
negative_speaker = np.random.choice(list(set(speakers) - {anchor_speaker}), size=1)[0]
|
197 |
+
assert negative_speaker != anchor_speaker
|
198 |
+
pos_utterances = np.random.choice(sp_to_utt[anchor_speaker], 2, replace=False)
|
199 |
+
neg_utterance = np.random.choice(sp_to_utt[negative_speaker], 1, replace=True)[0]
|
200 |
+
anchor_utterances.append(pos_utterances[0])
|
201 |
+
positive_utterances.append(pos_utterances[1])
|
202 |
+
negative_utterances.append(neg_utterance)
|
203 |
+
|
204 |
+
# anchor and positive should have difference utterances (but same speaker!).
|
205 |
+
anc_pos = np.array([positive_utterances, anchor_utterances])
|
206 |
+
assert np.all(anc_pos[0, :] != anc_pos[1, :])
|
207 |
+
assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
|
208 |
+
[extract_speaker(s) for s in anc_pos[1, :]]))
|
209 |
+
|
210 |
+
pos_neg = np.array([positive_utterances, negative_utterances])
|
211 |
+
assert np.all(pos_neg[0, :] != pos_neg[1, :])
|
212 |
+
assert np.all(np.array([extract_speaker(s) for s in pos_neg[0, :]]) != np.array(
|
213 |
+
[extract_speaker(s) for s in pos_neg[1, :]]))
|
214 |
+
|
215 |
+
batch_x = np.vstack([
|
216 |
+
[sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
|
217 |
+
[sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
|
218 |
+
[sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
|
219 |
+
])
|
220 |
+
|
221 |
+
batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
|
222 |
+
return batch_x, batch_y
|
223 |
+
|
224 |
+
def get_batch_train(self, batch_size):
|
225 |
+
from deep_speaker.test import batch_cosine_similarity
|
226 |
+
# s1 = time()
|
227 |
+
self.batch_count += 1
|
228 |
+
if self.batch_count % self.history_every == 0:
|
229 |
+
self.update_triplets_history()
|
230 |
+
|
231 |
+
all_indexes = range(len(self.history_embeddings_train))
|
232 |
+
anchor_indexes = np.random.choice(a=all_indexes, size=batch_size // 3, replace=False)
|
233 |
+
|
234 |
+
# s2 = time()
|
235 |
+
similar_negative_indexes = []
|
236 |
+
dissimilar_positive_indexes = []
|
237 |
+
# could be made parallel.
|
238 |
+
for anchor_index in anchor_indexes:
|
239 |
+
# s21 = time()
|
240 |
+
anchor_embedding = self.history_embeddings[anchor_index]
|
241 |
+
anchor_speaker = extract_speaker(self.history_utterances[anchor_index])
|
242 |
+
|
243 |
+
# why self.nb_speakers // 2? just random. because it is fast. otherwise it's too much.
|
244 |
+
negative_indexes = [j for (j, a) in enumerate(self.history_utterances)
|
245 |
+
if extract_speaker(a) != anchor_speaker]
|
246 |
+
negative_indexes = np.random.choice(negative_indexes, size=self.nb_speakers // 2)
|
247 |
+
|
248 |
+
# s22 = time()
|
249 |
+
|
250 |
+
anchor_embedding_tile = [anchor_embedding] * len(negative_indexes)
|
251 |
+
anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[negative_indexes])
|
252 |
+
|
253 |
+
# s23 = time()
|
254 |
+
similar_negative_index = negative_indexes[np.argsort(anchor_cos)[-1]] # [-1:]
|
255 |
+
similar_negative_indexes.append(similar_negative_index)
|
256 |
+
|
257 |
+
# s24 = time()
|
258 |
+
positive_indexes = [j for (j, a) in enumerate(self.history_utterances) if
|
259 |
+
extract_speaker(a) == anchor_speaker and j != anchor_index]
|
260 |
+
# s25 = time()
|
261 |
+
anchor_embedding_tile = [anchor_embedding] * len(positive_indexes)
|
262 |
+
# s26 = time()
|
263 |
+
anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[positive_indexes])
|
264 |
+
dissimilar_positive_index = positive_indexes[np.argsort(anchor_cos)[0]] # [:1]
|
265 |
+
dissimilar_positive_indexes.append(dissimilar_positive_index)
|
266 |
+
# s27 = time()
|
267 |
+
|
268 |
+
# s3 = time()
|
269 |
+
batch_x = np.vstack([
|
270 |
+
self.history_model_inputs[anchor_indexes],
|
271 |
+
self.history_model_inputs[dissimilar_positive_indexes],
|
272 |
+
self.history_model_inputs[similar_negative_indexes]
|
273 |
+
])
|
274 |
+
|
275 |
+
# s4 = time()
|
276 |
+
|
277 |
+
# for anchor, positive, negative in zip(history_utterances[anchor_indexes],
|
278 |
+
# history_utterances[dissimilar_positive_indexes],
|
279 |
+
# history_utterances[similar_negative_indexes]):
|
280 |
+
# print('anchor', os.path.basename(anchor),
|
281 |
+
# 'positive', os.path.basename(positive),
|
282 |
+
# 'negative', os.path.basename(negative))
|
283 |
+
# print('_' * 80)
|
284 |
+
|
285 |
+
# assert utterances as well positive != anchor.
|
286 |
+
anchor_speakers = [extract_speaker(a) for a in self.history_utterances[anchor_indexes]]
|
287 |
+
positive_speakers = [extract_speaker(a) for a in self.history_utterances[dissimilar_positive_indexes]]
|
288 |
+
negative_speakers = [extract_speaker(a) for a in self.history_utterances[similar_negative_indexes]]
|
289 |
+
|
290 |
+
assert len(anchor_indexes) == len(dissimilar_positive_indexes)
|
291 |
+
assert len(similar_negative_indexes) == len(dissimilar_positive_indexes)
|
292 |
+
assert list(self.history_utterances[dissimilar_positive_indexes]) != list(
|
293 |
+
self.history_utterances[anchor_indexes])
|
294 |
+
assert anchor_speakers == positive_speakers
|
295 |
+
assert negative_speakers != anchor_speakers
|
296 |
+
|
297 |
+
batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
|
298 |
+
|
299 |
+
for a in anchor_speakers:
|
300 |
+
self.metadata_train_speakers[a] += 1
|
301 |
+
for a in positive_speakers:
|
302 |
+
self.metadata_train_speakers[a] += 1
|
303 |
+
for a in negative_speakers:
|
304 |
+
self.metadata_train_speakers[a] += 1
|
305 |
+
|
306 |
+
# s5 = time()
|
307 |
+
# print('1-2', s2 - s1)
|
308 |
+
# print('2-3', s3 - s2)
|
309 |
+
# print('3-4', s4 - s3)
|
310 |
+
# print('4-5', s5 - s4)
|
311 |
+
# print('21-22', (s22 - s21) * (batch_size // 3))
|
312 |
+
# print('22-23', (s23 - s22) * (batch_size // 3))
|
313 |
+
# print('23-24', (s24 - s23) * (batch_size // 3))
|
314 |
+
# print('24-25', (s25 - s24) * (batch_size // 3))
|
315 |
+
# print('25-26', (s26 - s25) * (batch_size // 3))
|
316 |
+
# print('26-27', (s27 - s26) * (batch_size // 3))
|
317 |
+
|
318 |
+
return batch_x, batch_y
|
319 |
+
|
320 |
+
def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
|
321 |
+
speakers = list(self.audio.speakers_to_utterances.keys())
|
322 |
+
anchor_utterances = []
|
323 |
+
positive_utterances = []
|
324 |
+
negative_utterances = []
|
325 |
+
negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
|
326 |
+
assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
|
327 |
+
pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
|
328 |
+
neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
|
329 |
+
anchor_utterances.append(pos_utterances[0])
|
330 |
+
positive_utterances.append(pos_utterances[1])
|
331 |
+
negative_utterances.extend(neg_utterances)
|
332 |
+
|
333 |
+
# anchor and positive should have difference utterances (but same speaker!).
|
334 |
+
anc_pos = np.array([positive_utterances, anchor_utterances])
|
335 |
+
assert np.all(anc_pos[0, :] != anc_pos[1, :])
|
336 |
+
assert np.all(np.array([extract_speaker(s) for s in anc_pos[0, :]]) == np.array(
|
337 |
+
[extract_speaker(s) for s in anc_pos[1, :]]))
|
338 |
+
|
339 |
+
batch_x = np.vstack([
|
340 |
+
[sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
|
341 |
+
[sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
|
342 |
+
[sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
|
343 |
+
])
|
344 |
+
|
345 |
+
batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
|
346 |
+
return batch_x, batch_y
|
347 |
+
|
348 |
+
|
349 |
+
class TripletBatcher:
|
350 |
+
|
351 |
+
def __init__(self, kx_train, ky_train, kx_test, ky_test):
|
352 |
+
self.kx_train = kx_train
|
353 |
+
self.ky_train = ky_train
|
354 |
+
self.kx_test = kx_test
|
355 |
+
self.ky_test = ky_test
|
356 |
+
speakers_list = sorted(set(ky_train.argmax(axis=1)))
|
357 |
+
num_different_speakers = len(speakers_list)
|
358 |
+
assert speakers_list == sorted(set(ky_test.argmax(axis=1))) # train speakers = test speakers.
|
359 |
+
assert speakers_list == list(range(num_different_speakers))
|
360 |
+
self.train_indices_per_speaker = {}
|
361 |
+
self.test_indices_per_speaker = {}
|
362 |
+
|
363 |
+
for speaker_id in speakers_list:
|
364 |
+
self.train_indices_per_speaker[speaker_id] = list(np.where(ky_train.argmax(axis=1) == speaker_id)[0])
|
365 |
+
self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
|
366 |
+
|
367 |
+
# check.
|
368 |
+
# print(sorted(sum([v for v in self.train_indices_per_speaker.values()], [])))
|
369 |
+
# print(range(len(ky_train)))
|
370 |
+
assert sorted(sum([v for v in self.train_indices_per_speaker.values()], [])) == sorted(range(len(ky_train)))
|
371 |
+
assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
|
372 |
+
self.speakers_list = speakers_list
|
373 |
+
|
374 |
+
def select_speaker_data(self, speaker, n, is_test):
|
375 |
+
x = self.kx_test if is_test else self.kx_train
|
376 |
+
indices_per_speaker = self.test_indices_per_speaker if is_test else self.train_indices_per_speaker
|
377 |
+
indices = np.random.choice(indices_per_speaker[speaker], size=n)
|
378 |
+
return x[indices]
|
379 |
+
|
380 |
+
def get_batch(self, batch_size, is_test=False):
|
381 |
+
# y = self.ky_test if is_test else self.ky_train
|
382 |
+
|
383 |
+
two_different_speakers = np.random.choice(self.speakers_list, size=2, replace=False)
|
384 |
+
anchor_positive_speaker = two_different_speakers[0]
|
385 |
+
negative_speaker = two_different_speakers[1]
|
386 |
+
assert negative_speaker != anchor_positive_speaker
|
387 |
+
|
388 |
+
batch_x = np.vstack([
|
389 |
+
self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
|
390 |
+
self.select_speaker_data(anchor_positive_speaker, batch_size // 3, is_test),
|
391 |
+
self.select_speaker_data(negative_speaker, batch_size // 3, is_test)
|
392 |
+
])
|
393 |
+
|
394 |
+
batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
|
395 |
+
return batch_x, batch_y
|
396 |
+
|
397 |
+
|
398 |
+
class TripletBatcherMiner(TripletBatcher):
|
399 |
+
|
400 |
+
def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
|
401 |
+
super().__init__(kx_train, ky_train, kx_test, ky_test)
|
402 |
+
self.model = model
|
403 |
+
self.num_evaluations_to_find_best_batch = 10
|
404 |
+
|
405 |
+
def get_batch(self, batch_size, is_test=False):
|
406 |
+
if is_test:
|
407 |
+
return super().get_batch(batch_size, is_test)
|
408 |
+
max_loss = 0
|
409 |
+
max_batch = None, None
|
410 |
+
for i in range(self.num_evaluations_to_find_best_batch):
|
411 |
+
bx, by = super().get_batch(batch_size, is_test=False) # only train here.
|
412 |
+
loss = self.model.m.evaluate(bx, by, batch_size=batch_size, verbose=0)
|
413 |
+
if loss > max_loss:
|
414 |
+
max_loss = loss
|
415 |
+
max_batch = bx, by
|
416 |
+
return max_batch
|
417 |
+
|
418 |
+
|
419 |
+
class TripletBatcherSelectHardNegatives(TripletBatcher):
|
420 |
+
|
421 |
+
def __init__(self, kx_train, ky_train, kx_test, ky_test, model: DeepSpeakerModel):
|
422 |
+
super().__init__(kx_train, ky_train, kx_test, ky_test)
|
423 |
+
self.model = model
|
424 |
+
|
425 |
+
def get_batch(self, batch_size, is_test=False, predict=None):
|
426 |
+
if predict is None:
|
427 |
+
predict = self.model.m.predict
|
428 |
+
from deep_speaker.test import batch_cosine_similarity
|
429 |
+
num_triplets = batch_size // 3
|
430 |
+
inputs = []
|
431 |
+
k = 2 # do not change this.
|
432 |
+
for speaker in self.speakers_list:
|
433 |
+
inputs.append(self.select_speaker_data(speaker, n=k, is_test=is_test))
|
434 |
+
inputs = np.array(inputs) # num_speakers * [k, num_frames, num_fbanks, 1].
|
435 |
+
embeddings = predict(np.vstack(inputs))
|
436 |
+
assert embeddings.shape[-1] == 512
|
437 |
+
# (speaker, utterance, 512)
|
438 |
+
embeddings = np.reshape(embeddings, (len(self.speakers_list), k, 512))
|
439 |
+
cs = batch_cosine_similarity(embeddings[:, 0], embeddings[:, 1])
|
440 |
+
arg_sort = np.argsort(cs)
|
441 |
+
assert len(arg_sort) > num_triplets
|
442 |
+
anchor_speakers = arg_sort[0:num_triplets]
|
443 |
+
|
444 |
+
anchor_embeddings = embeddings[anchor_speakers, 0]
|
445 |
+
negative_speakers = sorted(set(self.speakers_list) - set(anchor_speakers))
|
446 |
+
negative_embeddings = embeddings[negative_speakers, 0]
|
447 |
+
|
448 |
+
selected_negative_speakers = []
|
449 |
+
for anchor_embedding in anchor_embeddings:
|
450 |
+
cs_negative = [batch_cosine_similarity([anchor_embedding], neg) for neg in negative_embeddings]
|
451 |
+
selected_negative_speakers.append(negative_speakers[int(np.argmax(cs_negative))])
|
452 |
+
|
453 |
+
# anchor with frame 0.
|
454 |
+
# positive with frame 1.
|
455 |
+
# negative with frame 0.
|
456 |
+
assert len(set(selected_negative_speakers).intersection(anchor_speakers)) == 0
|
457 |
+
negative = inputs[selected_negative_speakers, 0]
|
458 |
+
positive = inputs[anchor_speakers, 1]
|
459 |
+
anchor = inputs[anchor_speakers, 0]
|
460 |
+
batch_x = np.vstack([anchor, positive, negative])
|
461 |
+
batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
|
462 |
+
return batch_x, batch_y
|
463 |
+
|
464 |
+
|
465 |
+
class TripletEvaluator:
|
466 |
+
|
467 |
+
def __init__(self, kx_test, ky_test):
|
468 |
+
self.kx_test = kx_test
|
469 |
+
self.ky_test = ky_test
|
470 |
+
speakers_list = sorted(set(ky_test.argmax(axis=1)))
|
471 |
+
num_different_speakers = len(speakers_list)
|
472 |
+
assert speakers_list == list(range(num_different_speakers))
|
473 |
+
self.test_indices_per_speaker = {}
|
474 |
+
for speaker_id in speakers_list:
|
475 |
+
self.test_indices_per_speaker[speaker_id] = list(np.where(ky_test.argmax(axis=1) == speaker_id)[0])
|
476 |
+
assert sorted(sum([v for v in self.test_indices_per_speaker.values()], [])) == sorted(range(len(ky_test)))
|
477 |
+
self.speakers_list = speakers_list
|
478 |
+
|
479 |
+
def _select_speaker_data(self, speaker):
|
480 |
+
indices = np.random.choice(self.test_indices_per_speaker[speaker], size=1)
|
481 |
+
return self.kx_test[indices]
|
482 |
+
|
483 |
+
def get_speaker_verification_data(self, positive_speaker, num_different_speakers):
|
484 |
+
all_negative_speakers = list(set(self.speakers_list) - {positive_speaker})
|
485 |
+
assert len(self.speakers_list) - 1 == len(all_negative_speakers)
|
486 |
+
negative_speakers = np.random.choice(all_negative_speakers, size=num_different_speakers, replace=False)
|
487 |
+
assert positive_speaker not in negative_speakers
|
488 |
+
anchor = self._select_speaker_data(positive_speaker)
|
489 |
+
positive = self._select_speaker_data(positive_speaker)
|
490 |
+
data = [anchor, positive]
|
491 |
+
data.extend([self._select_speaker_data(n) for n in negative_speakers])
|
492 |
+
return np.vstack(data)
|
493 |
+
|
494 |
+
|
495 |
+
if __name__ == '__main__':
|
496 |
+
np.random.seed(123)
|
497 |
+
ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
|
498 |
+
max_length=NUM_FRAMES,
|
499 |
+
model=DeepSpeakerModel())
|
500 |
+
for i in range(1000):
|
501 |
+
print(i)
|
502 |
+
start = time()
|
503 |
+
ltb.get_batch_train(batch_size=9)
|
504 |
+
print(time() - start)
|
505 |
+
# ltb.get_batch(batch_size=96)
|
speaker_recognition/constants.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Constants.
|
2 |
+
|
3 |
+
SAMPLE_RATE = 16000 # not higher than that otherwise we may have errors when computing the fbanks.
|
4 |
+
|
5 |
+
# Train/Test sets share the same speakers. They contain different utterances.
|
6 |
+
# 0.8 means 20% of the utterances of each speaker will be held out and placed in the test set.
|
7 |
+
TRAIN_TEST_RATIO = 0.8
|
8 |
+
|
9 |
+
CHECKPOINTS_SOFTMAX_DIR = 'checkpoints-softmax'
|
10 |
+
|
11 |
+
CHECKPOINTS_TRIPLET_DIR = 'checkpoints-triplets'
|
12 |
+
|
13 |
+
BATCH_SIZE = 32 * 3 # have to be a multiple of 3.
|
14 |
+
|
15 |
+
# Input to the model will be a 4D image: (batch_size, num_frames, num_fbanks, 3)
|
16 |
+
# Where the 3 channels are: FBANK, DIFF(FBANK), DIFF(DIFF(FBANK)).
|
17 |
+
NUM_FRAMES = 160 # 1 second ~ 100 frames with default params winlen=0.025,winstep=0.01
|
18 |
+
NUM_FBANKS = 64
|
speaker_recognition/conv_models.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import tensorflow as tf
|
6 |
+
# pylint: disable=E0611,E0401
|
7 |
+
import tensorflow.keras.backend as K
|
8 |
+
# pylint: disable=E0611,E0401
|
9 |
+
from tensorflow.keras import layers, regularizers
|
10 |
+
# pylint: disable=E0611,E0401
|
11 |
+
from tensorflow.keras.layers import (
|
12 |
+
BatchNormalization,
|
13 |
+
Conv2D,
|
14 |
+
Dense,
|
15 |
+
Dropout,
|
16 |
+
Input,
|
17 |
+
Lambda,
|
18 |
+
Reshape,
|
19 |
+
)
|
20 |
+
# pylint: disable=E0611,E0401
|
21 |
+
from tensorflow.keras.models import Model
|
22 |
+
# pylint: disable=E0611,E0401
|
23 |
+
from tensorflow.keras.optimizers import Adam
|
24 |
+
|
25 |
+
from deep_speaker.constants import NUM_FBANKS, SAMPLE_RATE, NUM_FRAMES
|
26 |
+
from deep_speaker.triplet_loss import deep_speaker_loss
|
27 |
+
|
28 |
+
logger = logging.getLogger(__name__)
|
29 |
+
|
30 |
+
|
31 |
+
@tf.function
|
32 |
+
def tf_normalize(data, ndims, eps=0, adjusted=False):
|
33 |
+
data = tf.convert_to_tensor(data, name='data')
|
34 |
+
|
35 |
+
reduce_dims = [-i - 1 for i in range(ndims)]
|
36 |
+
# pylint: disable=E1123,E1120
|
37 |
+
data = tf.cast(data, dtype=tf.dtypes.float32)
|
38 |
+
data_num = tf.reduce_prod(data.shape[-ndims:])
|
39 |
+
data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)
|
40 |
+
|
41 |
+
# Apply a minimum normalization that protects us against uniform images.
|
42 |
+
stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
|
43 |
+
adjusted_stddev = stddev
|
44 |
+
if adjusted:
|
45 |
+
min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
|
46 |
+
eps = tf.maximum(eps, min_stddev)
|
47 |
+
if eps > 0:
|
48 |
+
adjusted_stddev = tf.maximum(adjusted_stddev, eps)
|
49 |
+
|
50 |
+
return (data - data_mean) / adjusted_stddev
|
51 |
+
|
52 |
+
|
53 |
+
@tf.function
|
54 |
+
def tf_fbank(samples):
|
55 |
+
"""
|
56 |
+
Compute Mel-filterbank energy features from an audio signal.
|
57 |
+
See python_speech_features.fbank
|
58 |
+
"""
|
59 |
+
frame_length = int(0.025 * SAMPLE_RATE)
|
60 |
+
frame_step = int(0.01 * SAMPLE_RATE)
|
61 |
+
fft_length = 512
|
62 |
+
fft_bins = fft_length // 2 + 1
|
63 |
+
|
64 |
+
pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]
|
65 |
+
|
66 |
+
# Original implementation from python_speech_features
|
67 |
+
# frames = tf.expand_dims(sigproc.framesig(preemphasis[0], frame_length,
|
68 |
+
# frame_step, winfunc=lambda x: np.ones((x,))), 0)
|
69 |
+
# powspec = sigproc.powspec(frames, fft_length)
|
70 |
+
|
71 |
+
# Tensorflow impl #1, using manually-split frames and rfft
|
72 |
+
# spec = tf.abs(tf.signal.rfft(frames, [fft_length]))
|
73 |
+
# powspec = tf.square(spec) / fft_length
|
74 |
+
|
75 |
+
# Tensorflow impl #2, using stft to handle framing automatically
|
76 |
+
# (There is a one-off mismatch on the number of frames on the resulting tensor, but I guess this is ok)
|
77 |
+
spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
|
78 |
+
powspec = tf.square(spec) / fft_length
|
79 |
+
|
80 |
+
# Matrix to transform spectrum to mel-frequencies
|
81 |
+
|
82 |
+
# Original implementation from python_speech_features
|
83 |
+
# linear_to_mel_weight_matrix = get_filterbanks(NUM_FBANKS, fft_length,
|
84 |
+
# SAMPLE_RATE, 0, SAMPLE_RATE/2).astype(np.float32).T
|
85 |
+
|
86 |
+
linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
|
87 |
+
num_mel_bins=NUM_FBANKS,
|
88 |
+
num_spectrogram_bins=fft_bins,
|
89 |
+
sample_rate=SAMPLE_RATE,
|
90 |
+
lower_edge_hertz=0,
|
91 |
+
upper_edge_hertz=SAMPLE_RATE / 2,
|
92 |
+
)
|
93 |
+
|
94 |
+
feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
|
95 |
+
# feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
|
96 |
+
return feat
|
97 |
+
|
98 |
+
|
99 |
+
class DeepSpeakerModel:
|
100 |
+
|
101 |
+
# I thought it was 3 but maybe energy is added at a 4th dimension.
|
102 |
+
# would be better to have 4 dimensions:
|
103 |
+
# MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
|
104 |
+
# this seems to help match the parameter counts.
|
105 |
+
def __init__(
|
106 |
+
self,
|
107 |
+
batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
|
108 |
+
include_softmax=False,
|
109 |
+
num_speakers_softmax=None,
|
110 |
+
pcm_input=False
|
111 |
+
):
|
112 |
+
if pcm_input:
|
113 |
+
batch_input_shape = None
|
114 |
+
self.include_softmax = include_softmax
|
115 |
+
if self.include_softmax:
|
116 |
+
assert num_speakers_softmax > 0
|
117 |
+
self.clipped_relu_count = 0
|
118 |
+
|
119 |
+
# http://cs231n.github.io/convolutional-networks/
|
120 |
+
# conv weights
|
121 |
+
# #params = ks * ks * nb_filters * num_channels_input
|
122 |
+
|
123 |
+
# Conv128-s
|
124 |
+
# 5*5*128*128/2+128
|
125 |
+
# ks*ks*nb_filters*channels/strides+bias(=nb_filters)
|
126 |
+
|
127 |
+
# take 100 ms -> 4 frames.
|
128 |
+
# if signal is 3 seconds, then take 100ms per 100ms and average out this network.
|
129 |
+
# 8*8 = 64 features.
|
130 |
+
|
131 |
+
# used to share all the layers across the inputs
|
132 |
+
|
133 |
+
# num_frames = K.shape() - do it dynamically after.
|
134 |
+
|
135 |
+
if pcm_input:
|
136 |
+
batch_input_shape = batch_input_shape or (None, None) # Batch-size, num-samples
|
137 |
+
inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
|
138 |
+
x = inputs
|
139 |
+
x = Lambda(tf_fbank)(x)
|
140 |
+
x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
|
141 |
+
x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
|
142 |
+
else:
|
143 |
+
batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
|
144 |
+
inputs = Input(batch_shape=batch_input_shape, name='input')
|
145 |
+
x = inputs
|
146 |
+
|
147 |
+
x = self.cnn_component(x)
|
148 |
+
|
149 |
+
x = Reshape((-1, 2048))(x)
|
150 |
+
# Temporal average layer. axis=1 is time.
|
151 |
+
x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
|
152 |
+
if include_softmax:
|
153 |
+
logger.info('Including a Dropout layer to reduce overfitting.')
|
154 |
+
# used for softmax because the dataset we pre-train on might be too small. easy to overfit.
|
155 |
+
x = Dropout(0.5)(x)
|
156 |
+
x = Dense(512, name='affine')(x)
|
157 |
+
if include_softmax:
|
158 |
+
# Those weights are just when we train on softmax.
|
159 |
+
x = Dense(num_speakers_softmax, activation='softmax')(x)
|
160 |
+
else:
|
161 |
+
# Does not contain any weights.
|
162 |
+
x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
|
163 |
+
self.m = Model(inputs, x, name='ResCNN')
|
164 |
+
|
165 |
+
def keras_model(self):
|
166 |
+
return self.m
|
167 |
+
|
168 |
+
def get_weights(self):
|
169 |
+
w = self.m.get_weights()
|
170 |
+
if self.include_softmax:
|
171 |
+
w.pop() # last 2 are the W_softmax and b_softmax.
|
172 |
+
w.pop()
|
173 |
+
return w
|
174 |
+
|
175 |
+
def clipped_relu(self, inputs):
|
176 |
+
relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
|
177 |
+
self.clipped_relu_count += 1
|
178 |
+
return relu
|
179 |
+
|
180 |
+
def identity_block(self, input_tensor, kernel_size, filters, stage, block):
|
181 |
+
conv_name_base = f'res{stage}_{block}_branch'
|
182 |
+
|
183 |
+
x = Conv2D(filters,
|
184 |
+
kernel_size=kernel_size,
|
185 |
+
strides=1,
|
186 |
+
activation=None,
|
187 |
+
padding='same',
|
188 |
+
kernel_initializer='glorot_uniform',
|
189 |
+
kernel_regularizer=regularizers.l2(l=0.0001),
|
190 |
+
name=conv_name_base + '_2a')(input_tensor)
|
191 |
+
x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
|
192 |
+
x = self.clipped_relu(x)
|
193 |
+
|
194 |
+
x = Conv2D(
|
195 |
+
filters,
|
196 |
+
kernel_size=kernel_size,
|
197 |
+
strides=1,
|
198 |
+
activation=None,
|
199 |
+
padding='same',
|
200 |
+
kernel_initializer='glorot_uniform',
|
201 |
+
kernel_regularizer=regularizers.l2(l=0.0001),
|
202 |
+
name=conv_name_base + '_2b',
|
203 |
+
)(x)
|
204 |
+
x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)
|
205 |
+
|
206 |
+
x = self.clipped_relu(x)
|
207 |
+
|
208 |
+
x = layers.add([x, input_tensor])
|
209 |
+
x = self.clipped_relu(x)
|
210 |
+
return x
|
211 |
+
|
212 |
+
def conv_and_res_block(self, inp, filters, stage):
|
213 |
+
conv_name = 'conv{}-s'.format(filters)
|
214 |
+
# TODO: why kernel_regularizer?
|
215 |
+
o = Conv2D(filters,
|
216 |
+
kernel_size=5,
|
217 |
+
strides=2,
|
218 |
+
activation=None,
|
219 |
+
padding='same',
|
220 |
+
kernel_initializer='glorot_uniform',
|
221 |
+
kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
|
222 |
+
o = BatchNormalization(name=conv_name + '_bn')(o)
|
223 |
+
o = self.clipped_relu(o)
|
224 |
+
for i in range(3):
|
225 |
+
o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
|
226 |
+
return o
|
227 |
+
|
228 |
+
def cnn_component(self, inp):
|
229 |
+
x = self.conv_and_res_block(inp, 64, stage=1)
|
230 |
+
x = self.conv_and_res_block(x, 128, stage=2)
|
231 |
+
x = self.conv_and_res_block(x, 256, stage=3)
|
232 |
+
x = self.conv_and_res_block(x, 512, stage=4)
|
233 |
+
return x
|
234 |
+
|
235 |
+
def set_weights(self, w):
|
236 |
+
for layer, layer_w in zip(self.m.layers, w):
|
237 |
+
layer.set_weights(layer_w)
|
238 |
+
logger.info(f'Setting weights for [{layer.name}]...')
|
239 |
+
|
240 |
+
|
241 |
+
def main():
|
242 |
+
# Looks correct to me.
|
243 |
+
# I have 37K but paper reports 41K. which is not too far.
|
244 |
+
dsm = DeepSpeakerModel()
|
245 |
+
dsm.m.summary()
|
246 |
+
|
247 |
+
# I suspect num frames to be 32.
|
248 |
+
# Then fbank=64, then total would be 32*64 = 2048.
|
249 |
+
# plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
|
250 |
+
|
251 |
+
|
252 |
+
def _train():
|
253 |
+
# x = np.random.uniform(size=(6, 32, 64, 4)) # 6 is multiple of 3.
|
254 |
+
# y_softmax = np.random.uniform(size=(6, 100))
|
255 |
+
# dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
|
256 |
+
# dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
|
257 |
+
# print(dsm.m.predict(x).shape)
|
258 |
+
# print(dsm.m.evaluate(x, y_softmax))
|
259 |
+
# w = dsm.get_weights()
|
260 |
+
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
|
261 |
+
# dsm.m.set_weights(w)
|
262 |
+
dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
|
263 |
+
|
264 |
+
# it works!!!!!!!!!!!!!!!!!!!!
|
265 |
+
# unit_batch_size = 20
|
266 |
+
# anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
|
267 |
+
# positive = np.array(anchor)
|
268 |
+
# negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
|
269 |
+
# batch = np.vstack((anchor, positive, negative))
|
270 |
+
# x = batch
|
271 |
+
# y = np.zeros(shape=(len(batch), 512)) # not important.
|
272 |
+
# print('Starting to fit...')
|
273 |
+
# while True:
|
274 |
+
# print(dsm.m.train_on_batch(x, y))
|
275 |
+
|
276 |
+
# should not work... and it does not work!
|
277 |
+
unit_batch_size = 20
|
278 |
+
negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
|
279 |
+
batch = np.vstack((negative, negative, negative))
|
280 |
+
x = batch
|
281 |
+
y = np.zeros(shape=(len(batch), 512)) # not important.
|
282 |
+
print('Starting to fit...')
|
283 |
+
while True:
|
284 |
+
print(dsm.m.train_on_batch(x, y))
|
285 |
+
|
286 |
+
|
287 |
+
def _test_checkpoint_compatibility():
|
288 |
+
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
|
289 |
+
dsm.m.save_weights('test.h5')
|
290 |
+
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
|
291 |
+
dsm.m.load_weights('test.h5', by_name=True)
|
292 |
+
os.remove('test.h5')
|
293 |
+
|
294 |
+
|
295 |
+
if __name__ == '__main__':
|
296 |
+
_test_checkpoint_compatibility()
|
speaker_recognition/eval_metrics.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def evaluate(sims, labels):
|
5 |
+
# Calculate evaluation metrics
|
6 |
+
thresholds = np.arange(0, 1.0, 0.001)
|
7 |
+
fm, tpr, acc = calculate_roc(thresholds, sims, labels)
|
8 |
+
eer = calculate_eer(thresholds, sims, labels)
|
9 |
+
return fm, tpr, acc, eer
|
10 |
+
|
11 |
+
|
12 |
+
def calculate_roc(thresholds, sims, labels):
|
13 |
+
nrof_thresholds = len(thresholds)
|
14 |
+
|
15 |
+
tprs = np.zeros((nrof_thresholds))
|
16 |
+
fprs = np.zeros((nrof_thresholds))
|
17 |
+
acc_train = np.zeros((nrof_thresholds))
|
18 |
+
precisions = np.zeros((nrof_thresholds))
|
19 |
+
fms = np.zeros((nrof_thresholds))
|
20 |
+
|
21 |
+
# Find the best threshold for the fold
|
22 |
+
|
23 |
+
for threshold_idx, threshold in enumerate(thresholds):
|
24 |
+
tprs[threshold_idx], fprs[threshold_idx], precisions[threshold_idx], fms[threshold_idx], acc_train[
|
25 |
+
threshold_idx] = calculate_accuracy(threshold, sims, labels)
|
26 |
+
|
27 |
+
bestindex = np.argmax(fms)
|
28 |
+
bestfm = fms[bestindex]
|
29 |
+
besttpr = tprs[bestindex]
|
30 |
+
bestacc = acc_train[bestindex]
|
31 |
+
|
32 |
+
return bestfm, besttpr, bestacc
|
33 |
+
|
34 |
+
|
35 |
+
def calculate_accuracy(threshold, sims, actual_issame):
|
36 |
+
predict_issame = np.greater(sims, threshold)
|
37 |
+
tp = np.sum(np.logical_and(predict_issame, actual_issame))
|
38 |
+
fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
|
39 |
+
tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
|
40 |
+
fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
|
41 |
+
|
42 |
+
tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn) # recall
|
43 |
+
fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
|
44 |
+
precision = 0 if (tp + fp == 0) else float(tp) / float(tp + fp)
|
45 |
+
fm = 2 * precision * tpr / (precision + tpr + 1e-12)
|
46 |
+
acc = float(tp + tn) / (sims.size + 1e-12)
|
47 |
+
return tpr, fpr, precision, fm, acc
|
48 |
+
|
49 |
+
|
50 |
+
def calculate_eer(thresholds, sims, labels):
|
51 |
+
nrof_thresholds = len(thresholds)
|
52 |
+
|
53 |
+
# Find the threshold that gives FAR = far_target
|
54 |
+
far_train = np.zeros(nrof_thresholds)
|
55 |
+
frr_train = np.zeros(nrof_thresholds)
|
56 |
+
eer_index = 0
|
57 |
+
eer_diff = 100000000
|
58 |
+
for threshold_idx, threshold in enumerate(thresholds):
|
59 |
+
frr_train[threshold_idx], far_train[threshold_idx] = calculate_val_far(threshold, sims, labels)
|
60 |
+
if abs(frr_train[threshold_idx] - far_train[threshold_idx]) < eer_diff:
|
61 |
+
eer_diff = abs(frr_train[threshold_idx] - far_train[threshold_idx])
|
62 |
+
eer_index = threshold_idx
|
63 |
+
|
64 |
+
frr, far = frr_train[eer_index], far_train[eer_index]
|
65 |
+
|
66 |
+
eer = (frr + far) / 2
|
67 |
+
|
68 |
+
return eer
|
69 |
+
|
70 |
+
|
71 |
+
def calculate_val_far(threshold, sims, actual_issame):
|
72 |
+
predict_issame = np.greater(sims, threshold)
|
73 |
+
true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
|
74 |
+
false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
|
75 |
+
n_same = np.sum(actual_issame)
|
76 |
+
n_diff = np.sum(np.logical_not(actual_issame))
|
77 |
+
if n_diff == 0:
|
78 |
+
n_diff = 1
|
79 |
+
if n_same == 0:
|
80 |
+
return 0, 0
|
81 |
+
val = float(true_accept) / float(n_same)
|
82 |
+
frr = 1 - val
|
83 |
+
far = float(false_accept) / float(n_diff)
|
84 |
+
return frr, far
|
speaker_recognition/test.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
from deep_speaker.audio import Audio
|
7 |
+
from deep_speaker.batcher import LazyTripletBatcher
|
8 |
+
from deep_speaker.constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE
|
9 |
+
from deep_speaker.conv_models import DeepSpeakerModel
|
10 |
+
from deep_speaker.eval_metrics import evaluate
|
11 |
+
from deep_speaker.utils import load_best_checkpoint, enable_deterministic
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
def batch_cosine_similarity(x1, x2):
|
17 |
+
# https://en.wikipedia.org/wiki/Cosine_similarity
|
18 |
+
# 1 = equal direction ; -1 = opposite direction
|
19 |
+
mul = np.multiply(x1, x2)
|
20 |
+
s = np.sum(mul, axis=1)
|
21 |
+
|
22 |
+
# l1 = np.sum(np.multiply(x1, x1),axis=1)
|
23 |
+
# l2 = np.sum(np.multiply(x2, x2), axis=1)
|
24 |
+
# as values have have length 1, we don't need to divide by norm (as it is 1)
|
25 |
+
return s
|
26 |
+
|
27 |
+
|
28 |
+
def eval_model(working_dir: str, model: DeepSpeakerModel):
|
29 |
+
enable_deterministic()
|
30 |
+
audio = Audio(working_dir)
|
31 |
+
batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model)
|
32 |
+
speakers_list = list(audio.speakers_to_utterances.keys())
|
33 |
+
num_negative_speakers = 99
|
34 |
+
num_speakers = len(speakers_list)
|
35 |
+
y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1)) # negatives + positive
|
36 |
+
for i, positive_speaker in tqdm(enumerate(speakers_list), desc='test', total=num_speakers):
|
37 |
+
# convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
|
38 |
+
input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers)
|
39 |
+
# batch size is not relevant. just making sure we don't push too much on the GPU.
|
40 |
+
predictions = model.m.predict(input_data, batch_size=BATCH_SIZE)
|
41 |
+
anchor_embedding = predictions[0]
|
42 |
+
for j, other_than_anchor_embedding in enumerate(predictions[1:]): # positive + negatives
|
43 |
+
y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0]
|
44 |
+
# y_pred[i] = softmax(y_pred[i])
|
45 |
+
# could apply softmax here.
|
46 |
+
y_true = np.zeros_like(y_pred) # positive is at index 0.
|
47 |
+
y_true[:, 0] = 1.0
|
48 |
+
print(np.matrix(y_true))
|
49 |
+
print(np.matrix(y_pred))
|
50 |
+
print(np.min(y_pred), np.max(y_pred))
|
51 |
+
fm, tpr, acc, eer = evaluate(y_pred, y_true)
|
52 |
+
return fm, tpr, acc, eer
|
53 |
+
|
54 |
+
|
55 |
+
def test(working_dir, checkpoint_file=None):
|
56 |
+
batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
|
57 |
+
dsm = DeepSpeakerModel(batch_input_shape)
|
58 |
+
if checkpoint_file is None:
|
59 |
+
checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
|
60 |
+
if checkpoint_file is not None:
|
61 |
+
logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...')
|
62 |
+
dsm.m.load_weights(checkpoint_file, by_name=True)
|
63 |
+
else:
|
64 |
+
logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
|
65 |
+
exit(1)
|
66 |
+
|
67 |
+
fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
|
68 |
+
logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
|
69 |
+
f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
|
speaker_recognition/train.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
# pylint: disable=E0611,E0401
|
5 |
+
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
|
6 |
+
# pylint: disable=E0611,E0401
|
7 |
+
from tensorflow.keras.optimizers import SGD
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
from deep_speaker.batcher import KerasFormatConverter, LazyTripletBatcher
|
11 |
+
from deep_speaker.constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
|
12 |
+
from deep_speaker.conv_models import DeepSpeakerModel
|
13 |
+
from deep_speaker.triplet_loss import deep_speaker_loss
|
14 |
+
from deep_speaker.utils import load_best_checkpoint, ensures_dir
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
# Otherwise it's just too much logging from Tensorflow...
|
19 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
20 |
+
|
21 |
+
|
22 |
+
def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE):
|
23 |
+
batcher = LazyTripletBatcher(working_dir, max_length, dsm)
|
24 |
+
|
25 |
+
# build small test set.
|
26 |
+
test_batches = []
|
27 |
+
for _ in tqdm(range(200), desc='Build test set'):
|
28 |
+
test_batches.append(batcher.get_batch_test(batch_size))
|
29 |
+
|
30 |
+
def test_generator():
|
31 |
+
while True:
|
32 |
+
for bb in test_batches:
|
33 |
+
yield bb
|
34 |
+
|
35 |
+
def train_generator():
|
36 |
+
while True:
|
37 |
+
yield batcher.get_random_batch(batch_size, is_test=False)
|
38 |
+
|
39 |
+
checkpoint_name = dsm.m.name + '_checkpoint'
|
40 |
+
checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
|
41 |
+
checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
|
42 |
+
dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
|
43 |
+
epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
|
44 |
+
callbacks=[checkpoint])
|
45 |
+
|
46 |
+
|
47 |
+
def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test,
|
48 |
+
batch_size=BATCH_SIZE, max_epochs=1000, initial_epoch=0):
|
49 |
+
checkpoint_name = dsm.m.name + '_checkpoint'
|
50 |
+
checkpoint_filename = os.path.join(CHECKPOINTS_SOFTMAX_DIR, checkpoint_name + '_{epoch}.h5')
|
51 |
+
checkpoint = ModelCheckpoint(monitor='val_accuracy', filepath=checkpoint_filename, save_best_only=True)
|
52 |
+
|
53 |
+
# if the accuracy does not increase by 0.1% over 20 epochs, we stop the training.
|
54 |
+
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=20, verbose=1, mode='max')
|
55 |
+
|
56 |
+
# if the accuracy does not increase over 10 epochs, we reduce the learning rate by half.
|
57 |
+
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.0001, verbose=1)
|
58 |
+
|
59 |
+
max_len_train = len(kx_train) - len(kx_train) % batch_size
|
60 |
+
kx_train = kx_train[0:max_len_train]
|
61 |
+
ky_train = ky_train[0:max_len_train]
|
62 |
+
max_len_test = len(kx_test) - len(kx_test) % batch_size
|
63 |
+
kx_test = kx_test[0:max_len_test]
|
64 |
+
ky_test = ky_test[0:max_len_test]
|
65 |
+
|
66 |
+
dsm.m.fit(x=kx_train,
|
67 |
+
y=ky_train,
|
68 |
+
batch_size=batch_size,
|
69 |
+
epochs=initial_epoch + max_epochs,
|
70 |
+
initial_epoch=initial_epoch,
|
71 |
+
verbose=1,
|
72 |
+
shuffle=True,
|
73 |
+
validation_data=(kx_test, ky_test),
|
74 |
+
callbacks=[early_stopping, reduce_lr, checkpoint])
|
75 |
+
|
76 |
+
|
77 |
+
def start_training(working_dir, pre_training_phase=True):
|
78 |
+
ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
|
79 |
+
ensures_dir(CHECKPOINTS_TRIPLET_DIR)
|
80 |
+
batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
|
81 |
+
if pre_training_phase:
|
82 |
+
logger.info('Softmax pre-training.')
|
83 |
+
kc = KerasFormatConverter(working_dir)
|
84 |
+
num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
|
85 |
+
dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
|
86 |
+
dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
87 |
+
pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
|
88 |
+
if pre_training_checkpoint is not None:
|
89 |
+
initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
|
90 |
+
logger.info(f'Initial epoch is {initial_epoch}.')
|
91 |
+
logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
|
92 |
+
dsm.m.load_weights(pre_training_checkpoint) # latest one.
|
93 |
+
else:
|
94 |
+
initial_epoch = 0
|
95 |
+
fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
|
96 |
+
else:
|
97 |
+
logger.info('Training with the triplet loss.')
|
98 |
+
dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
|
99 |
+
triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
|
100 |
+
pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
|
101 |
+
if triplet_checkpoint is not None:
|
102 |
+
logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
|
103 |
+
dsm.m.load_weights(triplet_checkpoint)
|
104 |
+
elif pre_training_checkpoint is not None:
|
105 |
+
logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
|
106 |
+
# If `by_name` is True, weights are loaded into layers only if they share the
|
107 |
+
# same name. This is useful for fine-tuning or transfer-learning models where
|
108 |
+
# some of the layers have changed.
|
109 |
+
dsm.m.load_weights(pre_training_checkpoint, by_name=True)
|
110 |
+
dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
|
111 |
+
fit_model(dsm, working_dir, NUM_FRAMES)
|
speaker_recognition/triplet_loss.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pylint: disable=E0611,E0401
|
2 |
+
import tensorflow.keras.backend as K
|
3 |
+
|
4 |
+
# ALPHA = 0.2 # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf
|
5 |
+
ALPHA = 0.1 # used in Deep Speaker.
|
6 |
+
|
7 |
+
|
8 |
+
def batch_cosine_similarity(x1, x2):
|
9 |
+
# https://en.wikipedia.org/wiki/Cosine_similarity
|
10 |
+
# 1 = equal direction ; -1 = opposite direction
|
11 |
+
dot = K.squeeze(K.batch_dot(x1, x2, axes=1), axis=1)
|
12 |
+
# as values have have length 1, we don't need to divide by norm (as it is 1)
|
13 |
+
return dot
|
14 |
+
|
15 |
+
|
16 |
+
def deep_speaker_loss(y_true, y_pred, alpha=ALPHA):
|
17 |
+
# y_true is not used. we respect this convention:
|
18 |
+
# y_true.shape = (batch_size, embedding_size) [not used]
|
19 |
+
# y_pred.shape = (batch_size, embedding_size)
|
20 |
+
# EXAMPLE:
|
21 |
+
# _____________________________________________________
|
22 |
+
# ANCHOR 1 (512,)
|
23 |
+
# ANCHOR 2 (512,)
|
24 |
+
# POS EX 1 (512,)
|
25 |
+
# POS EX 2 (512,)
|
26 |
+
# NEG EX 1 (512,)
|
27 |
+
# NEG EX 2 (512,)
|
28 |
+
# _____________________________________________________
|
29 |
+
split = K.shape(y_pred)[0] // 3
|
30 |
+
|
31 |
+
anchor = y_pred[0:split]
|
32 |
+
positive_ex = y_pred[split:2 * split]
|
33 |
+
negative_ex = y_pred[2 * split:]
|
34 |
+
|
35 |
+
# If the loss does not decrease below ALPHA then the model does not learn anything.
|
36 |
+
# If all anchor = positive = negative (model outputs the same vector always).
|
37 |
+
# Then sap = san = 1. and loss = max(alpha,0) = alpha.
|
38 |
+
# On the contrary if anchor = positive = [1] and negative = [-1].
|
39 |
+
# Then sap = 1 and san = -1. loss = max(-1-1+0.1,0) = max(-1.9, 0) = 0.
|
40 |
+
sap = batch_cosine_similarity(anchor, positive_ex)
|
41 |
+
san = batch_cosine_similarity(anchor, negative_ex)
|
42 |
+
loss = K.maximum(san - sap + alpha, 0.0)
|
43 |
+
total_loss = K.mean(loss)
|
44 |
+
return total_loss
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == '__main__':
|
48 |
+
import numpy as np
|
49 |
+
|
50 |
+
print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
51 |
+
print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
52 |
+
print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
53 |
+
print('--------------')
|
54 |
+
print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
|
55 |
+
print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
|
56 |
+
print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
|
57 |
+
print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
|
58 |
+
|
59 |
+
print('--------------')
|
60 |
+
print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
61 |
+
print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
62 |
+
print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
63 |
+
print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
|
speaker_recognition/utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import shutil
|
5 |
+
from glob import glob
|
6 |
+
|
7 |
+
import click
|
8 |
+
import dill
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
+
from natsort import natsorted
|
12 |
+
|
13 |
+
from deep_speaker.constants import TRAIN_TEST_RATIO
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
def find_files(directory, ext='wav'):
|
19 |
+
return sorted(glob(directory + f'/**/*.{ext}', recursive=True))
|
20 |
+
|
21 |
+
|
22 |
+
def init_pandas():
|
23 |
+
pd.set_option('display.float_format', lambda x: '%.3f' % x)
|
24 |
+
pd.set_option('display.max_rows', None)
|
25 |
+
pd.set_option('display.max_columns', None)
|
26 |
+
pd.set_option('display.width', 1000)
|
27 |
+
|
28 |
+
|
29 |
+
def create_new_empty_dir(directory: str):
|
30 |
+
if os.path.exists(directory):
|
31 |
+
shutil.rmtree(directory)
|
32 |
+
os.makedirs(directory)
|
33 |
+
|
34 |
+
|
35 |
+
def ensure_dir_for_filename(filename: str):
|
36 |
+
ensures_dir(os.path.dirname(filename))
|
37 |
+
|
38 |
+
|
39 |
+
def ensures_dir(directory: str):
|
40 |
+
if len(directory) > 0 and not os.path.exists(directory):
|
41 |
+
os.makedirs(directory)
|
42 |
+
|
43 |
+
|
44 |
+
class ClickType:
|
45 |
+
|
46 |
+
@staticmethod
|
47 |
+
def input_file(writable=False):
|
48 |
+
return click.Path(exists=True, file_okay=True, dir_okay=False,
|
49 |
+
writable=writable, readable=True, resolve_path=True)
|
50 |
+
|
51 |
+
@staticmethod
|
52 |
+
def input_dir(writable=False):
|
53 |
+
return click.Path(exists=True, file_okay=False, dir_okay=True,
|
54 |
+
writable=writable, readable=True, resolve_path=True)
|
55 |
+
|
56 |
+
@staticmethod
|
57 |
+
def output_file():
|
58 |
+
return click.Path(exists=False, file_okay=True, dir_okay=False,
|
59 |
+
writable=True, readable=True, resolve_path=True)
|
60 |
+
|
61 |
+
@staticmethod
|
62 |
+
def output_dir():
|
63 |
+
return click.Path(exists=False, file_okay=False, dir_okay=True,
|
64 |
+
writable=True, readable=True, resolve_path=True)
|
65 |
+
|
66 |
+
|
67 |
+
def parallel_function(f, sequence, num_threads=None):
|
68 |
+
from multiprocessing import Pool
|
69 |
+
pool = Pool(processes=num_threads)
|
70 |
+
result = pool.map(f, sequence)
|
71 |
+
cleaned = [x for x in result if x is not None]
|
72 |
+
pool.close()
|
73 |
+
pool.join()
|
74 |
+
return cleaned
|
75 |
+
|
76 |
+
|
77 |
+
def load_best_checkpoint(checkpoint_dir):
|
78 |
+
checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
|
79 |
+
if len(checkpoints) != 0:
|
80 |
+
return checkpoints[-1]
|
81 |
+
return None
|
82 |
+
|
83 |
+
|
84 |
+
def delete_older_checkpoints(checkpoint_dir, max_to_keep=5):
|
85 |
+
assert max_to_keep > 0
|
86 |
+
checkpoints = natsorted(glob(os.path.join(checkpoint_dir, '*.h5')))
|
87 |
+
checkpoints_to_keep = checkpoints[-max_to_keep:]
|
88 |
+
for checkpoint in checkpoints:
|
89 |
+
if checkpoint not in checkpoints_to_keep:
|
90 |
+
os.remove(checkpoint)
|
91 |
+
|
92 |
+
|
93 |
+
def enable_deterministic():
|
94 |
+
print('Deterministic mode enabled.')
|
95 |
+
np.random.seed(123)
|
96 |
+
random.seed(123)
|
97 |
+
|
98 |
+
|
99 |
+
def load_pickle(file):
|
100 |
+
if not os.path.exists(file):
|
101 |
+
return None
|
102 |
+
logger.info(f'Loading PKL file: {file}.')
|
103 |
+
with open(file, 'rb') as r:
|
104 |
+
return dill.load(r)
|
105 |
+
|
106 |
+
|
107 |
+
def load_npy(file):
|
108 |
+
if not os.path.exists(file):
|
109 |
+
return None
|
110 |
+
logger.info(f'Loading NPY file: {file}.')
|
111 |
+
return np.load(file)
|
112 |
+
|
113 |
+
|
114 |
+
def train_test_sp_to_utt(audio, is_test):
|
115 |
+
sp_to_utt = {}
|
116 |
+
for speaker_id, utterances in audio.speakers_to_utterances.items():
|
117 |
+
utterances_files = sorted(utterances.values())
|
118 |
+
train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
|
119 |
+
sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
|
120 |
+
return sp_to_utt
|