Big / speaker_recognition /conv_models.py
TomCallan's picture
Upload 14 files
aed64b5
import logging
import os
import numpy as np
import tensorflow as tf
# pylint: disable=E0611,E0401
import tensorflow.keras.backend as K
# pylint: disable=E0611,E0401
from tensorflow.keras import layers, regularizers
# pylint: disable=E0611,E0401
from tensorflow.keras.layers import (
BatchNormalization,
Conv2D,
Dense,
Dropout,
Input,
Lambda,
Reshape,
)
# pylint: disable=E0611,E0401
from tensorflow.keras.models import Model
# pylint: disable=E0611,E0401
from tensorflow.keras.optimizers import Adam
from deep_speaker.constants import NUM_FBANKS, SAMPLE_RATE, NUM_FRAMES
from deep_speaker.triplet_loss import deep_speaker_loss
logger = logging.getLogger(__name__)
@tf.function
def tf_normalize(data, ndims, eps=0, adjusted=False):
data = tf.convert_to_tensor(data, name='data')
reduce_dims = [-i - 1 for i in range(ndims)]
# pylint: disable=E1123,E1120
data = tf.cast(data, dtype=tf.dtypes.float32)
data_num = tf.reduce_prod(data.shape[-ndims:])
data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)
# Apply a minimum normalization that protects us against uniform images.
stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
adjusted_stddev = stddev
if adjusted:
min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
eps = tf.maximum(eps, min_stddev)
if eps > 0:
adjusted_stddev = tf.maximum(adjusted_stddev, eps)
return (data - data_mean) / adjusted_stddev
@tf.function
def tf_fbank(samples):
"""
Compute Mel-filterbank energy features from an audio signal.
See python_speech_features.fbank
"""
frame_length = int(0.025 * SAMPLE_RATE)
frame_step = int(0.01 * SAMPLE_RATE)
fft_length = 512
fft_bins = fft_length // 2 + 1
pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]
# Original implementation from python_speech_features
# frames = tf.expand_dims(sigproc.framesig(preemphasis[0], frame_length,
# frame_step, winfunc=lambda x: np.ones((x,))), 0)
# powspec = sigproc.powspec(frames, fft_length)
# Tensorflow impl #1, using manually-split frames and rfft
# spec = tf.abs(tf.signal.rfft(frames, [fft_length]))
# powspec = tf.square(spec) / fft_length
# Tensorflow impl #2, using stft to handle framing automatically
# (There is a one-off mismatch on the number of frames on the resulting tensor, but I guess this is ok)
spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
powspec = tf.square(spec) / fft_length
# Matrix to transform spectrum to mel-frequencies
# Original implementation from python_speech_features
# linear_to_mel_weight_matrix = get_filterbanks(NUM_FBANKS, fft_length,
# SAMPLE_RATE, 0, SAMPLE_RATE/2).astype(np.float32).T
linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=NUM_FBANKS,
num_spectrogram_bins=fft_bins,
sample_rate=SAMPLE_RATE,
lower_edge_hertz=0,
upper_edge_hertz=SAMPLE_RATE / 2,
)
feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
# feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
return feat
class DeepSpeakerModel:
# I thought it was 3 but maybe energy is added at a 4th dimension.
# would be better to have 4 dimensions:
# MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
# this seems to help match the parameter counts.
def __init__(
self,
batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
include_softmax=False,
num_speakers_softmax=None,
pcm_input=False
):
if pcm_input:
batch_input_shape = None
self.include_softmax = include_softmax
if self.include_softmax:
assert num_speakers_softmax > 0
self.clipped_relu_count = 0
# http://cs231n.github.io/convolutional-networks/
# conv weights
# #params = ks * ks * nb_filters * num_channels_input
# Conv128-s
# 5*5*128*128/2+128
# ks*ks*nb_filters*channels/strides+bias(=nb_filters)
# take 100 ms -> 4 frames.
# if signal is 3 seconds, then take 100ms per 100ms and average out this network.
# 8*8 = 64 features.
# used to share all the layers across the inputs
# num_frames = K.shape() - do it dynamically after.
if pcm_input:
batch_input_shape = batch_input_shape or (None, None) # Batch-size, num-samples
inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
x = inputs
x = Lambda(tf_fbank)(x)
x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
else:
batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
inputs = Input(batch_shape=batch_input_shape, name='input')
x = inputs
x = self.cnn_component(x)
x = Reshape((-1, 2048))(x)
# Temporal average layer. axis=1 is time.
x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
if include_softmax:
logger.info('Including a Dropout layer to reduce overfitting.')
# used for softmax because the dataset we pre-train on might be too small. easy to overfit.
x = Dropout(0.5)(x)
x = Dense(512, name='affine')(x)
if include_softmax:
# Those weights are just when we train on softmax.
x = Dense(num_speakers_softmax, activation='softmax')(x)
else:
# Does not contain any weights.
x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
self.m = Model(inputs, x, name='ResCNN')
def keras_model(self):
return self.m
def get_weights(self):
w = self.m.get_weights()
if self.include_softmax:
w.pop() # last 2 are the W_softmax and b_softmax.
w.pop()
return w
def clipped_relu(self, inputs):
relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
self.clipped_relu_count += 1
return relu
def identity_block(self, input_tensor, kernel_size, filters, stage, block):
conv_name_base = f'res{stage}_{block}_branch'
x = Conv2D(filters,
kernel_size=kernel_size,
strides=1,
activation=None,
padding='same',
kernel_initializer='glorot_uniform',
kernel_regularizer=regularizers.l2(l=0.0001),
name=conv_name_base + '_2a')(input_tensor)
x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
x = self.clipped_relu(x)
x = Conv2D(
filters,
kernel_size=kernel_size,
strides=1,
activation=None,
padding='same',
kernel_initializer='glorot_uniform',
kernel_regularizer=regularizers.l2(l=0.0001),
name=conv_name_base + '_2b',
)(x)
x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)
x = self.clipped_relu(x)
x = layers.add([x, input_tensor])
x = self.clipped_relu(x)
return x
def conv_and_res_block(self, inp, filters, stage):
conv_name = 'conv{}-s'.format(filters)
# TODO: why kernel_regularizer?
o = Conv2D(filters,
kernel_size=5,
strides=2,
activation=None,
padding='same',
kernel_initializer='glorot_uniform',
kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
o = BatchNormalization(name=conv_name + '_bn')(o)
o = self.clipped_relu(o)
for i in range(3):
o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
return o
def cnn_component(self, inp):
x = self.conv_and_res_block(inp, 64, stage=1)
x = self.conv_and_res_block(x, 128, stage=2)
x = self.conv_and_res_block(x, 256, stage=3)
x = self.conv_and_res_block(x, 512, stage=4)
return x
def set_weights(self, w):
for layer, layer_w in zip(self.m.layers, w):
layer.set_weights(layer_w)
logger.info(f'Setting weights for [{layer.name}]...')
def main():
# Looks correct to me.
# I have 37K but paper reports 41K. which is not too far.
dsm = DeepSpeakerModel()
dsm.m.summary()
# I suspect num frames to be 32.
# Then fbank=64, then total would be 32*64 = 2048.
# plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
def _train():
# x = np.random.uniform(size=(6, 32, 64, 4)) # 6 is multiple of 3.
# y_softmax = np.random.uniform(size=(6, 100))
# dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
# dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
# print(dsm.m.predict(x).shape)
# print(dsm.m.evaluate(x, y_softmax))
# w = dsm.get_weights()
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
# dsm.m.set_weights(w)
dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
# it works!!!!!!!!!!!!!!!!!!!!
# unit_batch_size = 20
# anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
# positive = np.array(anchor)
# negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
# batch = np.vstack((anchor, positive, negative))
# x = batch
# y = np.zeros(shape=(len(batch), 512)) # not important.
# print('Starting to fit...')
# while True:
# print(dsm.m.train_on_batch(x, y))
# should not work... and it does not work!
unit_batch_size = 20
negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
batch = np.vstack((negative, negative, negative))
x = batch
y = np.zeros(shape=(len(batch), 512)) # not important.
print('Starting to fit...')
while True:
print(dsm.m.train_on_batch(x, y))
def _test_checkpoint_compatibility():
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
dsm.m.save_weights('test.h5')
dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
dsm.m.load_weights('test.h5', by_name=True)
os.remove('test.h5')
if __name__ == '__main__':
_test_checkpoint_compatibility()