Spaces:

TomCallan
/

Big

Runtime error

Big

File size: 10,741 Bytes

aed64b5

import logging
import os

import numpy as np
import tensorflow as tf
# pylint: disable=E0611,E0401
import tensorflow.keras.backend as K
# pylint: disable=E0611,E0401
from tensorflow.keras import layers, regularizers
# pylint: disable=E0611,E0401
from tensorflow.keras.layers import (
    BatchNormalization,
    Conv2D,
    Dense,
    Dropout,
    Input,
    Lambda,
    Reshape,
)
# pylint: disable=E0611,E0401
from tensorflow.keras.models import Model
# pylint: disable=E0611,E0401
from tensorflow.keras.optimizers import Adam

from deep_speaker.constants import NUM_FBANKS, SAMPLE_RATE, NUM_FRAMES
from deep_speaker.triplet_loss import deep_speaker_loss

logger = logging.getLogger(__name__)


@tf.function
def tf_normalize(data, ndims, eps=0, adjusted=False):
    data = tf.convert_to_tensor(data, name='data')

    reduce_dims = [-i - 1 for i in range(ndims)]
    # pylint: disable=E1123,E1120
    data = tf.cast(data, dtype=tf.dtypes.float32)
    data_num = tf.reduce_prod(data.shape[-ndims:])
    data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)

    # Apply a minimum normalization that protects us against uniform images.
    stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
    adjusted_stddev = stddev
    if adjusted:
        min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
        eps = tf.maximum(eps, min_stddev)
    if eps > 0:
        adjusted_stddev = tf.maximum(adjusted_stddev, eps)

    return (data - data_mean) / adjusted_stddev


@tf.function
def tf_fbank(samples):
    """
    Compute Mel-filterbank energy features from an audio signal.
    See python_speech_features.fbank
    """
    frame_length = int(0.025 * SAMPLE_RATE)
    frame_step = int(0.01 * SAMPLE_RATE)
    fft_length = 512
    fft_bins = fft_length // 2 + 1

    pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]

    # Original implementation from python_speech_features
    # frames = tf.expand_dims(sigproc.framesig(preemphasis[0], frame_length,
    # frame_step, winfunc=lambda x: np.ones((x,))), 0)
    # powspec = sigproc.powspec(frames, fft_length)

    # Tensorflow impl #1, using manually-split frames and rfft
    # spec = tf.abs(tf.signal.rfft(frames, [fft_length]))
    # powspec = tf.square(spec) / fft_length

    # Tensorflow impl #2, using stft to handle framing automatically
    # (There is a one-off mismatch on the number of frames on the resulting tensor, but I guess this is ok)
    spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
    powspec = tf.square(spec) / fft_length

    # Matrix to transform spectrum to mel-frequencies

    # Original implementation from python_speech_features
    # linear_to_mel_weight_matrix = get_filterbanks(NUM_FBANKS, fft_length,
    # SAMPLE_RATE, 0, SAMPLE_RATE/2).astype(np.float32).T

    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=NUM_FBANKS,
        num_spectrogram_bins=fft_bins,
        sample_rate=SAMPLE_RATE,
        lower_edge_hertz=0,
        upper_edge_hertz=SAMPLE_RATE / 2,
    )

    feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
    # feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
    return feat


class DeepSpeakerModel:

    # I thought it was 3 but maybe energy is added at a 4th dimension.
    # would be better to have 4 dimensions:
    # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
    # this seems to help match the parameter counts.
    def __init__(
            self,
            batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
            include_softmax=False,
            num_speakers_softmax=None,
            pcm_input=False
    ):
        if pcm_input:
            batch_input_shape = None
        self.include_softmax = include_softmax
        if self.include_softmax:
            assert num_speakers_softmax > 0
        self.clipped_relu_count = 0

        # http://cs231n.github.io/convolutional-networks/
        # conv weights
        # #params = ks * ks * nb_filters * num_channels_input

        # Conv128-s
        # 5*5*128*128/2+128
        # ks*ks*nb_filters*channels/strides+bias(=nb_filters)

        # take 100 ms -> 4 frames.
        # if signal is 3 seconds, then take 100ms per 100ms and average out this network.
        # 8*8 = 64 features.

        # used to share all the layers across the inputs

        # num_frames = K.shape() - do it dynamically after.

        if pcm_input:
            batch_input_shape = batch_input_shape or (None, None)  # Batch-size, num-samples
            inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
            x = inputs
            x = Lambda(tf_fbank)(x)
            x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
            x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
        else:
            batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
            inputs = Input(batch_shape=batch_input_shape, name='input')
            x = inputs

        x = self.cnn_component(x)

        x = Reshape((-1, 2048))(x)
        # Temporal average layer. axis=1 is time.
        x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
        if include_softmax:
            logger.info('Including a Dropout layer to reduce overfitting.')
            # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
            x = Dropout(0.5)(x)
        x = Dense(512, name='affine')(x)
        if include_softmax:
            # Those weights are just when we train on softmax.
            x = Dense(num_speakers_softmax, activation='softmax')(x)
        else:
            # Does not contain any weights.
            x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
        self.m = Model(inputs, x, name='ResCNN')

    def keras_model(self):
        return self.m

    def get_weights(self):
        w = self.m.get_weights()
        if self.include_softmax:
            w.pop()  # last 2 are the W_softmax and b_softmax.
            w.pop()
        return w

    def clipped_relu(self, inputs):
        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
        self.clipped_relu_count += 1
        return relu

    def identity_block(self, input_tensor, kernel_size, filters, stage, block):
        conv_name_base = f'res{stage}_{block}_branch'

        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001),
                   name=conv_name_base + '_2a')(input_tensor)
        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
        x = self.clipped_relu(x)

        x = Conv2D(
            filters,
            kernel_size=kernel_size,
            strides=1,
            activation=None,
            padding='same',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(l=0.0001),
            name=conv_name_base + '_2b',
        )(x)
        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)

        x = self.clipped_relu(x)

        x = layers.add([x, input_tensor])
        x = self.clipped_relu(x)
        return x

    def conv_and_res_block(self, inp, filters, stage):
        conv_name = 'conv{}-s'.format(filters)
        # TODO: why kernel_regularizer?
        o = Conv2D(filters,
                   kernel_size=5,
                   strides=2,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
        o = BatchNormalization(name=conv_name + '_bn')(o)
        o = self.clipped_relu(o)
        for i in range(3):
            o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
        return o

    def cnn_component(self, inp):
        x = self.conv_and_res_block(inp, 64, stage=1)
        x = self.conv_and_res_block(x, 128, stage=2)
        x = self.conv_and_res_block(x, 256, stage=3)
        x = self.conv_and_res_block(x, 512, stage=4)
        return x

    def set_weights(self, w):
        for layer, layer_w in zip(self.m.layers, w):
            layer.set_weights(layer_w)
            logger.info(f'Setting weights for [{layer.name}]...')


def main():
    # Looks correct to me.
    # I have 37K but paper reports 41K. which is not too far.
    dsm = DeepSpeakerModel()
    dsm.m.summary()

    # I suspect num frames to be 32.
    # Then fbank=64, then total would be 32*64 = 2048.
    # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)


def _train():
    # x = np.random.uniform(size=(6, 32, 64, 4))  # 6 is multiple of 3.
    # y_softmax = np.random.uniform(size=(6, 100))
    # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
    # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
    # print(dsm.m.predict(x).shape)
    # print(dsm.m.evaluate(x, y_softmax))
    # w = dsm.get_weights()
    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
    # dsm.m.set_weights(w)
    dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)

    # it works!!!!!!!!!!!!!!!!!!!!
    # unit_batch_size = 20
    # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
    # positive = np.array(anchor)
    # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
    # batch = np.vstack((anchor, positive, negative))
    # x = batch
    # y = np.zeros(shape=(len(batch), 512))  # not important.
    # print('Starting to fit...')
    # while True:
    #     print(dsm.m.train_on_batch(x, y))

    # should not work... and it does not work!
    unit_batch_size = 20
    negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
    batch = np.vstack((negative, negative, negative))
    x = batch
    y = np.zeros(shape=(len(batch), 512))  # not important.
    print('Starting to fit...')
    while True:
        print(dsm.m.train_on_batch(x, y))


def _test_checkpoint_compatibility():
    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
    dsm.m.save_weights('test.h5')
    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
    dsm.m.load_weights('test.h5', by_name=True)
    os.remove('test.h5')


if __name__ == '__main__':
    _test_checkpoint_compatibility()