Spaces:

TomCallan
/

Big

Runtime error

App Files Files Community

Big / speaker_recognition /triplet_loss.py

TomCallan

Upload 14 files

aed64b5 over 2 years ago

raw

history blame contribute delete

2.78 kB

	# pylint: disable=E0611,E0401
	import tensorflow.keras.backend as K

	# ALPHA = 0.2 # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf
	ALPHA = 0.1 # used in Deep Speaker.


	def batch_cosine_similarity(x1, x2):
	# https://en.wikipedia.org/wiki/Cosine_similarity
	# 1 = equal direction ; -1 = opposite direction
	dot = K.squeeze(K.batch_dot(x1, x2, axes=1), axis=1)
	# as values have have length 1, we don't need to divide by norm (as it is 1)
	return dot


	def deep_speaker_loss(y_true, y_pred, alpha=ALPHA):
	# y_true is not used. we respect this convention:
	# y_true.shape = (batch_size, embedding_size) [not used]
	# y_pred.shape = (batch_size, embedding_size)
	# EXAMPLE:
	# _____________________________________________________
	# ANCHOR 1 (512,)
	# ANCHOR 2 (512,)
	# POS EX 1 (512,)
	# POS EX 2 (512,)
	# NEG EX 1 (512,)
	# NEG EX 2 (512,)
	# _____________________________________________________
	split = K.shape(y_pred)[0] // 3

	anchor = y_pred[0:split]
	positive_ex = y_pred[split:2 * split]
	negative_ex = y_pred[2 * split:]

	# If the loss does not decrease below ALPHA then the model does not learn anything.
	# If all anchor = positive = negative (model outputs the same vector always).
	# Then sap = san = 1. and loss = max(alpha,0) = alpha.
	# On the contrary if anchor = positive = [1] and negative = [-1].
	# Then sap = 1 and san = -1. loss = max(-1-1+0.1,0) = max(-1.9, 0) = 0.
	sap = batch_cosine_similarity(anchor, positive_ex)
	san = batch_cosine_similarity(anchor, negative_ex)
	loss = K.maximum(san - sap + alpha, 0.0)
	total_loss = K.mean(loss)
	return total_loss


	if __name__ == '__main__':
	import numpy as np

	print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print('--------------')
	print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
	print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
	print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))
	print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.6], [1.0], [0.0]])))

	print('--------------')
	print(deep_speaker_loss(alpha=2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print(deep_speaker_loss(alpha=1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print(deep_speaker_loss(alpha=0.1, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))
	print(deep_speaker_loss(alpha=0.2, y_true=0, y_pred=np.array([[0.9], [1.0], [-1.0]])))