Spaces:

keras-io
/

SpeakerRecognition

Runtime error

App Files Files Community

SpeakerRecognition / app.py

RobotJelly

app.py

d550e96 almost 3 years ago

raw

history blame

3.92 kB

	import os
	import shutil
	import numpy as np

	import tensorflow as tf
	from tensorflow import keras

	from pathlib import Path
	from IPython.display import display, Audio

	import numpy as np
	import tensorflow as tf
	import gradio as gr
	from huggingface_hub import from_pretrained_keras
	import cv2
	from IPython.display import Audio

	classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']

	# Percentage of samples to use for validation
	# VALID_SPLIT = 0.1

	# Seed to use when shuffling the dataset and the noise
	# SHUFFLE_SEED = 43

	# The sampling rate to use.
	# This is the one used in all of the audio samples.
	# We will resample all of the noise to this sampling rate.
	# This will also be the output size of the audio wave samples
	# (since all samples are of 1 second long)
	SAMPLING_RATE = 16000

	# The factor to multiply the noise with according to:
	# noisy_sample = sample + noise * prop * scale
	# where prop = sample_amplitude / noise_amplitude
	# SCALE = 0.5

	# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
	# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
	# BATCH_SIZE
	# )

	# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))

	model = from_pretrained_keras("keras-io/speaker-recognition")


	def path_to_audio(path):
	"""Reads and decodes an audio file."""
	audio = tf.io.read_file(path)
	audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
	return audio

	def audio_to_fft(audio):
	# Since tf.signal.fft applies FFT on the innermost dimension,
	# we need to squeeze the dimensions and then expand them again
	# after FFT
	audio = tf.squeeze(audio, axis=-1)
	fft = tf.signal.fft(
	tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
	)
	fft = tf.expand_dims(fft, axis=-1)
	# print("audio.shape[1]", audio.shape)

	# Return the absolute value of the first half of the FFT
	# which represents the positive frequencies
	return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


	actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'

	# print(path_to_audio(actual_audio_path).shape)
	# print(actual_audio_path.shape)

	def predict(actual_audio_path, actual_label):
	path_of_actual_audio = path_to_audio(actual_audio_path)
	actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
	# Get the signal FFT
	ffts = audio_to_fft(actual_audio)
	# Predict
	y_pred = model.predict(ffts)
	y_pred = np.argmax(y_pred, axis=-1)
	# print(y_pred)
	return classes_names[y_pred[0]], actual_audio_path

	# the app takes one AUDIO to be recognised
	input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]

	# the app outputs two segmented images
	output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
	# it's good practice to pass examples, description and a title to guide users
	examples = [['audios/260.wav', 'Benjamin_Netanyau'],
	['audios/611.wav', 'Jens_Stoltenberg'],
	['audios/65.wav', 'Julia_Gillard'],
	['audios/1083.wav', 'Magaret_Tarcher'],
	['audios/605.wav', 'Nelson_Mandela']]
	title = "Speaker Recognition"
	description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"

	gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, allow_flagging=False, analytics_enabled=False,
	title=title, description=description, article="Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on <a href='https://keras.io/examples/audio/speaker_recognition_using_cnn/'><b>this notebook</b></a>").launch(enable_queue=True, debug=True)