import os import shutil import numpy as np import tensorflow as tf from tensorflow import keras from pathlib import Path from IPython.display import display, Audio import numpy as np import tensorflow as tf import gradio as gr from huggingface_hub import from_pretrained_keras import cv2 from IPython.display import Audio classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela'] # Percentage of samples to use for validation # VALID_SPLIT = 0.1 # Seed to use when shuffling the dataset and the noise # SHUFFLE_SEED = 43 # The sampling rate to use. # This is the one used in all of the audio samples. # We will resample all of the noise to this sampling rate. # This will also be the output size of the audio wave samples # (since all samples are of 1 second long) SAMPLING_RATE = 16000 # The factor to multiply the noise with according to: # noisy_sample = sample + noise * prop * scale # where prop = sample_amplitude / noise_amplitude # SCALE = 0.5 # test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels) # test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch( # BATCH_SIZE # ) # test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y)) model = from_pretrained_keras("keras-io/speaker-recognition") def path_to_audio(path): """Reads and decodes an audio file.""" audio = tf.io.read_file(path) audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE) return audio def audio_to_fft(audio): # Since tf.signal.fft applies FFT on the innermost dimension, # we need to squeeze the dimensions and then expand them again # after FFT audio = tf.squeeze(audio, axis=-1) fft = tf.signal.fft( tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64) ) fft = tf.expand_dims(fft, axis=-1) # print("audio.shape[1]", audio.shape) # Return the absolute value of the first half of the FFT # which represents the positive frequencies return tf.math.abs(fft[:, : (audio.shape[1] // 2), :]) actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav' # print(path_to_audio(actual_audio_path).shape) # print(actual_audio_path.shape) def predict(actual_audio_path, actual_label): path_of_actual_audio = path_to_audio(actual_audio_path) actual_audio = tf.expand_dims(path_of_actual_audio, axis=0) # Get the signal FFT ffts = audio_to_fft(actual_audio) # Predict y_pred = model.predict(ffts) y_pred = np.argmax(y_pred, axis=-1) # print(y_pred) return classes_names[y_pred[0]], actual_audio_path # the app takes one AUDIO to be recognised input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")] # the app outputs two segmented images output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")] # it's good practice to pass examples, description and a title to guide users examples = [['audios/260.wav', 'Benjamin_Netanyau'], ['audios/611.wav', 'Jens_Stoltenberg'], ['audios/65.wav', 'Julia_Gillard'], ['audios/1083.wav', 'Magaret_Tarcher'], ['audios/605.wav', 'Nelson_Mandela']] title = "Speaker Recognition" description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!" gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="Space By: Kavya Bisht \n Based on this notebook").launch(enable_queue=True, debug=True)