import os
import shutil
import numpy as np

import tensorflow as tf
from tensorflow import keras

from pathlib import Path
from IPython.display import display, Audio

import numpy as np
import tensorflow as tf
import gradio as gr
from huggingface_hub import from_pretrained_keras
import cv2
from IPython.display import Audio

classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']

# Percentage of samples to use for validation
# VALID_SPLIT = 0.1

# Seed to use when shuffling the dataset and the noise
# SHUFFLE_SEED = 43

# The sampling rate to use.
# This is the one used in all of the audio samples.
# We will resample all of the noise to this sampling rate.
# This will also be the output size of the audio wave samples
# (since all samples are of 1 second long)
SAMPLING_RATE = 16000

# The factor to multiply the noise with according to:
#   noisy_sample = sample + noise * prop * scale
#      where prop = sample_amplitude / noise_amplitude
# SCALE = 0.5

# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
#     BATCH_SIZE
# )

# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))

model = from_pretrained_keras("keras-io/speaker-recognition")


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio

def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)
    # print("audio.shape[1]", audio.shape)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'

# print(path_to_audio(actual_audio_path).shape)
# print(actual_audio_path.shape)

def predict(actual_audio_path, actual_label):
  path_of_actual_audio = path_to_audio(actual_audio_path)
  actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
  # Get the signal FFT
  ffts = audio_to_fft(actual_audio)
  # Predict
  y_pred = model.predict(ffts)
  y_pred = np.argmax(y_pred, axis=-1)
  # print(y_pred)
  return classes_names[y_pred[0]], actual_audio_path
  
# the app takes one AUDIO to be recognised
input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]

# the app outputs two segmented images
output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
# it's good practice to pass examples, description and a title to guide users
examples = [['audios/260.wav', 'Benjamin_Netanyau'], 
            ['audios/611.wav', 'Jens_Stoltenberg'], 
            ['audios/65.wav', 'Julia_Gillard'], 
            ['audios/1083.wav', 'Magaret_Tarcher'],
            ['audios/605.wav', 'Nelson_Mandela']]
title = "Speaker Recognition"
description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"

gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, allow_flagging=False, analytics_enabled=False,
  title=title, description=description, article="Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on <a href='https://keras.io/examples/audio/speaker_recognition_using_cnn/'><b>this notebook</b></a>").launch(enable_queue=True, debug=True)