import os
import numpy as np
import librosa
import librosa.display
import pickle
import tensorflow as tf
import gradio as gr
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
from io import BytesIO
from PIL import Image
import warnings

# Suppress warnings and logs
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# Load model and label encoder
model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
with open("new_label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

def extract_features(audio, sr, max_len=40):
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
    mfccs = np.mean(mfccs.T, axis=0)
    
    # Extract Chroma
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma = np.mean(chroma.T, axis=0)
    
    # Extract Spectral Contrast
    contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    contrast = np.mean(contrast.T, axis=0)
    
    # Extract Zero-Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    zcr = np.mean(zcr.T, axis=0)
    
    # Extract Spectral Centroid
    centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
    centroid = np.mean(centroid.T, axis=0)
    
    # Extract Spectral Rolloff
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
    rolloff = np.mean(rolloff.T, axis=0)
    
    # Extract RMS Energy
    rms = librosa.feature.rms(y=audio)
    rms = np.mean(rms.T, axis=0)

    features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
    
    # Pad or trim to fixed length
    if len(features) < max_len:
        features = np.pad(features, (0, max_len - len(features)), mode='constant')
    else:
        features = features[:max_len]
    return features

def create_mel_spectrogram(audio, sr):
    """Create mel spectrogram plot"""
    plt.figure(figsize=(10, 4))
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    
    # Save to BytesIO and convert to PIL Image
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img = Image.open(buf)
    plt.close()
    return img

def create_polar_plot(emotion_probabilities):
    """Create polar plot of emotion probabilities"""
    emotions = list(emotion_probabilities.keys())
    probabilities = [prob * 100 for prob in emotion_probabilities.values()]  # Convert to percentages
    
    # Prepare data for polar plot
    angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    probabilities += probabilities[:1]  # Complete the circle
    
    # Create polar plot
    fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(projection='polar'))
    ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
    ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')
    
    # Customize the plot
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
    ax.set_ylim(0, 100)
    
    ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
    plt.tight_layout()
    
    # Save to BytesIO and convert to PIL Image
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img = Image.open(buf)
    plt.close()
    return img

def create_waveform_plot(audio, sr):
    """Create waveform plot"""
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(audio, sr=sr)
    plt.title('Audio Waveform')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Amplitude')
    plt.tight_layout()
    
    # Save to BytesIO and convert to PIL Image
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img = Image.open(buf)
    plt.close()
    return img

def predict_emotion(audio_file):
    try:
        # Load audio file
        audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')
        
        # Extract features
        features = extract_features(audio_np, sr)
        features = np.expand_dims(features, axis=0)

        # Make prediction
        predictions = model.predict(features, verbose=0)
        predicted_class = np.argmax(predictions[0])
        predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]

        # Calculate emotion probabilities (as percentages for display)
        emotion_probabilities = {
            label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
            for i, pred in enumerate(predictions[0])
        }
        
        # Create visualizations
        mel_spec_plot = create_mel_spectrogram(audio_np, sr)
        polar_plot = create_polar_plot(emotion_probabilities)
        waveform_plot = create_waveform_plot(audio_np, sr)
        
        # Convert probabilities to percentages for better display
        emotion_probabilities_percent = {
            emotion: round(prob, 2) 
            for emotion, prob in emotion_probabilities.items()
        }

        return (
            predicted_emotion,
            emotion_probabilities_percent,
            mel_spec_plot,
            polar_plot,
            waveform_plot
        )
        
    except Exception as e:
        error_msg = f"Error processing audio: {str(e)}"
        return error_msg, {}, None, None, None

# Create Gradio interface
with gr.Blocks(title="🎤 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
    gr.Markdown(
        """
        # 🎤 Emotion Recognition from Audio
        Upload or record an audio file to analyze the emotional content and view detailed visualizations.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                label="Upload or Record Audio",
                type="filepath",
                sources=["upload", "microphone"]
            )
            
            predict_btn = gr.Button("🔍 Analyze Emotion", variant="primary", size="lg")
            
        with gr.Column(scale=1):
            predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False)
            emotion_probs = gr.Label(label="📊 Emotion Probabilities (%)", num_top_classes=10)
    
    with gr.Row():
        with gr.Column():
            waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil")
        with gr.Column():
            mel_spec_plot = gr.Image(label="🎵 Mel Spectrogram", type="pil")
    
    with gr.Row():
        polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil")
    
    # Set up the prediction function
    predict_btn.click(
        fn=predict_emotion,
        inputs=[audio_input],
        outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
    )
    
    # Also allow automatic prediction when audio is uploaded
    audio_input.change(
        fn=predict_emotion,
        inputs=[audio_input],
        outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
    )

    gr.Markdown(
        """
        ### 📝 How it works:
        1. **Upload** an audio file or **record** directly using your microphone
        2. The system extracts audio features (MFCCs, Chroma, Spectral features, etc.)
        3. A trained neural network predicts the emotion
        4. View the results with detailed visualizations:
           - **Waveform**: Shows the audio signal over time
           - **Mel Spectrogram**: Visual representation of the audio's frequency content 
           - **Radar Chart**: Probability distribution across all diff emotion categories
        
        ### 🎭 Supported Emotions:
        Depending on your model training, this may include emotions like: Happy, Sad, Angry, Fear, Disgust, Surprise, Neutral, and others.
        """
    )

# Launch the interface
if __name__ == "__main__":
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )