import os import numpy as np import librosa import librosa.display import pickle import tensorflow as tf import gradio as gr import matplotlib.pyplot as plt import matplotlib matplotlib.use('Agg') # Use non-interactive backend from io import BytesIO from PIL import Image import warnings # Suppress warnings and logs warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Load model and label encoder model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False) with open("new_label_encoder.pkl", "rb") as f: label_encoder = pickle.load(f) def extract_features(audio, sr, max_len=40): # Extract MFCCs mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20) mfccs = np.mean(mfccs.T, axis=0) # Extract Chroma chroma = librosa.feature.chroma_stft(y=audio, sr=sr) chroma = np.mean(chroma.T, axis=0) # Extract Spectral Contrast contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) contrast = np.mean(contrast.T, axis=0) # Extract Zero-Crossing Rate zcr = librosa.feature.zero_crossing_rate(y=audio) zcr = np.mean(zcr.T, axis=0) # Extract Spectral Centroid centroid = librosa.feature.spectral_centroid(y=audio, sr=sr) centroid = np.mean(centroid.T, axis=0) # Extract Spectral Rolloff rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85) rolloff = np.mean(rolloff.T, axis=0) # Extract RMS Energy rms = librosa.feature.rms(y=audio) rms = np.mean(rms.T, axis=0) features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms]) # Pad or trim to fixed length if len(features) < max_len: features = np.pad(features, (0, max_len - len(features)), mode='constant') else: features = features[:max_len] return features def create_mel_spectrogram(audio, sr): """Create mel spectrogram plot""" plt.figure(figsize=(10, 4)) S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000) S_dB = librosa.power_to_db(S, ref=np.max) librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel') plt.colorbar(format='%+2.0f dB') plt.title('Mel Spectrogram') plt.tight_layout() # Save to BytesIO and convert to PIL Image buf = BytesIO() plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close() return img def create_polar_plot(emotion_probabilities): """Create polar plot of emotion probabilities""" emotions = list(emotion_probabilities.keys()) probabilities = [prob * 100 for prob in emotion_probabilities.values()] # Convert to percentages # Prepare data for polar plot angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist() angles += angles[:1] # Complete the circle probabilities += probabilities[:1] # Complete the circle # Create polar plot fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(projection='polar')) ax.fill(angles, probabilities, color='skyblue', alpha=0.4) ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o') # Customize the plot ax.set_yticks([20, 40, 60, 80, 100]) ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10) ax.set_xticks(angles[:-1]) ax.set_xticklabels(emotions, fontsize=12, color="darkblue") ax.set_ylim(0, 100) ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20) plt.tight_layout() # Save to BytesIO and convert to PIL Image buf = BytesIO() plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close() return img def create_waveform_plot(audio, sr): """Create waveform plot""" plt.figure(figsize=(12, 4)) librosa.display.waveshow(audio, sr=sr) plt.title('Audio Waveform') plt.xlabel('Time (seconds)') plt.ylabel('Amplitude') plt.tight_layout() # Save to BytesIO and convert to PIL Image buf = BytesIO() plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close() return img def predict_emotion(audio_file): try: # Load audio file audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast') # Extract features features = extract_features(audio_np, sr) features = np.expand_dims(features, axis=0) # Make prediction predictions = model.predict(features, verbose=0) predicted_class = np.argmax(predictions[0]) predicted_emotion = label_encoder.inverse_transform([predicted_class])[0] # Calculate emotion probabilities (as percentages for display) emotion_probabilities = { label_encoder.inverse_transform([i])[0]: round(float(pred), 4) for i, pred in enumerate(predictions[0]) } # Create visualizations mel_spec_plot = create_mel_spectrogram(audio_np, sr) polar_plot = create_polar_plot(emotion_probabilities) waveform_plot = create_waveform_plot(audio_np, sr) # Convert probabilities to percentages for better display emotion_probabilities_percent = { emotion: round(prob, 2) for emotion, prob in emotion_probabilities.items() } return ( predicted_emotion, emotion_probabilities_percent, mel_spec_plot, polar_plot, waveform_plot ) except Exception as e: error_msg = f"Error processing audio: {str(e)}" return error_msg, {}, None, None, None # Create Gradio interface with gr.Blocks(title="🎤 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface: gr.Markdown( """ # 🎤 Emotion Recognition from Audio Upload or record an audio file to analyze the emotional content and view detailed visualizations. """ ) with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( label="Upload or Record Audio", type="filepath", sources=["upload", "microphone"] ) predict_btn = gr.Button("🔍 Analyze Emotion", variant="primary", size="lg") with gr.Column(scale=1): predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False) emotion_probs = gr.Label(label="📊 Emotion Probabilities (%)", num_top_classes=10) with gr.Row(): with gr.Column(): waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil") with gr.Column(): mel_spec_plot = gr.Image(label="🎵 Mel Spectrogram", type="pil") with gr.Row(): polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil") # Set up the prediction function predict_btn.click( fn=predict_emotion, inputs=[audio_input], outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot] ) # Also allow automatic prediction when audio is uploaded audio_input.change( fn=predict_emotion, inputs=[audio_input], outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot] ) gr.Markdown( """ ### 📝 How it works: 1. **Upload** an audio file or **record** directly using your microphone 2. The system extracts audio features (MFCCs, Chroma, Spectral features, etc.) 3. A trained neural network predicts the emotion 4. View the results with detailed visualizations: - **Waveform**: Shows the audio signal over time - **Mel Spectrogram**: Visual representation of the audio's frequency content - **Radar Chart**: Probability distribution across all diff emotion categories ### 🎭 Supported Emotions: Depending on your model training, this may include emotions like: Happy, Sad, Angry, Fear, Disgust, Surprise, Neutral, and others. """ ) # Launch the interface if __name__ == "__main__": iface.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )