Spaces:

KingInTheNorth
/

audio-emotion-detector

Sleeping

App Files Files Community

manikanta2026 commited on May 22

Commit

8287fdb

1 Parent(s): 3f0edb4

changes3

Browse files

Files changed (2) hide show

app.py +182 -27
requirements.txt +30 -5

app.py CHANGED Viewed

@@ -1,12 +1,20 @@
 import os
 import numpy as np
 import librosa
 import pickle
 import tensorflow as tf
 import gradio as gr
-# Optional: Suppress TensorFlow logging for cleaner output
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 # Load model and label encoder
 model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
@@ -14,56 +22,203 @@ with open("new_label_encoder.pkl", "rb") as f:
     label_encoder = pickle.load(f)
 def extract_features(audio, sr, max_len=40):
     mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
     mfccs = np.mean(mfccs.T, axis=0)
     chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
     chroma = np.mean(chroma.T, axis=0)
     contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
     contrast = np.mean(contrast.T, axis=0)
     zcr = librosa.feature.zero_crossing_rate(y=audio)
     zcr = np.mean(zcr.T, axis=0)
     centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
     centroid = np.mean(centroid.T, axis=0)
-    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
     rolloff = np.mean(rolloff.T, axis=0)
     rms = librosa.feature.rms(y=audio)
     rms = np.mean(rms.T, axis=0)
     features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
     if len(features) < max_len:
         features = np.pad(features, (0, max_len - len(features)), mode='constant')
     else:
         features = features[:max_len]
     return features
 def predict_emotion(audio_file):
-    audio_np, sr = librosa.load(audio_file, sr=None)
-    features = extract_features(audio_np, sr)
-    features = np.expand_dims(features, axis=0)
-    predictions = model.predict(features, verbose=0)
-    predicted_class = np.argmax(predictions[0])
-    predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
-    # Output confidences as floats (0-1), rounded to 4 decimals
-    emotion_probabilities = {
-        label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
-        for i, pred in enumerate(predictions[0])
-    }
-    return predicted_emotion, emotion_probabilities
-# Gradio interface
-iface = gr.Interface(
-    fn=predict_emotion,
-    inputs=gr.Audio(type="filepath"),
-    outputs=[
-        gr.Text(label="Predicted Emotion"),
-        gr.Label(label="Emotion Probabilities")
-    ],
-    title="🎤 Emotion Recognition from Audio",
-    description="Upload or record audio to identify the emotion being expressed."
-)
-iface.launch()

 import os
 import numpy as np
 import librosa
+import librosa.display
 import pickle
 import tensorflow as tf
 import gradio as gr
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
+from io import BytesIO
+import warnings
+# Suppress warnings and logs
+warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 # Load model and label encoder
 model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
     label_encoder = pickle.load(f)
 def extract_features(audio, sr, max_len=40):
+    # Extract MFCCs
     mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
     mfccs = np.mean(mfccs.T, axis=0)
+    # Extract Chroma
     chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
     chroma = np.mean(chroma.T, axis=0)
+    # Extract Spectral Contrast
     contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
     contrast = np.mean(contrast.T, axis=0)
+    # Extract Zero-Crossing Rate
     zcr = librosa.feature.zero_crossing_rate(y=audio)
     zcr = np.mean(zcr.T, axis=0)
+    # Extract Spectral Centroid
     centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
     centroid = np.mean(centroid.T, axis=0)
+    # Extract Spectral Rolloff
+    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
     rolloff = np.mean(rolloff.T, axis=0)
+    # Extract RMS Energy
     rms = librosa.feature.rms(y=audio)
     rms = np.mean(rms.T, axis=0)
     features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
+    # Pad or trim to fixed length
     if len(features) < max_len:
         features = np.pad(features, (0, max_len - len(features)), mode='constant')
     else:
         features = features[:max_len]
     return features
+def create_mel_spectrogram(audio, sr):
+    """Create mel spectrogram plot"""
+    plt.figure(figsize=(10, 4))
+    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
+    S_dB = librosa.power_to_db(S, ref=np.max)
+    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
+    plt.colorbar(format='%+2.0f dB')
+    plt.title('Mel Spectrogram')
+    plt.tight_layout()
+    # Save to BytesIO and return the plot
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+    buf.seek(0)
+    plt.close()
+    return buf
+def create_polar_plot(emotion_probabilities):
+    """Create polar plot of emotion probabilities"""
+    emotions = list(emotion_probabilities.keys())
+    probabilities = [prob * 100 for prob in emotion_probabilities.values()]  # Convert to percentages
+    # Prepare data for polar plot
+    angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
+    angles += angles[:1]  # Complete the circle
+    probabilities += probabilities[:1]  # Complete the circle
+    # Create polar plot
+    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
+    ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
+    ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')
+    # Customize the plot
+    ax.set_yticks([20, 40, 60, 80, 100])
+    ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
+    ax.set_ylim(0, 100)
+    ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
+    plt.tight_layout()
+    # Save to BytesIO and return the plot
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+    buf.seek(0)
+    plt.close()
+    return buf
+def create_waveform_plot(audio, sr):
+    """Create waveform plot"""
+    plt.figure(figsize=(12, 4))
+    librosa.display.waveshow(audio, sr=sr)
+    plt.title('Audio Waveform')
+    plt.xlabel('Time (seconds)')
+    plt.ylabel('Amplitude')
+    plt.tight_layout()
+    # Save to BytesIO and return the plot
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+    buf.seek(0)
+    plt.close()
+    return buf
 def predict_emotion(audio_file):
+    try:
+        # Load audio file
+        audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')
+        # Extract features
+        features = extract_features(audio_np, sr)
+        features = np.expand_dims(features, axis=0)
+        # Make prediction
+        predictions = model.predict(features, verbose=0)
+        predicted_class = np.argmax(predictions[0])
+        predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
+        # Calculate emotion probabilities (as percentages for display)
+        emotion_probabilities = {
+            label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
+            for i, pred in enumerate(predictions[0])
+        }
+        # Create visualizations
+        mel_spec_plot = create_mel_spectrogram(audio_np, sr)
+        polar_plot = create_polar_plot(emotion_probabilities)
+        waveform_plot = create_waveform_plot(audio_np, sr)
+        # Convert probabilities to percentages for better display
+        emotion_probabilities_percent = {
+            emotion: round(prob * 100, 2)
+            for emotion, prob in emotion_probabilities.items()
+        }
+        return (
+            predicted_emotion,
+            emotion_probabilities_percent,
+            mel_spec_plot,
+            polar_plot,
+            waveform_plot
+        )
+    except Exception as e:
+        error_msg = f"Error processing audio: {str(e)}"
+        return error_msg, {}, None, None, None
+# Create Gradio interface
+with gr.Blocks(title="🎤 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
+    gr.Markdown(
+        """
+        # 🎤 Emotion Recognition from Audio
+        Upload or record an audio file to analyze the emotional content and view detailed visualizations.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                label="Upload or Record Audio",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            predict_btn = gr.Button("🔍 Analyze Emotion", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False)
+            emotion_probs = gr.Label(label="📊 Emotion Probabilities (%)", num_top_classes=10)
+    with gr.Row():
+        with gr.Column():
+            waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil")
+        with gr.Column():
+            mel_spec_plot = gr.Image(label="🎵 Mel Spectrogram", type="pil")
+    with gr.Row():
+        polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil")
+    # Set up the prediction function
+    predict_btn.click(
+        fn=predict_emotion,
+        inputs=[audio_input],
+        outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
+    )
+    # Also allow automatic prediction when audio is uploaded
+    audio_input.change(
+        fn=predict_emotion,
+        inputs=[audio_input],
+        outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
+    )
+# Launch the interface
+if __name__ == "__main__":
+    iface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -1,5 +1,30 @@
-tensorflow
-librosa
-gradio
-numpy
-scikit-learn

+# Core ML and Audio Processing
+tensorflow>=2.10.0,<2.16.0
+librosa>=0.10.0
+numpy>=1.21.0,<1.25.0
+scikit-learn>=1.1.0
+# Audio file handling
+soundfile>=0.12.0
+audioread>=3.0.0
+# Visualization
+matplotlib>=3.5.0
+seaborn>=0.11.0
+# Web Interface
+gradio>=4.0.0
+# Data handling
+pandas>=1.5.0
+pickle5>=0.0.12
+# Optional audio codecs (recommended for broader format support)
+ffmpeg-python>=0.2.0
+# System utilities
+packaging>=21.0
+# For better performance (optional but recommended)
+numba>=0.56.0