manikanta2026
changes3
4f299e1
import os
import numpy as np
import librosa
import librosa.display
import pickle
import tensorflow as tf
import gradio as gr
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg') # Use non-interactive backend
from io import BytesIO
from PIL import Image
import warnings
# Suppress warnings and logs
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
# Load model and label encoder
model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
with open("new_label_encoder.pkl", "rb") as f:
label_encoder = pickle.load(f)
def extract_features(audio, sr, max_len=40):
# Extract MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
mfccs = np.mean(mfccs.T, axis=0)
# Extract Chroma
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
chroma = np.mean(chroma.T, axis=0)
# Extract Spectral Contrast
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
contrast = np.mean(contrast.T, axis=0)
# Extract Zero-Crossing Rate
zcr = librosa.feature.zero_crossing_rate(y=audio)
zcr = np.mean(zcr.T, axis=0)
# Extract Spectral Centroid
centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
centroid = np.mean(centroid.T, axis=0)
# Extract Spectral Rolloff
rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
rolloff = np.mean(rolloff.T, axis=0)
# Extract RMS Energy
rms = librosa.feature.rms(y=audio)
rms = np.mean(rms.T, axis=0)
features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
# Pad or trim to fixed length
if len(features) < max_len:
features = np.pad(features, (0, max_len - len(features)), mode='constant')
else:
features = features[:max_len]
return features
def create_mel_spectrogram(audio, sr):
"""Create mel spectrogram plot"""
plt.figure(figsize=(10, 4))
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
# Save to BytesIO and convert to PIL Image
buf = BytesIO()
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
img = Image.open(buf)
plt.close()
return img
def create_polar_plot(emotion_probabilities):
"""Create polar plot of emotion probabilities"""
emotions = list(emotion_probabilities.keys())
probabilities = [prob * 100 for prob in emotion_probabilities.values()] # Convert to percentages
# Prepare data for polar plot
angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
angles += angles[:1] # Complete the circle
probabilities += probabilities[:1] # Complete the circle
# Create polar plot
fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(projection='polar'))
ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')
# Customize the plot
ax.set_yticks([20, 40, 60, 80, 100])
ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
ax.set_ylim(0, 100)
ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
plt.tight_layout()
# Save to BytesIO and convert to PIL Image
buf = BytesIO()
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
img = Image.open(buf)
plt.close()
return img
def create_waveform_plot(audio, sr):
"""Create waveform plot"""
plt.figure(figsize=(12, 4))
librosa.display.waveshow(audio, sr=sr)
plt.title('Audio Waveform')
plt.xlabel('Time (seconds)')
plt.ylabel('Amplitude')
plt.tight_layout()
# Save to BytesIO and convert to PIL Image
buf = BytesIO()
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
img = Image.open(buf)
plt.close()
return img
def predict_emotion(audio_file):
try:
# Load audio file
audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')
# Extract features
features = extract_features(audio_np, sr)
features = np.expand_dims(features, axis=0)
# Make prediction
predictions = model.predict(features, verbose=0)
predicted_class = np.argmax(predictions[0])
predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
# Calculate emotion probabilities (as percentages for display)
emotion_probabilities = {
label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
for i, pred in enumerate(predictions[0])
}
# Create visualizations
mel_spec_plot = create_mel_spectrogram(audio_np, sr)
polar_plot = create_polar_plot(emotion_probabilities)
waveform_plot = create_waveform_plot(audio_np, sr)
# Convert probabilities to percentages for better display
emotion_probabilities_percent = {
emotion: round(prob, 2)
for emotion, prob in emotion_probabilities.items()
}
return (
predicted_emotion,
emotion_probabilities_percent,
mel_spec_plot,
polar_plot,
waveform_plot
)
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
return error_msg, {}, None, None, None
# Create Gradio interface
with gr.Blocks(title="🎀 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
gr.Markdown(
"""
# 🎀 Emotion Recognition from Audio
Upload or record an audio file to analyze the emotional content and view detailed visualizations.
"""
)
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload or Record Audio",
type="filepath",
sources=["upload", "microphone"]
)
predict_btn = gr.Button("πŸ” Analyze Emotion", variant="primary", size="lg")
with gr.Column(scale=1):
predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False)
emotion_probs = gr.Label(label="πŸ“Š Emotion Probabilities (%)", num_top_classes=10)
with gr.Row():
with gr.Column():
waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil")
with gr.Column():
mel_spec_plot = gr.Image(label="🎡 Mel Spectrogram", type="pil")
with gr.Row():
polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil")
# Set up the prediction function
predict_btn.click(
fn=predict_emotion,
inputs=[audio_input],
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
)
# Also allow automatic prediction when audio is uploaded
audio_input.change(
fn=predict_emotion,
inputs=[audio_input],
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
)
gr.Markdown(
"""
### πŸ“ How it works:
1. **Upload** an audio file or **record** directly using your microphone
2. The system extracts audio features (MFCCs, Chroma, Spectral features, etc.)
3. A trained neural network predicts the emotion
4. View the results with detailed visualizations:
- **Waveform**: Shows the audio signal over time
- **Mel Spectrogram**: Visual representation of the audio's frequency content
- **Radar Chart**: Probability distribution across all diff emotion categories
### 🎭 Supported Emotions:
Depending on your model training, this may include emotions like: Happy, Sad, Angry, Fear, Disgust, Surprise, Neutral, and others.
"""
)
# Launch the interface
if __name__ == "__main__":
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)