|
import os |
|
import numpy as np |
|
import librosa |
|
import librosa.display |
|
import pickle |
|
import tensorflow as tf |
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import matplotlib |
|
matplotlib.use('Agg') |
|
from io import BytesIO |
|
from PIL import Image |
|
import warnings |
|
|
|
|
|
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") |
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' |
|
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" |
|
|
|
|
|
model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False) |
|
with open("new_label_encoder.pkl", "rb") as f: |
|
label_encoder = pickle.load(f) |
|
|
|
def extract_features(audio, sr, max_len=40): |
|
|
|
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20) |
|
mfccs = np.mean(mfccs.T, axis=0) |
|
|
|
|
|
chroma = librosa.feature.chroma_stft(y=audio, sr=sr) |
|
chroma = np.mean(chroma.T, axis=0) |
|
|
|
|
|
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) |
|
contrast = np.mean(contrast.T, axis=0) |
|
|
|
|
|
zcr = librosa.feature.zero_crossing_rate(y=audio) |
|
zcr = np.mean(zcr.T, axis=0) |
|
|
|
|
|
centroid = librosa.feature.spectral_centroid(y=audio, sr=sr) |
|
centroid = np.mean(centroid.T, axis=0) |
|
|
|
|
|
rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85) |
|
rolloff = np.mean(rolloff.T, axis=0) |
|
|
|
|
|
rms = librosa.feature.rms(y=audio) |
|
rms = np.mean(rms.T, axis=0) |
|
|
|
features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms]) |
|
|
|
|
|
if len(features) < max_len: |
|
features = np.pad(features, (0, max_len - len(features)), mode='constant') |
|
else: |
|
features = features[:max_len] |
|
return features |
|
|
|
def create_mel_spectrogram(audio, sr): |
|
"""Create mel spectrogram plot""" |
|
plt.figure(figsize=(10, 4)) |
|
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000) |
|
S_dB = librosa.power_to_db(S, ref=np.max) |
|
librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel') |
|
plt.colorbar(format='%+2.0f dB') |
|
plt.title('Mel Spectrogram') |
|
plt.tight_layout() |
|
|
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') |
|
buf.seek(0) |
|
img = Image.open(buf) |
|
plt.close() |
|
return img |
|
|
|
def create_polar_plot(emotion_probabilities): |
|
"""Create polar plot of emotion probabilities""" |
|
emotions = list(emotion_probabilities.keys()) |
|
probabilities = [prob * 100 for prob in emotion_probabilities.values()] |
|
|
|
|
|
angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist() |
|
angles += angles[:1] |
|
probabilities += probabilities[:1] |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(projection='polar')) |
|
ax.fill(angles, probabilities, color='skyblue', alpha=0.4) |
|
ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o') |
|
|
|
|
|
ax.set_yticks([20, 40, 60, 80, 100]) |
|
ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10) |
|
ax.set_xticks(angles[:-1]) |
|
ax.set_xticklabels(emotions, fontsize=12, color="darkblue") |
|
ax.set_ylim(0, 100) |
|
|
|
ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20) |
|
plt.tight_layout() |
|
|
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') |
|
buf.seek(0) |
|
img = Image.open(buf) |
|
plt.close() |
|
return img |
|
|
|
def create_waveform_plot(audio, sr): |
|
"""Create waveform plot""" |
|
plt.figure(figsize=(12, 4)) |
|
librosa.display.waveshow(audio, sr=sr) |
|
plt.title('Audio Waveform') |
|
plt.xlabel('Time (seconds)') |
|
plt.ylabel('Amplitude') |
|
plt.tight_layout() |
|
|
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') |
|
buf.seek(0) |
|
img = Image.open(buf) |
|
plt.close() |
|
return img |
|
|
|
def predict_emotion(audio_file): |
|
try: |
|
|
|
audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast') |
|
|
|
|
|
features = extract_features(audio_np, sr) |
|
features = np.expand_dims(features, axis=0) |
|
|
|
|
|
predictions = model.predict(features, verbose=0) |
|
predicted_class = np.argmax(predictions[0]) |
|
predicted_emotion = label_encoder.inverse_transform([predicted_class])[0] |
|
|
|
|
|
emotion_probabilities = { |
|
label_encoder.inverse_transform([i])[0]: round(float(pred), 4) |
|
for i, pred in enumerate(predictions[0]) |
|
} |
|
|
|
|
|
mel_spec_plot = create_mel_spectrogram(audio_np, sr) |
|
polar_plot = create_polar_plot(emotion_probabilities) |
|
waveform_plot = create_waveform_plot(audio_np, sr) |
|
|
|
|
|
emotion_probabilities_percent = { |
|
emotion: round(prob, 2) |
|
for emotion, prob in emotion_probabilities.items() |
|
} |
|
|
|
return ( |
|
predicted_emotion, |
|
emotion_probabilities_percent, |
|
mel_spec_plot, |
|
polar_plot, |
|
waveform_plot |
|
) |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
return error_msg, {}, None, None, None |
|
|
|
|
|
with gr.Blocks(title="π€ Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface: |
|
gr.Markdown( |
|
""" |
|
# π€ Emotion Recognition from Audio |
|
Upload or record an audio file to analyze the emotional content and view detailed visualizations. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio( |
|
label="Upload or Record Audio", |
|
type="filepath", |
|
sources=["upload", "microphone"] |
|
) |
|
|
|
predict_btn = gr.Button("π Analyze Emotion", variant="primary", size="lg") |
|
|
|
with gr.Column(scale=1): |
|
predicted_emotion = gr.Text(label="π― Predicted Emotion", interactive=False) |
|
emotion_probs = gr.Label(label="π Emotion Probabilities (%)", num_top_classes=10) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
waveform_plot = gr.Image(label="π Audio Waveform", type="pil") |
|
with gr.Column(): |
|
mel_spec_plot = gr.Image(label="π΅ Mel Spectrogram", type="pil") |
|
|
|
with gr.Row(): |
|
polar_plot = gr.Image(label="π― Emotion Probability Radar", type="pil") |
|
|
|
|
|
predict_btn.click( |
|
fn=predict_emotion, |
|
inputs=[audio_input], |
|
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot] |
|
) |
|
|
|
|
|
audio_input.change( |
|
fn=predict_emotion, |
|
inputs=[audio_input], |
|
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot] |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
### π How it works: |
|
1. **Upload** an audio file or **record** directly using your microphone |
|
2. The system extracts audio features (MFCCs, Chroma, Spectral features, etc.) |
|
3. A trained neural network predicts the emotion |
|
4. View the results with detailed visualizations: |
|
- **Waveform**: Shows the audio signal over time |
|
- **Mel Spectrogram**: Visual representation of the audio's frequency content |
|
- **Radar Chart**: Probability distribution across all diff emotion categories |
|
|
|
### π Supported Emotions: |
|
Depending on your model training, this may include emotions like: Happy, Sad, Angry, Fear, Disgust, Surprise, Neutral, and others. |
|
""" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True |
|
) |