manikanta2026
commited on
Commit
Β·
8287fdb
1
Parent(s):
3f0edb4
changes3
Browse files- app.py +182 -27
- requirements.txt +30 -5
app.py
CHANGED
@@ -1,12 +1,20 @@
|
|
1 |
import os
|
2 |
import numpy as np
|
3 |
import librosa
|
|
|
4 |
import pickle
|
5 |
import tensorflow as tf
|
6 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
#
|
9 |
-
|
|
|
|
|
10 |
|
11 |
# Load model and label encoder
|
12 |
model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
|
@@ -14,56 +22,203 @@ with open("new_label_encoder.pkl", "rb") as f:
|
|
14 |
label_encoder = pickle.load(f)
|
15 |
|
16 |
def extract_features(audio, sr, max_len=40):
|
|
|
17 |
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
|
18 |
mfccs = np.mean(mfccs.T, axis=0)
|
|
|
|
|
19 |
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
|
20 |
chroma = np.mean(chroma.T, axis=0)
|
|
|
|
|
21 |
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
|
22 |
contrast = np.mean(contrast.T, axis=0)
|
|
|
|
|
23 |
zcr = librosa.feature.zero_crossing_rate(y=audio)
|
24 |
zcr = np.mean(zcr.T, axis=0)
|
|
|
|
|
25 |
centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
|
26 |
centroid = np.mean(centroid.T, axis=0)
|
27 |
-
|
|
|
|
|
28 |
rolloff = np.mean(rolloff.T, axis=0)
|
|
|
|
|
29 |
rms = librosa.feature.rms(y=audio)
|
30 |
rms = np.mean(rms.T, axis=0)
|
31 |
|
32 |
features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
|
|
|
|
|
33 |
if len(features) < max_len:
|
34 |
features = np.pad(features, (0, max_len - len(features)), mode='constant')
|
35 |
else:
|
36 |
features = features[:max_len]
|
37 |
return features
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def predict_emotion(audio_file):
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
# Gradio interface
|
58 |
-
iface = gr.Interface(
|
59 |
-
fn=predict_emotion,
|
60 |
-
inputs=gr.Audio(type="filepath"),
|
61 |
-
outputs=[
|
62 |
-
gr.Text(label="Predicted Emotion"),
|
63 |
-
gr.Label(label="Emotion Probabilities")
|
64 |
-
],
|
65 |
-
title="π€ Emotion Recognition from Audio",
|
66 |
-
description="Upload or record audio to identify the emotion being expressed."
|
67 |
-
)
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import numpy as np
|
3 |
import librosa
|
4 |
+
import librosa.display
|
5 |
import pickle
|
6 |
import tensorflow as tf
|
7 |
import gradio as gr
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import matplotlib
|
10 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
11 |
+
from io import BytesIO
|
12 |
+
import warnings
|
13 |
|
14 |
+
# Suppress warnings and logs
|
15 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
|
16 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
17 |
+
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
|
18 |
|
19 |
# Load model and label encoder
|
20 |
model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
|
|
|
22 |
label_encoder = pickle.load(f)
|
23 |
|
24 |
def extract_features(audio, sr, max_len=40):
|
25 |
+
# Extract MFCCs
|
26 |
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
|
27 |
mfccs = np.mean(mfccs.T, axis=0)
|
28 |
+
|
29 |
+
# Extract Chroma
|
30 |
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
|
31 |
chroma = np.mean(chroma.T, axis=0)
|
32 |
+
|
33 |
+
# Extract Spectral Contrast
|
34 |
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
|
35 |
contrast = np.mean(contrast.T, axis=0)
|
36 |
+
|
37 |
+
# Extract Zero-Crossing Rate
|
38 |
zcr = librosa.feature.zero_crossing_rate(y=audio)
|
39 |
zcr = np.mean(zcr.T, axis=0)
|
40 |
+
|
41 |
+
# Extract Spectral Centroid
|
42 |
centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
|
43 |
centroid = np.mean(centroid.T, axis=0)
|
44 |
+
|
45 |
+
# Extract Spectral Rolloff
|
46 |
+
rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
|
47 |
rolloff = np.mean(rolloff.T, axis=0)
|
48 |
+
|
49 |
+
# Extract RMS Energy
|
50 |
rms = librosa.feature.rms(y=audio)
|
51 |
rms = np.mean(rms.T, axis=0)
|
52 |
|
53 |
features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
|
54 |
+
|
55 |
+
# Pad or trim to fixed length
|
56 |
if len(features) < max_len:
|
57 |
features = np.pad(features, (0, max_len - len(features)), mode='constant')
|
58 |
else:
|
59 |
features = features[:max_len]
|
60 |
return features
|
61 |
|
62 |
+
def create_mel_spectrogram(audio, sr):
|
63 |
+
"""Create mel spectrogram plot"""
|
64 |
+
plt.figure(figsize=(10, 4))
|
65 |
+
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
|
66 |
+
S_dB = librosa.power_to_db(S, ref=np.max)
|
67 |
+
librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
|
68 |
+
plt.colorbar(format='%+2.0f dB')
|
69 |
+
plt.title('Mel Spectrogram')
|
70 |
+
plt.tight_layout()
|
71 |
+
|
72 |
+
# Save to BytesIO and return the plot
|
73 |
+
buf = BytesIO()
|
74 |
+
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
|
75 |
+
buf.seek(0)
|
76 |
+
plt.close()
|
77 |
+
return buf
|
78 |
+
|
79 |
+
def create_polar_plot(emotion_probabilities):
|
80 |
+
"""Create polar plot of emotion probabilities"""
|
81 |
+
emotions = list(emotion_probabilities.keys())
|
82 |
+
probabilities = [prob * 100 for prob in emotion_probabilities.values()] # Convert to percentages
|
83 |
+
|
84 |
+
# Prepare data for polar plot
|
85 |
+
angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
|
86 |
+
angles += angles[:1] # Complete the circle
|
87 |
+
probabilities += probabilities[:1] # Complete the circle
|
88 |
+
|
89 |
+
# Create polar plot
|
90 |
+
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
|
91 |
+
ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
|
92 |
+
ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')
|
93 |
+
|
94 |
+
# Customize the plot
|
95 |
+
ax.set_yticks([20, 40, 60, 80, 100])
|
96 |
+
ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
|
97 |
+
ax.set_xticks(angles[:-1])
|
98 |
+
ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
|
99 |
+
ax.set_ylim(0, 100)
|
100 |
+
|
101 |
+
ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
|
102 |
+
plt.tight_layout()
|
103 |
+
|
104 |
+
# Save to BytesIO and return the plot
|
105 |
+
buf = BytesIO()
|
106 |
+
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
|
107 |
+
buf.seek(0)
|
108 |
+
plt.close()
|
109 |
+
return buf
|
110 |
+
|
111 |
+
def create_waveform_plot(audio, sr):
|
112 |
+
"""Create waveform plot"""
|
113 |
+
plt.figure(figsize=(12, 4))
|
114 |
+
librosa.display.waveshow(audio, sr=sr)
|
115 |
+
plt.title('Audio Waveform')
|
116 |
+
plt.xlabel('Time (seconds)')
|
117 |
+
plt.ylabel('Amplitude')
|
118 |
+
plt.tight_layout()
|
119 |
+
|
120 |
+
# Save to BytesIO and return the plot
|
121 |
+
buf = BytesIO()
|
122 |
+
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
|
123 |
+
buf.seek(0)
|
124 |
+
plt.close()
|
125 |
+
return buf
|
126 |
+
|
127 |
def predict_emotion(audio_file):
|
128 |
+
try:
|
129 |
+
# Load audio file
|
130 |
+
audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')
|
131 |
+
|
132 |
+
# Extract features
|
133 |
+
features = extract_features(audio_np, sr)
|
134 |
+
features = np.expand_dims(features, axis=0)
|
135 |
|
136 |
+
# Make prediction
|
137 |
+
predictions = model.predict(features, verbose=0)
|
138 |
+
predicted_class = np.argmax(predictions[0])
|
139 |
+
predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
|
140 |
|
141 |
+
# Calculate emotion probabilities (as percentages for display)
|
142 |
+
emotion_probabilities = {
|
143 |
+
label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
|
144 |
+
for i, pred in enumerate(predictions[0])
|
145 |
+
}
|
146 |
+
|
147 |
+
# Create visualizations
|
148 |
+
mel_spec_plot = create_mel_spectrogram(audio_np, sr)
|
149 |
+
polar_plot = create_polar_plot(emotion_probabilities)
|
150 |
+
waveform_plot = create_waveform_plot(audio_np, sr)
|
151 |
+
|
152 |
+
# Convert probabilities to percentages for better display
|
153 |
+
emotion_probabilities_percent = {
|
154 |
+
emotion: round(prob * 100, 2)
|
155 |
+
for emotion, prob in emotion_probabilities.items()
|
156 |
+
}
|
157 |
|
158 |
+
return (
|
159 |
+
predicted_emotion,
|
160 |
+
emotion_probabilities_percent,
|
161 |
+
mel_spec_plot,
|
162 |
+
polar_plot,
|
163 |
+
waveform_plot
|
164 |
+
)
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
error_msg = f"Error processing audio: {str(e)}"
|
168 |
+
return error_msg, {}, None, None, None
|
169 |
|
170 |
+
# Create Gradio interface
|
171 |
+
with gr.Blocks(title="π€ Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
|
172 |
+
gr.Markdown(
|
173 |
+
"""
|
174 |
+
# π€ Emotion Recognition from Audio
|
175 |
+
Upload or record an audio file to analyze the emotional content and view detailed visualizations.
|
176 |
+
"""
|
177 |
+
)
|
178 |
+
|
179 |
+
with gr.Row():
|
180 |
+
with gr.Column(scale=1):
|
181 |
+
audio_input = gr.Audio(
|
182 |
+
label="Upload or Record Audio",
|
183 |
+
type="filepath",
|
184 |
+
sources=["upload", "microphone"]
|
185 |
+
)
|
186 |
+
|
187 |
+
predict_btn = gr.Button("π Analyze Emotion", variant="primary", size="lg")
|
188 |
+
|
189 |
+
with gr.Column(scale=1):
|
190 |
+
predicted_emotion = gr.Text(label="π― Predicted Emotion", interactive=False)
|
191 |
+
emotion_probs = gr.Label(label="π Emotion Probabilities (%)", num_top_classes=10)
|
192 |
+
|
193 |
+
with gr.Row():
|
194 |
+
with gr.Column():
|
195 |
+
waveform_plot = gr.Image(label="π Audio Waveform", type="pil")
|
196 |
+
with gr.Column():
|
197 |
+
mel_spec_plot = gr.Image(label="π΅ Mel Spectrogram", type="pil")
|
198 |
+
|
199 |
+
with gr.Row():
|
200 |
+
polar_plot = gr.Image(label="π― Emotion Probability Radar", type="pil")
|
201 |
+
|
202 |
+
# Set up the prediction function
|
203 |
+
predict_btn.click(
|
204 |
+
fn=predict_emotion,
|
205 |
+
inputs=[audio_input],
|
206 |
+
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
|
207 |
+
)
|
208 |
+
|
209 |
+
# Also allow automatic prediction when audio is uploaded
|
210 |
+
audio_input.change(
|
211 |
+
fn=predict_emotion,
|
212 |
+
inputs=[audio_input],
|
213 |
+
outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
|
214 |
+
)
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
217 |
+
# Launch the interface
|
218 |
+
if __name__ == "__main__":
|
219 |
+
iface.launch(
|
220 |
+
server_name="0.0.0.0",
|
221 |
+
server_port=7860,
|
222 |
+
share=False,
|
223 |
+
show_error=True
|
224 |
+
)
|
requirements.txt
CHANGED
@@ -1,5 +1,30 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Core ML and Audio Processing
|
3 |
+
tensorflow>=2.10.0,<2.16.0
|
4 |
+
librosa>=0.10.0
|
5 |
+
numpy>=1.21.0,<1.25.0
|
6 |
+
scikit-learn>=1.1.0
|
7 |
+
|
8 |
+
# Audio file handling
|
9 |
+
soundfile>=0.12.0
|
10 |
+
audioread>=3.0.0
|
11 |
+
|
12 |
+
# Visualization
|
13 |
+
matplotlib>=3.5.0
|
14 |
+
seaborn>=0.11.0
|
15 |
+
|
16 |
+
# Web Interface
|
17 |
+
gradio>=4.0.0
|
18 |
+
|
19 |
+
# Data handling
|
20 |
+
pandas>=1.5.0
|
21 |
+
pickle5>=0.0.12
|
22 |
+
|
23 |
+
# Optional audio codecs (recommended for broader format support)
|
24 |
+
ffmpeg-python>=0.2.0
|
25 |
+
|
26 |
+
# System utilities
|
27 |
+
packaging>=21.0
|
28 |
+
|
29 |
+
# For better performance (optional but recommended)
|
30 |
+
numba>=0.56.0
|