manikanta2026 commited on
Commit
8287fdb
Β·
1 Parent(s): 3f0edb4
Files changed (2) hide show
  1. app.py +182 -27
  2. requirements.txt +30 -5
app.py CHANGED
@@ -1,12 +1,20 @@
1
  import os
2
  import numpy as np
3
  import librosa
 
4
  import pickle
5
  import tensorflow as tf
6
  import gradio as gr
 
 
 
 
 
7
 
8
- # Optional: Suppress TensorFlow logging for cleaner output
9
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 
10
 
11
  # Load model and label encoder
12
  model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
@@ -14,56 +22,203 @@ with open("new_label_encoder.pkl", "rb") as f:
14
  label_encoder = pickle.load(f)
15
 
16
  def extract_features(audio, sr, max_len=40):
 
17
  mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
18
  mfccs = np.mean(mfccs.T, axis=0)
 
 
19
  chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
20
  chroma = np.mean(chroma.T, axis=0)
 
 
21
  contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
22
  contrast = np.mean(contrast.T, axis=0)
 
 
23
  zcr = librosa.feature.zero_crossing_rate(y=audio)
24
  zcr = np.mean(zcr.T, axis=0)
 
 
25
  centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
26
  centroid = np.mean(centroid.T, axis=0)
27
- rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
 
 
28
  rolloff = np.mean(rolloff.T, axis=0)
 
 
29
  rms = librosa.feature.rms(y=audio)
30
  rms = np.mean(rms.T, axis=0)
31
 
32
  features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
 
 
33
  if len(features) < max_len:
34
  features = np.pad(features, (0, max_len - len(features)), mode='constant')
35
  else:
36
  features = features[:max_len]
37
  return features
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def predict_emotion(audio_file):
40
- audio_np, sr = librosa.load(audio_file, sr=None)
41
- features = extract_features(audio_np, sr)
42
- features = np.expand_dims(features, axis=0)
 
 
 
 
43
 
44
- predictions = model.predict(features, verbose=0)
45
- predicted_class = np.argmax(predictions[0])
46
- predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
 
47
 
48
- # Output confidences as floats (0-1), rounded to 4 decimals
49
- emotion_probabilities = {
50
- label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
51
- for i, pred in enumerate(predictions[0])
52
- }
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- return predicted_emotion, emotion_probabilities
 
 
 
 
 
 
 
 
 
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Gradio interface
58
- iface = gr.Interface(
59
- fn=predict_emotion,
60
- inputs=gr.Audio(type="filepath"),
61
- outputs=[
62
- gr.Text(label="Predicted Emotion"),
63
- gr.Label(label="Emotion Probabilities")
64
- ],
65
- title="🎀 Emotion Recognition from Audio",
66
- description="Upload or record audio to identify the emotion being expressed."
67
- )
68
 
69
- iface.launch()
 
 
 
 
 
 
 
 
1
  import os
2
  import numpy as np
3
  import librosa
4
+ import librosa.display
5
  import pickle
6
  import tensorflow as tf
7
  import gradio as gr
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib
10
+ matplotlib.use('Agg') # Use non-interactive backend
11
+ from io import BytesIO
12
+ import warnings
13
 
14
+ # Suppress warnings and logs
15
+ warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
16
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17
+ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
18
 
19
  # Load model and label encoder
20
  model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
 
22
  label_encoder = pickle.load(f)
23
 
24
  def extract_features(audio, sr, max_len=40):
25
+ # Extract MFCCs
26
  mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
27
  mfccs = np.mean(mfccs.T, axis=0)
28
+
29
+ # Extract Chroma
30
  chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
31
  chroma = np.mean(chroma.T, axis=0)
32
+
33
+ # Extract Spectral Contrast
34
  contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
35
  contrast = np.mean(contrast.T, axis=0)
36
+
37
+ # Extract Zero-Crossing Rate
38
  zcr = librosa.feature.zero_crossing_rate(y=audio)
39
  zcr = np.mean(zcr.T, axis=0)
40
+
41
+ # Extract Spectral Centroid
42
  centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
43
  centroid = np.mean(centroid.T, axis=0)
44
+
45
+ # Extract Spectral Rolloff
46
+ rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
47
  rolloff = np.mean(rolloff.T, axis=0)
48
+
49
+ # Extract RMS Energy
50
  rms = librosa.feature.rms(y=audio)
51
  rms = np.mean(rms.T, axis=0)
52
 
53
  features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])
54
+
55
+ # Pad or trim to fixed length
56
  if len(features) < max_len:
57
  features = np.pad(features, (0, max_len - len(features)), mode='constant')
58
  else:
59
  features = features[:max_len]
60
  return features
61
 
62
+ def create_mel_spectrogram(audio, sr):
63
+ """Create mel spectrogram plot"""
64
+ plt.figure(figsize=(10, 4))
65
+ S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
66
+ S_dB = librosa.power_to_db(S, ref=np.max)
67
+ librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
68
+ plt.colorbar(format='%+2.0f dB')
69
+ plt.title('Mel Spectrogram')
70
+ plt.tight_layout()
71
+
72
+ # Save to BytesIO and return the plot
73
+ buf = BytesIO()
74
+ plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
75
+ buf.seek(0)
76
+ plt.close()
77
+ return buf
78
+
79
+ def create_polar_plot(emotion_probabilities):
80
+ """Create polar plot of emotion probabilities"""
81
+ emotions = list(emotion_probabilities.keys())
82
+ probabilities = [prob * 100 for prob in emotion_probabilities.values()] # Convert to percentages
83
+
84
+ # Prepare data for polar plot
85
+ angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
86
+ angles += angles[:1] # Complete the circle
87
+ probabilities += probabilities[:1] # Complete the circle
88
+
89
+ # Create polar plot
90
+ fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
91
+ ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
92
+ ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')
93
+
94
+ # Customize the plot
95
+ ax.set_yticks([20, 40, 60, 80, 100])
96
+ ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
97
+ ax.set_xticks(angles[:-1])
98
+ ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
99
+ ax.set_ylim(0, 100)
100
+
101
+ ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
102
+ plt.tight_layout()
103
+
104
+ # Save to BytesIO and return the plot
105
+ buf = BytesIO()
106
+ plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
107
+ buf.seek(0)
108
+ plt.close()
109
+ return buf
110
+
111
+ def create_waveform_plot(audio, sr):
112
+ """Create waveform plot"""
113
+ plt.figure(figsize=(12, 4))
114
+ librosa.display.waveshow(audio, sr=sr)
115
+ plt.title('Audio Waveform')
116
+ plt.xlabel('Time (seconds)')
117
+ plt.ylabel('Amplitude')
118
+ plt.tight_layout()
119
+
120
+ # Save to BytesIO and return the plot
121
+ buf = BytesIO()
122
+ plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
123
+ buf.seek(0)
124
+ plt.close()
125
+ return buf
126
+
127
  def predict_emotion(audio_file):
128
+ try:
129
+ # Load audio file
130
+ audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')
131
+
132
+ # Extract features
133
+ features = extract_features(audio_np, sr)
134
+ features = np.expand_dims(features, axis=0)
135
 
136
+ # Make prediction
137
+ predictions = model.predict(features, verbose=0)
138
+ predicted_class = np.argmax(predictions[0])
139
+ predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
140
 
141
+ # Calculate emotion probabilities (as percentages for display)
142
+ emotion_probabilities = {
143
+ label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
144
+ for i, pred in enumerate(predictions[0])
145
+ }
146
+
147
+ # Create visualizations
148
+ mel_spec_plot = create_mel_spectrogram(audio_np, sr)
149
+ polar_plot = create_polar_plot(emotion_probabilities)
150
+ waveform_plot = create_waveform_plot(audio_np, sr)
151
+
152
+ # Convert probabilities to percentages for better display
153
+ emotion_probabilities_percent = {
154
+ emotion: round(prob * 100, 2)
155
+ for emotion, prob in emotion_probabilities.items()
156
+ }
157
 
158
+ return (
159
+ predicted_emotion,
160
+ emotion_probabilities_percent,
161
+ mel_spec_plot,
162
+ polar_plot,
163
+ waveform_plot
164
+ )
165
+
166
+ except Exception as e:
167
+ error_msg = f"Error processing audio: {str(e)}"
168
+ return error_msg, {}, None, None, None
169
 
170
+ # Create Gradio interface
171
+ with gr.Blocks(title="🎀 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
172
+ gr.Markdown(
173
+ """
174
+ # 🎀 Emotion Recognition from Audio
175
+ Upload or record an audio file to analyze the emotional content and view detailed visualizations.
176
+ """
177
+ )
178
+
179
+ with gr.Row():
180
+ with gr.Column(scale=1):
181
+ audio_input = gr.Audio(
182
+ label="Upload or Record Audio",
183
+ type="filepath",
184
+ sources=["upload", "microphone"]
185
+ )
186
+
187
+ predict_btn = gr.Button("πŸ” Analyze Emotion", variant="primary", size="lg")
188
+
189
+ with gr.Column(scale=1):
190
+ predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False)
191
+ emotion_probs = gr.Label(label="πŸ“Š Emotion Probabilities (%)", num_top_classes=10)
192
+
193
+ with gr.Row():
194
+ with gr.Column():
195
+ waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil")
196
+ with gr.Column():
197
+ mel_spec_plot = gr.Image(label="🎡 Mel Spectrogram", type="pil")
198
+
199
+ with gr.Row():
200
+ polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil")
201
+
202
+ # Set up the prediction function
203
+ predict_btn.click(
204
+ fn=predict_emotion,
205
+ inputs=[audio_input],
206
+ outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
207
+ )
208
+
209
+ # Also allow automatic prediction when audio is uploaded
210
+ audio_input.change(
211
+ fn=predict_emotion,
212
+ inputs=[audio_input],
213
+ outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
214
+ )
215
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ # Launch the interface
218
+ if __name__ == "__main__":
219
+ iface.launch(
220
+ server_name="0.0.0.0",
221
+ server_port=7860,
222
+ share=False,
223
+ show_error=True
224
+ )
requirements.txt CHANGED
@@ -1,5 +1,30 @@
1
- tensorflow
2
- librosa
3
- gradio
4
- numpy
5
- scikit-learn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Core ML and Audio Processing
3
+ tensorflow>=2.10.0,<2.16.0
4
+ librosa>=0.10.0
5
+ numpy>=1.21.0,<1.25.0
6
+ scikit-learn>=1.1.0
7
+
8
+ # Audio file handling
9
+ soundfile>=0.12.0
10
+ audioread>=3.0.0
11
+
12
+ # Visualization
13
+ matplotlib>=3.5.0
14
+ seaborn>=0.11.0
15
+
16
+ # Web Interface
17
+ gradio>=4.0.0
18
+
19
+ # Data handling
20
+ pandas>=1.5.0
21
+ pickle5>=0.0.12
22
+
23
+ # Optional audio codecs (recommended for broader format support)
24
+ ffmpeg-python>=0.2.0
25
+
26
+ # System utilities
27
+ packaging>=21.0
28
+
29
+ # For better performance (optional but recommended)
30
+ numba>=0.56.0