Spaces:
Sleeping
Sleeping
File size: 7,239 Bytes
53d1efd 00ae0ce def04d4 7539cee 53d1efd 7539cee 00ae0ce 7539cee cb9a254 7539cee 53d1efd 7539cee def04d4 7539cee def04d4 7539cee 53d1efd 7539cee 53d1efd 7539cee 53d1efd 7539cee 53d1efd 7539cee 53d1efd 7539cee 53d1efd 786ea23 def04d4 53d1efd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# app.py
import gradio as gr
import librosa
import numpy as np
import os
import tempfile
from collections import Counter
from speechbrain.inference.interfaces import foreign_class
import io
import matplotlib.pyplot as plt
import librosa.display
# Try to import noisereduce (if not available, noise reduction will be skipped)
try:
import noisereduce as nr
NOISEREDUCE_AVAILABLE = True
except ImportError:
NOISEREDUCE_AVAILABLE = False
# Mapping from emotion labels to emojis
emotion_to_emoji = {
"angry": "π ",
"happy": "π",
"sad": "π’",
"neutral": "π",
"excited": "π",
"fear": "π¨",
"disgust": "π€’",
"surprise": "π²"
}
def add_emoji_to_label(label):
emoji = emotion_to_emoji.get(label.lower(), "")
return f"{label.capitalize()} {emoji}"
# Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
classifier = foreign_class(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available
)
def preprocess_audio(audio_file, apply_noise_reduction=False):
"""
Load and preprocess the audio file:
- Convert to 16kHz mono.
- Optionally apply noise reduction.
- Normalize the audio.
The processed audio is saved to a temporary file and its path is returned.
"""
y, sr = librosa.load(audio_file, sr=16000, mono=True)
if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
y = nr.reduce_noise(y=y, sr=sr)
if np.max(np.abs(y)) > 0:
y = y / np.max(np.abs(y))
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
import soundfile as sf
sf.write(temp_file.name, y, sr)
return temp_file.name
def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
"""
For long audio files, split the file into overlapping segments, predict the emotion for each segment,
and return the majority-voted label.
"""
y, sr = librosa.load(audio_file, sr=16000, mono=True)
total_duration = librosa.get_duration(y=y, sr=sr)
if total_duration <= segment_duration:
temp_file = preprocess_audio(audio_file, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
os.remove(temp_file)
return label
step = segment_duration - overlap
segments = []
for start in np.arange(0, total_duration - segment_duration + 0.001, step):
start_sample = int(start * sr)
end_sample = int((start + segment_duration) * sr)
segment_audio = y[start_sample:end_sample]
temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
import soundfile as sf
sf.write(temp_seg.name, segment_audio, sr)
segments.append(temp_seg.name)
predictions = []
for seg in segments:
temp_file = preprocess_audio(seg, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
predictions.append(label)
os.remove(temp_file)
os.remove(seg)
vote = Counter(predictions)
most_common = vote.most_common(1)[0][0]
return most_common
def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
"""
Main prediction function.
- Uses ensemble prediction if enabled.
- Otherwise, processes the entire audio at once.
- Returns the predicted emotion with an emoji.
"""
try:
if use_ensemble:
label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
else:
temp_file = preprocess_audio(audio_file, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
os.remove(temp_file)
return add_emoji_to_label(label)
except Exception as e:
return f"Error processing file: {str(e)}"
def plot_waveform(audio_file):
"""
Generate a waveform plot for the given audio file and return the image bytes.
"""
y, sr = librosa.load(audio_file, sr=16000, mono=True)
plt.figure(figsize=(10, 3))
librosa.display.waveshow(y, sr=sr)
plt.title("Waveform")
buf = io.BytesIO()
plt.savefig(buf, format="png")
plt.close()
buf.seek(0)
return buf.read()
def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
"""
Predict the emotion and also generate the waveform plot.
Returns a tuple: (emotion label with emoji, waveform image)
"""
emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
waveform = plot_waveform(audio_file)
return emotion, waveform
# Build the enhanced UI using Gradio Blocks
with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition π</h1>")
gr.Markdown(
"Upload an audio file and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
"The prediction is accompanied by an emoji, and you can also view the audio's waveform. "
"Use the options below to adjust ensemble prediction and noise reduction settings."
)
with gr.Tabs():
with gr.TabItem("Emotion Recognition"):
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio", source="upload")
use_ensemble = gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False)
apply_noise_reduction = gr.Checkbox(label="Apply Noise Reduction", value=False)
with gr.Row():
segment_duration = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=3.0, label="Segment Duration (s)")
overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
predict_button = gr.Button("Predict Emotion")
result_text = gr.Textbox(label="Predicted Emotion")
waveform_image = gr.Image(label="Audio Waveform", type="auto")
predict_button.click(
predict_and_plot,
inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
outputs=[result_text, waveform_image]
)
with gr.TabItem("About"):
gr.Markdown("""
**Enhanced Emotion Recognition App**
- **Model:** SpeechBrain's wav2vec2 model fine-tuned on IEMOCAP for emotion recognition.
- **Features:**
- Ensemble Prediction for long audio files.
- Optional Noise Reduction.
- Visualization of the audio waveform.
- Emoji representation of the predicted emotion.
**Credits:**
- [SpeechBrain](https://speechbrain.github.io)
- [Gradio](https://gradio.app)
""")
if __name__ == "__main__":
demo.launch()
|