Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import numpy as np
|
|
3 |
import torch
|
4 |
from transformers import pipeline
|
5 |
import librosa
|
|
|
6 |
|
7 |
class EmotionRecognizer:
|
8 |
def __init__(self):
|
@@ -11,100 +12,113 @@ class EmotionRecognizer:
|
|
11 |
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
|
12 |
device=0 if torch.cuda.is_available() else -1
|
13 |
)
|
14 |
-
self.
|
|
|
15 |
|
16 |
-
def process_audio(self,
|
17 |
try:
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
# Convert stereo to mono if necessary
|
22 |
-
if len(audio_data.shape) > 1:
|
23 |
-
audio_data = np.mean(audio_data, axis=1)
|
24 |
-
|
25 |
-
# Convert to float32 and normalize
|
26 |
-
audio_data = audio_data.astype(np.float32)
|
27 |
-
if np.max(np.abs(audio_data)) > 1.0:
|
28 |
-
audio_data = audio_data / np.max(np.abs(audio_data))
|
29 |
|
|
|
|
|
|
|
|
|
30 |
# Resample if necessary
|
31 |
-
if
|
32 |
-
|
33 |
-
y=
|
34 |
-
orig_sr=
|
35 |
-
target_sr=self.
|
36 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
elif len(audio_data) > 10 * self.sample_rate:
|
43 |
-
# Take first 10 seconds if audio is too long
|
44 |
-
audio_data = audio_data[:10 * self.sample_rate]
|
45 |
|
46 |
-
#
|
47 |
-
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
for pred in result
|
53 |
])
|
54 |
|
55 |
-
# Prepare plot data
|
56 |
plot_data = {
|
57 |
-
"labels":
|
58 |
-
"values":
|
59 |
}
|
60 |
|
61 |
-
return
|
62 |
|
63 |
except Exception as e:
|
64 |
-
|
65 |
-
|
|
|
66 |
|
67 |
def create_interface():
|
68 |
recognizer = EmotionRecognizer()
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
return recognizer.process_audio(audio)
|
74 |
-
|
75 |
-
with gr.Blocks() as interface:
|
76 |
-
gr.Markdown("# Audio Emotion Recognition")
|
77 |
-
gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")
|
78 |
|
79 |
with gr.Row():
|
80 |
with gr.Column():
|
81 |
audio_input = gr.Audio(
|
82 |
-
label="Upload or Record Audio",
|
83 |
-
type="numpy",
|
84 |
sources=["microphone", "upload"],
|
|
|
|
|
|
|
85 |
)
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
with gr.Column():
|
90 |
-
|
91 |
-
label="Results",
|
92 |
-
|
93 |
)
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
98 |
)
|
99 |
|
100 |
-
|
101 |
-
fn=
|
102 |
-
inputs=
|
103 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
)
|
105 |
|
106 |
return interface
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
demo = create_interface()
|
110 |
-
demo.launch(
|
|
|
3 |
import torch
|
4 |
from transformers import pipeline
|
5 |
import librosa
|
6 |
+
import soundfile as sf
|
7 |
|
8 |
class EmotionRecognizer:
|
9 |
def __init__(self):
|
|
|
12 |
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
|
13 |
device=0 if torch.cuda.is_available() else -1
|
14 |
)
|
15 |
+
self.target_sr = 16000 # Target sample rate for the model
|
16 |
+
self.max_duration = 10 # Max audio duration in seconds
|
17 |
|
18 |
+
def process_audio(self, audio_path):
|
19 |
try:
|
20 |
+
# Load audio file using soundfile (works better in Hugging Face Spaces)
|
21 |
+
audio, orig_sr = sf.read(audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
# Convert stereo to mono if needed
|
24 |
+
if len(audio.shape) > 1:
|
25 |
+
audio = np.mean(audio, axis=1)
|
26 |
+
|
27 |
# Resample if necessary
|
28 |
+
if orig_sr != self.target_sr:
|
29 |
+
audio = librosa.resample(
|
30 |
+
y=audio.astype(np.float32),
|
31 |
+
orig_sr=orig_sr,
|
32 |
+
target_sr=self.target_sr
|
33 |
)
|
34 |
+
else:
|
35 |
+
audio = audio.astype(np.float32)
|
36 |
+
|
37 |
+
# Normalize audio
|
38 |
+
audio = librosa.util.normalize(audio)
|
39 |
+
|
40 |
+
# Trim/pad audio to max duration
|
41 |
+
max_samples = self.max_duration * self.target_sr
|
42 |
+
if len(audio) > max_samples:
|
43 |
+
audio = audio[:max_samples]
|
44 |
+
else:
|
45 |
+
audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
|
46 |
|
47 |
+
# Run classification
|
48 |
+
results = self.classifier(
|
49 |
+
{"array": audio, "sampling_rate": self.target_sr}
|
50 |
+
)
|
|
|
|
|
|
|
51 |
|
52 |
+
# Format output
|
53 |
+
labels = [res["label"] for res in results]
|
54 |
+
scores = [res["score"] * 100 for res in results]
|
55 |
|
56 |
+
text_output = "\n".join([
|
57 |
+
f"{label}: {score:.2f}%"
|
58 |
+
for label, score in zip(labels, scores)
|
|
|
59 |
])
|
60 |
|
|
|
61 |
plot_data = {
|
62 |
+
"labels": labels,
|
63 |
+
"values": scores
|
64 |
}
|
65 |
|
66 |
+
return text_output, plot_data
|
67 |
|
68 |
except Exception as e:
|
69 |
+
error_msg = f"Error processing audio: {str(e)}"
|
70 |
+
print(error_msg)
|
71 |
+
return error_msg, None
|
72 |
|
73 |
def create_interface():
|
74 |
recognizer = EmotionRecognizer()
|
75 |
|
76 |
+
with gr.Blocks(title="Audio Emotion Recognition") as interface:
|
77 |
+
gr.Markdown("# 🎙️ Audio Emotion Recognition")
|
78 |
+
gr.Markdown("Record or upload audio (English speech, 3-10 seconds)")
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
with gr.Row():
|
81 |
with gr.Column():
|
82 |
audio_input = gr.Audio(
|
|
|
|
|
83 |
sources=["microphone", "upload"],
|
84 |
+
type="filepath",
|
85 |
+
label="Input Audio",
|
86 |
+
waveform_options={"waveform_progress_color": "#FF0066"}
|
87 |
)
|
88 |
+
submit_btn = gr.Button("Analyze", variant="primary")
|
89 |
+
|
|
|
90 |
with gr.Column():
|
91 |
+
text_output = gr.Textbox(
|
92 |
+
label="Emotion Analysis Results",
|
93 |
+
interactive=False
|
94 |
)
|
95 |
+
plot_output = gr.BarPlot(
|
96 |
+
label="Confidence Scores",
|
97 |
+
x="labels",
|
98 |
+
y="values",
|
99 |
+
color="labels",
|
100 |
+
height=300
|
101 |
)
|
102 |
|
103 |
+
submit_btn.click(
|
104 |
+
fn=recognizer.process_audio,
|
105 |
+
inputs=audio_input,
|
106 |
+
outputs=[text_output, plot_output]
|
107 |
+
)
|
108 |
+
|
109 |
+
gr.Examples(
|
110 |
+
examples=[
|
111 |
+
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav",
|
112 |
+
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav"
|
113 |
+
],
|
114 |
+
inputs=audio_input,
|
115 |
+
outputs=[text_output, plot_output],
|
116 |
+
fn=recognizer.process_audio,
|
117 |
+
cache_examples=True
|
118 |
)
|
119 |
|
120 |
return interface
|
121 |
|
122 |
if __name__ == "__main__":
|
123 |
demo = create_interface()
|
124 |
+
demo.launch()
|