Boltz79 commited on
Commit
ddf32d8
·
verified ·
1 Parent(s): 45e49c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -62
app.py CHANGED
@@ -3,6 +3,7 @@ import numpy as np
3
  import torch
4
  from transformers import pipeline
5
  import librosa
 
6
 
7
  class EmotionRecognizer:
8
  def __init__(self):
@@ -11,100 +12,113 @@ class EmotionRecognizer:
11
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
12
  device=0 if torch.cuda.is_available() else -1
13
  )
14
- self.sample_rate = 16000
 
15
 
16
- def process_audio(self, audio_input):
17
  try:
18
- # Extract audio data and sample rate from gradio input
19
- sample_rate, audio_data = audio_input
20
-
21
- # Convert stereo to mono if necessary
22
- if len(audio_data.shape) > 1:
23
- audio_data = np.mean(audio_data, axis=1)
24
-
25
- # Convert to float32 and normalize
26
- audio_data = audio_data.astype(np.float32)
27
- if np.max(np.abs(audio_data)) > 1.0:
28
- audio_data = audio_data / np.max(np.abs(audio_data))
29
 
 
 
 
 
30
  # Resample if necessary
31
- if sample_rate != self.sample_rate:
32
- audio_data = librosa.resample(
33
- y=audio_data,
34
- orig_sr=sample_rate,
35
- target_sr=self.sample_rate
36
  )
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Ensure the audio isn't too short
39
- if len(audio_data) < self.sample_rate:
40
- # Pad audio if it's too short
41
- audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data)))
42
- elif len(audio_data) > 10 * self.sample_rate:
43
- # Take first 10 seconds if audio is too long
44
- audio_data = audio_data[:10 * self.sample_rate]
45
 
46
- # Make prediction
47
- result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate})
 
48
 
49
- # Format results
50
- emotions_text = "\n".join([
51
- f"{pred['label']}: {pred['score']*100:.2f}%"
52
- for pred in result
53
  ])
54
 
55
- # Prepare plot data
56
  plot_data = {
57
- "labels": [pred['label'] for pred in result],
58
- "values": [pred['score'] * 100 for pred in result]
59
  }
60
 
61
- return emotions_text, plot_data
62
 
63
  except Exception as e:
64
- print(f"Error details: {str(e)}")
65
- return f"Error processing audio: {str(e)}", None
 
66
 
67
  def create_interface():
68
  recognizer = EmotionRecognizer()
69
 
70
- def process_audio_file(audio):
71
- if audio is None:
72
- return "Please provide an audio input.", None
73
- return recognizer.process_audio(audio)
74
-
75
- with gr.Blocks() as interface:
76
- gr.Markdown("# Audio Emotion Recognition")
77
- gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")
78
 
79
  with gr.Row():
80
  with gr.Column():
81
  audio_input = gr.Audio(
82
- label="Upload or Record Audio",
83
- type="numpy",
84
  sources=["microphone", "upload"],
 
 
 
85
  )
86
- analyze_btn = gr.Button("Analyze Emotion")
87
- gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.")
88
-
89
  with gr.Column():
90
- output_text = gr.Textbox(
91
- label="Results",
92
- lines=5
93
  )
94
- output_plot = gr.BarPlot(
95
- title="Emotion Confidence Scores",
96
- x_title="Emotions",
97
- y_title="Confidence (%)"
 
 
98
  )
99
 
100
- analyze_btn.click(
101
- fn=process_audio_file,
102
- inputs=[audio_input],
103
- outputs=[output_text, output_plot]
 
 
 
 
 
 
 
 
 
 
 
104
  )
105
 
106
  return interface
107
 
108
  if __name__ == "__main__":
109
  demo = create_interface()
110
- demo.launch(share=True)
 
3
  import torch
4
  from transformers import pipeline
5
  import librosa
6
+ import soundfile as sf
7
 
8
  class EmotionRecognizer:
9
  def __init__(self):
 
12
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
13
  device=0 if torch.cuda.is_available() else -1
14
  )
15
+ self.target_sr = 16000 # Target sample rate for the model
16
+ self.max_duration = 10 # Max audio duration in seconds
17
 
18
+ def process_audio(self, audio_path):
19
  try:
20
+ # Load audio file using soundfile (works better in Hugging Face Spaces)
21
+ audio, orig_sr = sf.read(audio_path)
 
 
 
 
 
 
 
 
 
22
 
23
+ # Convert stereo to mono if needed
24
+ if len(audio.shape) > 1:
25
+ audio = np.mean(audio, axis=1)
26
+
27
  # Resample if necessary
28
+ if orig_sr != self.target_sr:
29
+ audio = librosa.resample(
30
+ y=audio.astype(np.float32),
31
+ orig_sr=orig_sr,
32
+ target_sr=self.target_sr
33
  )
34
+ else:
35
+ audio = audio.astype(np.float32)
36
+
37
+ # Normalize audio
38
+ audio = librosa.util.normalize(audio)
39
+
40
+ # Trim/pad audio to max duration
41
+ max_samples = self.max_duration * self.target_sr
42
+ if len(audio) > max_samples:
43
+ audio = audio[:max_samples]
44
+ else:
45
+ audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
46
 
47
+ # Run classification
48
+ results = self.classifier(
49
+ {"array": audio, "sampling_rate": self.target_sr}
50
+ )
 
 
 
51
 
52
+ # Format output
53
+ labels = [res["label"] for res in results]
54
+ scores = [res["score"] * 100 for res in results]
55
 
56
+ text_output = "\n".join([
57
+ f"{label}: {score:.2f}%"
58
+ for label, score in zip(labels, scores)
 
59
  ])
60
 
 
61
  plot_data = {
62
+ "labels": labels,
63
+ "values": scores
64
  }
65
 
66
+ return text_output, plot_data
67
 
68
  except Exception as e:
69
+ error_msg = f"Error processing audio: {str(e)}"
70
+ print(error_msg)
71
+ return error_msg, None
72
 
73
  def create_interface():
74
  recognizer = EmotionRecognizer()
75
 
76
+ with gr.Blocks(title="Audio Emotion Recognition") as interface:
77
+ gr.Markdown("# 🎙️ Audio Emotion Recognition")
78
+ gr.Markdown("Record or upload audio (English speech, 3-10 seconds)")
 
 
 
 
 
79
 
80
  with gr.Row():
81
  with gr.Column():
82
  audio_input = gr.Audio(
 
 
83
  sources=["microphone", "upload"],
84
+ type="filepath",
85
+ label="Input Audio",
86
+ waveform_options={"waveform_progress_color": "#FF0066"}
87
  )
88
+ submit_btn = gr.Button("Analyze", variant="primary")
89
+
 
90
  with gr.Column():
91
+ text_output = gr.Textbox(
92
+ label="Emotion Analysis Results",
93
+ interactive=False
94
  )
95
+ plot_output = gr.BarPlot(
96
+ label="Confidence Scores",
97
+ x="labels",
98
+ y="values",
99
+ color="labels",
100
+ height=300
101
  )
102
 
103
+ submit_btn.click(
104
+ fn=recognizer.process_audio,
105
+ inputs=audio_input,
106
+ outputs=[text_output, plot_output]
107
+ )
108
+
109
+ gr.Examples(
110
+ examples=[
111
+ "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav",
112
+ "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav"
113
+ ],
114
+ inputs=audio_input,
115
+ outputs=[text_output, plot_output],
116
+ fn=recognizer.process_audio,
117
+ cache_examples=True
118
  )
119
 
120
  return interface
121
 
122
  if __name__ == "__main__":
123
  demo = create_interface()
124
+ demo.launch()