Boltz79 commited on
Commit
ba147ac
·
verified ·
1 Parent(s): ddf32d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -41
app.py CHANGED
@@ -12,19 +12,15 @@ class EmotionRecognizer:
12
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
13
  device=0 if torch.cuda.is_available() else -1
14
  )
15
- self.target_sr = 16000 # Target sample rate for the model
16
- self.max_duration = 10 # Max audio duration in seconds
17
 
18
  def process_audio(self, audio_path):
19
  try:
20
- # Load audio file using soundfile (works better in Hugging Face Spaces)
21
  audio, orig_sr = sf.read(audio_path)
22
-
23
- # Convert stereo to mono if needed
24
  if len(audio.shape) > 1:
25
  audio = np.mean(audio, axis=1)
26
 
27
- # Resample if necessary
28
  if orig_sr != self.target_sr:
29
  audio = librosa.resample(
30
  y=audio.astype(np.float32),
@@ -34,64 +30,46 @@ class EmotionRecognizer:
34
  else:
35
  audio = audio.astype(np.float32)
36
 
37
- # Normalize audio
38
  audio = librosa.util.normalize(audio)
39
-
40
- # Trim/pad audio to max duration
41
  max_samples = self.max_duration * self.target_sr
42
  if len(audio) > max_samples:
43
  audio = audio[:max_samples]
44
  else:
45
  audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
46
 
47
- # Run classification
48
  results = self.classifier(
49
  {"array": audio, "sampling_rate": self.target_sr}
50
  )
51
 
52
- # Format output
53
  labels = [res["label"] for res in results]
54
  scores = [res["score"] * 100 for res in results]
55
 
56
- text_output = "\n".join([
57
- f"{label}: {score:.2f}%"
58
- for label, score in zip(labels, scores)
59
- ])
60
-
61
- plot_data = {
62
- "labels": labels,
63
- "values": scores
64
- }
65
 
66
  return text_output, plot_data
67
 
68
  except Exception as e:
69
- error_msg = f"Error processing audio: {str(e)}"
70
- print(error_msg)
71
- return error_msg, None
72
 
73
  def create_interface():
74
  recognizer = EmotionRecognizer()
75
 
76
  with gr.Blocks(title="Audio Emotion Recognition") as interface:
77
  gr.Markdown("# 🎙️ Audio Emotion Recognition")
78
- gr.Markdown("Record or upload audio (English speech, 3-10 seconds)")
79
 
80
  with gr.Row():
81
  with gr.Column():
82
  audio_input = gr.Audio(
83
  sources=["microphone", "upload"],
84
  type="filepath",
85
- label="Input Audio",
86
- waveform_options={"waveform_progress_color": "#FF0066"}
87
  )
88
  submit_btn = gr.Button("Analyze", variant="primary")
89
 
90
  with gr.Column():
91
- text_output = gr.Textbox(
92
- label="Emotion Analysis Results",
93
- interactive=False
94
- )
95
  plot_output = gr.BarPlot(
96
  label="Confidence Scores",
97
  x="labels",
@@ -105,17 +83,6 @@ def create_interface():
105
  inputs=audio_input,
106
  outputs=[text_output, plot_output]
107
  )
108
-
109
- gr.Examples(
110
- examples=[
111
- "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav",
112
- "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav"
113
- ],
114
- inputs=audio_input,
115
- outputs=[text_output, plot_output],
116
- fn=recognizer.process_audio,
117
- cache_examples=True
118
- )
119
 
120
  return interface
121
 
 
12
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
13
  device=0 if torch.cuda.is_available() else -1
14
  )
15
+ self.target_sr = 16000
16
+ self.max_duration = 10
17
 
18
  def process_audio(self, audio_path):
19
  try:
 
20
  audio, orig_sr = sf.read(audio_path)
 
 
21
  if len(audio.shape) > 1:
22
  audio = np.mean(audio, axis=1)
23
 
 
24
  if orig_sr != self.target_sr:
25
  audio = librosa.resample(
26
  y=audio.astype(np.float32),
 
30
  else:
31
  audio = audio.astype(np.float32)
32
 
 
33
  audio = librosa.util.normalize(audio)
 
 
34
  max_samples = self.max_duration * self.target_sr
35
  if len(audio) > max_samples:
36
  audio = audio[:max_samples]
37
  else:
38
  audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
39
 
 
40
  results = self.classifier(
41
  {"array": audio, "sampling_rate": self.target_sr}
42
  )
43
 
 
44
  labels = [res["label"] for res in results]
45
  scores = [res["score"] * 100 for res in results]
46
 
47
+ text_output = "\n".join([f"{label}: {score:.2f}%" for label, score in zip(labels, scores)])
48
+ plot_data = {"labels": labels, "values": scores}
 
 
 
 
 
 
 
49
 
50
  return text_output, plot_data
51
 
52
  except Exception as e:
53
+ return f"Error processing audio: {str(e)}", None
 
 
54
 
55
  def create_interface():
56
  recognizer = EmotionRecognizer()
57
 
58
  with gr.Blocks(title="Audio Emotion Recognition") as interface:
59
  gr.Markdown("# 🎙️ Audio Emotion Recognition")
60
+ gr.Markdown("Record or upload English speech (3-10 seconds)")
61
 
62
  with gr.Row():
63
  with gr.Column():
64
  audio_input = gr.Audio(
65
  sources=["microphone", "upload"],
66
  type="filepath",
67
+ label="Input Audio"
 
68
  )
69
  submit_btn = gr.Button("Analyze", variant="primary")
70
 
71
  with gr.Column():
72
+ text_output = gr.Textbox(label="Results", interactive=False)
 
 
 
73
  plot_output = gr.BarPlot(
74
  label="Confidence Scores",
75
  x="labels",
 
83
  inputs=audio_input,
84
  outputs=[text_output, plot_output]
85
  )
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  return interface
88