Boltz79 commited on
Commit
cc50c45
·
verified ·
1 Parent(s): 4667629

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -30
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app.py
2
  import gradio as gr
3
  import librosa
4
  import numpy as np
@@ -44,13 +43,7 @@ classifier = foreign_class(
44
  )
45
 
46
  def preprocess_audio(audio_file, apply_noise_reduction=False):
47
- """
48
- Load and preprocess the audio file:
49
- - Convert to 16kHz mono.
50
- - Optionally apply noise reduction.
51
- - Normalize the audio.
52
- Saves the processed audio to a temporary file and returns its path.
53
- """
54
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
55
  if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
56
  y = nr.reduce_noise(y=y, sr=sr)
@@ -62,19 +55,15 @@ def preprocess_audio(audio_file, apply_noise_reduction=False):
62
  return temp_file.name
63
 
64
  def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
65
- """
66
- For longer audio files, split into overlapping segments, predict each segment,
67
- and return the majority-voted emotion label.
68
- """
69
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
70
  total_duration = librosa.get_duration(y=y, sr=sr)
71
 
72
- # If the audio is short, process it directly
73
  if total_duration <= segment_duration:
74
  temp_file = preprocess_audio(audio_file, apply_noise_reduction)
75
  _, _, _, label = classifier.classify_file(temp_file)
76
  os.remove(temp_file)
77
- return label
78
 
79
  step = segment_duration - overlap
80
  segments = []
@@ -91,7 +80,7 @@ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duratio
91
  for seg in segments:
92
  temp_file = preprocess_audio(seg, apply_noise_reduction)
93
  _, _, _, label = classifier.classify_file(temp_file)
94
- predictions.append(label)
95
  os.remove(temp_file)
96
  os.remove(seg)
97
 
@@ -100,6 +89,7 @@ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duratio
100
  return most_common
101
 
102
  def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
 
103
  try:
104
  if use_ensemble:
105
  label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
@@ -109,19 +99,16 @@ def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False,
109
  os.remove(temp_file)
110
 
111
  if isinstance(result, tuple) and len(result) > 3:
112
- label = result[3][0] # Extract the predicted label (e.g., 'hap')
113
  else:
114
  label = str(result) # Convert to string if unexpected format
115
 
116
- return add_emoji_to_label(label.lower()) # Ensure lowercase for consistency
117
  except Exception as e:
118
  return f"Error processing file: {str(e)}"
119
 
120
-
121
  def plot_waveform(audio_file):
122
- """
123
- Generate and return a waveform plot image (as a PIL Image) for the given audio file.
124
- """
125
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
126
  plt.figure(figsize=(10, 3))
127
  librosa.display.waveshow(y, sr=sr)
@@ -130,15 +117,10 @@ def plot_waveform(audio_file):
130
  plt.savefig(buf, format="png")
131
  plt.close()
132
  buf.seek(0)
133
- # Convert buffer to PIL Image
134
- image = Image.open(buf)
135
- return image
136
 
137
  def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
138
- """
139
- Run emotion prediction and generate a waveform plot.
140
- Returns a tuple: (emotion label with emoji, waveform image as a PIL Image).
141
- """
142
  emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
143
  waveform = plot_waveform(audio_file)
144
  return emotion, waveform
@@ -147,7 +129,7 @@ def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_du
147
  with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
148
  gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition 😊</h1>")
149
  gr.Markdown(
150
- "Upload an audio file and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
151
  "The prediction is accompanied by an emoji, and you can also view the audio's waveform. "
152
  "Use the options below to adjust ensemble prediction and noise reduction settings."
153
  )
@@ -163,7 +145,6 @@ with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: A
163
  overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
164
  predict_button = gr.Button("Predict Emotion")
165
  result_text = gr.Textbox(label="Predicted Emotion")
166
- # Set type to "pil" since we are returning a PIL Image
167
  waveform_image = gr.Image(label="Audio Waveform", type="pil")
168
 
169
  predict_button.click(
 
 
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
 
43
  )
44
 
45
  def preprocess_audio(audio_file, apply_noise_reduction=False):
46
+ """Load and preprocess the audio file: convert to 16kHz mono, optionally apply noise reduction, and normalize."""
 
 
 
 
 
 
47
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
48
  if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
49
  y = nr.reduce_noise(y=y, sr=sr)
 
55
  return temp_file.name
56
 
57
  def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
58
+ """Split longer audio files into overlapping segments, predict each segment, and return the majority-voted emotion label."""
 
 
 
59
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
60
  total_duration = librosa.get_duration(y=y, sr=sr)
61
 
 
62
  if total_duration <= segment_duration:
63
  temp_file = preprocess_audio(audio_file, apply_noise_reduction)
64
  _, _, _, label = classifier.classify_file(temp_file)
65
  os.remove(temp_file)
66
+ return label[0]
67
 
68
  step = segment_duration - overlap
69
  segments = []
 
80
  for seg in segments:
81
  temp_file = preprocess_audio(seg, apply_noise_reduction)
82
  _, _, _, label = classifier.classify_file(temp_file)
83
+ predictions.append(label[0]) # Extract the predicted emotion
84
  os.remove(temp_file)
85
  os.remove(seg)
86
 
 
89
  return most_common
90
 
91
  def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
92
+ """Predict emotion from an audio file and return the emotion with an emoji."""
93
  try:
94
  if use_ensemble:
95
  label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
 
99
  os.remove(temp_file)
100
 
101
  if isinstance(result, tuple) and len(result) > 3:
102
+ label = result[3][0] # Extract the predicted emotion label
103
  else:
104
  label = str(result) # Convert to string if unexpected format
105
 
106
+ return add_emoji_to_label(label.lower()) # Format and add an emoji
107
  except Exception as e:
108
  return f"Error processing file: {str(e)}"
109
 
 
110
  def plot_waveform(audio_file):
111
+ """Generate and return a waveform plot image (as a PIL Image) for the given audio file."""
 
 
112
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
113
  plt.figure(figsize=(10, 3))
114
  librosa.display.waveshow(y, sr=sr)
 
117
  plt.savefig(buf, format="png")
118
  plt.close()
119
  buf.seek(0)
120
+ return Image.open(buf)
 
 
121
 
122
  def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
123
+ """Run emotion prediction and generate a waveform plot."""
 
 
 
124
  emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
125
  waveform = plot_waveform(audio_file)
126
  return emotion, waveform
 
129
  with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
130
  gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition 😊</h1>")
131
  gr.Markdown(
132
+ "Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
133
  "The prediction is accompanied by an emoji, and you can also view the audio's waveform. "
134
  "Use the options below to adjust ensemble prediction and noise reduction settings."
135
  )
 
145
  overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
146
  predict_button = gr.Button("Predict Emotion")
147
  result_text = gr.Textbox(label="Predicted Emotion")
 
148
  waveform_image = gr.Image(label="Audio Waveform", type="pil")
149
 
150
  predict_button.click(