amroa commited on
Commit
36dbf7a
·
1 Parent(s): 8c4ff63
Files changed (1) hide show
  1. app.py +60 -12
app.py CHANGED
@@ -145,16 +145,33 @@ def preprocess_for_inference(audio_arr, sr):
145
  return results
146
 
147
  DESCRIPTION = """
148
- # Bird audio classification using SOTA Voice of Jungle Technology. \n
149
  # Introduction
150
 
151
- It is esimated that 50% of the global economy is threatened by biodiversity loss. As such, efforts have been concerted into estimating bird biodiversity, as birds are a top indicator of biodiversity in the region. One of these efforts is
152
  finding the bird species in a region using bird species audio classification.
153
- Prediction on left table shows prediction on the type of noise (class), while the right predictions are the species of bird. If class prediction does not output bird, then consequently the species prediction is not confident.
 
 
 
 
 
154
  """
155
 
156
 
157
  css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  .number-input {
159
  height: 100%;
160
  padding-bottom: 60px; /* Adust the value as needed for more or less space */
@@ -209,15 +226,42 @@ class Seafoam(Base):
209
 
210
  seafoam = Seafoam()
211
 
212
- ## logo: <img src="https://i.ibb.co/vcG9kr0/vojlogo.jpg" alt="vojlogo" border="0">
213
- ## cactus: <img src="https://i.ibb.co/3sW2mJN/spur.jpg" alt="spur" border="0">
214
- with gr.Blocks(theme=seafoam, css = css) as demo:
215
- #img_src = '<img src="https://i.ibb.co/3sW2mJN/spur.jpg" alt="spur" border="0" width = "100%">'
216
- #gr.Markdown(f"{img_src}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
- #gr.Markdown(f"# Team Voice of Jungle {img_src} more text")
 
219
  gr.Markdown(DESCRIPTION)
220
-
221
  with gr.Row():
222
  with gr.Column(elem_classes="column-container"):
223
  start_time_input = gr.Number(label="Start Time", value=0, elem_classes="number-input full-height")
@@ -236,7 +280,6 @@ with gr.Blocks(theme=seafoam, css = css) as demo:
236
 
237
  gr.Examples(
238
  examples=[
239
- ["312_Cissopis_leverinia_1.wav", 0, 5],
240
  ["1094_Pionus_fuscus_2.wav", 0, 10],
241
  ],
242
  inputs=[audio_input, start_time_input, end_time_input]
@@ -244,4 +287,9 @@ with gr.Blocks(theme=seafoam, css = css) as demo:
244
 
245
  gr.Button("Predict").click(predict, [audio_input, start_time_input, end_time_input], [raw_class_output, species_output, waveform_output, spectrogram_output])
246
 
247
- demo.launch(share = True)
 
 
 
 
 
 
145
  return results
146
 
147
  DESCRIPTION = """
 
148
  # Introduction
149
 
150
+ It is esimated that 50% of the global economy is threatened by biodiversity loss [2]. As such, intensive efforts have been concerted into estimating bird biodiversity, as birds are a top indicator of biodiversity in the region. One of these efforts is
151
  finding the bird species in a region using bird species audio classification.
152
+
153
+ # Solution
154
+
155
+ To tackle this problem, we propose VOJ. It first preprocesses an audio signal using a bandpass filter (1K - 8K) and then applies downsampling to 16K Hz. Afterwards, we input the signal into AudioMAE (Audio Masked AutoEncoder by Meta [1]) which extracts relevant features even in the presence of corruptions to the signal spectrogram.
156
+ The AudioMAE is also trained on 527 types of audio that comprise bird, silence, environmental noise, and other types. The purpose of this initial inference stage is to provide an initial sense of the audio. If the AudioMAE outputs silence, we can expect low species prediction confidence, or if the output is insect, it may not be worth labelling.
157
+ Next, we train BirdAST, which has Audio Spectrogram Transformer (AST) as backbone, followed by an attention pooling and dense layer. We also train EfficientB0 on the melspectrogram, and finally, we train a model using Wav2Vec pretrained on 50 bird species [3].
158
  """
159
 
160
 
161
  css = """
162
+ #gradio-animation {
163
+ font-size: 2em;
164
+ font-weight: bold;
165
+ text-align: center;
166
+ margin-bottom: 20px;
167
+ }
168
+
169
+ .logo-container img {
170
+ width: 14%; /* Adjust width as necessary */
171
+ display: block;
172
+ margin: auto;
173
+ }
174
+
175
  .number-input {
176
  height: 100%;
177
  padding-bottom: 60px; /* Adust the value as needed for more or less space */
 
226
 
227
  seafoam = Seafoam()
228
 
229
+
230
+ js = """
231
+ function createGradioAnimation() {
232
+ var container = document.getElementById('gradio-animation');
233
+ var text = 'Voice of Jungle';
234
+ for (var i = 0; i < text.length; i++) {
235
+ (function(i){
236
+ setTimeout(function(){
237
+ var letter = document.createElement('span');
238
+ letter.style.opacity = '0';
239
+ letter.style.transition = 'opacity 0.5s';
240
+ letter.innerText = text[i];
241
+ container.appendChild(letter);
242
+ setTimeout(function() {
243
+ letter.style.opacity = '1';
244
+ }, 50);
245
+ }, i * 250);
246
+ })(i);
247
+ }
248
+ }
249
+ """
250
+
251
+ REFERENCES = """
252
+ References
253
+
254
+ [1] Huang, P.-Y., Xu, H., Li, J., Baevski, A., Auli, M., Galuba, W., Metze, F., & Feichtenhofer, C. (2022). Masked Autoencoders that Listen. In NeurIPS.
255
+
256
+ [2] Torkington, S. (2023, February 7). 50% of the global economy is under threat from biodiversity loss. World Economic Forum. Retrieved from https://www.weforum.org/agenda/2023/02/biodiversity-nature-loss-cop15/.
257
+
258
+ [3] https://www.kaggle.com/code/dima806/bird-species-by-sound-detection
259
+ """
260
+ with gr.Blocks(theme = seafoam, css = css, js = js) as demo:
261
 
262
+ gr.Markdown('<div class="logo-container"><img src="https://i.ibb.co/vcG9kr0/vojlogo.jpg" width="50px" alt="vojlogo"></div>')
263
+ gr.Markdown('<div id="gradio-animation"></div>')
264
  gr.Markdown(DESCRIPTION)
 
265
  with gr.Row():
266
  with gr.Column(elem_classes="column-container"):
267
  start_time_input = gr.Number(label="Start Time", value=0, elem_classes="number-input full-height")
 
280
 
281
  gr.Examples(
282
  examples=[
 
283
  ["1094_Pionus_fuscus_2.wav", 0, 10],
284
  ],
285
  inputs=[audio_input, start_time_input, end_time_input]
 
287
 
288
  gr.Button("Predict").click(predict, [audio_input, start_time_input, end_time_input], [raw_class_output, species_output, waveform_output, spectrogram_output])
289
 
290
+ gr.Markdown(REFERENCES)
291
+
292
+ demo.launch(share = True)
293
+
294
+ ## logo: <img src="https://i.ibb.co/vcG9kr0/vojlogo.jpg" alt="vojlogo" border="0">
295
+ ## cactus: <img src="https://i.ibb.co/3sW2mJN/spur.jpg" alt="spur" border="0">