dennisvdang commited on
Commit
e47e5af
·
1 Parent(s): 1e7df51

Script fixes

Browse files
Files changed (1) hide show
  1. app.py +29 -110
app.py CHANGED
@@ -1,35 +1,29 @@
1
  import os
2
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow)
3
- import tensorflow as tf
4
- tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs
5
- import warnings
6
- warnings.filterwarnings("ignore") # Suppress all warnings
7
-
8
- import argparse
9
- from functools import reduce
10
- from typing import List, Tuple
11
  import shutil
12
- import librosa
13
  import numpy as np
14
- from matplotlib import pyplot as plt
 
 
 
15
  from pydub import AudioSegment
16
  from pydub.silence import detect_nonsilent
17
- from pytube import YouTube
18
  from sklearn.preprocessing import StandardScaler
19
- import shutil
20
- import streamlit as st
21
  import tempfile
22
 
23
-
24
  # Constants
25
  SR = 12000
26
  HOP_LENGTH = 128
27
  MAX_FRAMES = 300
28
  MAX_METERS = 201
29
  N_FEATURES = 15
30
- MODEL_PATH = "models/CRNN/best_model_V3.h5"
31
  AUDIO_TEMP_PATH = "output/temp"
32
 
 
 
 
33
 
34
  def extract_audio(url):
35
  try:
@@ -54,7 +48,6 @@ def extract_audio(url):
54
 
55
 
56
  def strip_silence(audio_path):
57
- """Removes silent parts from an audio file."""
58
  sound = AudioSegment.from_file(audio_path)
59
  nonsilent_ranges = detect_nonsilent(
60
  sound, min_silence_len=500, silence_thresh=-50)
@@ -64,8 +57,6 @@ def strip_silence(audio_path):
64
 
65
 
66
  class AudioFeature:
67
- """Class for extracting and processing audio features."""
68
-
69
  def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
70
  self.audio_path = audio_path
71
  self.beats = None
@@ -116,7 +107,6 @@ class AudioFeature:
116
  return self.key, self.mode
117
 
118
  def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
119
- """Calculate a normalized, key-invariant chromagram for the given audio waveform."""
120
  chromagram = librosa.feature.chroma_cqt(
121
  y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
122
  chromagram = (chromagram - chromagram.min()) / \
@@ -129,7 +119,6 @@ class AudioFeature:
129
  return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
130
 
131
  def extract_features(self):
132
- """Extract various audio features from the loaded audio."""
133
  self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
134
  self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
135
  self.spectrogram, _ = librosa.magphase(
@@ -172,7 +161,6 @@ class AudioFeature:
172
  self.n_frames = len(self.combined_features)
173
 
174
  def create_meter_grid(self):
175
- """Create a grid based on the meter of the song, using tempo and beats."""
176
  self.tempo, self.beats = librosa.beat.beat_track(
177
  onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
178
  self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
@@ -181,12 +169,7 @@ class AudioFeature:
181
  return self.meter_grid
182
 
183
  def _create_meter_grid(self) -> np.ndarray:
184
- """
185
- Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame.
186
-
187
- Returns:
188
- - np.ndarray: The meter grid.
189
- """
190
  seconds_per_beat = 60 / self.tempo
191
  beat_interval = int(librosa.time_to_frames(
192
  seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
@@ -228,17 +211,7 @@ class AudioFeature:
228
  return meter_grid
229
 
230
 
231
- def segment_data_meters(data: np.ndarray, meter_grid: List[int]) -> List[np.ndarray]:
232
- """
233
- Divide song data into segments based on measure grid frames.
234
-
235
- Parameters:
236
- - data (np.ndarray): The song data to be segmented.
237
- - meter_grid (List[int]): The grid indicating the start of each measure.
238
-
239
- Returns:
240
- - List[np.ndarray]: A list of song data segments.
241
- """
242
  meter_segments = [data[s:e]
243
  for s, e in zip(meter_grid[:-1], meter_grid[1:])]
244
  meter_segments = [segment.astype(np.float32) for segment in meter_segments]
@@ -246,31 +219,14 @@ def segment_data_meters(data: np.ndarray, meter_grid: List[int]) -> List[np.ndar
246
 
247
 
248
  def positional_encoding(position: int, d_model: int) -> np.ndarray:
249
- """
250
- Generate a positional encoding for a given position and model dimension.
251
-
252
- Parameters:
253
- - position (int): The position for which to generate the encoding.
254
- - d_model (int): The dimension of the model.
255
-
256
- Returns:
257
- - np.ndarray: The positional encoding.
258
- """
259
  angle_rads = np.arange(position)[:, np.newaxis] / np.power(
260
  10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
261
  return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
262
 
263
 
264
  def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
265
- """
266
- Apply positional encoding at the meter and frame levels to a list of segments.
267
-
268
- Parameters:
269
- - segments (List[np.ndarray]): The list of segments to encode.
270
-
271
- Returns:
272
- - List[np.ndarray]: The list of segments with applied positional encoding.
273
- """
274
  n_features = segments[0].shape[1]
275
  measure_level_encodings = positional_encoding(len(segments), n_features)
276
  return [
@@ -280,19 +236,7 @@ def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[n
280
  ]
281
 
282
 
283
- def pad_song(encoded_segments: List[np.ndarray], max_frames: int = MAX_FRAMES, max_meters: int = MAX_METERS, n_features: int = N_FEATURES) -> np.ndarray:
284
- """
285
- Pad or truncate the encoded segments to have the specified max_frames and max_meters dimensions.
286
-
287
- Parameters:
288
- - encoded_segments (List[np.ndarray]): The encoded segments to pad or truncate.
289
- - max_frames (int): The maximum number of frames per segment.
290
- - max_meters (int): The maximum number of meters.
291
- - n_features (int): The number of features per frame.
292
-
293
- Returns:
294
- - np.ndarray: The padded or truncated song.
295
- """
296
  padded_meters = [
297
  np.pad(meter[:max_frames], ((0, max(0, max_frames -
298
  meter.shape[0])), (0, 0)), 'constant', constant_values=0)
@@ -305,18 +249,7 @@ def pad_song(encoded_segments: List[np.ndarray], max_frames: int = MAX_FRAMES, m
305
 
306
 
307
  def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
308
- """
309
- Process an audio file, extracting features and applying positional encoding.
310
-
311
- Parameters:
312
- - audio_path (str): The path to the audio file.
313
- - trim_silence (bool): Whether to trim silence from the audio.
314
- - sr (int): The sample rate to use when loading the audio.
315
- - hop_length (int): The hop length to use for feature extraction.
316
-
317
- Returns:
318
- - Tuple[np.ndarray, AudioFeature]: The processed audio and its features.
319
- """
320
  if trim_silence:
321
  strip_silence(audio_path)
322
 
@@ -353,12 +286,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
353
 
354
  This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
355
  of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
356
-
357
- Parameters:
358
- - data (np.ndarray): Array of binary predictions.
359
-
360
- Returns:
361
- - np.ndarray: Smoothed array of binary predictions.
362
  """
363
  if not isinstance(data, np.ndarray):
364
  data = np.array(data)
@@ -392,19 +319,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
392
  return smoothed_data
393
 
394
  def make_predictions(model, processed_audio, audio_features, url, video_name):
395
- """
396
- Generate predictions from the model and process them to binary and smoothed predictions.
397
-
398
- Parameters:
399
- - model: The loaded model for making predictions.
400
- - processed_audio: The audio data that has been processed for prediction.
401
- - audio_features: Audio features object containing necessary metadata like meter grid.
402
- - url (str): YouTube URL of the audio file.
403
- - video_name (str): Name of the video.
404
-
405
- Returns:
406
- - np.ndarray: The smoothed binary predictions.
407
- """
408
  predictions = model.predict(processed_audio)[0]
409
  binary_predictions = np.round(
410
  predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
@@ -431,7 +345,6 @@ def make_predictions(model, processed_audio, audio_features, url, video_name):
431
 
432
 
433
  def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
434
- """Draw meter grid lines on the plot."""
435
  for time in meter_grid_times:
436
  ax.axvline(x=time, color='grey', linestyle='--',
437
  linewidth=1, alpha=0.6)
@@ -482,18 +395,24 @@ def plot_predictions(audio_features, predictions):
482
 
483
 
484
  def main():
485
- st.title("Chorus Finder")
486
  st.write("Upload a YouTube URL to find the chorus in the song.")
487
  url = st.text_input("YouTube URL")
488
  if st.button("Find Chorus"):
489
  if url:
490
- audio_file, video_title, temp_dir = extract_audio(url)
 
491
  if audio_file:
492
- strip_silence(audio_file)
493
- processed_audio, audio_features = process_audio(audio_path=audio_file)
494
- model = load_model()
495
- smoothed_predictions = make_predictions(model, processed_audio, audio_features, url, video_title)
496
- plot_predictions(audio_features=audio_features, predictions=smoothed_predictions)
 
 
 
 
 
497
  shutil.rmtree(temp_dir)
498
  else:
499
  st.error("Please enter a valid YouTube URL")
 
1
  import os
 
 
 
 
 
 
 
 
 
2
  import shutil
 
3
  import numpy as np
4
+ import librosa
5
+ import tensorflow as tf
6
+ import streamlit as st
7
+ from pytube import YouTube
8
  from pydub import AudioSegment
9
  from pydub.silence import detect_nonsilent
10
+ from functools import reduce
11
  from sklearn.preprocessing import StandardScaler
12
+ from matplotlib import pyplot as plt
 
13
  import tempfile
14
 
 
15
  # Constants
16
  SR = 12000
17
  HOP_LENGTH = 128
18
  MAX_FRAMES = 300
19
  MAX_METERS = 201
20
  N_FEATURES = 15
21
+ MODEL_PATH = "models/CRNN/chorus_detection_crnn.h5"
22
  AUDIO_TEMP_PATH = "output/temp"
23
 
24
+ # Suppress TensorFlow logs
25
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
26
+ tf.get_logger().setLevel('ERROR')
27
 
28
  def extract_audio(url):
29
  try:
 
48
 
49
 
50
  def strip_silence(audio_path):
 
51
  sound = AudioSegment.from_file(audio_path)
52
  nonsilent_ranges = detect_nonsilent(
53
  sound, min_silence_len=500, silence_thresh=-50)
 
57
 
58
 
59
  class AudioFeature:
 
 
60
  def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
61
  self.audio_path = audio_path
62
  self.beats = None
 
107
  return self.key, self.mode
108
 
109
  def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
 
110
  chromagram = librosa.feature.chroma_cqt(
111
  y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
112
  chromagram = (chromagram - chromagram.min()) / \
 
119
  return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
120
 
121
  def extract_features(self):
 
122
  self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
123
  self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
124
  self.spectrogram, _ = librosa.magphase(
 
161
  self.n_frames = len(self.combined_features)
162
 
163
  def create_meter_grid(self):
 
164
  self.tempo, self.beats = librosa.beat.beat_track(
165
  onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
166
  self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
 
169
  return self.meter_grid
170
 
171
  def _create_meter_grid(self) -> np.ndarray:
172
+ """Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame."""
 
 
 
 
 
173
  seconds_per_beat = 60 / self.tempo
174
  beat_interval = int(librosa.time_to_frames(
175
  seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
 
211
  return meter_grid
212
 
213
 
214
+ def segment_data_meters(data: np.ndarray, meter_grid):
 
 
 
 
 
 
 
 
 
 
215
  meter_segments = [data[s:e]
216
  for s, e in zip(meter_grid[:-1], meter_grid[1:])]
217
  meter_segments = [segment.astype(np.float32) for segment in meter_segments]
 
219
 
220
 
221
  def positional_encoding(position: int, d_model: int) -> np.ndarray:
222
+ """Generate a positional encoding for a given position and model dimension."""
 
 
 
 
 
 
 
 
 
223
  angle_rads = np.arange(position)[:, np.newaxis] / np.power(
224
  10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
225
  return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
226
 
227
 
228
  def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
229
+ """Apply positional encoding at the meter and frame levels to a list of segments."""
 
 
 
 
 
 
 
 
230
  n_features = segments[0].shape[1]
231
  measure_level_encodings = positional_encoding(len(segments), n_features)
232
  return [
 
236
  ]
237
 
238
 
239
+ def pad_song(encoded_segments, max_frames: int = MAX_FRAMES, max_meters: int = MAX_METERS, n_features: int = N_FEATURES) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
240
  padded_meters = [
241
  np.pad(meter[:max_frames], ((0, max(0, max_frames -
242
  meter.shape[0])), (0, 0)), 'constant', constant_values=0)
 
249
 
250
 
251
  def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
252
+ """Process an audio file, extracting features and applying positional encoding."""
 
 
 
 
 
 
 
 
 
 
 
253
  if trim_silence:
254
  strip_silence(audio_path)
255
 
 
286
 
287
  This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
288
  of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
 
 
 
 
 
 
289
  """
290
  if not isinstance(data, np.ndarray):
291
  data = np.array(data)
 
319
  return smoothed_data
320
 
321
  def make_predictions(model, processed_audio, audio_features, url, video_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  predictions = model.predict(processed_audio)[0]
323
  binary_predictions = np.round(
324
  predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
 
345
 
346
 
347
  def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
 
348
  for time in meter_grid_times:
349
  ax.axvline(x=time, color='grey', linestyle='--',
350
  linewidth=1, alpha=0.6)
 
395
 
396
 
397
  def main():
398
+ st.title("Chorus Detection")
399
  st.write("Upload a YouTube URL to find the chorus in the song.")
400
  url = st.text_input("YouTube URL")
401
  if st.button("Find Chorus"):
402
  if url:
403
+ with st.spinner('Extracting audio...'):
404
+ audio_file, video_title, temp_dir = extract_audio(url)
405
  if audio_file:
406
+ with st.spinner('Stripping silence...'):
407
+ strip_silence(audio_file)
408
+ with st.spinner('Processing audio...'):
409
+ processed_audio, audio_features = process_audio(audio_path=audio_file)
410
+ with st.spinner('Loading model...'):
411
+ model = load_model(MODEL_PATH)
412
+ with st.spinner('Making predictions...'):
413
+ smoothed_predictions = make_predictions(model, processed_audio, audio_features, url, video_title)
414
+ with st.spinner('Plotting predictions...'):
415
+ plot_predictions(audio_features, smoothed_predictions)
416
  shutil.rmtree(temp_dir)
417
  else:
418
  st.error("Please enter a valid YouTube URL")