Spaces:
Running
Running
Commit
·
e47e5af
1
Parent(s):
1e7df51
Script fixes
Browse files
app.py
CHANGED
@@ -1,35 +1,29 @@
|
|
1 |
import os
|
2 |
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow)
|
3 |
-
import tensorflow as tf
|
4 |
-
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs
|
5 |
-
import warnings
|
6 |
-
warnings.filterwarnings("ignore") # Suppress all warnings
|
7 |
-
|
8 |
-
import argparse
|
9 |
-
from functools import reduce
|
10 |
-
from typing import List, Tuple
|
11 |
import shutil
|
12 |
-
import librosa
|
13 |
import numpy as np
|
14 |
-
|
|
|
|
|
|
|
15 |
from pydub import AudioSegment
|
16 |
from pydub.silence import detect_nonsilent
|
17 |
-
from
|
18 |
from sklearn.preprocessing import StandardScaler
|
19 |
-
import
|
20 |
-
import streamlit as st
|
21 |
import tempfile
|
22 |
|
23 |
-
|
24 |
# Constants
|
25 |
SR = 12000
|
26 |
HOP_LENGTH = 128
|
27 |
MAX_FRAMES = 300
|
28 |
MAX_METERS = 201
|
29 |
N_FEATURES = 15
|
30 |
-
MODEL_PATH = "models/CRNN/
|
31 |
AUDIO_TEMP_PATH = "output/temp"
|
32 |
|
|
|
|
|
|
|
33 |
|
34 |
def extract_audio(url):
|
35 |
try:
|
@@ -54,7 +48,6 @@ def extract_audio(url):
|
|
54 |
|
55 |
|
56 |
def strip_silence(audio_path):
|
57 |
-
"""Removes silent parts from an audio file."""
|
58 |
sound = AudioSegment.from_file(audio_path)
|
59 |
nonsilent_ranges = detect_nonsilent(
|
60 |
sound, min_silence_len=500, silence_thresh=-50)
|
@@ -64,8 +57,6 @@ def strip_silence(audio_path):
|
|
64 |
|
65 |
|
66 |
class AudioFeature:
|
67 |
-
"""Class for extracting and processing audio features."""
|
68 |
-
|
69 |
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
|
70 |
self.audio_path = audio_path
|
71 |
self.beats = None
|
@@ -116,7 +107,6 @@ class AudioFeature:
|
|
116 |
return self.key, self.mode
|
117 |
|
118 |
def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
|
119 |
-
"""Calculate a normalized, key-invariant chromagram for the given audio waveform."""
|
120 |
chromagram = librosa.feature.chroma_cqt(
|
121 |
y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
|
122 |
chromagram = (chromagram - chromagram.min()) / \
|
@@ -129,7 +119,6 @@ class AudioFeature:
|
|
129 |
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
|
130 |
|
131 |
def extract_features(self):
|
132 |
-
"""Extract various audio features from the loaded audio."""
|
133 |
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
|
134 |
self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
|
135 |
self.spectrogram, _ = librosa.magphase(
|
@@ -172,7 +161,6 @@ class AudioFeature:
|
|
172 |
self.n_frames = len(self.combined_features)
|
173 |
|
174 |
def create_meter_grid(self):
|
175 |
-
"""Create a grid based on the meter of the song, using tempo and beats."""
|
176 |
self.tempo, self.beats = librosa.beat.beat_track(
|
177 |
onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
|
178 |
self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
|
@@ -181,12 +169,7 @@ class AudioFeature:
|
|
181 |
return self.meter_grid
|
182 |
|
183 |
def _create_meter_grid(self) -> np.ndarray:
|
184 |
-
"""
|
185 |
-
Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame.
|
186 |
-
|
187 |
-
Returns:
|
188 |
-
- np.ndarray: The meter grid.
|
189 |
-
"""
|
190 |
seconds_per_beat = 60 / self.tempo
|
191 |
beat_interval = int(librosa.time_to_frames(
|
192 |
seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
|
@@ -228,17 +211,7 @@ class AudioFeature:
|
|
228 |
return meter_grid
|
229 |
|
230 |
|
231 |
-
def segment_data_meters(data: np.ndarray, meter_grid
|
232 |
-
"""
|
233 |
-
Divide song data into segments based on measure grid frames.
|
234 |
-
|
235 |
-
Parameters:
|
236 |
-
- data (np.ndarray): The song data to be segmented.
|
237 |
-
- meter_grid (List[int]): The grid indicating the start of each measure.
|
238 |
-
|
239 |
-
Returns:
|
240 |
-
- List[np.ndarray]: A list of song data segments.
|
241 |
-
"""
|
242 |
meter_segments = [data[s:e]
|
243 |
for s, e in zip(meter_grid[:-1], meter_grid[1:])]
|
244 |
meter_segments = [segment.astype(np.float32) for segment in meter_segments]
|
@@ -246,31 +219,14 @@ def segment_data_meters(data: np.ndarray, meter_grid: List[int]) -> List[np.ndar
|
|
246 |
|
247 |
|
248 |
def positional_encoding(position: int, d_model: int) -> np.ndarray:
|
249 |
-
"""
|
250 |
-
Generate a positional encoding for a given position and model dimension.
|
251 |
-
|
252 |
-
Parameters:
|
253 |
-
- position (int): The position for which to generate the encoding.
|
254 |
-
- d_model (int): The dimension of the model.
|
255 |
-
|
256 |
-
Returns:
|
257 |
-
- np.ndarray: The positional encoding.
|
258 |
-
"""
|
259 |
angle_rads = np.arange(position)[:, np.newaxis] / np.power(
|
260 |
10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
|
261 |
return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
|
262 |
|
263 |
|
264 |
def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
|
265 |
-
"""
|
266 |
-
Apply positional encoding at the meter and frame levels to a list of segments.
|
267 |
-
|
268 |
-
Parameters:
|
269 |
-
- segments (List[np.ndarray]): The list of segments to encode.
|
270 |
-
|
271 |
-
Returns:
|
272 |
-
- List[np.ndarray]: The list of segments with applied positional encoding.
|
273 |
-
"""
|
274 |
n_features = segments[0].shape[1]
|
275 |
measure_level_encodings = positional_encoding(len(segments), n_features)
|
276 |
return [
|
@@ -280,19 +236,7 @@ def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[n
|
|
280 |
]
|
281 |
|
282 |
|
283 |
-
def pad_song(encoded_segments
|
284 |
-
"""
|
285 |
-
Pad or truncate the encoded segments to have the specified max_frames and max_meters dimensions.
|
286 |
-
|
287 |
-
Parameters:
|
288 |
-
- encoded_segments (List[np.ndarray]): The encoded segments to pad or truncate.
|
289 |
-
- max_frames (int): The maximum number of frames per segment.
|
290 |
-
- max_meters (int): The maximum number of meters.
|
291 |
-
- n_features (int): The number of features per frame.
|
292 |
-
|
293 |
-
Returns:
|
294 |
-
- np.ndarray: The padded or truncated song.
|
295 |
-
"""
|
296 |
padded_meters = [
|
297 |
np.pad(meter[:max_frames], ((0, max(0, max_frames -
|
298 |
meter.shape[0])), (0, 0)), 'constant', constant_values=0)
|
@@ -305,18 +249,7 @@ def pad_song(encoded_segments: List[np.ndarray], max_frames: int = MAX_FRAMES, m
|
|
305 |
|
306 |
|
307 |
def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
|
308 |
-
"""
|
309 |
-
Process an audio file, extracting features and applying positional encoding.
|
310 |
-
|
311 |
-
Parameters:
|
312 |
-
- audio_path (str): The path to the audio file.
|
313 |
-
- trim_silence (bool): Whether to trim silence from the audio.
|
314 |
-
- sr (int): The sample rate to use when loading the audio.
|
315 |
-
- hop_length (int): The hop length to use for feature extraction.
|
316 |
-
|
317 |
-
Returns:
|
318 |
-
- Tuple[np.ndarray, AudioFeature]: The processed audio and its features.
|
319 |
-
"""
|
320 |
if trim_silence:
|
321 |
strip_silence(audio_path)
|
322 |
|
@@ -353,12 +286,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
|
|
353 |
|
354 |
This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
|
355 |
of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
|
356 |
-
|
357 |
-
Parameters:
|
358 |
-
- data (np.ndarray): Array of binary predictions.
|
359 |
-
|
360 |
-
Returns:
|
361 |
-
- np.ndarray: Smoothed array of binary predictions.
|
362 |
"""
|
363 |
if not isinstance(data, np.ndarray):
|
364 |
data = np.array(data)
|
@@ -392,19 +319,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
|
|
392 |
return smoothed_data
|
393 |
|
394 |
def make_predictions(model, processed_audio, audio_features, url, video_name):
|
395 |
-
"""
|
396 |
-
Generate predictions from the model and process them to binary and smoothed predictions.
|
397 |
-
|
398 |
-
Parameters:
|
399 |
-
- model: The loaded model for making predictions.
|
400 |
-
- processed_audio: The audio data that has been processed for prediction.
|
401 |
-
- audio_features: Audio features object containing necessary metadata like meter grid.
|
402 |
-
- url (str): YouTube URL of the audio file.
|
403 |
-
- video_name (str): Name of the video.
|
404 |
-
|
405 |
-
Returns:
|
406 |
-
- np.ndarray: The smoothed binary predictions.
|
407 |
-
"""
|
408 |
predictions = model.predict(processed_audio)[0]
|
409 |
binary_predictions = np.round(
|
410 |
predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
|
@@ -431,7 +345,6 @@ def make_predictions(model, processed_audio, audio_features, url, video_name):
|
|
431 |
|
432 |
|
433 |
def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
|
434 |
-
"""Draw meter grid lines on the plot."""
|
435 |
for time in meter_grid_times:
|
436 |
ax.axvline(x=time, color='grey', linestyle='--',
|
437 |
linewidth=1, alpha=0.6)
|
@@ -482,18 +395,24 @@ def plot_predictions(audio_features, predictions):
|
|
482 |
|
483 |
|
484 |
def main():
|
485 |
-
st.title("Chorus
|
486 |
st.write("Upload a YouTube URL to find the chorus in the song.")
|
487 |
url = st.text_input("YouTube URL")
|
488 |
if st.button("Find Chorus"):
|
489 |
if url:
|
490 |
-
|
|
|
491 |
if audio_file:
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
497 |
shutil.rmtree(temp_dir)
|
498 |
else:
|
499 |
st.error("Please enter a valid YouTube URL")
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import shutil
|
|
|
3 |
import numpy as np
|
4 |
+
import librosa
|
5 |
+
import tensorflow as tf
|
6 |
+
import streamlit as st
|
7 |
+
from pytube import YouTube
|
8 |
from pydub import AudioSegment
|
9 |
from pydub.silence import detect_nonsilent
|
10 |
+
from functools import reduce
|
11 |
from sklearn.preprocessing import StandardScaler
|
12 |
+
from matplotlib import pyplot as plt
|
|
|
13 |
import tempfile
|
14 |
|
|
|
15 |
# Constants
|
16 |
SR = 12000
|
17 |
HOP_LENGTH = 128
|
18 |
MAX_FRAMES = 300
|
19 |
MAX_METERS = 201
|
20 |
N_FEATURES = 15
|
21 |
+
MODEL_PATH = "models/CRNN/chorus_detection_crnn.h5"
|
22 |
AUDIO_TEMP_PATH = "output/temp"
|
23 |
|
24 |
+
# Suppress TensorFlow logs
|
25 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
26 |
+
tf.get_logger().setLevel('ERROR')
|
27 |
|
28 |
def extract_audio(url):
|
29 |
try:
|
|
|
48 |
|
49 |
|
50 |
def strip_silence(audio_path):
|
|
|
51 |
sound = AudioSegment.from_file(audio_path)
|
52 |
nonsilent_ranges = detect_nonsilent(
|
53 |
sound, min_silence_len=500, silence_thresh=-50)
|
|
|
57 |
|
58 |
|
59 |
class AudioFeature:
|
|
|
|
|
60 |
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
|
61 |
self.audio_path = audio_path
|
62 |
self.beats = None
|
|
|
107 |
return self.key, self.mode
|
108 |
|
109 |
def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
|
|
|
110 |
chromagram = librosa.feature.chroma_cqt(
|
111 |
y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
|
112 |
chromagram = (chromagram - chromagram.min()) / \
|
|
|
119 |
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
|
120 |
|
121 |
def extract_features(self):
|
|
|
122 |
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
|
123 |
self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
|
124 |
self.spectrogram, _ = librosa.magphase(
|
|
|
161 |
self.n_frames = len(self.combined_features)
|
162 |
|
163 |
def create_meter_grid(self):
|
|
|
164 |
self.tempo, self.beats = librosa.beat.beat_track(
|
165 |
onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
|
166 |
self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
|
|
|
169 |
return self.meter_grid
|
170 |
|
171 |
def _create_meter_grid(self) -> np.ndarray:
|
172 |
+
"""Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame."""
|
|
|
|
|
|
|
|
|
|
|
173 |
seconds_per_beat = 60 / self.tempo
|
174 |
beat_interval = int(librosa.time_to_frames(
|
175 |
seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
|
|
|
211 |
return meter_grid
|
212 |
|
213 |
|
214 |
+
def segment_data_meters(data: np.ndarray, meter_grid):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
meter_segments = [data[s:e]
|
216 |
for s, e in zip(meter_grid[:-1], meter_grid[1:])]
|
217 |
meter_segments = [segment.astype(np.float32) for segment in meter_segments]
|
|
|
219 |
|
220 |
|
221 |
def positional_encoding(position: int, d_model: int) -> np.ndarray:
|
222 |
+
"""Generate a positional encoding for a given position and model dimension."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
angle_rads = np.arange(position)[:, np.newaxis] / np.power(
|
224 |
10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
|
225 |
return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
|
226 |
|
227 |
|
228 |
def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
|
229 |
+
"""Apply positional encoding at the meter and frame levels to a list of segments."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
n_features = segments[0].shape[1]
|
231 |
measure_level_encodings = positional_encoding(len(segments), n_features)
|
232 |
return [
|
|
|
236 |
]
|
237 |
|
238 |
|
239 |
+
def pad_song(encoded_segments, max_frames: int = MAX_FRAMES, max_meters: int = MAX_METERS, n_features: int = N_FEATURES) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
padded_meters = [
|
241 |
np.pad(meter[:max_frames], ((0, max(0, max_frames -
|
242 |
meter.shape[0])), (0, 0)), 'constant', constant_values=0)
|
|
|
249 |
|
250 |
|
251 |
def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
|
252 |
+
"""Process an audio file, extracting features and applying positional encoding."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
if trim_silence:
|
254 |
strip_silence(audio_path)
|
255 |
|
|
|
286 |
|
287 |
This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
|
288 |
of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
"""
|
290 |
if not isinstance(data, np.ndarray):
|
291 |
data = np.array(data)
|
|
|
319 |
return smoothed_data
|
320 |
|
321 |
def make_predictions(model, processed_audio, audio_features, url, video_name):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
predictions = model.predict(processed_audio)[0]
|
323 |
binary_predictions = np.round(
|
324 |
predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
|
|
|
345 |
|
346 |
|
347 |
def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
|
|
|
348 |
for time in meter_grid_times:
|
349 |
ax.axvline(x=time, color='grey', linestyle='--',
|
350 |
linewidth=1, alpha=0.6)
|
|
|
395 |
|
396 |
|
397 |
def main():
|
398 |
+
st.title("Chorus Detection")
|
399 |
st.write("Upload a YouTube URL to find the chorus in the song.")
|
400 |
url = st.text_input("YouTube URL")
|
401 |
if st.button("Find Chorus"):
|
402 |
if url:
|
403 |
+
with st.spinner('Extracting audio...'):
|
404 |
+
audio_file, video_title, temp_dir = extract_audio(url)
|
405 |
if audio_file:
|
406 |
+
with st.spinner('Stripping silence...'):
|
407 |
+
strip_silence(audio_file)
|
408 |
+
with st.spinner('Processing audio...'):
|
409 |
+
processed_audio, audio_features = process_audio(audio_path=audio_file)
|
410 |
+
with st.spinner('Loading model...'):
|
411 |
+
model = load_model(MODEL_PATH)
|
412 |
+
with st.spinner('Making predictions...'):
|
413 |
+
smoothed_predictions = make_predictions(model, processed_audio, audio_features, url, video_title)
|
414 |
+
with st.spinner('Plotting predictions...'):
|
415 |
+
plot_predictions(audio_features, smoothed_predictions)
|
416 |
shutil.rmtree(temp_dir)
|
417 |
else:
|
418 |
st.error("Please enter a valid YouTube URL")
|