Spaces:

dennisvdang
/

Chorus-Detection

Running

App Files Files Community

dennisvdang commited on Jun 15, 2024

Commit

e47e5af

1 Parent(s): 1e7df51

Script fixes

Browse files

Files changed (1) hide show

app.py +29 -110

app.py CHANGED Viewed

@@ -1,35 +1,29 @@
 import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logs (must be set before importing TensorFlow)
-import tensorflow as tf
-tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow ERROR logs
-import warnings
-warnings.filterwarnings("ignore")  # Suppress all warnings
-import argparse
-from functools import reduce
-from typing import List, Tuple
 import shutil
-import librosa
 import numpy as np
-from matplotlib import pyplot as plt
 from pydub import AudioSegment
 from pydub.silence import detect_nonsilent
-from pytube import YouTube
 from sklearn.preprocessing import StandardScaler
-import shutil
-import streamlit as st
 import tempfile
 # Constants
 SR = 12000
 HOP_LENGTH = 128
 MAX_FRAMES = 300
 MAX_METERS = 201
 N_FEATURES = 15
-MODEL_PATH = "models/CRNN/best_model_V3.h5"
 AUDIO_TEMP_PATH = "output/temp"
 def extract_audio(url):
     try:
@@ -54,7 +48,6 @@ def extract_audio(url):
 def strip_silence(audio_path):
-    """Removes silent parts from an audio file."""
     sound = AudioSegment.from_file(audio_path)
     nonsilent_ranges = detect_nonsilent(
         sound, min_silence_len=500, silence_thresh=-50)
@@ -64,8 +57,6 @@ def strip_silence(audio_path):
 class AudioFeature:
-    """Class for extracting and processing audio features."""
     def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
         self.audio_path = audio_path
         self.beats = None
@@ -116,7 +107,6 @@ class AudioFeature:
         return self.key, self.mode
     def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
-        """Calculate a normalized, key-invariant chromagram for the given audio waveform."""
         chromagram = librosa.feature.chroma_cqt(
             y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
         chromagram = (chromagram - chromagram.min()) / \
@@ -129,7 +119,6 @@ class AudioFeature:
         return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
     def extract_features(self):
-        """Extract various audio features from the loaded audio."""
         self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
         self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
         self.spectrogram, _ = librosa.magphase(
@@ -172,7 +161,6 @@ class AudioFeature:
         self.n_frames = len(self.combined_features)
     def create_meter_grid(self):
-        """Create a grid based on the meter of the song, using tempo and beats."""
         self.tempo, self.beats = librosa.beat.beat_track(
             onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
         self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
@@ -181,12 +169,7 @@ class AudioFeature:
         return self.meter_grid
     def _create_meter_grid(self) -> np.ndarray:
-        """
-        Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame.
-        Returns:
-        - np.ndarray: The meter grid.
-        """
         seconds_per_beat = 60 / self.tempo
         beat_interval = int(librosa.time_to_frames(
             seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
@@ -228,17 +211,7 @@ class AudioFeature:
         return meter_grid
-def segment_data_meters(data: np.ndarray, meter_grid: List[int]) -> List[np.ndarray]:
-    """
-    Divide song data into segments based on measure grid frames.
-    Parameters:
-    - data (np.ndarray): The song data to be segmented.
-    - meter_grid (List[int]): The grid indicating the start of each measure.
-    Returns:
-    - List[np.ndarray]: A list of song data segments.
-    """
     meter_segments = [data[s:e]
                       for s, e in zip(meter_grid[:-1], meter_grid[1:])]
     meter_segments = [segment.astype(np.float32) for segment in meter_segments]
@@ -246,31 +219,14 @@ def segment_data_meters(data: np.ndarray, meter_grid: List[int]) -> List[np.ndar
 def positional_encoding(position: int, d_model: int) -> np.ndarray:
-    """
-    Generate a positional encoding for a given position and model dimension.
-    Parameters:
-    - position (int): The position for which to generate the encoding.
-    - d_model (int): The dimension of the model.
-    Returns:
-    - np.ndarray: The positional encoding.
-    """
     angle_rads = np.arange(position)[:, np.newaxis] / np.power(
         10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
     return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
 def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
-    """
-    Apply positional encoding at the meter and frame levels to a list of segments.
-    Parameters:
-    - segments (List[np.ndarray]): The list of segments to encode.
-    Returns:
-    - List[np.ndarray]: The list of segments with applied positional encoding.
-    """
     n_features = segments[0].shape[1]
     measure_level_encodings = positional_encoding(len(segments), n_features)
     return [
@@ -280,19 +236,7 @@ def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[n
     ]
-def pad_song(encoded_segments: List[np.ndarray], max_frames: int = MAX_FRAMES, max_meters: int = MAX_METERS, n_features: int = N_FEATURES) -> np.ndarray:
-    """
-    Pad or truncate the encoded segments to have the specified max_frames and max_meters dimensions.
-    Parameters:
-    - encoded_segments (List[np.ndarray]): The encoded segments to pad or truncate.
-    - max_frames (int): The maximum number of frames per segment.
-    - max_meters (int): The maximum number of meters.
-    - n_features (int): The number of features per frame.
-    Returns:
-    - np.ndarray: The padded or truncated song.
-    """
     padded_meters = [
         np.pad(meter[:max_frames], ((0, max(0, max_frames -
                meter.shape[0])), (0, 0)), 'constant', constant_values=0)
@@ -305,18 +249,7 @@ def pad_song(encoded_segments: List[np.ndarray], max_frames: int = MAX_FRAMES, m
 def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
-    """
-    Process an audio file, extracting features and applying positional encoding.
-    Parameters:
-    - audio_path (str): The path to the audio file.
-    - trim_silence (bool): Whether to trim silence from the audio.
-    - sr (int): The sample rate to use when loading the audio.
-    - hop_length (int): The hop length to use for feature extraction.
-    Returns:
-    - Tuple[np.ndarray, AudioFeature]: The processed audio and its features.
-    """
     if trim_silence:
         strip_silence(audio_path)
@@ -353,12 +286,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
     This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
     of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
-    Parameters:
-    - data (np.ndarray): Array of binary predictions.
-    Returns:
-    - np.ndarray: Smoothed array of binary predictions.
     """
     if not isinstance(data, np.ndarray):
         data = np.array(data)
@@ -392,19 +319,6 @@ def smooth_predictions(data: np.ndarray) -> np.ndarray:
     return smoothed_data
 def make_predictions(model, processed_audio, audio_features, url, video_name):
-    """
-    Generate predictions from the model and process them to binary and smoothed predictions.
-    Parameters:
-    - model: The loaded model for making predictions.
-    - processed_audio: The audio data that has been processed for prediction.
-    - audio_features: Audio features object containing necessary metadata like meter grid.
-    - url (str): YouTube URL of the audio file.
-    - video_name (str): Name of the video.
-    Returns:
-    - np.ndarray: The smoothed binary predictions.
-    """
     predictions = model.predict(processed_audio)[0]
     binary_predictions = np.round(
         predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
@@ -431,7 +345,6 @@ def make_predictions(model, processed_audio, audio_features, url, video_name):
 def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
-    """Draw meter grid lines on the plot."""
     for time in meter_grid_times:
         ax.axvline(x=time, color='grey', linestyle='--',
                    linewidth=1, alpha=0.6)
@@ -482,18 +395,24 @@ def plot_predictions(audio_features, predictions):
 def main():
-    st.title("Chorus Finder")
     st.write("Upload a YouTube URL to find the chorus in the song.")
     url = st.text_input("YouTube URL")
     if st.button("Find Chorus"):
         if url:
-            audio_file, video_title, temp_dir = extract_audio(url)
             if audio_file:
-                strip_silence(audio_file)
-                processed_audio, audio_features = process_audio(audio_path=audio_file)
-                model = load_model()
-                smoothed_predictions = make_predictions(model, processed_audio, audio_features, url, video_title)
-                plot_predictions(audio_features=audio_features, predictions=smoothed_predictions)
                 shutil.rmtree(temp_dir)
         else:
             st.error("Please enter a valid YouTube URL")

 import os
 import shutil
 import numpy as np
+import librosa
+import tensorflow as tf
+import streamlit as st
+from pytube import YouTube
 from pydub import AudioSegment
 from pydub.silence import detect_nonsilent
+from functools import reduce
 from sklearn.preprocessing import StandardScaler
+from matplotlib import pyplot as plt
 import tempfile
 # Constants
 SR = 12000
 HOP_LENGTH = 128
 MAX_FRAMES = 300
 MAX_METERS = 201
 N_FEATURES = 15
+MODEL_PATH = "models/CRNN/chorus_detection_crnn.h5"
 AUDIO_TEMP_PATH = "output/temp"
+# Suppress TensorFlow logs
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+tf.get_logger().setLevel('ERROR')
 def extract_audio(url):
     try:
 def strip_silence(audio_path):
     sound = AudioSegment.from_file(audio_path)
     nonsilent_ranges = detect_nonsilent(
         sound, min_silence_len=500, silence_thresh=-50)
 class AudioFeature:
     def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
         self.audio_path = audio_path
         self.beats = None
         return self.key, self.mode
     def calculate_ki_chroma(self, waveform: np.ndarray, sr: int, hop_length: int) -> np.ndarray:
         chromagram = librosa.feature.chroma_cqt(
             y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
         chromagram = (chromagram - chromagram.min()) / \
         return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
     def extract_features(self):
         self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
         self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
         self.spectrogram, _ = librosa.magphase(
         self.n_frames = len(self.combined_features)
     def create_meter_grid(self):
         self.tempo, self.beats = librosa.beat.beat_track(
             onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length)
         self.tempo = self.tempo * 2 if self.tempo < 70 else self.tempo / \
         return self.meter_grid
     def _create_meter_grid(self) -> np.ndarray:
+        """Helper function to create a meter grid for the song, extrapolating both forwards and backwards from an anchor frame."""
         seconds_per_beat = 60 / self.tempo
         beat_interval = int(librosa.time_to_frames(
             seconds_per_beat, sr=self.sr, hop_length=self.hop_length))
         return meter_grid
+def segment_data_meters(data: np.ndarray, meter_grid):
     meter_segments = [data[s:e]
                       for s, e in zip(meter_grid[:-1], meter_grid[1:])]
     meter_segments = [segment.astype(np.float32) for segment in meter_segments]
 def positional_encoding(position: int, d_model: int) -> np.ndarray:
+    """Generate a positional encoding for a given position and model dimension."""
     angle_rads = np.arange(position)[:, np.newaxis] / np.power(
         10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
     return np.concatenate([np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2])], axis=-1)
 def apply_hierarchical_positional_encoding(segments: List[np.ndarray]) -> List[np.ndarray]:
+    """Apply positional encoding at the meter and frame levels to a list of segments."""
     n_features = segments[0].shape[1]
     measure_level_encodings = positional_encoding(len(segments), n_features)
     return [
     ]
+def pad_song(encoded_segments, max_frames: int = MAX_FRAMES, max_meters: int = MAX_METERS, n_features: int = N_FEATURES) -> np.ndarray:
     padded_meters = [
         np.pad(meter[:max_frames], ((0, max(0, max_frames -
                meter.shape[0])), (0, 0)), 'constant', constant_values=0)
 def process_audio(audio_path, trim_silence=True, sr=SR, hop_length=HOP_LENGTH):
+    """Process an audio file, extracting features and applying positional encoding."""
     if trim_silence:
         strip_silence(audio_path)
     This function applies a smoothing algorithm to correct isolated zeros and ones in a sequence
     of binary predictions. It also removes isolated sequences of 1s that are shorter than 5.
     """
     if not isinstance(data, np.ndarray):
         data = np.array(data)
     return smoothed_data
 def make_predictions(model, processed_audio, audio_features, url, video_name):
     predictions = model.predict(processed_audio)[0]
     binary_predictions = np.round(
         predictions[:(len(audio_features.meter_grid) - 1)]).flatten()
 def plot_meter_lines(ax: plt.Axes, meter_grid_times: np.ndarray) -> None:
     for time in meter_grid_times:
         ax.axvline(x=time, color='grey', linestyle='--',
                    linewidth=1, alpha=0.6)
 def main():
+    st.title("Chorus Detection")
     st.write("Upload a YouTube URL to find the chorus in the song.")
     url = st.text_input("YouTube URL")
     if st.button("Find Chorus"):
         if url:
+            with st.spinner('Extracting audio...'):
+                audio_file, video_title, temp_dir = extract_audio(url)
             if audio_file:
+                with st.spinner('Stripping silence...'):
+                    strip_silence(audio_file)
+                with st.spinner('Processing audio...'):
+                    processed_audio, audio_features = process_audio(audio_path=audio_file)
+                with st.spinner('Loading model...'):
+                    model = load_model(MODEL_PATH)
+                with st.spinner('Making predictions...'):
+                    smoothed_predictions = make_predictions(model, processed_audio, audio_features, url, video_title)
+                with st.spinner('Plotting predictions...'):
+                    plot_predictions(audio_features, smoothed_predictions)
                 shutil.rmtree(temp_dir)
         else:
             st.error("Please enter a valid YouTube URL")