Spaces:

dennisvdang
/

Chorus-Detection

Sleeping

File size: 7,983 Bytes

f440926

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logs (must be set before importing TensorFlow)
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow ERROR logs
import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings

import argparse
from functools import reduce
from typing import List, Tuple
import shutil
import librosa
import numpy as np
from matplotlib import pyplot as plt
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from pytube import YouTube
from sklearn.preprocessing import StandardScaler
import shutil
import streamlit as st


# Constants
SR = 12000
HOP_LENGTH = 128
MAX_FRAMES = 300
MAX_METERS = 201
N_FEATURES = 15
MODEL_PATH = "models/CRNN/best_model_V3.h5"
AUDIO_TEMP_PATH = "output/temp"

def extract_audio(url, output_path=AUDIO_TEMP_PATH):
    try:
        yt = YouTube(url)
        video_title = yt.title
        audio_stream = yt.streams.filter(only_audio=True).first()
        if audio_stream:
            os.makedirs(output_path, exist_ok=True)
            out_file = audio_stream.download(output_path)
            base, _ = os.path.splitext(out_file)
            audio_file = base + '.mp3'
            if os.path.exists(audio_file):
                os.remove(audio_file)
            os.rename(out_file, audio_file)
            return audio_file, video_title
        else:
            st.error("No audio stream found")
            return None, None
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return None, None

def strip_silence(audio_path):
    sound = AudioSegment.from_file(audio_path)
    nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
    stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
    stripped.export(audio_path, format='mp3')

class AudioFeature:
    def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
        self.audio_path = audio_path
        self.sr = sr
        self.hop_length = hop_length
        self.y = None
        self.y_harm, self.y_perc = None, None
        self.spectrogram = None
        self.rms = None
        self.melspectrogram = None
        self.mel_acts = None
        self.chromagram = None
        self.chroma_acts = None
        self.onset_env = None
        self.tempogram = None
        self.tempogram_acts = None
        self.mfccs = None
        self.mfcc_acts = None
        self.combined_features = None
        self.n_frames = None
        self.tempo = None
        self.beats = None
        self.meter_grid = None
        self.key, self.mode = None, None

    def detect_key(self, chroma_vals):
        note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
        minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
        major_profile /= np.linalg.norm(major_profile)
        minor_profile /= np.linalg.norm(minor_profile)

        major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
        minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]

        max_major_idx = np.argmax(major_correlations)
        max_minor_idx = np.argmax(minor_correlations)

        self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
        self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
        return self.key, self.mode

    def calculate_ki_chroma(self, waveform, sr, hop_length):
        chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
        chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
        chroma_vals = np.sum(chromagram, axis=1)
        key, mode = self.detect_key(chroma_vals)
        key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
        shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
        return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
    
    def extract_features(self):
        self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
        self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
        self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
        self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
        self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
        self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
        self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
        self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
        self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
        self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
        self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
        self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
        self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
        self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
        self.n_frames = self.combined_features.shape[1]
        self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
        self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
        self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))

    def get_features(self):
        self.extract_features()
        return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode

def load_model(model_path=MODEL_PATH):
    return tf.keras.models.load_model(model_path)

def predict_chorus(audio_features, model):
    features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
    features = features[:, :MAX_FRAMES]
    features = np.expand_dims(features, axis=0)
    scaler = StandardScaler()
    features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
    predictions = model.predict(features)
    return predictions

def plot_predictions(predictions, title):
    plt.figure(figsize=(10, 4))
    plt.plot(predictions[0], label='Chorus Probability')
    plt.title(title)
    plt.xlabel('Frame')
    plt.ylabel('Probability')
    plt.legend()
    st.pyplot(plt)

def main():
    st.title("Chorus Finder")
    st.write("Upload a YouTube URL to find the chorus in the song.")
    url = st.text_input("YouTube URL")
    if st.button("Find Chorus"):
        if url:
            audio_file, video_title = extract_audio(url)
            if audio_file:
                strip_silence(audio_file)
                audio_features = AudioFeature(audio_file)
                model = load_model()
                predictions = predict_chorus(audio_features, model)
                plot_predictions(predictions, video_title)
                shutil.rmtree(AUDIO_TEMP_PATH)
        else:
            st.error("Please enter a valid YouTube URL")

if __name__ == "__main__":
    main()