Spaces:
Running
Running
import os | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow) | |
import tensorflow as tf | |
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs | |
import warnings | |
warnings.filterwarnings("ignore") # Suppress all warnings | |
import argparse | |
from functools import reduce | |
from typing import List, Tuple | |
import shutil | |
import librosa | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from pydub import AudioSegment | |
from pydub.silence import detect_nonsilent | |
from pytube import YouTube | |
from sklearn.preprocessing import StandardScaler | |
import shutil | |
import streamlit as st | |
# Constants | |
SR = 12000 | |
HOP_LENGTH = 128 | |
MAX_FRAMES = 300 | |
MAX_METERS = 201 | |
N_FEATURES = 15 | |
MODEL_PATH = "models/CRNN/best_model_V3.h5" | |
AUDIO_TEMP_PATH = "output/temp" | |
def extract_audio(url, output_path=AUDIO_TEMP_PATH): | |
try: | |
yt = YouTube(url) | |
video_title = yt.title | |
audio_stream = yt.streams.filter(only_audio=True).first() | |
if audio_stream: | |
os.makedirs(output_path, exist_ok=True) | |
out_file = audio_stream.download(output_path) | |
base, _ = os.path.splitext(out_file) | |
audio_file = base + '.mp3' | |
if os.path.exists(audio_file): | |
os.remove(audio_file) | |
os.rename(out_file, audio_file) | |
return audio_file, video_title | |
else: | |
st.error("No audio stream found") | |
return None, None | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
return None, None | |
def strip_silence(audio_path): | |
sound = AudioSegment.from_file(audio_path) | |
nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50) | |
stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty()) | |
stripped.export(audio_path, format='mp3') | |
class AudioFeature: | |
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH): | |
self.audio_path = audio_path | |
self.sr = sr | |
self.hop_length = hop_length | |
self.y = None | |
self.y_harm, self.y_perc = None, None | |
self.spectrogram = None | |
self.rms = None | |
self.melspectrogram = None | |
self.mel_acts = None | |
self.chromagram = None | |
self.chroma_acts = None | |
self.onset_env = None | |
self.tempogram = None | |
self.tempogram_acts = None | |
self.mfccs = None | |
self.mfcc_acts = None | |
self.combined_features = None | |
self.n_frames = None | |
self.tempo = None | |
self.beats = None | |
self.meter_grid = None | |
self.key, self.mode = None, None | |
def detect_key(self, chroma_vals): | |
note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] | |
major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]) | |
minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]) | |
major_profile /= np.linalg.norm(major_profile) | |
minor_profile /= np.linalg.norm(minor_profile) | |
major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)] | |
minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)] | |
max_major_idx = np.argmax(major_correlations) | |
max_minor_idx = np.argmax(minor_correlations) | |
self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor' | |
self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx] | |
return self.key, self.mode | |
def calculate_ki_chroma(self, waveform, sr, hop_length): | |
chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24) | |
chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min()) | |
chroma_vals = np.sum(chromagram, axis=1) | |
key, mode = self.detect_key(chroma_vals) | |
key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key) | |
shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12 | |
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1) | |
def extract_features(self): | |
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr) | |
self.y_harm, self.y_perc = librosa.effects.hpss(self.y) | |
self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length)) | |
self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32) | |
self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32) | |
self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32) | |
self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32) | |
self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32) | |
self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length) | |
self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32) | |
self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32) | |
self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32) | |
self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32) | |
self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts]) | |
self.n_frames = self.combined_features.shape[1] | |
self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length) | |
self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames) | |
self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1)) | |
def get_features(self): | |
self.extract_features() | |
return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode | |
def load_model(model_path=MODEL_PATH): | |
return tf.keras.models.load_model(model_path) | |
def predict_chorus(audio_features, model): | |
features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features() | |
features = features[:, :MAX_FRAMES] | |
features = np.expand_dims(features, axis=0) | |
scaler = StandardScaler() | |
features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape) | |
predictions = model.predict(features) | |
return predictions | |
def plot_predictions(predictions, title): | |
plt.figure(figsize=(10, 4)) | |
plt.plot(predictions[0], label='Chorus Probability') | |
plt.title(title) | |
plt.xlabel('Frame') | |
plt.ylabel('Probability') | |
plt.legend() | |
st.pyplot(plt) | |
def main(): | |
st.title("Chorus Finder") | |
st.write("Upload a YouTube URL to find the chorus in the song.") | |
url = st.text_input("YouTube URL") | |
if st.button("Find Chorus"): | |
if url: | |
audio_file, video_title = extract_audio(url) | |
if audio_file: | |
strip_silence(audio_file) | |
audio_features = AudioFeature(audio_file) | |
model = load_model() | |
predictions = predict_chorus(audio_features, model) | |
plot_predictions(predictions, video_title) | |
shutil.rmtree(AUDIO_TEMP_PATH) | |
else: | |
st.error("Please enter a valid YouTube URL") | |
if __name__ == "__main__": | |
main() | |