Spaces:
Running
Running
File size: 7,983 Bytes
f440926 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow)
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs
import warnings
warnings.filterwarnings("ignore") # Suppress all warnings
import argparse
from functools import reduce
from typing import List, Tuple
import shutil
import librosa
import numpy as np
from matplotlib import pyplot as plt
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from pytube import YouTube
from sklearn.preprocessing import StandardScaler
import shutil
import streamlit as st
# Constants
SR = 12000
HOP_LENGTH = 128
MAX_FRAMES = 300
MAX_METERS = 201
N_FEATURES = 15
MODEL_PATH = "models/CRNN/best_model_V3.h5"
AUDIO_TEMP_PATH = "output/temp"
def extract_audio(url, output_path=AUDIO_TEMP_PATH):
try:
yt = YouTube(url)
video_title = yt.title
audio_stream = yt.streams.filter(only_audio=True).first()
if audio_stream:
os.makedirs(output_path, exist_ok=True)
out_file = audio_stream.download(output_path)
base, _ = os.path.splitext(out_file)
audio_file = base + '.mp3'
if os.path.exists(audio_file):
os.remove(audio_file)
os.rename(out_file, audio_file)
return audio_file, video_title
else:
st.error("No audio stream found")
return None, None
except Exception as e:
st.error(f"An error occurred: {e}")
return None, None
def strip_silence(audio_path):
sound = AudioSegment.from_file(audio_path)
nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
stripped.export(audio_path, format='mp3')
class AudioFeature:
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
self.audio_path = audio_path
self.sr = sr
self.hop_length = hop_length
self.y = None
self.y_harm, self.y_perc = None, None
self.spectrogram = None
self.rms = None
self.melspectrogram = None
self.mel_acts = None
self.chromagram = None
self.chroma_acts = None
self.onset_env = None
self.tempogram = None
self.tempogram_acts = None
self.mfccs = None
self.mfcc_acts = None
self.combined_features = None
self.n_frames = None
self.tempo = None
self.beats = None
self.meter_grid = None
self.key, self.mode = None, None
def detect_key(self, chroma_vals):
note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
major_profile /= np.linalg.norm(major_profile)
minor_profile /= np.linalg.norm(minor_profile)
major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]
max_major_idx = np.argmax(major_correlations)
max_minor_idx = np.argmax(minor_correlations)
self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
return self.key, self.mode
def calculate_ki_chroma(self, waveform, sr, hop_length):
chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
chroma_vals = np.sum(chromagram, axis=1)
key, mode = self.detect_key(chroma_vals)
key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
def extract_features(self):
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
self.n_frames = self.combined_features.shape[1]
self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))
def get_features(self):
self.extract_features()
return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode
def load_model(model_path=MODEL_PATH):
return tf.keras.models.load_model(model_path)
def predict_chorus(audio_features, model):
features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
features = features[:, :MAX_FRAMES]
features = np.expand_dims(features, axis=0)
scaler = StandardScaler()
features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
predictions = model.predict(features)
return predictions
def plot_predictions(predictions, title):
plt.figure(figsize=(10, 4))
plt.plot(predictions[0], label='Chorus Probability')
plt.title(title)
plt.xlabel('Frame')
plt.ylabel('Probability')
plt.legend()
st.pyplot(plt)
def main():
st.title("Chorus Finder")
st.write("Upload a YouTube URL to find the chorus in the song.")
url = st.text_input("YouTube URL")
if st.button("Find Chorus"):
if url:
audio_file, video_title = extract_audio(url)
if audio_file:
strip_silence(audio_file)
audio_features = AudioFeature(audio_file)
model = load_model()
predictions = predict_chorus(audio_features, model)
plot_predictions(predictions, video_title)
shutil.rmtree(AUDIO_TEMP_PATH)
else:
st.error("Please enter a valid YouTube URL")
if __name__ == "__main__":
main()
|