File size: 7,983 Bytes
f440926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logs (must be set before importing TensorFlow)
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow ERROR logs
import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings

import argparse
from functools import reduce
from typing import List, Tuple
import shutil
import librosa
import numpy as np
from matplotlib import pyplot as plt
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from pytube import YouTube
from sklearn.preprocessing import StandardScaler
import shutil
import streamlit as st


# Constants
SR = 12000
HOP_LENGTH = 128
MAX_FRAMES = 300
MAX_METERS = 201
N_FEATURES = 15
MODEL_PATH = "models/CRNN/best_model_V3.h5"
AUDIO_TEMP_PATH = "output/temp"

def extract_audio(url, output_path=AUDIO_TEMP_PATH):
    try:
        yt = YouTube(url)
        video_title = yt.title
        audio_stream = yt.streams.filter(only_audio=True).first()
        if audio_stream:
            os.makedirs(output_path, exist_ok=True)
            out_file = audio_stream.download(output_path)
            base, _ = os.path.splitext(out_file)
            audio_file = base + '.mp3'
            if os.path.exists(audio_file):
                os.remove(audio_file)
            os.rename(out_file, audio_file)
            return audio_file, video_title
        else:
            st.error("No audio stream found")
            return None, None
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return None, None

def strip_silence(audio_path):
    sound = AudioSegment.from_file(audio_path)
    nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
    stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
    stripped.export(audio_path, format='mp3')

class AudioFeature:
    def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
        self.audio_path = audio_path
        self.sr = sr
        self.hop_length = hop_length
        self.y = None
        self.y_harm, self.y_perc = None, None
        self.spectrogram = None
        self.rms = None
        self.melspectrogram = None
        self.mel_acts = None
        self.chromagram = None
        self.chroma_acts = None
        self.onset_env = None
        self.tempogram = None
        self.tempogram_acts = None
        self.mfccs = None
        self.mfcc_acts = None
        self.combined_features = None
        self.n_frames = None
        self.tempo = None
        self.beats = None
        self.meter_grid = None
        self.key, self.mode = None, None

    def detect_key(self, chroma_vals):
        note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
        minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
        major_profile /= np.linalg.norm(major_profile)
        minor_profile /= np.linalg.norm(minor_profile)

        major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
        minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]

        max_major_idx = np.argmax(major_correlations)
        max_minor_idx = np.argmax(minor_correlations)

        self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
        self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
        return self.key, self.mode

    def calculate_ki_chroma(self, waveform, sr, hop_length):
        chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
        chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
        chroma_vals = np.sum(chromagram, axis=1)
        key, mode = self.detect_key(chroma_vals)
        key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
        shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
        return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
    
    def extract_features(self):
        self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
        self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
        self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
        self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
        self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
        self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
        self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
        self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
        self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
        self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
        self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
        self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
        self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
        self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
        self.n_frames = self.combined_features.shape[1]
        self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
        self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
        self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))

    def get_features(self):
        self.extract_features()
        return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode

def load_model(model_path=MODEL_PATH):
    return tf.keras.models.load_model(model_path)

def predict_chorus(audio_features, model):
    features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
    features = features[:, :MAX_FRAMES]
    features = np.expand_dims(features, axis=0)
    scaler = StandardScaler()
    features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
    predictions = model.predict(features)
    return predictions

def plot_predictions(predictions, title):
    plt.figure(figsize=(10, 4))
    plt.plot(predictions[0], label='Chorus Probability')
    plt.title(title)
    plt.xlabel('Frame')
    plt.ylabel('Probability')
    plt.legend()
    st.pyplot(plt)

def main():
    st.title("Chorus Finder")
    st.write("Upload a YouTube URL to find the chorus in the song.")
    url = st.text_input("YouTube URL")
    if st.button("Find Chorus"):
        if url:
            audio_file, video_title = extract_audio(url)
            if audio_file:
                strip_silence(audio_file)
                audio_features = AudioFeature(audio_file)
                model = load_model()
                predictions = predict_chorus(audio_features, model)
                plot_predictions(predictions, video_title)
                shutil.rmtree(AUDIO_TEMP_PATH)
        else:
            st.error("Please enter a valid YouTube URL")

if __name__ == "__main__":
    main()