Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +128 -0
- functions.py +138 -0
- requirements.txt +1 -0
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import keras
|
3 |
+
import librosa
|
4 |
+
import hopsworks
|
5 |
+
import os
|
6 |
+
import numpy as np
|
7 |
+
import shutil
|
8 |
+
from functions import log_mel_spectrogram, split_spectrogram, load_audio_file, image_transformer, save_spectrogram_as_png
|
9 |
+
from datasets import load_dataset
|
10 |
+
|
11 |
+
def empty_string():
|
12 |
+
return ""
|
13 |
+
|
14 |
+
def create_image_folder(folder):
|
15 |
+
try:
|
16 |
+
os.mkdir(folder)
|
17 |
+
except:
|
18 |
+
FileExistsError()
|
19 |
+
return
|
20 |
+
|
21 |
+
def delete_folder(folder):
|
22 |
+
try:
|
23 |
+
shutil.rmtree(folder)
|
24 |
+
except:
|
25 |
+
FileNotFoundError()
|
26 |
+
return
|
27 |
+
|
28 |
+
|
29 |
+
def create_dataset(image_folder):
|
30 |
+
image_dataset = load_dataset(image_folder, split=None)["train"]
|
31 |
+
print(image_dataset)
|
32 |
+
image_dataset = image_dataset.map(image_transformer, batched=True, fn_kwargs={"mode": "L"})
|
33 |
+
|
34 |
+
image_dataset_tf = image_dataset.to_tf_dataset(batch_size=1, columns="image")
|
35 |
+
|
36 |
+
return image_dataset_tf
|
37 |
+
|
38 |
+
|
39 |
+
def majority_vote(raw_predictions):
|
40 |
+
label_predictions = np.argmax(raw_predictions, axis=1)
|
41 |
+
labels, count = np.unique(label_predictions, return_counts=True)
|
42 |
+
|
43 |
+
winner = labels[np.argmax(count)]
|
44 |
+
return label_decoding[winner]
|
45 |
+
|
46 |
+
def predict(audio):
|
47 |
+
create_image_folder(folder)
|
48 |
+
try:
|
49 |
+
audio_array = load_audio_file(audio, sample_rate, res_type, duration)
|
50 |
+
except:
|
51 |
+
return "Error when loading audio. Did you submit a file?"
|
52 |
+
spectrogram = log_mel_spectrogram(audio_array, sample_rate, nfft, hop_length, window)
|
53 |
+
spec_splits = split_spectrogram(spectrogram, output_shape)
|
54 |
+
for idx, split in enumerate(spec_splits):
|
55 |
+
save_path = os.path.join(folder, f"{idx+1}_spec.png")
|
56 |
+
save_spectrogram_as_png(split, save_path, sample_rate, nfft, hop_length)
|
57 |
+
|
58 |
+
image_dataset = create_dataset(folder)
|
59 |
+
|
60 |
+
raw_preds = model.predict(image_dataset, verbose=0)
|
61 |
+
|
62 |
+
genre_pred = majority_vote(raw_preds)
|
63 |
+
return f"The genre of the submitted audio is {genre_pred}!"
|
64 |
+
|
65 |
+
sample_rate = 22050
|
66 |
+
res_type = "kaiser_fast"
|
67 |
+
nfft = 2048
|
68 |
+
hop_length = 512
|
69 |
+
window = "hann"
|
70 |
+
output_shape = (128, 256)
|
71 |
+
duration = 0
|
72 |
+
folder = "images"
|
73 |
+
|
74 |
+
label_decoding = {0: "Electronic",
|
75 |
+
1: "Experimental",
|
76 |
+
2: "Folk",
|
77 |
+
3: "Hip-Hop",
|
78 |
+
4: "Instrumental",
|
79 |
+
5: "International",
|
80 |
+
6: "Pop",
|
81 |
+
7: "Rock"}
|
82 |
+
|
83 |
+
model_path = "best_model.keras"
|
84 |
+
model = keras.models.load_model(model_path)
|
85 |
+
|
86 |
+
|
87 |
+
"""
|
88 |
+
model_version = 1
|
89 |
+
project = hopsworks.login()
|
90 |
+
mr = project.get_model_registry()
|
91 |
+
model = mr.get_model("cnn_genre_classifier", version=model_version)
|
92 |
+
|
93 |
+
model_dir = model.download()
|
94 |
+
|
95 |
+
model = keras.models.load_model(model_dir)
|
96 |
+
"""
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
with gr.Blocks() as demo:
|
101 |
+
with gr.Row():
|
102 |
+
gr.Markdown(
|
103 |
+
"""
|
104 |
+
# Music Genre Classifier
|
105 |
+
|
106 |
+
Hello!
|
107 |
+
|
108 |
+
This is a prototype for a genre classification service, where you can upload an audio file,
|
109 |
+
and the model will predict which genre it belongs to!
|
110 |
+
|
111 |
+
The model has been trained to predict 8 top-level genres, that each encompasses a multitude of sub-genres.
|
112 |
+
|
113 |
+
Upload your favorite song and give it a try!
|
114 |
+
"""
|
115 |
+
)
|
116 |
+
with gr.Row():
|
117 |
+
with gr.Column():
|
118 |
+
audio = gr.Audio(sources="upload", type="filepath", label="Upload your song here", format="mp3")
|
119 |
+
with gr.Column():
|
120 |
+
answer_box = gr.Text(label="Answer appears here", interactive=False)
|
121 |
+
with gr.Row():
|
122 |
+
submit_audio = gr.Button("Submit audio for prediction")
|
123 |
+
submit_audio.click(fn=empty_string, outputs=answer_box)
|
124 |
+
submit_audio.click(fn=predict, inputs=audio, outputs=answer_box, trigger_mode="once")
|
125 |
+
submit_audio.click(fn=delete_folder)
|
126 |
+
|
127 |
+
|
128 |
+
demo.launch(share=True)
|
functions.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
import gc
|
5 |
+
import matplotlib
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
|
8 |
+
def load_audio_file(path, sample_rate=22050, resampling_type="kaiser_fast", duration=30):
|
9 |
+
"""Load an audio file as a numpy string using Librosa library.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
path (str): Path to audio file.
|
13 |
+
sample_rate (int, optional): Sample rate to resample audio file to.
|
14 |
+
"None" uses the file's original sample rate. Defaults to 44100.
|
15 |
+
resampling_type (str, optional): Method to use for resampling. Defaults to "kaiser_fast".
|
16 |
+
duration (int, optional): Length to pad/shorten audio files to.
|
17 |
+
0 returns original audio length. Defaults to 30.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
numpy.array: Audio file as numpy array.
|
21 |
+
"""
|
22 |
+
# Load an audio file with librosa. Resamples the file to a specified sample rate.
|
23 |
+
audio_array, _ = librosa.load(path, sr=sample_rate, mono=True, res_type=resampling_type)
|
24 |
+
if duration > 0:
|
25 |
+
audio_array = pad_audio(audio_array, sample_rate, 30)
|
26 |
+
return audio_array
|
27 |
+
|
28 |
+
def add_noise(audio_array, std):
|
29 |
+
noise = np.random.normal(0, std, audio_array.shape)
|
30 |
+
return audio_array + noise
|
31 |
+
|
32 |
+
def pad_audio(audio_array, sample_rate=22050, duration=30):
|
33 |
+
# If audio array is shorter than 30s*sample rate -> pad
|
34 |
+
# If audio array is longer than 30s*sample rate -> shorten
|
35 |
+
duration_samples = duration * sample_rate
|
36 |
+
audio_len = audio_array.size
|
37 |
+
|
38 |
+
if audio_len < duration_samples:
|
39 |
+
audio_array = np.pad(audio_array, (duration_samples - audio_len)//2)
|
40 |
+
elif audio_len > duration_samples:
|
41 |
+
audio_array = audio_array[:duration_samples]
|
42 |
+
return audio_array
|
43 |
+
|
44 |
+
def log_mel_spectrogram(audio_array, sr=22050, nfft=2048, hop_length=512, window="hann"):
|
45 |
+
S = librosa.feature.melspectrogram(y=audio_array, sr=sr, n_fft=nfft,
|
46 |
+
hop_length=hop_length, win_length=nfft,
|
47 |
+
window=window)
|
48 |
+
S_db = librosa.power_to_db(S, ref=np.max)
|
49 |
+
return S_db
|
50 |
+
|
51 |
+
def split_spectrogram(spectrogram, output_shape=(128, 256)):
|
52 |
+
# Split spectrogram into equal chunks along the column axis.
|
53 |
+
splits = []
|
54 |
+
col_idx = 0
|
55 |
+
while col_idx + output_shape[1] <= spectrogram.shape[1]:
|
56 |
+
spec_split = spectrogram[:, col_idx:col_idx+output_shape[1]]
|
57 |
+
splits.append(spec_split)
|
58 |
+
col_idx += output_shape[1]
|
59 |
+
return splits
|
60 |
+
|
61 |
+
def save_spectrogram_as_png(spectrogram, save_path, sample_rate=22050, nfft=2048, hop_length=512):
|
62 |
+
shape = spectrogram.shape
|
63 |
+
fig, ax = plt.subplots(1, 1, figsize=(shape[1]/100, shape[0]/100))
|
64 |
+
fig.subplots_adjust(top=1.0, bottom=0, right=1.0, left=0, hspace=0, wspace=0)
|
65 |
+
ax.set_axis_off()
|
66 |
+
librosa.display.specshow(data=spectrogram, sr=sample_rate, n_fft=nfft, hop_length=hop_length, ax=ax)
|
67 |
+
plt.savefig(save_path, bbox_inches=None, pad_inches=0)
|
68 |
+
plt.close(fig)
|
69 |
+
return
|
70 |
+
|
71 |
+
def extract_features(df, audio_dir, save_path,
|
72 |
+
sr=22050, rs_type="kaiser_fast",
|
73 |
+
output_shape=(128,256), duration=30
|
74 |
+
, nfft=2048, hop_length=512, window="hann", checkpoint_id=0):
|
75 |
+
"""
|
76 |
+
Loads audio files, computes log-mel-spectrogram and saves it as png.
|
77 |
+
Args:
|
78 |
+
df (_type_): DataFrame containing ids and genres. Should only contain samples from specific data split (train/val/test).
|
79 |
+
audio_dir (_type_): Directory containing all audio files.
|
80 |
+
save_path (_type_): Path to where spectrograms will be saved.
|
81 |
+
sr (_type_): Sampling rate to set for all loaded audio files.
|
82 |
+
rs_type (_type_): Resampling method used when loading audio file to specific sampling rate.
|
83 |
+
output_shape (_type_): Shape of each spectrogram split.
|
84 |
+
duration (_type_): Set to standardize length of all audio files. Longer or shorter will be cut or padded respectively.
|
85 |
+
nfft (_type_): Number of samples for every fft window.
|
86 |
+
hop_length (_type_): Hop length to use for STFT.
|
87 |
+
window (_type_): Window function to use for STFT.
|
88 |
+
checkpoint_id (int, optional): Write the id of a track to start from there. Defaults to 0.
|
89 |
+
"""
|
90 |
+
matplotlib.use("Agg")
|
91 |
+
|
92 |
+
if int(checkpoint_id) > 0:
|
93 |
+
df = df.loc[checkpoint_id:]
|
94 |
+
|
95 |
+
id_list = df.index.ravel()
|
96 |
+
genre_list = df["genre_top"].ravel()
|
97 |
+
|
98 |
+
# Due to some weird memory leak, garbage collection is manually performed every 10% of progress.
|
99 |
+
gc_interval = int(len(id_list) * 0.1)
|
100 |
+
gc_checkpoints = id_list[::gc_interval]
|
101 |
+
|
102 |
+
for id, genre in zip(id_list, genre_list):
|
103 |
+
id_string = str(id).rjust(6, "0")
|
104 |
+
filename = id_string + ".mp3"
|
105 |
+
folder_name = filename[:3]
|
106 |
+
file_path = os.path.join(audio_dir, folder_name, filename)
|
107 |
+
|
108 |
+
print(id_string, end=" ")
|
109 |
+
audio = load_audio_file(file_path, sr, rs_type, duration)
|
110 |
+
|
111 |
+
spectrogram = log_mel_spectrogram(audio, sr=sr, nfft=nfft, hop_length=hop_length, window=window)
|
112 |
+
spec_splits = split_spectrogram(spectrogram, output_shape)
|
113 |
+
|
114 |
+
for idx, split in enumerate(spec_splits):
|
115 |
+
image_name = id_string + "_" + str(idx+1) +".png"
|
116 |
+
|
117 |
+
image_path = os.path.join(save_path, genre, image_name)
|
118 |
+
save_spectrogram_as_png(split, image_path, sr, nfft, hop_length)
|
119 |
+
|
120 |
+
if id in gc_checkpoints:
|
121 |
+
gc.collect()
|
122 |
+
return
|
123 |
+
|
124 |
+
|
125 |
+
def image_transformer(dataset, mode):
|
126 |
+
"""
|
127 |
+
Convert images from Huggingface Dataset object to different mode.
|
128 |
+
The generated PNGs are usually RGBA. This function can convert them to RGB, grayscale among others.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
dataset (object): Huggingface Dataset object
|
132 |
+
mode (str): String specifying mode to convert images to. Ex: "RGB", "L" for grayscale.
|
133 |
+
|
134 |
+
Returns:
|
135 |
+
object: Huggingface Dataset
|
136 |
+
"""
|
137 |
+
dataset["image"] = [image.convert(mode) for image in dataset["image"]]
|
138 |
+
return dataset
|
requirements.txt
CHANGED
@@ -2,5 +2,6 @@ gradio
|
|
2 |
librosa
|
3 |
numpy
|
4 |
hopsworks
|
|
|
5 |
tensorflow==2.15.0
|
6 |
keras==3.0.2
|
|
|
2 |
librosa
|
3 |
numpy
|
4 |
hopsworks
|
5 |
+
datasets
|
6 |
tensorflow==2.15.0
|
7 |
keras==3.0.2
|