DurreSudoku commited on
Commit
e2eef75
·
verified ·
1 Parent(s): fcfac87

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +128 -0
  2. functions.py +138 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import keras
3
+ import librosa
4
+ import hopsworks
5
+ import os
6
+ import numpy as np
7
+ import shutil
8
+ from functions import log_mel_spectrogram, split_spectrogram, load_audio_file, image_transformer, save_spectrogram_as_png
9
+ from datasets import load_dataset
10
+
11
+ def empty_string():
12
+ return ""
13
+
14
+ def create_image_folder(folder):
15
+ try:
16
+ os.mkdir(folder)
17
+ except:
18
+ FileExistsError()
19
+ return
20
+
21
+ def delete_folder(folder):
22
+ try:
23
+ shutil.rmtree(folder)
24
+ except:
25
+ FileNotFoundError()
26
+ return
27
+
28
+
29
+ def create_dataset(image_folder):
30
+ image_dataset = load_dataset(image_folder, split=None)["train"]
31
+ print(image_dataset)
32
+ image_dataset = image_dataset.map(image_transformer, batched=True, fn_kwargs={"mode": "L"})
33
+
34
+ image_dataset_tf = image_dataset.to_tf_dataset(batch_size=1, columns="image")
35
+
36
+ return image_dataset_tf
37
+
38
+
39
+ def majority_vote(raw_predictions):
40
+ label_predictions = np.argmax(raw_predictions, axis=1)
41
+ labels, count = np.unique(label_predictions, return_counts=True)
42
+
43
+ winner = labels[np.argmax(count)]
44
+ return label_decoding[winner]
45
+
46
+ def predict(audio):
47
+ create_image_folder(folder)
48
+ try:
49
+ audio_array = load_audio_file(audio, sample_rate, res_type, duration)
50
+ except:
51
+ return "Error when loading audio. Did you submit a file?"
52
+ spectrogram = log_mel_spectrogram(audio_array, sample_rate, nfft, hop_length, window)
53
+ spec_splits = split_spectrogram(spectrogram, output_shape)
54
+ for idx, split in enumerate(spec_splits):
55
+ save_path = os.path.join(folder, f"{idx+1}_spec.png")
56
+ save_spectrogram_as_png(split, save_path, sample_rate, nfft, hop_length)
57
+
58
+ image_dataset = create_dataset(folder)
59
+
60
+ raw_preds = model.predict(image_dataset, verbose=0)
61
+
62
+ genre_pred = majority_vote(raw_preds)
63
+ return f"The genre of the submitted audio is {genre_pred}!"
64
+
65
+ sample_rate = 22050
66
+ res_type = "kaiser_fast"
67
+ nfft = 2048
68
+ hop_length = 512
69
+ window = "hann"
70
+ output_shape = (128, 256)
71
+ duration = 0
72
+ folder = "images"
73
+
74
+ label_decoding = {0: "Electronic",
75
+ 1: "Experimental",
76
+ 2: "Folk",
77
+ 3: "Hip-Hop",
78
+ 4: "Instrumental",
79
+ 5: "International",
80
+ 6: "Pop",
81
+ 7: "Rock"}
82
+
83
+ model_path = "best_model.keras"
84
+ model = keras.models.load_model(model_path)
85
+
86
+
87
+ """
88
+ model_version = 1
89
+ project = hopsworks.login()
90
+ mr = project.get_model_registry()
91
+ model = mr.get_model("cnn_genre_classifier", version=model_version)
92
+
93
+ model_dir = model.download()
94
+
95
+ model = keras.models.load_model(model_dir)
96
+ """
97
+
98
+
99
+
100
+ with gr.Blocks() as demo:
101
+ with gr.Row():
102
+ gr.Markdown(
103
+ """
104
+ # Music Genre Classifier
105
+
106
+ Hello!
107
+
108
+ This is a prototype for a genre classification service, where you can upload an audio file,
109
+ and the model will predict which genre it belongs to!
110
+
111
+ The model has been trained to predict 8 top-level genres, that each encompasses a multitude of sub-genres.
112
+
113
+ Upload your favorite song and give it a try!
114
+ """
115
+ )
116
+ with gr.Row():
117
+ with gr.Column():
118
+ audio = gr.Audio(sources="upload", type="filepath", label="Upload your song here", format="mp3")
119
+ with gr.Column():
120
+ answer_box = gr.Text(label="Answer appears here", interactive=False)
121
+ with gr.Row():
122
+ submit_audio = gr.Button("Submit audio for prediction")
123
+ submit_audio.click(fn=empty_string, outputs=answer_box)
124
+ submit_audio.click(fn=predict, inputs=audio, outputs=answer_box, trigger_mode="once")
125
+ submit_audio.click(fn=delete_folder)
126
+
127
+
128
+ demo.launch(share=True)
functions.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import os
4
+ import gc
5
+ import matplotlib
6
+ import matplotlib.pyplot as plt
7
+
8
+ def load_audio_file(path, sample_rate=22050, resampling_type="kaiser_fast", duration=30):
9
+ """Load an audio file as a numpy string using Librosa library.
10
+
11
+ Args:
12
+ path (str): Path to audio file.
13
+ sample_rate (int, optional): Sample rate to resample audio file to.
14
+ "None" uses the file's original sample rate. Defaults to 44100.
15
+ resampling_type (str, optional): Method to use for resampling. Defaults to "kaiser_fast".
16
+ duration (int, optional): Length to pad/shorten audio files to.
17
+ 0 returns original audio length. Defaults to 30.
18
+
19
+ Returns:
20
+ numpy.array: Audio file as numpy array.
21
+ """
22
+ # Load an audio file with librosa. Resamples the file to a specified sample rate.
23
+ audio_array, _ = librosa.load(path, sr=sample_rate, mono=True, res_type=resampling_type)
24
+ if duration > 0:
25
+ audio_array = pad_audio(audio_array, sample_rate, 30)
26
+ return audio_array
27
+
28
+ def add_noise(audio_array, std):
29
+ noise = np.random.normal(0, std, audio_array.shape)
30
+ return audio_array + noise
31
+
32
+ def pad_audio(audio_array, sample_rate=22050, duration=30):
33
+ # If audio array is shorter than 30s*sample rate -> pad
34
+ # If audio array is longer than 30s*sample rate -> shorten
35
+ duration_samples = duration * sample_rate
36
+ audio_len = audio_array.size
37
+
38
+ if audio_len < duration_samples:
39
+ audio_array = np.pad(audio_array, (duration_samples - audio_len)//2)
40
+ elif audio_len > duration_samples:
41
+ audio_array = audio_array[:duration_samples]
42
+ return audio_array
43
+
44
+ def log_mel_spectrogram(audio_array, sr=22050, nfft=2048, hop_length=512, window="hann"):
45
+ S = librosa.feature.melspectrogram(y=audio_array, sr=sr, n_fft=nfft,
46
+ hop_length=hop_length, win_length=nfft,
47
+ window=window)
48
+ S_db = librosa.power_to_db(S, ref=np.max)
49
+ return S_db
50
+
51
+ def split_spectrogram(spectrogram, output_shape=(128, 256)):
52
+ # Split spectrogram into equal chunks along the column axis.
53
+ splits = []
54
+ col_idx = 0
55
+ while col_idx + output_shape[1] <= spectrogram.shape[1]:
56
+ spec_split = spectrogram[:, col_idx:col_idx+output_shape[1]]
57
+ splits.append(spec_split)
58
+ col_idx += output_shape[1]
59
+ return splits
60
+
61
+ def save_spectrogram_as_png(spectrogram, save_path, sample_rate=22050, nfft=2048, hop_length=512):
62
+ shape = spectrogram.shape
63
+ fig, ax = plt.subplots(1, 1, figsize=(shape[1]/100, shape[0]/100))
64
+ fig.subplots_adjust(top=1.0, bottom=0, right=1.0, left=0, hspace=0, wspace=0)
65
+ ax.set_axis_off()
66
+ librosa.display.specshow(data=spectrogram, sr=sample_rate, n_fft=nfft, hop_length=hop_length, ax=ax)
67
+ plt.savefig(save_path, bbox_inches=None, pad_inches=0)
68
+ plt.close(fig)
69
+ return
70
+
71
+ def extract_features(df, audio_dir, save_path,
72
+ sr=22050, rs_type="kaiser_fast",
73
+ output_shape=(128,256), duration=30
74
+ , nfft=2048, hop_length=512, window="hann", checkpoint_id=0):
75
+ """
76
+ Loads audio files, computes log-mel-spectrogram and saves it as png.
77
+ Args:
78
+ df (_type_): DataFrame containing ids and genres. Should only contain samples from specific data split (train/val/test).
79
+ audio_dir (_type_): Directory containing all audio files.
80
+ save_path (_type_): Path to where spectrograms will be saved.
81
+ sr (_type_): Sampling rate to set for all loaded audio files.
82
+ rs_type (_type_): Resampling method used when loading audio file to specific sampling rate.
83
+ output_shape (_type_): Shape of each spectrogram split.
84
+ duration (_type_): Set to standardize length of all audio files. Longer or shorter will be cut or padded respectively.
85
+ nfft (_type_): Number of samples for every fft window.
86
+ hop_length (_type_): Hop length to use for STFT.
87
+ window (_type_): Window function to use for STFT.
88
+ checkpoint_id (int, optional): Write the id of a track to start from there. Defaults to 0.
89
+ """
90
+ matplotlib.use("Agg")
91
+
92
+ if int(checkpoint_id) > 0:
93
+ df = df.loc[checkpoint_id:]
94
+
95
+ id_list = df.index.ravel()
96
+ genre_list = df["genre_top"].ravel()
97
+
98
+ # Due to some weird memory leak, garbage collection is manually performed every 10% of progress.
99
+ gc_interval = int(len(id_list) * 0.1)
100
+ gc_checkpoints = id_list[::gc_interval]
101
+
102
+ for id, genre in zip(id_list, genre_list):
103
+ id_string = str(id).rjust(6, "0")
104
+ filename = id_string + ".mp3"
105
+ folder_name = filename[:3]
106
+ file_path = os.path.join(audio_dir, folder_name, filename)
107
+
108
+ print(id_string, end=" ")
109
+ audio = load_audio_file(file_path, sr, rs_type, duration)
110
+
111
+ spectrogram = log_mel_spectrogram(audio, sr=sr, nfft=nfft, hop_length=hop_length, window=window)
112
+ spec_splits = split_spectrogram(spectrogram, output_shape)
113
+
114
+ for idx, split in enumerate(spec_splits):
115
+ image_name = id_string + "_" + str(idx+1) +".png"
116
+
117
+ image_path = os.path.join(save_path, genre, image_name)
118
+ save_spectrogram_as_png(split, image_path, sr, nfft, hop_length)
119
+
120
+ if id in gc_checkpoints:
121
+ gc.collect()
122
+ return
123
+
124
+
125
+ def image_transformer(dataset, mode):
126
+ """
127
+ Convert images from Huggingface Dataset object to different mode.
128
+ The generated PNGs are usually RGBA. This function can convert them to RGB, grayscale among others.
129
+
130
+ Args:
131
+ dataset (object): Huggingface Dataset object
132
+ mode (str): String specifying mode to convert images to. Ex: "RGB", "L" for grayscale.
133
+
134
+ Returns:
135
+ object: Huggingface Dataset
136
+ """
137
+ dataset["image"] = [image.convert(mode) for image in dataset["image"]]
138
+ return dataset
requirements.txt CHANGED
@@ -2,5 +2,6 @@ gradio
2
  librosa
3
  numpy
4
  hopsworks
 
5
  tensorflow==2.15.0
6
  keras==3.0.2
 
2
  librosa
3
  numpy
4
  hopsworks
5
+ datasets
6
  tensorflow==2.15.0
7
  keras==3.0.2