Spaces:

DurreSudoku
/

Genre_Classifier

Sleeping

App Files Files Community

DurreSudoku commited on Jan 12, 2024

Commit

e2eef75

verified ·

1 Parent(s): fcfac87

Upload 3 files

Browse files

Files changed (3) hide show

app.py +128 -0
functions.py +138 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import gradio as gr
+import keras
+import librosa
+import hopsworks
+import os
+import numpy as np
+import shutil
+from functions import log_mel_spectrogram, split_spectrogram, load_audio_file, image_transformer, save_spectrogram_as_png
+from datasets import load_dataset
+def empty_string():
+    return ""
+def create_image_folder(folder):
+    try:
+        os.mkdir(folder)
+    except:
+        FileExistsError()
+    return
+def delete_folder(folder):
+    try:
+        shutil.rmtree(folder)
+    except:
+        FileNotFoundError()
+    return
+def create_dataset(image_folder):
+    image_dataset = load_dataset(image_folder, split=None)["train"]
+    print(image_dataset)
+    image_dataset = image_dataset.map(image_transformer, batched=True, fn_kwargs={"mode": "L"})
+    image_dataset_tf = image_dataset.to_tf_dataset(batch_size=1, columns="image")
+    return image_dataset_tf
+def majority_vote(raw_predictions):
+    label_predictions = np.argmax(raw_predictions, axis=1)
+    labels, count = np.unique(label_predictions, return_counts=True)
+    winner = labels[np.argmax(count)]
+    return label_decoding[winner]
+def predict(audio):
+    create_image_folder(folder)
+    try:
+        audio_array = load_audio_file(audio, sample_rate, res_type, duration)
+    except:
+        return "Error when loading audio. Did you submit a file?"
+    spectrogram = log_mel_spectrogram(audio_array, sample_rate, nfft, hop_length, window)
+    spec_splits = split_spectrogram(spectrogram, output_shape)
+    for idx, split in enumerate(spec_splits):
+        save_path = os.path.join(folder, f"{idx+1}_spec.png")
+        save_spectrogram_as_png(split, save_path, sample_rate, nfft, hop_length)
+    image_dataset = create_dataset(folder)
+    raw_preds = model.predict(image_dataset, verbose=0)
+    genre_pred = majority_vote(raw_preds)
+    return f"The genre of the submitted audio is {genre_pred}!"
+sample_rate = 22050
+res_type = "kaiser_fast"
+nfft = 2048
+hop_length = 512
+window = "hann"
+output_shape = (128, 256)
+duration = 0
+folder = "images"
+label_decoding = {0: "Electronic",
+                  1: "Experimental",
+                  2: "Folk",
+                  3: "Hip-Hop",
+                  4: "Instrumental",
+                  5: "International",
+                  6: "Pop",
+                  7: "Rock"}
+model_path = "best_model.keras"
+model = keras.models.load_model(model_path)
+"""
+model_version = 1
+project = hopsworks.login()
+mr = project.get_model_registry()
+model = mr.get_model("cnn_genre_classifier", version=model_version)
+model_dir = model.download()
+model = keras.models.load_model(model_dir)
+"""
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown(
+    """
+    # Music Genre Classifier
+    Hello!
+    This is a prototype for a genre classification service, where you can upload an audio file,
+    and the model will predict which genre it belongs to!
+    The model has been trained to predict 8 top-level genres, that each encompasses a multitude of sub-genres.
+    Upload your favorite song and give it a try!
+    """
+        )
+    with gr.Row():
+        with gr.Column():
+            audio = gr.Audio(sources="upload", type="filepath", label="Upload your song here", format="mp3")
+        with gr.Column():
+            answer_box = gr.Text(label="Answer appears here", interactive=False)
+    with gr.Row():
+        submit_audio = gr.Button("Submit audio for prediction")
+        submit_audio.click(fn=empty_string, outputs=answer_box)
+        submit_audio.click(fn=predict, inputs=audio, outputs=answer_box, trigger_mode="once")
+        submit_audio.click(fn=delete_folder)
+demo.launch(share=True)

functions.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import librosa
+import numpy as np
+import os
+import gc
+import matplotlib
+import matplotlib.pyplot as plt
+def load_audio_file(path, sample_rate=22050, resampling_type="kaiser_fast", duration=30):
+    """Load an audio file as a numpy string using Librosa library.
+    Args:
+        path (str): Path to audio file.
+        sample_rate (int, optional): Sample rate to resample audio file to.
+            "None" uses the file's original sample rate. Defaults to 44100.
+        resampling_type (str, optional): Method to use for resampling. Defaults to "kaiser_fast".
+        duration (int, optional): Length to pad/shorten audio files to.
+            0 returns original audio length. Defaults to 30.
+    Returns:
+        numpy.array: Audio file as numpy array.
+    """
+    # Load an audio file with librosa. Resamples the file to a specified sample rate.
+    audio_array, _ = librosa.load(path, sr=sample_rate, mono=True, res_type=resampling_type)
+    if duration > 0:
+        audio_array = pad_audio(audio_array, sample_rate, 30)
+    return audio_array
+def add_noise(audio_array, std):
+    noise = np.random.normal(0, std, audio_array.shape)
+    return audio_array + noise
+def pad_audio(audio_array, sample_rate=22050, duration=30):
+    # If audio array is shorter than 30s*sample rate -> pad
+    # If audio array is longer than 30s*sample rate -> shorten
+    duration_samples = duration * sample_rate
+    audio_len = audio_array.size
+    if audio_len < duration_samples:
+        audio_array = np.pad(audio_array, (duration_samples - audio_len)//2)
+    elif audio_len > duration_samples:
+        audio_array = audio_array[:duration_samples]
+    return audio_array
+def log_mel_spectrogram(audio_array, sr=22050, nfft=2048, hop_length=512, window="hann"):
+    S = librosa.feature.melspectrogram(y=audio_array, sr=sr, n_fft=nfft,
+                                       hop_length=hop_length, win_length=nfft,
+                                       window=window)
+    S_db = librosa.power_to_db(S, ref=np.max)
+    return S_db
+def split_spectrogram(spectrogram, output_shape=(128, 256)):
+    # Split spectrogram into equal chunks along the column axis.
+    splits = []
+    col_idx = 0
+    while col_idx + output_shape[1] <= spectrogram.shape[1]:
+        spec_split = spectrogram[:, col_idx:col_idx+output_shape[1]]
+        splits.append(spec_split)
+        col_idx += output_shape[1]
+    return splits
+def save_spectrogram_as_png(spectrogram, save_path, sample_rate=22050, nfft=2048, hop_length=512):
+    shape = spectrogram.shape
+    fig, ax = plt.subplots(1, 1, figsize=(shape[1]/100, shape[0]/100))
+    fig.subplots_adjust(top=1.0, bottom=0, right=1.0, left=0, hspace=0, wspace=0)
+    ax.set_axis_off()
+    librosa.display.specshow(data=spectrogram, sr=sample_rate, n_fft=nfft, hop_length=hop_length, ax=ax)
+    plt.savefig(save_path, bbox_inches=None, pad_inches=0)
+    plt.close(fig)
+    return
+def extract_features(df, audio_dir, save_path,
+                     sr=22050, rs_type="kaiser_fast",
+                     output_shape=(128,256), duration=30
+                     , nfft=2048, hop_length=512, window="hann", checkpoint_id=0):
+    """
+    Loads audio files, computes log-mel-spectrogram and saves it as png.
+    Args:
+        df (_type_): DataFrame containing ids and genres. Should only contain samples from specific data split (train/val/test).
+        audio_dir (_type_): Directory containing all audio files.
+        save_path (_type_): Path to where spectrograms will be saved.
+        sr (_type_): Sampling rate to set for all loaded audio files.
+        rs_type (_type_): Resampling method used when loading audio file to specific sampling rate.
+        output_shape (_type_): Shape of each spectrogram split.
+        duration (_type_): Set to standardize length of all audio files. Longer or shorter will be cut or padded respectively.
+        nfft (_type_): Number of samples for every fft window.
+        hop_length (_type_): Hop length to use for STFT.
+        window (_type_): Window function to use for STFT.
+        checkpoint_id (int, optional): Write the id of a track to start from there. Defaults to 0.
+    """
+    matplotlib.use("Agg")
+    if int(checkpoint_id) > 0:
+        df = df.loc[checkpoint_id:]
+    id_list = df.index.ravel()
+    genre_list = df["genre_top"].ravel()
+    # Due to some weird memory leak, garbage collection is manually performed every 10% of progress.
+    gc_interval = int(len(id_list) * 0.1)
+    gc_checkpoints = id_list[::gc_interval]
+    for id, genre in zip(id_list, genre_list):
+        id_string = str(id).rjust(6, "0")
+        filename = id_string + ".mp3"
+        folder_name = filename[:3]
+        file_path = os.path.join(audio_dir, folder_name, filename)
+        print(id_string, end=" ")
+        audio = load_audio_file(file_path, sr, rs_type, duration)
+        spectrogram = log_mel_spectrogram(audio, sr=sr, nfft=nfft, hop_length=hop_length, window=window)
+        spec_splits = split_spectrogram(spectrogram, output_shape)
+        for idx, split in enumerate(spec_splits):
+            image_name = id_string + "_" + str(idx+1) +".png"
+            image_path = os.path.join(save_path, genre, image_name)
+            save_spectrogram_as_png(split, image_path, sr, nfft, hop_length)
+        if id in gc_checkpoints:
+            gc.collect()
+    return
+def image_transformer(dataset, mode):
+    """
+    Convert images from Huggingface Dataset object to different mode.
+    The generated PNGs are usually RGBA. This function can convert them to RGB, grayscale among others.
+    Args:
+        dataset (object): Huggingface Dataset object
+        mode (str): String specifying mode to convert images to. Ex: "RGB", "L" for grayscale.
+    Returns:
+        object: Huggingface Dataset
+    """
+    dataset["image"] = [image.convert(mode) for image in dataset["image"]]
+    return dataset

requirements.txt CHANGED Viewed

@@ -2,5 +2,6 @@ gradio
 librosa
 numpy
 hopsworks
 tensorflow==2.15.0
 keras==3.0.2

 librosa
 numpy
 hopsworks
+datasets
 tensorflow==2.15.0
 keras==3.0.2