Spaces:

m-a-p
/

MERT-Music-Genre-Tagging-Prediction

Runtime error

File size: 2,498 Bytes

1eaf59a
5247bff
 
 
 
 
 
1eaf59a
5247bff
1eaf59a
5247bff

import gradio as gr
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import torch
from torch import nn
import torchaudio
import torchaudio.transforms as T

# input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py

inputs = [gr.components.Audio(type="filepath", label="Add music audio file"), 
          gr.inputs.Audio(source="microphone",optional=True, type="filepath"),
          ]
outputs = [gr.components.Textbox()]
# outputs = [gr.components.Textbox(), transcription_df]
title = "Output the tags of a (music) audio"
description = "An example of using MERT-95M-public to conduct music tagging."
article = ""
audio_examples = [
    ["input/example-1.wav"],
    ["input/example-2.wav"],
]

# Load the model
model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)


def convert_audio(inputs, microphone):
    if (microphone is not None):
        inputs = microphone
    
    waveform, sample_rate = torchaudio.load(inputs)
    

    resample_rate = processor.sampling_rate

    # make sure the sample_rate aligned
    if resample_rate != sample_rate:
        print(f'setting rate from {sample_rate} to {resample_rate}')
        resampler = T.Resample(sample_rate, resample_rate)
        waveform = resampler(waveform)
    
    inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # take a look at the output shape, there are 13 layers of representation
    # each layer performs differently in different downstream tasks, you should choose empirically
    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
    # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
    return str(all_layer_hidden_states.shape)


# iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")
# iface.launch()

audio_chunked = gr.Interface(
    fn=convert_audio,
    inputs=inputs,
    outputs=outputs,
    allow_flagging="never",
    title=title,
    description=description,
    article=article,
    examples=audio_examples,
)


demo = gr.Blocks()
with demo:
    gr.TabbedInterface([audio_chunked], [
        "Audio File"])
# demo.queue(concurrency_count=1, max_size=5)
demo.launch(show_api=False)