import gradio as gr from transformers import Wav2Vec2FeatureExtractor from transformers import AutoModel import torch from torch import nn import torchaudio import torchaudio.transforms as T # input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py inputs = [gr.components.Audio(type="filepath", label="Add music audio file"), gr.inputs.Audio(source="microphone",optional=True, type="filepath"), ] outputs = [gr.components.Textbox()] # outputs = [gr.components.Textbox(), transcription_df] title = "Output the tags of a (music) audio" description = "An example of using MERT-95M-public to conduct music tagging." article = "" audio_examples = [ ["input/example-1.wav"], ["input/example-2.wav"], ] # Load the model model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True) # loading the corresponding preprocessor config processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True) def convert_audio(inputs, microphone): if (microphone is not None): inputs = microphone waveform, sample_rate = torchaudio.load(inputs) resample_rate = processor.sampling_rate # make sure the sample_rate aligned if resample_rate != sample_rate: print(f'setting rate from {sample_rate} to {resample_rate}') resampler = T.Resample(sample_rate, resample_rate) waveform = resampler(waveform) inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True) # take a look at the output shape, there are 13 layers of representation # each layer performs differently in different downstream tasks, you should choose empirically all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim] return str(all_layer_hidden_states.shape) # iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text") # iface.launch() audio_chunked = gr.Interface( fn=convert_audio, inputs=inputs, outputs=outputs, allow_flagging="never", title=title, description=description, article=article, examples=audio_examples, ) demo = gr.Blocks() with demo: gr.TabbedInterface([audio_chunked], [ "Audio File"]) # demo.queue(concurrency_count=1, max_size=5) demo.launch(show_api=False)