Spaces:
Running
Running
import gradio as gr | |
from numpy import max as np_max,abs as np_abs,int16 as np_int16 | |
from librosa import load as librosa_load | |
from pydub import AudioSegment | |
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor | |
from os import listdir as os_listdir | |
from base64 import b64encode | |
from shutil import unpack_archive | |
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to("cpu") | |
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") | |
unpack_archive("soundfonts.zip","soundfonts") | |
soundfonts = [i.removesuffix(".sf2") for i in os_listdir("soundfonts")] | |
def librosa_to_audiosegment(y, sr): | |
epsilon = 1e-8 | |
if np_max(np_abs(y)) > 0: | |
y = y / (np_max(np_abs(y)) + epsilon) * 32767 | |
return AudioSegment(y.astype(np_int16).tobytes(), frame_rate=sr, sample_width=2, channels=1) | |
def inference(file_upload, composer, sf2_files,volume=-16): | |
sf2_files = ["soundfonts/" + i + ".sf2" for i in sf2_files] | |
audio_data, audio_sr = librosa_load(file_upload, sr=None) | |
inputs = processor(audio=audio_data, sampling_rate=audio_sr, return_tensors="pt").to("cpu") | |
midi = processor.batch_decode( | |
token_ids=model.generate(input_features=inputs["input_features"], composer="composer" + str(composer)), | |
feature_extractor_output=inputs | |
)["pretty_midi_objects"][0] | |
midi.write(open("output.mid", "wb")) | |
final_mix = librosa_to_audiosegment(audio_data, audio_sr).apply_gain(volume) | |
for sf2_file in sf2_files: | |
sf_audio_data = midi.fluidsynth(fs=44100, sf2_path=sf2_file) | |
epsilon = 1e-8 | |
sf_audio_data = np_int16(sf_audio_data / (np_max(np_abs(sf_audio_data)) + epsilon) * 32767) | |
sf_audio_segment = librosa_to_audiosegment(sf_audio_data, 44100) | |
if len(sf_audio_segment) < len(final_mix): | |
sf_audio_segment = sf_audio_segment.append(AudioSegment.silent(duration=len(final_mix) - len(sf_audio_segment))) | |
elif len(sf_audio_segment) > len(final_mix): | |
sf_audio_segment = sf_audio_segment[:len(final_mix)] | |
final_mix = final_mix.overlay(sf_audio_segment) | |
final_mix.export("output.mp3", format="mp3") | |
return "output.mid", "output.mp3", f'<div style="display: flex; justify-content: center; align-items: center;"><iframe style="width: 100%; height: 500px; overflow:hidden" srcdoc=\'{open("midi_viz.html").read().replace("{midi_data}", b64encode(open("output.mid","rb").read()).decode("utf-8"))}\'></iframe></div>' | |
gr.Interface( | |
inference, | |
[ | |
gr.Audio(sources="upload", type="filepath", label="Audio"), | |
gr.Number(1, minimum=1, maximum=21, label="Composer"), | |
gr.Dropdown(soundfonts, multiselect=True, label="Instrument") | |
], | |
[ | |
gr.File(label="MIDI"), | |
gr.Audio(label="Instrument Audio"), | |
gr.HTML() | |
] | |
).launch() |