import gradio as gr from numpy import max as np_max,abs as np_abs,int16 as np_int16 from librosa import load as librosa_load from pydub import AudioSegment from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor from os import listdir as os_listdir from base64 import b64encode from shutil import unpack_archive model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to("cpu") processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") unpack_archive("soundfonts.zip","soundfonts") soundfonts = [i.removesuffix(".sf2") for i in os_listdir("soundfonts")] def librosa_to_audiosegment(y, sr): epsilon = 1e-8 if np_max(np_abs(y)) > 0: y = y / (np_max(np_abs(y)) + epsilon) * 32767 return AudioSegment(y.astype(np_int16).tobytes(), frame_rate=sr, sample_width=2, channels=1) def inference(file_upload, composer, sf2_files,volume=-16): sf2_files = ["soundfonts/" + i + ".sf2" for i in sf2_files] audio_data, audio_sr = librosa_load(file_upload, sr=None) inputs = processor(audio=audio_data, sampling_rate=audio_sr, return_tensors="pt").to("cpu") midi = processor.batch_decode( token_ids=model.generate(input_features=inputs["input_features"], composer="composer" + str(composer)), feature_extractor_output=inputs )["pretty_midi_objects"][0] midi.write(open("output.mid", "wb")) final_mix = librosa_to_audiosegment(audio_data, audio_sr).apply_gain(volume) for sf2_file in sf2_files: sf_audio_data = midi.fluidsynth(fs=44100, sf2_path=sf2_file) epsilon = 1e-8 sf_audio_data = np_int16(sf_audio_data / (np_max(np_abs(sf_audio_data)) + epsilon) * 32767) sf_audio_segment = librosa_to_audiosegment(sf_audio_data, 44100) if len(sf_audio_segment) < len(final_mix): sf_audio_segment = sf_audio_segment.append(AudioSegment.silent(duration=len(final_mix) - len(sf_audio_segment))) elif len(sf_audio_segment) > len(final_mix): sf_audio_segment = sf_audio_segment[:len(final_mix)] final_mix = final_mix.overlay(sf_audio_segment) final_mix.export("output.mp3", format="mp3") return "output.mid", "output.mp3", f'
' gr.Interface( inference, [ gr.Audio(sources="upload", type="filepath", label="Audio"), gr.Number(1, minimum=1, maximum=21, label="Composer"), gr.Dropdown(soundfonts, multiselect=True, label="Instrument") ], [ gr.File(label="MIDI"), gr.Audio(label="Instrument Audio"), gr.HTML() ] ).launch()