Hyun_POP2PIANO / app.py
DataBassist's picture
Create app.py
b2b4fc2
raw
history blame
8.33 kB
import os # ํŒŒ์ผ ๋ฐ ๋””๋ ‰ํ† ๋ฆฌ ์ž‘์—…์„ ์œ„ํ•œ ๋ชจ๋“ˆ
import torch # ๋”ฅ๋Ÿฌ๋‹ ํ”„๋ ˆ์ž„์›Œํฌ PyTorch
import librosa # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ๋ชจ๋“ˆ
import binascii # ์ด์ง„ ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค๋ฃจ๋Š” ๋ชจ๋“ˆ
import warnings # ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€๋ฅผ ์ถœ๋ ฅํ•˜๋Š” ๋ชจ๋“ˆ
import midi2audio # MIDI ํŒŒ์ผ์„ WAV ํŒŒ์ผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๋ชจ๋“ˆ
import numpy as np # ๋‹ค์ฐจ์› ๋ฐฐ์—ด์„ ๋‹ค๋ฃจ๋Š” ๋ชจ๋“ˆ
import pytube as pt # YouTube ๋™์˜์ƒ์„ ๋‹ค์šด๋กœ๋“œํ•˜๋Š” ๋ชจ๋“ˆ
import gradio as gr # ์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒํ•œ UI๋ฅผ ๋งŒ๋“ค๊ธฐ ์œ„ํ•œ ๋ชจ๋“ˆ
import soundfile as sf # ์‚ฌ์šด๋“œ ํŒŒ์ผ์„ ๋‹ค๋ฃจ๋Š” ๋ชจ๋“ˆ
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor # Pop2Piano ๋ชจ๋ธ๊ณผ ์ „์ฒ˜๋ฆฌ๊ธฐ
yt_video_dir = "./yt_dir" # YouTube ๋™์˜์ƒ ๋‹ค์šด๋กœ๋“œ ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
outputs_dir = "./midi_wav_outputs" # MIDI ๋ฐ WAV ํŒŒ์ผ ์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
os.makedirs(outputs_dir, exist_ok=True) # ์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ (์ด๋ฏธ ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ ๋ฌด์‹œ)
os.makedirs(yt_video_dir, exist_ok=True) # YouTube ๋™์˜์ƒ ๋‹ค์šด๋กœ๋“œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ (์ด๋ฏธ ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ ๋ฌด์‹œ)
device = "cuda" if torch.cuda.is_available() else "cpu" # CUDA๊ฐ€ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๊ฒฝ์šฐ GPU๋ฅผ ์‚ฌ์šฉํ•˜๊ณ , ๊ทธ๋ ‡์ง€ ์•Š์€ ๊ฒฝ์šฐ CPU๋ฅผ ์‚ฌ์šฉ
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device) # ์‚ฌ์ „ ํ•™์Šต๋œ Pop2Piano ๋ชจ๋ธ ๋กœ๋“œ
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") # ์‚ฌ์ „ ํ•™์Šต๋œ Pop2Piano ์ „์ฒ˜๋ฆฌ๊ธฐ ๋กœ๋“œ
composers = model.generation_config.composer_to_feature_token.keys() # ์ž‘๊ณก๊ฐ€ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ
def get_audio_from_yt_video(yt_link):
try:
yt = pt.YouTube(yt_link) # YouTube ๋™์˜์ƒ ๊ฐ์ฒด ์ƒ์„ฑ
t = yt.streams.filter(only_audio=True) # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ํ•„ํ„ฐ๋ง
filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4") # ๋žœ๋ค ํŒŒ์ผ ์ด๋ฆ„ ์ƒ์„ฑ
t[0].download(filename=filename) # ๋™์˜์ƒ ๋‹ค์šด๋กœ๋“œ
except:
warnings.warn(f"Video Not Found at {yt_link}") # ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ์ถœ๋ ฅ
filename = None
return filename, filename
def inference(file_uploaded, composer):
waveform, sr = librosa.load(file_uploaded, sr=None) # ํŒŒ์ผ์—์„œ ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ์™€ ์ƒ˜ํ”Œ๋ง ์ฃผํŒŒ์ˆ˜ ๋กœ๋“œ
inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device) # ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ
model_output = model.generate(input_features=inputs["input_features"], composer=composer) # ๋ชจ๋ธ์— ์ž…๋ ฅํ•˜์—ฌ ์ถœ๋ ฅ ์ƒ์„ฑ
tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"] # ํ† ํฐ ๋””์ฝ”๋”ฉ
return prepare_output_file(tokenizer_output, sr) # ์ถœ๋ ฅ ํŒŒ์ผ ์ค€๋น„ ํ•จ์ˆ˜ ํ˜ธ์ถœ
def prepare_output_file(tokenizer_output, sr):
output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode() # ๋žœ๋ค ์ถœ๋ ฅ ํŒŒ์ผ ์ด๋ฆ„ ์ƒ์„ฑ
midi_output = os.path.join(outputs_dir, output_file_name + ".mid") # MIDI ์ถœ๋ ฅ ํŒŒ์ผ ๊ฒฝ๋กœ
tokenizer_output[0].write(midi_output) # MIDI ํŒŒ์ผ ์ž‘์„ฑ
wav_output = midi_output.replace(".mid", ".wav") # WAV ์ถœ๋ ฅ ํŒŒ์ผ ๊ฒฝ๋กœ
midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output) # MIDI๋ฅผ WAV๋กœ ๋ณ€ํ™˜
return wav_output, wav_output, midi_output # WAV ๋ฐ MIDI ํŒŒ์ผ ๊ฒฝ๋กœ ๋ฐ˜ํ™˜
def get_stereo(pop_path, midi, pop_scale=0.5):
pop_y, sr = librosa.load(pop_path, sr=None) # ํŒ ์Œ์•… ํŒŒ์ผ ๋กœ๋“œ
midi_y, _ = librosa.load(midi.name, sr=None) # MIDI ํŒŒ์ผ ๋กœ๋“œ
if len(pop_y) > len(midi_y):
midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y))) # MIDI ๊ธธ์ด๋ฅผ ํŒ ์Œ์•… ๊ธธ์ด์— ๋งž์ถค
elif len(pop_y) < len(midi_y):
pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y))) # ํŒ ์Œ์•… ๊ธธ์ด๋ฅผ MIDI ๊ธธ์ด์— ๋งž์ถค
stereo = np.stack((midi_y, pop_y * pop_scale)) # ์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ์ƒ์„ฑ
stereo_mix_path = pop_path.replace("output", "output_stereo_mix") # ์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ํŒŒ์ผ ๊ฒฝ๋กœ
sf.write(file=stereo_mix_path, data=stereo.T, samplerate=sr, format="wav") # ์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ํŒŒ์ผ ์ž‘์„ฑ
return stereo_mix_path, stereo_mix_path # ์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ํŒŒ์ผ ๊ฒฝ๋กœ ๋ฐ˜ํ™˜
block = gr.Blocks(theme="Taithrah/Deep") # Gradio ๋ธ”๋ก ์ƒ์„ฑ
with block:
gr.HTML(
"""
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 12px;">
๐ŸŽน Pop2Piano : ํ”ผ์•„๋…ธ ์ปค๋ฒ„๊ณก ์ƒ์„ฑ๊ธฐ ๐ŸŽน
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 90%">
A demo for Pop2Piano: Pop Audio-based Piano Cover Generation. <br>
Please select the composer (Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
</p>
</div>
"""
)
with gr.Group():
with gr.Row(equal_height=True):
with gr.Column():
file_uploaded = gr.Audio(label="์˜ค๋””์˜ค ์—…๋กœ๋“œ", type="filepath")
with gr.Column():
with gr.Row():
yt_link = gr.Textbox(label="์œ ํŠœ๋ธŒ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”.", autofocus=True, lines=3)
yt_btn = gr.Button("์œ ํŠœ๋ธŒ ๋งํฌ์—์„œ ์˜ค๋””์˜ค๋ฅผ ๋‹ค์šด ๋ฐ›์Šต๋‹ˆ๋‹ค.", size="lg")
yt_audio_path = gr.Audio(label="์œ ํŠœ๋ธŒ ๋™์˜์ƒ์—์„œ ์ถ”์ถœํ•œ ์˜ค๋””์˜ค", interactive=False)
yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])
with gr.Group():
with gr.Column():
composer = gr.Dropdown(label="ํŽธ๊ณก์ž ์„ ํƒ", choices=composers, value="composer1")
generate_btn = gr.Button("๋‚˜๋งŒ์˜ ํ”ผ์•„๋…ธ ์ปค๋ฒ„๊ณก ๋งŒ๋“ค๊ธฐ ๐ŸŽน ๐ŸŽต")
with gr.Row().style(mobile_collapse=False, equal_height=True):
wav_output2 = gr.File(label="๋‚˜๋งŒ์˜ ํ”ผ์•„๋…ธ ์ปค๋ฒ„๊ณก์„ ๋‹ค์šด๋กœ๋“œ (.wav)")
wav_output1 = gr.Audio(label="๋‚˜๋งŒ์˜ ํ”ผ์•„๋…ธ ์ปค๋ฒ„๊ณก ๋“ฃ๊ธฐ")
midi_output = gr.File(label="์ƒ์„ฑํ•œ midi ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ (.mid)")
generate_btn.click(
inference, inputs=[file_uploaded, composer], outputs=[wav_output1, wav_output2, midi_output]
)
with gr.Group():
gr.HTML(
"""
<div> <h3> <center> ์›๋ณธ ์˜ค๋””์˜ค์™€ MIDI๋ฅผ ํ˜ผํ•ฉํ•˜์—ฌ ์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ํ•˜๊ธฐ. </h3> </div>
"""
)
pop_scale = gr.Slider(
0, 1, value=0.5, label="์›๊ณก๊ณผ MIDI ์‚ฌ์ด์—์„œ ๋น„์œจ์„ ์กฐ์ •ํ•ด ๋ณด์„ธ์š”.", info="1.0 = ์›๊ณก, 0.0=.mid", interactive=True
),
stereo_btn = gr.Button("Get Stereo Mix")
with gr.Row():
stereo_mix1 = gr.Audio(label="์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ๋“ฃ๊ธฐ")
stereo_mix2 = gr.File(label="์Šคํ…Œ๋ ˆ์˜ค ๋ฏน์Šค ๋‹ค์šด๋กœ๋“œ")
stereo_btn.click(get_stereo, inputs=[file_uploaded, wav_output2, pop_scale[0]], outputs=[stereo_mix1, stereo_mix2])
gr.HTML(
"""
<div class="footer">
<center>The design for this Space is taken from DataBassist </a>
</div>
"""
)
gr.HTML(
"""
<div class="footer">
<center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
<center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
<center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
</p>
</div>
"""
)
block.launch(debug=False)