Spaces:
Running
on
T4
Running
on
T4
import os | |
import binascii | |
import warnings | |
import json | |
import argparse | |
import copy | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import torch | |
import tqdm | |
import librosa | |
import librosa.display | |
import soundfile as sf | |
import gradio as gr | |
import pytube as pt | |
from pytube.exceptions import VideoUnavailable | |
from inference.style_transfer import * | |
yt_video_dir = f"./yt_dir/0" | |
os.makedirs(yt_video_dir, exist_ok=True) | |
def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30): | |
try: | |
yt = pt.YouTube(yt_link) | |
t = yt.streams.filter(only_audio=True) | |
filename_in = os.path.join(yt_video_dir, "input.wav") | |
t[0].download(filename=filename_in) | |
except VideoUnavailable as e: | |
warnings.warn(f"Video Not Found at {yt_link} ({e})") | |
filename_in = None | |
# trim audio length - due to computation time on HuggingFace environment | |
trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) | |
return filename_in, filename_in | |
def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30): | |
try: | |
yt = pt.YouTube(yt_link) | |
t = yt.streams.filter(only_audio=True) | |
filename_ref = os.path.join(yt_video_dir, "reference.wav") | |
t[0].download(filename=filename_ref) | |
except VideoUnavailable as e: | |
warnings.warn(f"Video Not Found at {yt_link} ({e})") | |
filename_ref = None | |
# trim audio length - due to computation time on HuggingFace environment | |
trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) | |
return filename_ref, filename_ref | |
def inference(file_uploaded_in, file_uploaded_ref): | |
# clear out previously separated results | |
os.system(f"rm -r {yt_video_dir}/separated") | |
# change file path name | |
os.system(f"mv {file_uploaded_in} {yt_video_dir}/input.wav") | |
os.system(f"mv {file_uploaded_ref} {yt_video_dir}/reference.wav") | |
# Perform music mixing style transfer | |
args = set_up() | |
inference_style_transfer = Mixing_Style_Transfer_Inference(args) | |
output_wav_path = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref) | |
return output_wav_path | |
with gr.Blocks() as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
<div | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 0.8rem; | |
font-size: 1.75rem; | |
" | |
> | |
<h1 style="font-weight: 900; margin-bottom: 7px;"> | |
Music Mixing Style Transfer | |
</h1> | |
</div> | |
""" | |
) | |
gr.Markdown( | |
""" | |
This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023). | |
- [project page](https://jhtonykoo.github.io/MixingStyleTransfer/) | |
- [GitHub](https://github.com/jhtonyKoo/music_mixing_style_transfer) | |
- [supplementary](https://pale-cicada-946.notion.site/Music-Mixing-Style-Transfer-A-Contrastive-Learning-Approach-to-Disentangle-Audio-Effects-Supplemen-e6eccd9a431a4a8fa4fdd5adb2d3f219) | |
""" | |
) | |
with gr.Group(): | |
with gr.Column(): | |
with gr.Blocks(): | |
with gr.Tab("Input Music"): | |
file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type="filepath") | |
with gr.Tab("YouTube url"): | |
with gr.Row(): | |
yt_link_in = gr.Textbox( | |
label="Enter YouTube Link of the Video", autofocus=True, lines=3 | |
) | |
yt_in_start_sec = gr.Number( | |
value=0, | |
label="starting point of the song (in seconds)" | |
) | |
yt_in_duration_sec = gr.Number( | |
value=30, | |
label="duration of the song (in seconds)" | |
) | |
yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg") | |
yt_audio_path_in = gr.Audio( | |
label="Input Audio Extracted from the YouTube Video", interactive=False | |
) | |
yt_btn_in.click( | |
get_audio_from_yt_video_input, | |
inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec], | |
outputs=[yt_audio_path_in, file_uploaded_in], | |
) | |
with gr.Blocks(): | |
with gr.Tab("Reference Music"): | |
file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type="filepath") | |
with gr.Tab("YouTube url"): | |
with gr.Row(): | |
yt_link_ref = gr.Textbox( | |
label="Enter YouTube Link of the Video", autofocus=True, lines=3 | |
) | |
yt_ref_start_sec = gr.Number( | |
value=0, | |
label="starting point of the song (in seconds)" | |
) | |
yt_ref_duration_sec = gr.Number( | |
value=30, | |
label="duration of the song (in seconds)" | |
) | |
yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg") | |
yt_audio_path_ref = gr.Audio( | |
label="Reference Audio Extracted from the YouTube Video", interactive=False | |
) | |
yt_btn_ref.click( | |
get_audio_from_yt_video_ref, | |
inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec], | |
outputs=[yt_audio_path_ref, file_uploaded_ref], | |
) | |
with gr.Group(): | |
gr.HTML( | |
""" | |
<div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient... </h3> </div> | |
""" | |
) | |
with gr.Column(): | |
inference_btn = gr.Button("Run Mixing Style Transfer") | |
with gr.Row(): | |
output_mix = gr.Audio(label="mixing style transferred music track") | |
inference_btn.click( | |
inference, | |
inputs=[file_uploaded_in, file_uploaded_ref], | |
outputs=[output_mix], | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) | |