Spaces:

jhtonyKoo
/

music_mixing_style_transfer

Running on T4

App Files Files Community

music_mixing_style_transfer / app.py

jhtonyKoo

Update app.py

36393fb verified 5 months ago

raw

history blame contribute delete

7.39 kB

	import os
	import binascii
	import warnings

	import json
	import argparse
	import copy

	import numpy as np
	import matplotlib.pyplot as plt
	import torch
	import tqdm
	import librosa
	import soundfile as sf
	import gradio as gr
	import pytube as pt

	from pytube.exceptions import VideoUnavailable

	from inference.style_transfer import *


	yt_video_dir = f"./yt_dir/0"
	os.makedirs(yt_video_dir, exist_ok=True)


	def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30):
	try:
	yt = pt.YouTube(yt_link)
	t = yt.streams.filter(only_audio=True)
	filename_in = os.path.join(yt_video_dir, "input.wav")
	t[0].download(filename=filename_in)
	except VideoUnavailable as e:
	warnings.warn(f"Video Not Found at {yt_link} ({e})")
	filename_in = None

	# trim audio length - due to computation time on HuggingFace environment
	trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)

	return filename_in, filename_in

	def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30):
	try:
	yt = pt.YouTube(yt_link)
	t = yt.streams.filter(only_audio=True)
	filename_ref = os.path.join(yt_video_dir, "reference.wav")
	t[0].download(filename=filename_ref)
	except VideoUnavailable as e:
	warnings.warn(f"Video Not Found at {yt_link} ({e})")
	filename_ref = None

	# trim audio length - due to computation time on HuggingFace environment
	trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)

	return filename_ref, filename_ref

	def inference(file_uploaded_in, file_uploaded_ref):
	# clear out previously separated results
	os.system(f"rm -r {yt_video_dir}/separated")
	# change file path name
	# os.system(f"cp {file_uploaded_in} {yt_video_dir}/input.wav")
	# os.system(f"cp {file_uploaded_ref} {yt_video_dir}/reference.wav")

	sample_rate, data = file_uploaded_in
	sf.write(f"{yt_video_dir}/input.wav", data, sample_rate)
	sample_rate, data = file_uploaded_ref
	sf.write(f"{yt_video_dir}/reference.wav", data, sample_rate)


	# Perform music mixing style transfer
	args = set_up()

	inference_style_transfer = Mixing_Style_Transfer_Inference(args)
	# output_wav_path, fin_data_out_mix = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref)
	output_wav_path, fin_data_out_mix = inference_style_transfer.inference(f"{yt_video_dir}/input.wav", f"{yt_video_dir}/reference.wav")

	return (44100, fin_data_out_mix.transpose())



	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 0.8rem;
	font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px;">
	Music Mixing Style Transfer
	</h1>
	</div>
	"""
	)
	gr.Markdown(
	"""
	This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023).
	- [project page](https://jhtonykoo.github.io/MixingStyleTransfer/)
	- [GitHub](https://github.com/jhtonyKoo/music_mixing_style_transfer)
	- [supplementary](https://pale-cicada-946.notion.site/Music-Mixing-Style-Transfer-A-Contrastive-Learning-Approach-to-Disentangle-Audio-Effects-Supplemen-e6eccd9a431a4a8fa4fdd5adb2d3f219)
	"""
	)
	with gr.Group():
	with gr.Column():
	with gr.Blocks():
	with gr.Tab("Input Music"):
	file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred")
	with gr.Tab("YouTube url"):
	with gr.Row():
	yt_link_in = gr.Textbox(
	label="Enter YouTube Link of the Video", autofocus=True, lines=3
	)
	yt_in_start_sec = gr.Number(
	value=0,
	label="starting point of the song (in seconds)"
	)
	yt_in_duration_sec = gr.Number(
	value=30,
	label="duration of the song (in seconds)"
	)
	yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg")
	yt_audio_path_in = gr.Audio(
	label="Input Audio Extracted from the YouTube Video", interactive=False
	)
	yt_btn_in.click(
	get_audio_from_yt_video_input,
	inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec],
	outputs=[yt_audio_path_in, file_uploaded_in],
	)
	with gr.Blocks():
	with gr.Tab("Reference Music"):
	file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style")
	with gr.Tab("YouTube url"):
	with gr.Row():
	yt_link_ref = gr.Textbox(
	label="Enter YouTube Link of the Video", autofocus=True, lines=3
	)
	yt_ref_start_sec = gr.Number(
	value=0,
	label="starting point of the song (in seconds)"
	)
	yt_ref_duration_sec = gr.Number(
	value=30,
	label="duration of the song (in seconds)"
	)
	yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg")
	yt_audio_path_ref = gr.Audio(
	label="Reference Audio Extracted from the YouTube Video", interactive=False
	)
	yt_btn_ref.click(
	get_audio_from_yt_video_ref,
	inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec],
	outputs=[yt_audio_path_ref, file_uploaded_ref],
	)

	with gr.Group():
	gr.HTML(
	"""
	<div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient... </h3> </div>
	"""
	)
	with gr.Column():
	inference_btn = gr.Button("Run Mixing Style Transfer")
	with gr.Row():
	output_mix = gr.Audio(label="mixing style transferred music track", type='numpy')
	inference_btn.click(
	inference,
	inputs=[file_uploaded_in, file_uploaded_ref],
	outputs=[output_mix],
	)



	if __name__ == "__main__":
	demo.launch(debug=True)