Music Sorcery

import os
import binascii
import warnings

import json
import argparse
import copy

import numpy as np
import matplotlib.pyplot as plt
import torch
import tqdm
import librosa
import librosa.display
import soundfile as sf
import gradio as gr
import pytube as pt

from pytube.exceptions import VideoUnavailable

from inference.style_transfer import *


yt_video_dir = f"./yt_dir/0"
os.makedirs(yt_video_dir, exist_ok=True)


def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename_in = os.path.join(yt_video_dir, "input.wav")
        t[0].download(filename=filename_in)
    except VideoUnavailable as e:
        warnings.warn(f"Video Not Found at {yt_link} ({e})")
        filename_in = None

    # trim audio length - due to computation time on HuggingFace environment
    trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
    
    return filename_in, filename_in

def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename_ref = os.path.join(yt_video_dir, "reference.wav")
        t[0].download(filename=filename_ref)
    except VideoUnavailable as e:
        warnings.warn(f"Video Not Found at {yt_link} ({e})")
        filename_ref = None

    # trim audio length - due to computation time on HuggingFace environment
    trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
    
    return filename_ref, filename_ref

def inference(file_uploaded_in, file_uploaded_ref):
    # clear out previously separated results
    os.system(f"rm -r {yt_video_dir}/separated")
    # change file path name
    os.system(f"cp {file_uploaded_in} {yt_video_dir}/input.wav")
    os.system(f"cp {file_uploaded_ref} {yt_video_dir}/reference.wav")
    
    # Perform music mixing style transfer
    args = set_up()
    
    inference_style_transfer = Mixing_Style_Transfer_Inference(args)
    output_wav_path = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref)
    
    return output_wav_path


with gr.Blocks() as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Music Sorcery
                </h1>
              </div>
        """
    )
    gr.Markdown(
        """
        This App Elevate your tracks effortlessly by transforming their mixing style to mirror your favorite songs. Unleash studio sorcery and let your music shine with unparalleled creativity and enhanced sound quality.
        """
    )
    with gr.Group():
        with gr.Column():
            with gr.Blocks():
                with gr.Tab("Input Music"):
                    file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link_in = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_in_start_sec = gr.Number(
                            value=0,
                            label="starting point of the song (in seconds)"
                        )
                        yt_in_duration_sec = gr.Number(
                            value=30,
                            label="duration of the song (in seconds)"
                        )
                        yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path_in = gr.Audio(
                        label="Input Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn_in.click(
                        get_audio_from_yt_video_input,
                        inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec],
                        outputs=[yt_audio_path_in, file_uploaded_in],
                    )
            with gr.Blocks():
                with gr.Tab("Reference Music"):
                    file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link_ref = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_ref_start_sec = gr.Number(
                            value=0,
                            label="starting point of the song (in seconds)"
                        )
                        yt_ref_duration_sec = gr.Number(
                            value=30,
                            label="duration of the song (in seconds)"
                        )
                        yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path_ref = gr.Audio(
                        label="Reference Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn_ref.click(
                        get_audio_from_yt_video_ref,
                        inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec],
                        outputs=[yt_audio_path_ref, file_uploaded_ref],
                    )
                
    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient...  </h3> </div>
            """
        )
        with gr.Column():
            inference_btn = gr.Button("Run Mixing Style Transfer")
        with gr.Row():
            output_mix = gr.Audio(label="mixing style transferred music track")
            inference_btn.click(
                inference,
                inputs=[file_uploaded_in, file_uploaded_ref],
                outputs=[output_mix],
            )


if __name__ == "__main__":
    demo.launch(debug=True)