Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Feb 10

Commit

55e527f

verified ·

1 Parent(s): 45ea80f

Update app.py

Browse files

Files changed (1) hide show

app.py +310 -148

app.py CHANGED Viewed

@@ -1,172 +1,334 @@
-import os
-import binascii
-import warnings
-import json
-import argparse
-import copy
-import numpy as np
-import matplotlib.pyplot as plt
 import torch
-import tqdm
-import librosa
 import soundfile as sf
-import gradio as gr
-import pytube as pt
-from pytube.exceptions import VideoUnavailable
-from inference.style_transfer import *
-yt_video_dir = f"./yt_dir/0"
-os.makedirs(yt_video_dir, exist_ok=True)
-def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30):
     try:
-        yt = pt.YouTube(yt_link)
-        t = yt.streams.filter(only_audio=True)
-        filename_in = os.path.join(yt_video_dir, "input.wav")
-        t[0].download(filename=filename_in)
-    except VideoUnavailable as e:
-        warnings.warn(f"Video Not Found at {yt_link} ({e})")
-        filename_in = None
-    # trim audio length - due to computation time on HuggingFace environment
-    trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
-    return filename_in, filename_in
-def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30):
-    try:
-        yt = pt.YouTube(yt_link)
-        t = yt.streams.filter(only_audio=True)
-        filename_ref = os.path.join(yt_video_dir, "reference.wav")
-        t[0].download(filename=filename_ref)
-    except VideoUnavailable as e:
-        warnings.warn(f"Video Not Found at {yt_link} ({e})")
-        filename_ref = None
-    # trim audio length - due to computation time on HuggingFace environment
-    trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
-    return filename_ref, filename_ref
-def inference(file_uploaded_in, file_uploaded_ref):
-    # clear out previously separated results
-    os.system(f"rm -r {yt_video_dir}/separated")
-    # change file path name
-    os.system(f"cp {file_uploaded_in} {yt_video_dir}/input.wav")
-    os.system(f"cp {file_uploaded_ref} {yt_video_dir}/reference.wav")
-    # Perform music mixing style transfer
-    args = set_up()
-    inference_style_transfer = Mixing_Style_Transfer_Inference(args)
-    output_wav_path, fin_data_out_mix = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref)
-    return (44100, fin_data_out_mix)
 with gr.Blocks() as demo:
-    gr.HTML(
-        """
-            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
-              <div
-                style="
-                  display: inline-flex;
-                  align-items: center;
-                  gap: 0.8rem;
-                  font-size: 1.75rem;
-                "
-              >
-                <h1 style="font-weight: 900; margin-bottom: 7px;">
-                  Music Mixing Style Transfer
-                </h1>
-              </div>
-        """
     )
-    gr.Markdown(
-        """
-        This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023).
-            - [project page](https://jhtonykoo.github.io/MixingStyleTransfer/)
-            - [GitHub](https://github.com/jhtonyKoo/music_mixing_style_transfer)
-            - [supplementary](https://pale-cicada-946.notion.site/Music-Mixing-Style-Transfer-A-Contrastive-Learning-Approach-to-Disentangle-Audio-Effects-Supplemen-e6eccd9a431a4a8fa4fdd5adb2d3f219)
-        """
     )
-    with gr.Group():
         with gr.Column():
-            with gr.Blocks():
-                with gr.Tab("Input Music"):
-                    file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type='filepath')
-                with gr.Tab("YouTube url"):
-                    with gr.Row():
-                        yt_link_in = gr.Textbox(
-                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
-                        )
-                        yt_in_start_sec = gr.Number(
-                            value=0,
-                            label="starting point of the song (in seconds)"
-                        )
-                        yt_in_duration_sec = gr.Number(
-                            value=30,
-                            label="duration of the song (in seconds)"
-                        )
-                        yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg")
-                    yt_audio_path_in = gr.Audio(
-                        label="Input Audio Extracted from the YouTube Video", interactive=False
-                    )
-                    yt_btn_in.click(
-                        get_audio_from_yt_video_input,
-                        inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec],
-                        outputs=[yt_audio_path_in, file_uploaded_in],
-                    )
-            with gr.Blocks():
-                with gr.Tab("Reference Music"):
-                    file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type='filepath')
-                with gr.Tab("YouTube url"):
-                    with gr.Row():
-                        yt_link_ref = gr.Textbox(
-                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
-                        )
-                        yt_ref_start_sec = gr.Number(
-                            value=0,
-                            label="starting point of the song (in seconds)"
-                        )
-                        yt_ref_duration_sec = gr.Number(
-                            value=30,
-                            label="duration of the song (in seconds)"
-                        )
-                        yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg")
-                    yt_audio_path_ref = gr.Audio(
-                        label="Reference Audio Extracted from the YouTube Video", interactive=False
-                    )
-                    yt_btn_ref.click(
-                        get_audio_from_yt_video_ref,
-                        inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec],
-                        outputs=[yt_audio_path_ref, file_uploaded_ref],
-                    )
-    with gr.Group():
-        gr.HTML(
-            """
-            <div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient...  </h3> </div>
-            """
-        )
         with gr.Column():
-            inference_btn = gr.Button("Run Mixing Style Transfer")
-        with gr.Row():
-            output_mix = gr.Audio(label="mixing style transferred music track", type='numpy')
-            inference_btn.click(
-                inference,
-                inputs=[file_uploaded_in, file_uploaded_ref],
-                outputs=[output_mix],
             )
-if __name__ == "__main__":
-    demo.launch(debug=True)

+import gradio as gr
 import torch
 import soundfile as sf
+import numpy as np
+import yaml
+from inference import MasteringStyleTransfer
+from utils import download_youtube_audio
+from config import args
+import pyloudnorm as pyln
+import tempfile
+import os
+import pandas as pd
+mastering_transfer = MasteringStyleTransfer(args)
+def denormalize_audio(audio, dtype=np.int16):
+    """
+    Denormalize the audio from the range [-1, 1] to the full range of the specified dtype.
+    """
+    if dtype == np.int16:
+        audio = np.clip(audio, -1, 1)  # Ensure the input is in the range [-1, 1]
+        return (audio * 32767).astype(np.int16)
+    elif dtype == np.float32:
+        return audio.astype(np.float32)
+    else:
+        raise ValueError("Unsupported dtype. Use np.int16 or np.float32.")
+def loudness_normalize(audio, sample_rate, target_loudness=-12.0):
+    # Ensure audio is float32
+    if audio.dtype != np.float32:
+        audio = audio.astype(np.float32)
+    # If audio is mono, reshape to (samples, 1)
+    if audio.ndim == 1:
+        audio = audio.reshape(-1, 1)
+    meter = pyln.Meter(sample_rate)  # create BS.1770 meter
+    loudness = meter.integrated_loudness(audio)
+    loudness_normalized_audio = pyln.normalize.loudness(audio, loudness, target_loudness)
+    return loudness_normalized_audio
+def process_youtube_url(url):
     try:
+        audio, sr = download_youtube_audio(url)
+        return (sr, audio), None
+    except Exception as e:
+        return None, f"Error processing YouTube URL: {str(e)}"
+def download_youtube_audios(input_youtube_url, reference_youtube_url):
+    input_audio, input_error = process_youtube_url(input_youtube_url) if input_youtube_url else (None, None)
+    reference_audio, reference_error = process_youtube_url(reference_youtube_url) if reference_youtube_url else (None, None)
+    return input_audio, reference_audio, input_error, reference_error
+def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio, reference_youtube_url):
+    if input_youtube_url:
+        input_audio, error = process_youtube_url(input_youtube_url)
+        if error:
+            return None, None, error
+    if reference_youtube_url:
+        reference_audio, error = process_youtube_url(reference_youtube_url)
+        if error:
+            return None, None, error
+    if input_audio is None or reference_audio is None:
+        return None, None, "Both input and reference audio are required."
+    return process_audio(input_audio, reference_audio)
+def to_numpy_audio(audio):
+    # Convert output_audio to numpy array if it's a tensor
+    if isinstance(audio, torch.Tensor):
+        audio = audio.cpu().numpy()
+    # check dimension
+    if audio.ndim == 1:
+        audio = audio.reshape(-1, 1)
+    elif audio.ndim > 2:
+        audio = audio.squeeze()
+    # Ensure the audio is in the correct shape (samples, channels)
+    if audio.shape[1] > audio.shape[0]:
+        audio = audio.transpose(1,0)
+    return audio
+def process_audio(input_audio, reference_audio):
+    output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
+        input_audio, reference_audio
+    )
+    param_output = mastering_transfer.get_param_output_string(predicted_params)
+    # Convert to numpy audio
+    output_audio = to_numpy_audio(output_audio)
+    normalized_input = to_numpy_audio(normalized_input)
+    # Normalize output audio
+    output_audio = loudness_normalize(output_audio, sr)
+    # Denormalize the audio to int16
+    output_audio = denormalize_audio(output_audio, dtype=np.int16)
+    return (sr, output_audio), param_output, (sr, normalized_input)
+def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights, loss_function, clap_target_type, clap_text_prompt, clap_distance_fn):
+    if ito_reference_audio is None:
+        ito_reference_audio = reference_audio
+    af_weights = [float(w.strip()) for w in af_weights.split(',')]
+    ito_config = {
+        'optimizer': optimizer,
+        'learning_rate': learning_rate,
+        'num_steps': num_steps,
+        'af_weights': af_weights,
+        'sample_rate': args.sample_rate,
+        'loss_function': loss_function,
+        'clap_target_type': clap_target_type,
+        'clap_text_prompt': clap_text_prompt,
+        'clap_distance_fn': clap_distance_fn
+    }
+    input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate)
+    reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate)
+    ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate)
+    initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor)
+    all_results, min_loss_step = mastering_transfer.inference_time_optimization(
+        input_tensor, ito_reference_tensor, ito_config, initial_reference_feature
+    )
+    ito_log = ""
+    loss_values = []
+    for result in all_results:
+        ito_log += result['log']
+        loss_values.append({"step": result['step'], "loss": result['loss']})
+    # Return the results of the last step
+    last_result = all_results[-1]
+    current_output = last_result['audio']
+    ito_param_output = mastering_transfer.get_param_output_string(last_result['params'])
+    # Convert to numpy audio
+    current_output = to_numpy_audio(current_output)
+    # Loudness normalize output audio
+    current_output = loudness_normalize(current_output, args.sample_rate)
+    # Denormalize the audio to int16
+    current_output = denormalize_audio(current_output, dtype=np.int16)
+    return (args.sample_rate, current_output), ito_param_output, num_steps, ito_log, pd.DataFrame(loss_values), all_results
+def update_ito_output(all_results, selected_step):
+    selected_result = all_results[selected_step - 1]
+    current_output = selected_result['audio']
+    ito_param_output = mastering_transfer.get_param_output_string(selected_result['params'])
+    # Convert to numpy audio
+    current_output = to_numpy_audio(current_output)
+    # Loudness normalize output audio
+    current_output = loudness_normalize(current_output, args.sample_rate)
+    # Denormalize the audio to int16
+    current_output = denormalize_audio(current_output, dtype=np.int16)
+    return (args.sample_rate, current_output), ito_param_output, selected_result['log']
+""" APP display """
 with gr.Blocks() as demo:
+    gr.Markdown("# ITO-Master: Inference Time Optimization for Mastering Style Transfer")
+    with gr.Row():
+        gr.Markdown("Interactive demo of Inference Time Optimization (ITO) for Music Mastering Style Transfer. \
+                    The mastering style transfer is performed by a differentiable audio processing model, and the predicted parameters are shown as the output. \
+                    Perform mastering style transfer with an input source audio and a reference mastering style audio. On top of this result, you can perform ITO to optimize the reference embedding $z_{ref}$ to further gain control over the output mastering style.")
+        gr.Image("ito_snow.png", width=500, height=300, label="ITO pipeline")
+    gr.Markdown("## Step 1: Mastering Style Transfer")
+    with gr.Tab("Upload Audio"):
+        with gr.Row():
+            input_audio = gr.Audio(label="Source Audio $x_{in}$")
+            reference_audio = gr.Audio(label="Reference Style Audio $x_{ref}$")
+        process_button = gr.Button("Process Mastering Style Transfer")
+        gr.Markdown('<span style="color: lightgray; font-style: italic;">all output samples are normalized to -12dB LUFS</span>')
+        with gr.Row():
+            with gr.Column():
+                output_audio = gr.Audio(label="Output Audio y'", type='numpy')
+                normalized_input = gr.Audio(label="Normalized Source Audio", type='numpy')
+            param_output = gr.Textbox(label="Predicted Parameters", lines=5)
+        process_button.click(
+            process_audio,
+            inputs=[input_audio, reference_audio],
+            outputs=[output_audio, param_output, normalized_input]
+        )
+    with gr.Tab("YouTube Audio"):
+        gr.Markdown("Seems like it's currently unavailable to download YouTube clips from HuggingFace... But you could try out yourself in your environment with the available source code.")
+        with gr.Row():
+            input_youtube_url = gr.Textbox(label="Input YouTube URL")
+            reference_youtube_url = gr.Textbox(label="Reference YouTube URL")
+        download_button = gr.Button("Download YouTube Audios")
+        error_message_yt = gr.Textbox(label="Error Message", visible=False)
+        with gr.Row():
+            input_audio_yt = gr.Audio(label="Source Audio (Do not put when using YouTube URL)")
+            reference_audio_yt = gr.Audio(label="Reference Style Audio (Do not put when using YouTube URL)")
+        process_button_yt = gr.Button("Process Mastering Style Transfer")
+        gr.Markdown('<span style="color: lightgray; font-style: italic;">all output samples are normalized to -12dB LUFS</span>')
+        with gr.Row():
+            with gr.Column():
+                output_audio_yt = gr.Audio(label="Output Audio y'", type='numpy')
+                normalized_input_yt = gr.Audio(label="Normalized Source Audio", type='numpy')
+            param_output_yt = gr.Textbox(label="Predicted Parameters", lines=5)
+        def handle_download_youtube_audios(input_youtube_url, reference_youtube_url):
+            input_audio, reference_audio, input_error, reference_error = download_youtube_audios(input_youtube_url, reference_youtube_url)
+            if input_error or reference_error:
+                return None, None, gr.update(visible=True, value=input_error or reference_error)
+            return input_audio, reference_audio, gr.update(visible=False, value="")
+        download_button.click(
+            handle_download_youtube_audios,
+            inputs=[input_youtube_url, reference_youtube_url],
+            outputs=[input_audio_yt, reference_audio_yt, error_message_yt]
+        )
+        process_button_yt.click(
+            process_audio,
+            inputs=[input_audio_yt, reference_audio_yt],
+            outputs=[output_audio_yt, param_output_yt, normalized_input_yt]
+        )
+        # def process_and_handle_errors(input_audio, input_youtube_url, reference_audio, reference_youtube_url):
+        #     result = process_audio_with_youtube(input_audio, input_youtube_url, reference_audio, reference_youtube_url)
+        #     if len(result) == 3 and isinstance(result[2], str):  # Error occurred check
+        #         return None, None, None, gr.update(visible=True, value=result[2])
+        #     return result[0], result[1], result[2], gr.update(visible=False, value="")
+        # process_button_yt.click(
+        #     process_and_handle_errors,
+        #     inputs=[input_audio_yt, input_youtube_url, reference_audio_yt, reference_youtube_url],
+        #     outputs=[output_audio_yt, param_output_yt, normalized_input_yt, error_message_yt]
+        # )
+    gr.Markdown("## Step 2: Inference Time Optimization (ITO)")
+    with gr.Row():
+        ito_reference_audio = gr.Audio(label="ITO Reference Style Audio $x'_{ref}$ (optional)")
+        with gr.Column():
+            num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps for additional optimization")
+            optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")
+            learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate")
+            loss_function = gr.Radio(["AudioFeatureLoss", "CLAPFeatureLoss"], label="Loss Function", value="AudioFeatureLoss")
+            # Audio Feature Loss weights
+            with gr.Column(visible=True) as audio_feature_weights:
+                af_weights = gr.Textbox(
+                    label="AudioFeatureLoss Weights (comma-separated)",
+                    value="0.1,0.001,1.0,1.0,0.1",
+                    info="RMS, Crest Factor, Stereo Width, Stereo Imbalance, Bark Spectrum"
+                )
+            # CLAP Loss options
+            with gr.Column(visible=False) as clap_options:
+                clap_target_type = gr.Radio(["Audio", "Text"], label="CLAP Target Type", value="Audio")
+                clap_text_prompt = gr.Textbox(label="CLAP Text Prompt", visible=False)
+                clap_distance_fn = gr.Dropdown(["cosine", "mse", "l1"], label="CLAP Distance Function", value="cosine")
+    def update_clap_options(loss_function):
+        if loss_function == "CLAPFeatureLoss":
+            return gr.update(visible=False), gr.update(visible=True)
+        else:
+            return gr.update(visible=True), gr.update(visible=False)
+    loss_function.change(
+        update_clap_options,
+        inputs=[loss_function],
+        outputs=[audio_feature_weights, clap_options]
     )
+    def update_clap_text_prompt(clap_target_type):
+        return gr.update(visible=clap_target_type == "Text")
+    clap_target_type.change(
+        update_clap_text_prompt,
+        inputs=[clap_target_type],
+        outputs=[clap_text_prompt]
     )
+    ito_button = gr.Button("Perform ITO")
+    gr.Markdown('<span style="color: lightgray; font-style: italic;">all output samples are normalized to -12dB LUFS</span>')
+    with gr.Row():
         with gr.Column():
+            ito_output_audio = gr.Audio(label="ITO Output Audio")
+            ito_step_slider = gr.Slider(minimum=1, maximum=100, step=1, label="ITO Step", interactive=True)
+            ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=15)
         with gr.Column():
+            ito_loss_plot = gr.LinePlot(
+                x="step",
+                y="loss",
+                title="ITO Loss Curve",
+                x_title="Step",
+                y_title="Loss",
+                height=300,
+                width=600,
             )
+            ito_log = gr.Textbox(label="ITO Log", lines=10)
+    all_results = gr.State([])
+    ito_button.click(
+        perform_ito,
+        inputs=[normalized_input, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights, loss_function, clap_target_type, clap_text_prompt, clap_distance_fn],
+        outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
+    ).then(
+        update_ito_output,
+        inputs=[all_results, ito_step_slider],
+        outputs=[ito_output_audio, ito_param_output, ito_log]
+    )
+    ito_step_slider.change(
+        update_ito_output,
+        inputs=[all_results, ito_step_slider],
+        outputs=[ito_output_audio, ito_param_output, ito_log]
+    )
+# demo.launch()
+demo.launch(server_name="0.0.0.0", server_port=7860)