pop2piano

Running

App Files Files Community

sweetcocoa commited on Sep 9, 2023

Commit

71a2b8b

1 Parent(s): db4880c

refactor ui

Browse files

Files changed (3) hide show

app.py +54 -91
requirements.txt +2 -2
utils.py +21 -0

app.py CHANGED Viewed

@@ -2,16 +2,17 @@ import os
 import binascii
 import warnings
-import torch
 import librosa
 import numpy as np
-import pytube as pt  # to download the youtube videos as audios
-import gradio as gr
-import soundfile as sf  # to make the stereo mix
 from pytube.exceptions import VideoUnavailable
 from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
 yt_video_dir = "./yt_dir"
 outputs_dir = "./midi_wav_outputs"
@@ -24,7 +25,7 @@ processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
 composers = model.generation_config.composer_to_feature_token.keys()
-def get_audio_from_yt_video(yt_link):
     try:
         yt = pt.YouTube(yt_link)
         t = yt.streams.filter(only_audio=True)
@@ -40,55 +41,43 @@ def get_audio_from_yt_video(yt_link):
 def inference(file_uploaded, composer):
     # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
     # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
-    waveform, sr = librosa.load(file_uploaded, sr=None)
-    inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
     model_output = model.generate(input_features=inputs["input_features"], composer=composer)
     tokenizer_output = processor.batch_decode(
         token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
     )["pretty_midi_objects"]
-    return prepare_output_file(tokenizer_output, sr)
-def prepare_output_file(tokenizer_output, sr:int):
     # Add some random values so that no two file names are same
-    output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
     midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
     # write the .mid and its wav files
     tokenizer_output[0].write(midi_output)
-    midi_wav:np.ndarray = tokenizer_output[0].fluidsynth(sr)
-    wav_output:str = midi_output.replace(".mid", ".wav")
-    sf.write(wav_output, midi_wav, samplerate=sr)
-    return wav_output, wav_output, midi_output
-def get_stereo(pop_path, midi, pop_scale=0.5):
-    pop_y, sr = librosa.load(pop_path, sr=None)
-    midi_y, _ = librosa.load(midi.name, sr=None)
     if len(pop_y) > len(midi_y):
         midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
     elif len(pop_y) < len(midi_y):
         pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
-    stereo = np.stack((midi_y, pop_y * pop_scale))
-    stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
-    sf.write(
-        file=stereo_mix_path,
-        data=stereo.T,
-        samplerate=sr,
-        format="wav",
-    )
-    return stereo_mix_path, stereo_mix_path
-# Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
-# taken from https://huggingface.co/spaces/NoCrypt/miku
-block = gr.Blocks(theme="Taithrah/Minimal")
 with block:
     gr.HTML(
@@ -114,67 +103,48 @@ with block:
         """
     )
     with gr.Group():
-        with gr.Row(equal_height=True):
-            with gr.Column():
-                file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
-            with gr.Column():
-                with gr.Row():
-                    yt_link = gr.Textbox(
-                        label="Enter YouTube Link of the Video", autofocus=True, lines=3
                     )
-                    yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
-                yt_audio_path = gr.Audio(
-                    label="Audio Extracted from the YouTube Video", interactive=False
-                )
-                yt_btn.click(
-                    get_audio_from_yt_video,
-                    inputs=[yt_link],
-                    outputs=[yt_audio_path, file_uploaded],
-                )
     with gr.Group():
-        with gr.Column():
-            composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
-            generate_btn = gr.Button("Generate")
         with gr.Row().style(mobile_collapse=False, equal_height=True):
-            wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
             wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
             midi_output = gr.File(label="Download the Generated MIDI (.mid)")
             generate_btn.click(
                 inference,
                 inputs=[file_uploaded, composer],
-                outputs=[wav_output1, wav_output2, midi_output],
             )
-    with gr.Group():
-        gr.HTML(
-            """
-            <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
-            """
-        )
-        pop_scale = (
-            gr.Slider(
-                0,
-                1,
-                value=0.5,
-                label="Choose the ratio between Pop and MIDI",
-                info="1.0 = Only Pop, 0.0=Only MIDI",
-                interactive=True,
-            ),
-        )
-        stereo_btn = gr.Button("Get Stereo Mix")
-        with gr.Row():
-            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
-            stereo_mix2 = gr.File(label="Download the Stereo Mix")
-        stereo_btn.click(
-            get_stereo,
-            inputs=[file_uploaded, wav_output2, pop_scale[0]],
-            outputs=[stereo_mix1, stereo_mix2],
-        )
     with gr.Group():
         gr.Examples(
             [
@@ -182,16 +152,9 @@ with block:
             ],
             fn=inference,
             inputs=[file_uploaded, composer],
-            outputs=[wav_output1, wav_output2, midi_output],
             cache_examples=True,
         )
-        gr.HTML(
-            """
-        <div class="footer">
-                    <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
-        </div>
-        """
-        )
         gr.HTML(
             """

 import binascii
 import warnings
+import gradio as gr
 import librosa
 import numpy as np
+import torch
+import pretty_midi
+import pytube as pt
 from pytube.exceptions import VideoUnavailable
 from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
+from utils import mp3_write, normalize
 yt_video_dir = "./yt_dir"
 outputs_dir = "./midi_wav_outputs"
 composers = model.generation_config.composer_to_feature_token.keys()
+def get_audio_from_yt_video(yt_link: str):
     try:
         yt = pt.YouTube(yt_link)
         t = yt.streams.filter(only_audio=True)
 def inference(file_uploaded, composer):
     # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
     # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
+    pop_y, sr = librosa.load(file_uploaded, sr=None)
+    inputs = processor(audio=pop_y, sampling_rate=sr, return_tensors="pt").to(device)
     model_output = model.generate(input_features=inputs["input_features"], composer=composer)
     tokenizer_output = processor.batch_decode(
         token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
     )["pretty_midi_objects"]
+    return prepare_output_file(tokenizer_output, sr, pop_y)
+def prepare_output_file(tokenizer_output: pretty_midi.PrettyMIDI, sr: int, pop_y: np.ndarray):
     # Add some random values so that no two file names are same
+    output_file_name = "p2p_" + binascii.hexlify(os.urandom(8)).decode()
     midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
     # write the .mid and its wav files
     tokenizer_output[0].write(midi_output)
+    midi_y: np.ndarray = tokenizer_output[0].fluidsynth(sr)
+    midi_y_path: str = midi_output.replace(".mid", ".mp3")
+    mp3_write(midi_y_path, sr, normalize(midi_y), normalized=True)
+    # stack stereo audio
     if len(pop_y) > len(midi_y):
         midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
     elif len(pop_y) < len(midi_y):
         pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
+    stereo = np.stack((midi_y, pop_y * 0.5))
+    # write stereo audio
+    stereo_path = midi_output.replace(".mid", ".mix.mp3")
+    mp3_write(stereo_path, sr, normalize(stereo.T), normalized=True)
+    return midi_y_path, midi_y_path, midi_output, stereo_path, stereo_path
+block = gr.Blocks()
 with block:
     gr.HTML(
         """
     )
     with gr.Group():
+        with gr.Column():
+            with gr.Blocks() as audio_select:
+                with gr.Tab("Upload Audio"):
+                    file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
+                with gr.Tab("YouTube url"):
+                    with gr.Row():
+                        yt_link = gr.Textbox(
+                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
+                        )
+                        yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
+                    yt_audio_path = gr.Audio(
+                        label="Audio Extracted from the YouTube Video", interactive=False
                     )
+                    yt_btn.click(
+                        get_audio_from_yt_video,
+                        inputs=[yt_link],
+                        outputs=[yt_audio_path, file_uploaded],
+                    )
+            with gr.Column():
+                composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
+                generate_btn = gr.Button("Generate")
     with gr.Group():
+        gr.HTML(
+            """
+            <div> <h3> <center> Listen to the generated MIDI. </h3> </div>
+            """
+        )
         with gr.Row().style(mobile_collapse=False, equal_height=True):
+            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
             wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
+        with gr.Row():
+            stereo_mix2 = gr.File(label="Download the Stereo Mix (.mp3")
+            wav_output2 = gr.File(label="Download the Generated MIDI (.mp3)")
             midi_output = gr.File(label="Download the Generated MIDI (.mid)")
             generate_btn.click(
                 inference,
                 inputs=[file_uploaded, composer],
+                outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
             )
     with gr.Group():
         gr.Examples(
             [
             ],
             fn=inference,
             inputs=[file_uploaded, composer],
+            outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
             cache_examples=True,
         )
         gr.HTML(
             """

requirements.txt CHANGED Viewed

@@ -3,8 +3,8 @@ librosa
 pretty-midi==0.2.9
 essentia==2.1b6.dev1034
 pyFluidSynth==1.3.0
-git+https://github.com/huggingface/transformers
 pytube
 gradio
 resampy
-soundfile

 pretty-midi==0.2.9
 essentia==2.1b6.dev1034
 pyFluidSynth==1.3.0
+transformers
 pytube
 gradio
 resampy
+pydub

utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+import pydub
+def mp3_write(f: str, sr: int, x: np.ndarray, normalized: bool = False):
+    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
+    if normalized:  # normalized array - each item should be a float in [-1, 1)
+        y = np.int16(x * 2**15)
+    else:
+        y = np.int16(x)
+    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
+    song.export(f, format="mp3", bitrate="256k")
+def normalize(audio: np.ndarray, min_y: float = -1.0, max_y: float = 1.0, eps: float = 1e-8):
+    max_y -= eps
+    min_y += eps
+    amax = audio.max()
+    amin = audio.min()
+    audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
+    return audio