File size: 11,804 Bytes
c978742
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137bd5e
 
 
 
c978742
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# Experimental app to help with the process of generating music videos
# Requires youtube-dl to be installed
# pip install youtube-dl

import os
import random
from io import BytesIO
from pathlib import Path

import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import torch
import youtube_dl
from diffusers.models import AutoencoderKL
from diffusers.schedulers import LMSDiscreteScheduler
from matplotlib import pyplot as plt
from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr

from huggingface_hub import HfFolder

HfFolder().save_token(os.environ['HF_TOKEN'])

pipe = StableDiffusionWalkPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
    torch_dtype=torch.float16,
    revision="fp16",
    safety_checker=None,
    scheduler=LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"),
).to("cuda")


def download_example_clip(url, output_dir="./", output_filename="%(title)s.%(ext)s"):
    if (Path(output_dir) / output_filename).exists():
        return str(Path(output_dir) / output_filename)

    files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
    ydl_opts = {
        "outtmpl": str(Path(output_dir) / output_filename),
        "format": "bestaudio",
        "extract-audio": True,
        "audio-format": "mp3",
        "audio-quality": 0,
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    files_after = os.listdir(output_dir)
    return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])


def audio_data_to_buffer(y, sr):
    audio_filepath = BytesIO()
    audio_filepath.name = "audio.wav"
    sf.write(audio_filepath, y, samplerate=sr, format="WAV")
    audio_filepath.seek(0)
    return audio_filepath


def plot_array(y):
    fig = plt.figure()
    x = np.arange(y.shape[0])
    plt.title("Line graph")
    plt.xlabel("X axis")
    plt.ylabel("Y axis")
    plt.plot(x, y, color="red")
    plt.savefig("timesteps_chart.png")
    return fig


def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
    if audio is None:
        return [
            gr.update(visible=False),
            gr.update(visible=False),
        ]

    y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
    T = get_timesteps_arr(
        audio_data_to_buffer(y, sr),
        0,
        duration,
        fps=fps,
        margin=margin,
        smooth=smooth,
    )
    return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]


def on_audio_change_or_clear(audio):
    if audio is None:
        return [gr.update(visible=False), gr.update(visible=False)]

    duration = librosa.get_duration(filename=audio)
    return [gr.update(maximum=int(duration), visible=True), gr.update(maximum=int(min(10, duration)), visible=True)]


def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
    if sliced_audio is None:
        return gr.update(visible=False)

    T = get_timesteps_arr(
        sliced_audio,
        0,
        duration,
        fps=fps,
        margin=margin,
        smooth=smooth,
    )
    return gr.update(value=plot_array(T), visible=True)


def on_generate_images_btn_click(
    prompt_a,
    prompt_b,
    seed_a,
    seed_b,
    output_dir,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    upsample,
):
    output_dir = Path(output_dir) / "images"

    if seed_a == -1:
        seed_a = random.randint(0, 9999999)
    if seed_b == -1:
        seed_b = random.randint(0, 9999999)

    image_a_fpath = generate_images(
        pipe,
        prompt_a,
        seeds=[seed_a],
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        height=height,
        width=width,
        upsample=upsample,
        output_dir=output_dir,
    )[0]
    image_b_fpath = generate_images(
        pipe,
        prompt_b,
        seeds=[seed_b],
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        height=height,
        width=width,
        upsample=upsample,
        output_dir=output_dir,
    )[0]

    return [
        gr.update(value=image_a_fpath, visible=True),
        gr.update(value=image_b_fpath, visible=True),
        gr.update(value=seed_a),
        gr.update(value=seed_b),
    ]


def on_generate_music_video_btn_click(
    audio_filepath,
    audio_start_sec,
    duration,
    fps,
    smooth,
    margin,
    prompt_a,
    prompt_b,
    seed_a,
    seed_b,
    batch_size,
    output_dir,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    upsample,
):

    if audio_filepath is None:
        return gr.update(visible=False)

    video_filepath = pipe.walk(
        prompts=[prompt_a, prompt_b],
        seeds=[seed_a, seed_b],
        num_interpolation_steps=int(duration * fps),
        output_dir=output_dir,
        fps=fps,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        height=height,
        width=width,
        upsample=upsample,
        batch_size=batch_size,
        audio_filepath=audio_filepath,
        audio_start_sec=audio_start_sec,
        margin=margin,
        smooth=smooth,
    )
    return gr.update(value=video_filepath, visible=True)


audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
slice_btn = gr.Button("Slice Audio")

sliced_audio = gr.Audio(type="filepath")
wav_plot = gr.Plot(label="Interpolation Weights Per Frame")

fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
update_weight_settings_btn = gr.Button("Update Interpolation Weights")

prompt_a = gr.Textbox(value="blueberry spaghetti", label="Prompt A")
prompt_b = gr.Textbox(value="strawberry spaghetti", label="Prompt B")
seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
generate_images_btn = gr.Button("Generate Images")
image_a = gr.Image(visible=False, label="Image A")
image_b = gr.Image(visible=False, label="Image B")

batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
generate_music_video_btn = gr.Button("Generate Music Video")
video = gr.Video(visible=False, label="Video")

STEP_1_MARKDOWN = """
## 1. Upload Some Audio
Upload an audio file to use as the source for the music video.
"""

STEP_2_MARKDOWN = """
## 2. Slice Portion of Audio for Generated Clip
Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
**Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
"""

STEP_3_MARKDOWN = """
## 3. Set Interpolation Weight Settings
This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
"""

STEP_4_MARKDOWN = """
## 4. Select Prompts, Seeds, Settings, and Generate Images
Here you can select the settings for image generation.
Then, you can select prompts and seeds for generating images.
  - Image A will be first frame of the generated video.
  - Image B will be last frame of the generated video.
  - The video will be generated by interpolating between the two images using the audio you provided.
If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
"""


with gr.Blocks() as demo:
    gr.Markdown(STEP_1_MARKDOWN)
    audio = gr.Audio(type="filepath", interactive=True)
    gr.Examples(
        [
            download_example_clip(
                url="https://soundcloud.com/nateraw/thoughts", output_dir="./music", output_filename="thoughts.mp3"
            )
        ],
        inputs=audio,
        outputs=[audio_start_sec, duration],
        fn=on_audio_change_or_clear,
        cache_examples=True,
    )
    audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
    audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])

    gr.Markdown(STEP_2_MARKDOWN)
    audio_start_sec.render()
    duration.render()
    slice_btn.render()

    slice_btn.click(
        on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot]
    )
    sliced_audio.render()

    gr.Markdown(STEP_3_MARKDOWN)

    with gr.Row():
        with gr.Column(scale=4):
            fps.render()
            smooth.render()
            margin.render()
            update_weight_settings_btn.render()
            update_weight_settings_btn.click(
                on_update_weight_settings_btn_click, [sliced_audio, duration, fps, smooth, margin], wav_plot
            )
        with gr.Column(scale=3):
            wav_plot.render()

    gr.Markdown(STEP_4_MARKDOWN)

    with gr.Accordion("Additional Settings", open=False):
        output_dir = gr.Textbox(value="./dreams", label="Output Directory")
        num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
        guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
        height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
        width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
        upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")

    with gr.Row():
        with gr.Column(scale=4):
            prompt_a.render()
        with gr.Column(scale=1):
            seed_a.render()

    with gr.Row():
        with gr.Column(scale=4):
            prompt_b.render()
        with gr.Column(scale=1):
            seed_b.render()

    generate_images_btn.render()

    with gr.Row():
        with gr.Column(scale=1):
            image_a.render()
        with gr.Column(scale=1):
            image_b.render()

    generate_images_btn.click(
        on_generate_images_btn_click,
        [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
        [image_a, image_b, seed_a, seed_b],
    )

    gr.Markdown("## 5. Generate Music Video")
    # TODO - add equivalent code snippet to generate music video
    batch_size.render()
    generate_music_video_btn.render()
    generate_music_video_btn.click(
        on_generate_music_video_btn_click,
        [
            audio,
            audio_start_sec,
            duration,
            fps,
            smooth,
            margin,
            prompt_a,
            prompt_b,
            seed_a,
            seed_b,
            batch_size,
            output_dir,
            num_inference_steps,
            guidance_scale,
            height,
            width,
            upsample,
        ],
        video,
    )
    video.render()


if __name__ == "__main__":
    demo.launch(debug=True)