nateraw commited on
Commit
c978742
·
1 Parent(s): c2c87de

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +366 -0
app.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental app to help with the process of generating music videos
2
+ # Requires youtube-dl to be installed
3
+ # pip install youtube-dl
4
+
5
+ import os
6
+ import random
7
+ from io import BytesIO
8
+ from pathlib import Path
9
+
10
+ import gradio as gr
11
+ import librosa
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import torch
15
+ import youtube_dl
16
+ from diffusers.models import AutoencoderKL
17
+ from diffusers.schedulers import LMSDiscreteScheduler
18
+ from matplotlib import pyplot as plt
19
+ from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr
20
+
21
+ pipe = StableDiffusionWalkPipeline.from_pretrained(
22
+ "runwayml/stable-diffusion-v1-5",
23
+ vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
24
+ torch_dtype=torch.float16,
25
+ revision="fp16",
26
+ safety_checker=None,
27
+ scheduler=LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"),
28
+ ).to("cuda")
29
+
30
+
31
+ def download_example_clip(url, output_dir="./", output_filename="%(title)s.%(ext)s"):
32
+ if (Path(output_dir) / output_filename).exists():
33
+ return str(Path(output_dir) / output_filename)
34
+
35
+ files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
36
+ ydl_opts = {
37
+ "outtmpl": str(Path(output_dir) / output_filename),
38
+ "format": "bestaudio",
39
+ "extract-audio": True,
40
+ "audio-format": "mp3",
41
+ "audio-quality": 0,
42
+ }
43
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
44
+ ydl.download([url])
45
+
46
+ files_after = os.listdir(output_dir)
47
+ return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])
48
+
49
+
50
+ def audio_data_to_buffer(y, sr):
51
+ audio_filepath = BytesIO()
52
+ audio_filepath.name = "audio.wav"
53
+ sf.write(audio_filepath, y, samplerate=sr, format="WAV")
54
+ audio_filepath.seek(0)
55
+ return audio_filepath
56
+
57
+
58
+ def plot_array(y):
59
+ fig = plt.figure()
60
+ x = np.arange(y.shape[0])
61
+ plt.title("Line graph")
62
+ plt.xlabel("X axis")
63
+ plt.ylabel("Y axis")
64
+ plt.plot(x, y, color="red")
65
+ plt.savefig("timesteps_chart.png")
66
+ return fig
67
+
68
+
69
+ def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
70
+ if audio is None:
71
+ return [
72
+ gr.update(visible=False),
73
+ gr.update(visible=False),
74
+ ]
75
+
76
+ y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
77
+ T = get_timesteps_arr(
78
+ audio_data_to_buffer(y, sr),
79
+ 0,
80
+ duration,
81
+ fps=fps,
82
+ margin=margin,
83
+ smooth=smooth,
84
+ )
85
+ return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]
86
+
87
+
88
+ def on_audio_change_or_clear(audio):
89
+ if audio is None:
90
+ return [gr.update(visible=False), gr.update(visible=False)]
91
+
92
+ duration = librosa.get_duration(filename=audio)
93
+ return [gr.update(maximum=int(duration), visible=True), gr.update(maximum=int(min(10, duration)), visible=True)]
94
+
95
+
96
+ def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
97
+ if sliced_audio is None:
98
+ return gr.update(visible=False)
99
+
100
+ T = get_timesteps_arr(
101
+ sliced_audio,
102
+ 0,
103
+ duration,
104
+ fps=fps,
105
+ margin=margin,
106
+ smooth=smooth,
107
+ )
108
+ return gr.update(value=plot_array(T), visible=True)
109
+
110
+
111
+ def on_generate_images_btn_click(
112
+ prompt_a,
113
+ prompt_b,
114
+ seed_a,
115
+ seed_b,
116
+ output_dir,
117
+ num_inference_steps,
118
+ guidance_scale,
119
+ height,
120
+ width,
121
+ upsample,
122
+ ):
123
+ output_dir = Path(output_dir) / "images"
124
+
125
+ if seed_a == -1:
126
+ seed_a = random.randint(0, 9999999)
127
+ if seed_b == -1:
128
+ seed_b = random.randint(0, 9999999)
129
+
130
+ image_a_fpath = generate_images(
131
+ pipe,
132
+ prompt_a,
133
+ seeds=[seed_a],
134
+ num_inference_steps=num_inference_steps,
135
+ guidance_scale=guidance_scale,
136
+ height=height,
137
+ width=width,
138
+ upsample=upsample,
139
+ output_dir=output_dir,
140
+ )[0]
141
+ image_b_fpath = generate_images(
142
+ pipe,
143
+ prompt_b,
144
+ seeds=[seed_b],
145
+ num_inference_steps=num_inference_steps,
146
+ guidance_scale=guidance_scale,
147
+ height=height,
148
+ width=width,
149
+ upsample=upsample,
150
+ output_dir=output_dir,
151
+ )[0]
152
+
153
+ return [
154
+ gr.update(value=image_a_fpath, visible=True),
155
+ gr.update(value=image_b_fpath, visible=True),
156
+ gr.update(value=seed_a),
157
+ gr.update(value=seed_b),
158
+ ]
159
+
160
+
161
+ def on_generate_music_video_btn_click(
162
+ audio_filepath,
163
+ audio_start_sec,
164
+ duration,
165
+ fps,
166
+ smooth,
167
+ margin,
168
+ prompt_a,
169
+ prompt_b,
170
+ seed_a,
171
+ seed_b,
172
+ batch_size,
173
+ output_dir,
174
+ num_inference_steps,
175
+ guidance_scale,
176
+ height,
177
+ width,
178
+ upsample,
179
+ ):
180
+
181
+ if audio_filepath is None:
182
+ return gr.update(visible=False)
183
+
184
+ video_filepath = pipe.walk(
185
+ prompts=[prompt_a, prompt_b],
186
+ seeds=[seed_a, seed_b],
187
+ num_interpolation_steps=int(duration * fps),
188
+ output_dir=output_dir,
189
+ fps=fps,
190
+ num_inference_steps=num_inference_steps,
191
+ guidance_scale=guidance_scale,
192
+ height=height,
193
+ width=width,
194
+ upsample=upsample,
195
+ batch_size=batch_size,
196
+ audio_filepath=audio_filepath,
197
+ audio_start_sec=audio_start_sec,
198
+ margin=margin,
199
+ smooth=smooth,
200
+ )
201
+ return gr.update(value=video_filepath, visible=True)
202
+
203
+
204
+ audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
205
+ duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
206
+ slice_btn = gr.Button("Slice Audio")
207
+
208
+ sliced_audio = gr.Audio(type="filepath")
209
+ wav_plot = gr.Plot(label="Interpolation Weights Per Frame")
210
+
211
+ fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
212
+ smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
213
+ margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
214
+ update_weight_settings_btn = gr.Button("Update Interpolation Weights")
215
+
216
+ prompt_a = gr.Textbox(value="blueberry spaghetti", label="Prompt A")
217
+ prompt_b = gr.Textbox(value="strawberry spaghetti", label="Prompt B")
218
+ seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
219
+ seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
220
+ generate_images_btn = gr.Button("Generate Images")
221
+ image_a = gr.Image(visible=False, label="Image A")
222
+ image_b = gr.Image(visible=False, label="Image B")
223
+
224
+ batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
225
+ generate_music_video_btn = gr.Button("Generate Music Video")
226
+ video = gr.Video(visible=False, label="Video")
227
+
228
+ STEP_1_MARKDOWN = """
229
+ ## 1. Upload Some Audio
230
+ Upload an audio file to use as the source for the music video.
231
+ """
232
+
233
+ STEP_2_MARKDOWN = """
234
+ ## 2. Slice Portion of Audio for Generated Clip
235
+ Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
236
+ I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
237
+ **Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
238
+ """
239
+
240
+ STEP_3_MARKDOWN = """
241
+ ## 3. Set Interpolation Weight Settings
242
+ This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
243
+ If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
244
+ If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
245
+ """
246
+
247
+ STEP_4_MARKDOWN = """
248
+ ## 4. Select Prompts, Seeds, Settings, and Generate Images
249
+ Here you can select the settings for image generation.
250
+ Then, you can select prompts and seeds for generating images.
251
+ - Image A will be first frame of the generated video.
252
+ - Image B will be last frame of the generated video.
253
+ - The video will be generated by interpolating between the two images using the audio you provided.
254
+ If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
255
+ """
256
+
257
+
258
+ with gr.Blocks() as demo:
259
+ gr.Markdown(STEP_1_MARKDOWN)
260
+ audio = gr.Audio(type="filepath", interactive=True)
261
+ gr.Examples(
262
+ [
263
+ download_example_clip(
264
+ url="https://soundcloud.com/nateraw/thoughts", output_dir="./music", output_filename="thoughts.mp3"
265
+ )
266
+ ],
267
+ inputs=audio,
268
+ outputs=[audio_start_sec, duration],
269
+ fn=on_audio_change_or_clear,
270
+ cache_examples=True,
271
+ )
272
+ audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
273
+ audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
274
+
275
+ gr.Markdown(STEP_2_MARKDOWN)
276
+ audio_start_sec.render()
277
+ duration.render()
278
+ slice_btn.render()
279
+
280
+ slice_btn.click(
281
+ on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot]
282
+ )
283
+ sliced_audio.render()
284
+
285
+ gr.Markdown(STEP_3_MARKDOWN)
286
+
287
+ with gr.Row():
288
+ with gr.Column(scale=4):
289
+ fps.render()
290
+ smooth.render()
291
+ margin.render()
292
+ update_weight_settings_btn.render()
293
+ update_weight_settings_btn.click(
294
+ on_update_weight_settings_btn_click, [sliced_audio, duration, fps, smooth, margin], wav_plot
295
+ )
296
+ with gr.Column(scale=3):
297
+ wav_plot.render()
298
+
299
+ gr.Markdown(STEP_4_MARKDOWN)
300
+
301
+ with gr.Accordion("Additional Settings", open=False):
302
+ output_dir = gr.Textbox(value="./dreams", label="Output Directory")
303
+ num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
304
+ guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
305
+ height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
306
+ width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
307
+ upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")
308
+
309
+ with gr.Row():
310
+ with gr.Column(scale=4):
311
+ prompt_a.render()
312
+ with gr.Column(scale=1):
313
+ seed_a.render()
314
+
315
+ with gr.Row():
316
+ with gr.Column(scale=4):
317
+ prompt_b.render()
318
+ with gr.Column(scale=1):
319
+ seed_b.render()
320
+
321
+ generate_images_btn.render()
322
+
323
+ with gr.Row():
324
+ with gr.Column(scale=1):
325
+ image_a.render()
326
+ with gr.Column(scale=1):
327
+ image_b.render()
328
+
329
+ generate_images_btn.click(
330
+ on_generate_images_btn_click,
331
+ [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
332
+ [image_a, image_b, seed_a, seed_b],
333
+ )
334
+
335
+ gr.Markdown("## 5. Generate Music Video")
336
+ # TODO - add equivalent code snippet to generate music video
337
+ batch_size.render()
338
+ generate_music_video_btn.render()
339
+ generate_music_video_btn.click(
340
+ on_generate_music_video_btn_click,
341
+ [
342
+ audio,
343
+ audio_start_sec,
344
+ duration,
345
+ fps,
346
+ smooth,
347
+ margin,
348
+ prompt_a,
349
+ prompt_b,
350
+ seed_a,
351
+ seed_b,
352
+ batch_size,
353
+ output_dir,
354
+ num_inference_steps,
355
+ guidance_scale,
356
+ height,
357
+ width,
358
+ upsample,
359
+ ],
360
+ video,
361
+ )
362
+ video.render()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ demo.launch(debug=True)