File size: 4,791 Bytes
ee6ef96
d6d7648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee6ef96
d6d7648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import spaces
import warnings
warnings.filterwarnings("ignore")

import gradio as gr
import torch
import torch.nn as nn

from diffusers.models import AutoencoderKL
from diffusers.schedulers import PNDMScheduler
from unet import AudioUNet3DConditionModel
from audio_encoder import ImageBindSegmaskAudioEncoder
from pipeline import AudioCondAnimationPipeline, generate_videos


device = torch.device("cuda")
dtype = torch.float16


def freeze_and_make_eval(model: nn.Module):
	for param in model.parameters():
		param.requires_grad = False
	model.eval()


def create_pipeline(device=torch.device("cuda"), dtype=torch.float32):
	# 2. Prepare model
	pretrained_stable_diffusion_path = "./pretrained/stable-diffusion-v1-5"
	
	checkpoint_path = f"checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules"
	category_text_encoding_mapping = torch.load('datasets/AVSync15/class_clip_text_encodings_stable-diffusion-v1-5.pt', map_location="cpu")
	
	scheduler = PNDMScheduler.from_pretrained(pretrained_stable_diffusion_path, subfolder="scheduler")
	vae = AutoencoderKL.from_pretrained(pretrained_stable_diffusion_path, subfolder="vae").to(device=device, dtype=dtype)
	audio_encoder = ImageBindSegmaskAudioEncoder(n_segment=12).to(device=device, dtype=dtype)
	freeze_and_make_eval(audio_encoder)
	unet = AudioUNet3DConditionModel.from_pretrained(checkpoint_path, subfolder="unet").to(device=device, dtype=dtype)
	
	pipeline = AudioCondAnimationPipeline(
		unet=unet,
		scheduler=scheduler,
		vae=vae,
		audio_encoder=audio_encoder,
		null_text_encodings_path="./pretrained/openai-clip-l_null_text_encoding.pt"
	)
	pipeline.to(torch_device=device, dtype=dtype)
	pipeline.set_progress_bar_config(disable=True)
	
	return pipeline, category_text_encoding_mapping

pipeline, category_text_encoding_mapping = create_pipeline(device, dtype)


@spaces.GPU(duration=120)
def generate_video(image, audio, text, audio_guidance_scale, denoising_step):
	
	category_text_encoding = category_text_encoding_mapping[text].view(1, 77, 768)
	
	generate_videos(
		pipeline,
		audio_path=audio,
		image_path=image,
		category_text_encoding=category_text_encoding,
		image_size=(256, 256),
		video_fps=6,
		video_num_frame=12,
		text_guidance_scale=1.0,
		audio_guidance_scale=audio_guidance_scale,
		denoising_step=denoising_step,
		seed=123,
		save_path="./output_video.mp4",
		device=device
	)
	
	return "./output_video.mp4"


if __name__ ==  "__main__":
	
	categories = [
		"baby babbling crying", "dog barking", "hammering", "striking bowling", "cap gun shooting",
		"chicken crowing", "frog croaking", "lions roaring", "machine gun shooting", "playing cello",
		"playing trombone", "playing trumpet", "playing violin fiddle", "sharpen knife", "toilet flushing"
	]
	
	title = ""
	description = """
<div align="center">

<h1 style="font-size: 60px;">Audio-Synchronized Visual Animation</h1>

<p style="font-size: 30px;">
<a href="https://lzhangbj.github.io/projects/asva/asva.html">Project Webpage</a>
</p>

<p style="font-size: 30px;">
	<a href="https://lzhangbj.github.io/">Lin Zhang</a>,
	<a href="https://scholar.google.com/citations?user=6aYncPAAAAAJ">Shentong Mo</a>,
	<a href="https://yijingz02.github.io/">Yijing Zhang</a>,
	<a href="https://pedro-morgado.github.io/">Pedro Morgado</a>
</p>

<p style="font-size: 30px;">
University of Wisconsin Madison,
Carnegie Mellon University
<p>

<strong style="font-size: 30px;">ECCV 2024</strong>

<strong style="font-size: 25px;">Animate your images with audio-synchronized motion! </strong>

<p style="font-size: 18px;">Notes:</p>
<p style="font-size: 18px;">(1) Only the first 2 seconds of audio is used. </p>
<p style="font-size: 18px;">(2) Increase audio guidance scale for amplified visual dynamics. </p>
<p style="font-size: 18px;">(3) Increase sampling steps for higher visual quality. </p>

</div>
	"""
	
	# <p style="font-size: 20px;">Please be patient. Due to limited resources on huggingface, the generation may take up to 10mins </p>
	
	# Gradio Interface
	iface = gr.Interface(
		fn=generate_video,
		inputs=[
			gr.Image( label="Upload Image", type="filepath", height=256),
			gr.Audio(label="Upload Audio", type="filepath"),
			gr.Dropdown(choices=categories, label="Select Audio Category"),
			gr.Slider(minimum=1.0, maximum=12.0, step=0.1, value=4.0, label="Audio Guidance Scale"),
			gr.Slider(minimum=1, maximum=50, step=1, value=20, label="Sampling steps")
		],
		outputs=gr.Video(label="Generated Video", height=256),
		title=title,
		description=description,
		examples = [
			["./assets/lion_and_gun.png", "./assets/lions_roaring.wav", "lions roaring",  4.0, 20],
			["./assets/lion_and_gun.png", "./assets/machine_gun_shooting.wav", "machine gun shooting", 4.0, 20],
		]
	)
	
	# Launch the interface
	iface.launch()