Spaces:

weifeng-chen
/

Control-A-Video

Runtime error

weifeng.genius

update

e207824 over 1 year ago

8.66 kB

	from model.video_diffusion.models.controlnet3d import ControlNet3DModel
	from model.video_diffusion.models.unet_3d_condition import UNetPseudo3DConditionModel
	from model.video_diffusion.pipelines.pipeline_stable_diffusion_controlnet3d import Controlnet3DStableDiffusionPipeline
	from transformers import DPTForDepthEstimation
	from model.annotator.hed import HEDNetwork
	import torch
	from einops import rearrange,repeat
	import imageio
	import numpy as np
	import cv2
	import torch.nn.functional as F
	from PIL import Image
	import argparse
	import tempfile
	import os
	import gradio as gr


	control_mode = 'depth'
	control_net_path = f"wf-genius/controlavideo-{control_mode}"
	unet = UNetPseudo3DConditionModel.from_pretrained(control_net_path,
	torch_dtype = torch.float16,
	subfolder='unet',
	).to("cuda")
	controlnet = ControlNet3DModel.from_pretrained(control_net_path,
	torch_dtype = torch.float16,
	subfolder='controlnet',
	).to("cuda")

	if control_mode == 'depth':
	annotator_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
	elif control_mode == 'canny':
	annotator_model = None
	elif control_mode == 'hed':
	# firstly download from https://huggingface.co/wf-genius/controlavideo-hed/resolve/main/hed-network.pth
	annotator_model = HEDNetwork('hed-network.pth').to("cuda")

	video_controlnet_pipe = Controlnet3DStableDiffusionPipeline.from_pretrained(control_net_path, unet=unet,
	controlnet=controlnet, annotator_model=annotator_model,
	torch_dtype = torch.float16,
	).to("cuda")


	def to_video(frames, fps: int) -> str:
	out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
	writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
	for frame in frames:
	writer.append_data(np.array(frame))
	writer.close()
	return out_file.name

	def inference(input_video,
	prompt,
	seed,
	num_inference_steps,
	guidance_scale,
	sampling_rate,
	video_scale,
	init_noise_thres,
	each_sample_frame,
	iter_times,
	h,
	w,
	):
	num_sample_frames = iter_times * each_sample_frame
	testing_prompt = [prompt]
	np_frames, fps_vid = Controlnet3DStableDiffusionPipeline.get_frames_preprocess(input_video, num_frames=num_sample_frames, sampling_rate=sampling_rate, return_np=True)
	if control_mode == 'depth':
	frames = torch.from_numpy(np_frames).div(255) * 2 - 1
	frames = rearrange(frames, "f h w c -> c f h w").unsqueeze(0)
	frames = rearrange(frames, 'b c f h w -> (b f) c h w')
	control_maps = video_controlnet_pipe.get_depth_map(frames, h, w, return_standard_norm=False) # (b f) 1 h w
	elif control_mode == 'canny':
	control_maps = np.stack([cv2.Canny(inp, 100, 200) for inp in np_frames])
	control_maps = repeat(control_maps, 'f h w -> f c h w',c=1)
	control_maps = torch.from_numpy(control_maps).div(255) # 0~1
	elif control_mode == 'hed':
	control_maps = np.stack([video_controlnet_pipe.get_hed_map(inp) for inp in np_frames])
	control_maps = repeat(control_maps, 'f h w -> f c h w',c=1)
	control_maps = torch.from_numpy(control_maps).div(255) # 0~1
	control_maps = control_maps.to(dtype=controlnet.dtype, device=controlnet.device)
	control_maps = F.interpolate(control_maps, size=(h,w), mode='bilinear', align_corners=False)
	control_maps = rearrange(control_maps, "(b f) c h w -> b c f h w", f=num_sample_frames)
	if control_maps.shape[1] == 1:
	control_maps = repeat(control_maps, 'b c f h w -> b (n c) f h w', n=3)

	frames = torch.from_numpy(np_frames).div(255)
	frames = rearrange(frames, 'f h w c -> f c h w')
	v2v_input_frames = torch.nn.functional.interpolate(
	frames,
	size=(h, w),
	mode="bicubic",
	antialias=True,
	)
	v2v_input_frames = rearrange(v2v_input_frames, '(b f) c h w -> b c f h w ', f=num_sample_frames)

	out = []
	for i in range(num_sample_frames//each_sample_frame):
	out1 = video_controlnet_pipe(
	# controlnet_hint= control_maps[:,:,:each_sample_frame,:,:],
	# images= v2v_input_frames[:,:,:each_sample_frame,:,:],
	controlnet_hint=control_maps[:,:,ieach_sample_frame-1:(i+1)each_sample_frame-1,:,:] if i>0 else control_maps[:,:,:each_sample_frame,:,:],
	images=v2v_input_frames[:,:,ieach_sample_frame-1:(i+1)each_sample_frame-1,:,:] if i>0 else v2v_input_frames[:,:,:each_sample_frame,:,:],
	first_frame_output=out[-1] if i>0 else None,
	prompt=testing_prompt,
	num_inference_steps=num_inference_steps,
	width=w,
	height=h,
	guidance_scale=guidance_scale,
	generator=[torch.Generator(device="cuda").manual_seed(seed)],
	video_scale = video_scale,
	init_noise_by_residual_thres = init_noise_thres, # residual-based init. larger thres ==> more smooth.
	controlnet_conditioning_scale=1.0,
	fix_first_frame=True,
	in_domain=True,
	)
	out1 = out1.images[0]
	if len(out1) > 1:
	out1 = out1[1:] # drop the first frame
	out.extend(out1)

	return to_video(out, 8)


	examples = [
	["bear.mp4",
	"a bear walking through stars, artstation"],
	["car-shadow.mp4",
	"a car, sunset, cartoon style, artstation."],
	["libby.mp4",
	"a dog running, chinese ink painting."],
	]

	def preview_inference(
	input_video,
	prompt, seed,
	num_inference_steps, guidance_scale,
	sampling_rate, video_scale, init_noise_thres,
	each_sample_frame,iter_times, h, w,
	):
	return inference(input_video,
	prompt, seed,
	num_inference_steps, guidance_scale,
	sampling_rate, 0.0, 0.0, 1, 1, h, w,)

	if __name__ == '__main__':
	with gr.Blocks() as demo:
	with gr.Row():
	# with gr.Column(scale=1):
	input_video = gr.Video(
	label="Input Video", source='upload', format="mp4", visible=True)
	with gr.Column():
	init_noise_thres = gr.Slider(0, 1, value=0.1, step=0.1, label="init_noise_thress")
	each_sample_frame = gr.Slider(6, 16, value=8, step=1, label="each_sample_frame")
	iter_times = gr.Slider(1, 4, value=1, step=1, label="iter_times")
	sampling_rate = gr.Slider(1, 8, value=3, step=1, label="sampling_rate")
	h = gr.Slider(256, 768, value=512, step=64, label="height")
	w = gr.Slider(256, 768, value=512, step=64, label="width")
	with gr.Column():
	seed = gr.Slider(0, 6666, value=1, step=1, label="seed")
	num_inference_steps = gr.Slider(5, 50, value=20, step=1, label="num_inference_steps")
	guidance_scale = gr.Slider(1, 20, value=7.5, step=0.5, label="guidance_scale")
	video_scale = gr.Slider(0, 2.5, value=1.5, step=0.1, label="video_scale")
	prompt = gr.Textbox(label='Prompt')
	# preview_button = gr.Button('Preview')
	run_button = gr.Button('Generate Video')
	# with gr.Column(scale=1):
	result = gr.Video(label="Generated Video")
	inputs = [
	input_video,
	prompt,
	seed,
	num_inference_steps,
	guidance_scale,
	sampling_rate,
	video_scale,
	init_noise_thres,
	each_sample_frame,
	iter_times,
	h,
	w,
	]

	gr.Examples(examples=examples,
	inputs=inputs,
	outputs=result,
	fn=inference,
	cache_examples=False,
	run_on_click=False,
	)

	run_button.click(fn=inference,
	inputs=inputs,
	outputs=result,)
	# preview_button.click(fn=preview_inference,
	# inputs=inputs,
	# outputs=result,)

	demo.launch(server_name="0.0.0.0", server_port=7860)