from diffusers import DiffusionPipeline from .invert import Inverter from .generate import Generator from .utils import init_model, seed_everything, get_frame_ids import torch from omegaconf import OmegaConf class VidToMePipeline(DiffusionPipeline): # def __init__(self, device="cuda", sd_version="2.1", float_precision="fp16", height=512, width=512): # # this will initlize the core pipeline components # pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision) # self.pipe = pipe # self.scheduler = scheduler # self.model_key = model_key # self.device = device # self.sd_version = sd_version # self.float_precision = float_precision # self.height = height # self.width = width def __init__(self, device="cuda", sd_version="1.5", float_precision="fp16", height=512, width=512): # Register configuration parameters self.register_to_config(device=device, sd_version=sd_version, float_precision=float_precision, height=height, width=width) self.sd_version = sd_version self.float_precision = float_precision self.height = height self.width = width # this will initlize the core pipeline components pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision) self.pipe = pipe self.scheduler = scheduler self.model_key = model_key super().__init__() def __call__(self, video_path=None, video_prompt=None, edit_prompt=None, control_type="none", n_timesteps=50, guidance_scale=7.5, negative_prompt="ugly, blurry, low res", frame_range=None, use_lora=False, seed=123, local_merge_ratio=0.9, global_merge_ratio=0.8): # dynamic config built from user inputs config = self._build_config(video_path, video_prompt, edit_prompt, control_type, n_timesteps, guidance_scale, negative_prompt, frame_range, use_lora, seed, local_merge_ratio, global_merge_ratio) # seed for reproducibility - change as you need seed_everything(config['seed']) # inversion stage print("Start inversion!") inversion = Inverter(self.pipe, self.scheduler, config) inversion(config['input_path'], config['inversion']['save_path']) # generation stage print("Start generation!") generator = Generator(self.pipe, self.scheduler, config) frame_ids = get_frame_ids(config['generation']['frame_range'], None) generator(config['input_path'], config['generation']['latents_path'], config['generation']['output_path'], frame_ids=frame_ids) print(f"Output generated at: {config['generation']['output_path']}") # def _build_config(self, video_path, video_prompt, edit_prompt, control_type, # n_timesteps, guidance_scale, negative_prompt, frame_range, # use_lora, seed, local_merge_ratio, global_merge_ratio): # # constructing config dictionary from user prompts # config = { # 'sd_version': self.sd_version, # 'input_path': video_path, # 'work_dir': "outputs/", # 'height': self.height, # 'width': self.width, # 'inversion': { # 'prompt': video_prompt or "Default video prompt.", # 'save_path': "outputs/latents", # 'steps': 50, # 'save_intermediate': False # }, # 'generation': { # 'control': control_type, # 'guidance_scale': guidance_scale, # 'n_timesteps': n_timesteps, # 'negative_prompt': negative_prompt, # 'prompt': edit_prompt or "Default edit prompt.", # 'latents_path': "outputs/latents", # 'output_path': "outputs/final", # 'frame_range': frame_range or [0, 32], # 'use_lora': use_lora, # 'local_merge_ratio': local_merge_ratio, # 'global_merge_ratio': global_merge_ratio # }, # 'seed': seed, # 'device': "cuda", # 'float_precision': self.float_precision # } # return config from omegaconf import OmegaConf def _build_config(self, video_path, video_prompt, edit_prompt, control_type, n_timesteps, guidance_scale, negative_prompt, frame_range, use_lora, seed, local_merge_ratio, global_merge_ratio, gen_control="none"): # Build config using OmegaConf, abstracting as much as possible config = OmegaConf.create({ 'sd_version': self.sd_version, # Default sd_version 'model_key': self.model_key or None, # Optionally allow model_key to be None 'input_path': video_path, # Path to the video 'work_dir': "workdir", # Default workdir, can be abstracted further 'height': self.height, 'width': self.width, 'inversion': { 'save_path': "${work_dir}/latents", # Save latents during inversion 'prompt': video_prompt or "Default video prompt.", 'n_frames': None, # None to invert all frames 'steps': 50, # Default inversion steps 'save_intermediate': False, # Default, but can be abstracted to user 'save_steps': 50, # Default 'use_blip': False, # Abstract BLIP prompt creation 'recon': False, # Reconstruct the input video from latents 'control': control_type or "none", # Default to 'none', can use 'tile', 'softedge', etc. 'control_scale': 1.0, # Default control scale 'batch_size': 8, # Default batch size for inversion 'force': False, # Default, force inversion even if latents exist }, 'generation': { 'control': gen_control or "none", # Default to Plug-and-Play for generation control 'pnp_attn_t': 0.5, # PnP args 'pnp_f_t': 0.8, # PnP args 'control_scale': 1.0, # Scale for ControlNet-like controls 'guidance_scale': guidance_scale, # Guidance scale for CFG 'n_timesteps': n_timesteps, # Number of diffusion timesteps 'negative_prompt': negative_prompt or "ugly, blurry, low res", # Negative prompt to avoid undesired generations 'prompt': edit_prompt or None, # Edit prompt during generation 'latents_path': "${work_dir}/latents", # Latents path from inversion 'output_path': "${work_dir}", # Output directory for final images 'chunk_size': 4, # Number of frames processed per chunk 'chunk_ord': "mix-4", # Processing order for video chunks 'local_merge_ratio': local_merge_ratio, # Merge ratio for blending 'merge_global': True, # Enable global merging 'global_merge_ratio': global_merge_ratio, # Global merge ratio 'global_rand': 0.5, # Randomness in global merge 'align_batch': True, # Align batch processing 'frame_range': frame_range or [0, 32, 1], # Default frame range 'frame_ids': None, # Specify frame IDs to edit 'save_frame': True, # Save individual frames 'use_lora': use_lora, # Enable LoRA if applicable # Additional LoRA configurations 'lora': { 'pretrained_model_name_or_path_or_dict': None, # Default LoRA model path 'lora_weight_name': None, 'lora_adapter': None, 'lora_weight': 1.0 } }, 'seed': seed, # Seed for reproducibility 'device': "cuda", # Default to CUDA 'float_precision': "fp16", # Enable mixed-precision 'enable_xformers_memory_efficient_attention': True # Default to enable xformers memory-efficient attention }) return config # # Sample usage # pipeline = VidToMePipeline(device="cuda", sd_version="2.1", float_precision="fp16") # pipeline(video_path="path/to/video.mp4", video_prompt="A beautiful scene of a sunset", # edit_prompt="Make the sunset look more vibrant", control_type="depth", n_timesteps=50)