File size: 8,664 Bytes

from diffusers import DiffusionPipeline
from .invert import Inverter
from .generate import Generator
from .utils import init_model, seed_everything, get_frame_ids
import torch
from omegaconf import OmegaConf

class VidToMePipeline(DiffusionPipeline):
    # def __init__(self, device="cuda", sd_version="2.1", float_precision="fp16", height=512, width=512):
        # # this will initlize the core pipeline components
        # pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision)
        # self.pipe = pipe
        # self.scheduler = scheduler
        # self.model_key = model_key
        # self.device = device
        # self.sd_version = sd_version
        # self.float_precision = float_precision
        # self.height = height
        # self.width = width

    def __init__(self, device="cuda", sd_version="1.5", float_precision="fp16", height=512, width=512):
        # Register configuration parameters
        self.register_to_config(device=device, sd_version=sd_version, float_precision=float_precision, height=height, width=width)
        self.sd_version = sd_version
        self.float_precision = float_precision
        self.height = height
        self.width = width
        # this will initlize the core pipeline components
        pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision)
        self.pipe = pipe
        self.scheduler = scheduler
        self.model_key = model_key
        super().__init__()

    def __call__(self, video_path=None, video_prompt=None, edit_prompt=None, 
                 control_type="none", n_timesteps=50, guidance_scale=7.5, 
                 negative_prompt="ugly, blurry, low res", frame_range=None, 
                 use_lora=False, seed=123, local_merge_ratio=0.9, global_merge_ratio=0.8):
        
        # dynamic config built from user inputs
        config = self._build_config(video_path, video_prompt, edit_prompt, control_type, 
                                    n_timesteps, guidance_scale, negative_prompt, 
                                    frame_range, use_lora, seed, local_merge_ratio, global_merge_ratio)
        
        # seed for reproducibility - change as you need
        seed_everything(config['seed'])

        # inversion stage
        print("Start inversion!")
        inversion = Inverter(self.pipe, self.scheduler, config)
        inversion(config['input_path'], config['inversion']['save_path'])

        # generation stage
        print("Start generation!")
        generator = Generator(self.pipe, self.scheduler, config)
        frame_ids = get_frame_ids(config['generation']['frame_range'], None)
        generator(config['input_path'], config['generation']['latents_path'], 
                  config['generation']['output_path'], frame_ids=frame_ids)
        print(f"Output generated at: {config['generation']['output_path']}")

    # def _build_config(self, video_path, video_prompt, edit_prompt, control_type, 
    #                   n_timesteps, guidance_scale, negative_prompt, frame_range, 
    #                   use_lora, seed, local_merge_ratio, global_merge_ratio):
    #     # constructing config dictionary from user prompts
    #     config = {
    #         'sd_version': self.sd_version,
    #         'input_path': video_path,
    #         'work_dir': "outputs/",
    #         'height': self.height,
    #         'width': self.width,
    #         'inversion': {
    #             'prompt': video_prompt or "Default video prompt.",
    #             'save_path': "outputs/latents",
    #             'steps': 50,
    #             'save_intermediate': False
    #         },
    #         'generation': {
    #             'control': control_type,
    #             'guidance_scale': guidance_scale,
    #             'n_timesteps': n_timesteps,
    #             'negative_prompt': negative_prompt,
    #             'prompt': edit_prompt or "Default edit prompt.",
    #             'latents_path': "outputs/latents",
    #             'output_path': "outputs/final",
    #             'frame_range': frame_range or [0, 32],
    #             'use_lora': use_lora,
    #             'local_merge_ratio': local_merge_ratio,
    #             'global_merge_ratio': global_merge_ratio
    #         },
    #         'seed': seed,
    #         'device': "cuda",
    #         'float_precision': self.float_precision
    #     }
    #     return config
    from omegaconf import OmegaConf

    def _build_config(self, video_path, video_prompt, edit_prompt, control_type, 
                  n_timesteps, guidance_scale, negative_prompt, frame_range, 
                  use_lora, seed, local_merge_ratio, global_merge_ratio, gen_control="none"):
        # Build config using OmegaConf, abstracting as much as possible
        config = OmegaConf.create({
            'sd_version': self.sd_version,  # Default sd_version
            'model_key': self.model_key or None,  # Optionally allow model_key to be None
            'input_path': video_path,  # Path to the video
            'work_dir': "workdir",  # Default workdir, can be abstracted further
            'height': self.height,
            'width': self.width,
            'inversion': {
                'save_path': "${work_dir}/latents",  # Save latents during inversion
                'prompt': video_prompt or "Default video prompt.",
                'n_frames': None,  # None to invert all frames
                'steps': 50,  # Default inversion steps
                'save_intermediate': False,  # Default, but can be abstracted to user
                'save_steps': 50,  # Default
                'use_blip': False,  # Abstract BLIP prompt creation
                'recon': False,  # Reconstruct the input video from latents
                'control': control_type or "none",  # Default to 'none', can use 'tile', 'softedge', etc.
                'control_scale': 1.0,  # Default control scale
                'batch_size': 8,  # Default batch size for inversion
                'force': False,  # Default, force inversion even if latents exist
            },
            'generation': {
                'control': gen_control or "none",  # Default to Plug-and-Play for generation control
                'pnp_attn_t': 0.5,  # PnP args
                'pnp_f_t': 0.8,  # PnP args
                'control_scale': 1.0,  # Scale for ControlNet-like controls
                'guidance_scale': guidance_scale,  # Guidance scale for CFG
                'n_timesteps': n_timesteps,  # Number of diffusion timesteps
                'negative_prompt': negative_prompt or "ugly, blurry, low res",  # Negative prompt to avoid undesired generations
                'prompt': edit_prompt or None,  # Edit prompt during generation
                'latents_path': "${work_dir}/latents",  # Latents path from inversion
                'output_path': "${work_dir}",  # Output directory for final images
                'chunk_size': 4,  # Number of frames processed per chunk
                'chunk_ord': "mix-4",  # Processing order for video chunks
                'local_merge_ratio': local_merge_ratio,  # Merge ratio for blending
                'merge_global': True,  # Enable global merging
                'global_merge_ratio': global_merge_ratio,  # Global merge ratio
                'global_rand': 0.5,  # Randomness in global merge
                'align_batch': True,  # Align batch processing
                'frame_range': frame_range or [0, 32, 1],  # Default frame range
                'frame_ids': None,  # Specify frame IDs to edit
                'save_frame': True,  # Save individual frames
                'use_lora': use_lora,  # Enable LoRA if applicable
                # Additional LoRA configurations
                'lora': {
                    'pretrained_model_name_or_path_or_dict': None,  # Default LoRA model path
                    'lora_weight_name': None,
                    'lora_adapter': None,
                    'lora_weight': 1.0
                }
            },
            'seed': seed,  # Seed for reproducibility
            'device': "cuda",  # Default to CUDA
            'float_precision': "fp16",  # Enable mixed-precision
            'enable_xformers_memory_efficient_attention': True  # Default to enable xformers memory-efficient attention
        })
        
        return config


# # Sample usage
# pipeline = VidToMePipeline(device="cuda", sd_version="2.1", float_precision="fp16")
# pipeline(video_path="path/to/video.mp4", video_prompt="A beautiful scene of a sunset", 
#          edit_prompt="Make the sunset look more vibrant", control_type="depth", n_timesteps=50)