import torch import torch.nn.functional as F import numpy as np from tqdm import tqdm from .ddpm import DDPMSampler import logging from .config import Config, default_config WIDTH = 512 HEIGHT = 512 LATENTS_WIDTH = WIDTH // 8 LATENTS_HEIGHT = HEIGHT // 8 logging.basicConfig(level=logging.INFO) def validate_strength(strength): if not 0 < strength <= 1: raise ValueError("Strength must be between 0 and 1") def initialize_generator(seed, device): generator = torch.Generator(device=device) if seed is None: generator.seed() else: generator.manual_seed(seed) return generator def encode_prompt(prompt, uncond_prompt, do_cfg, tokenizer, clip, device): clip.to(device) if do_cfg: cond_tokens = tokenizer.batch_encode_plus([prompt], padding="max_length", max_length=77).input_ids cond_tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device) cond_context = clip(cond_tokens) uncond_tokens = tokenizer.batch_encode_plus([uncond_prompt or ""], padding="max_length", max_length=77).input_ids uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=device) uncond_context = clip(uncond_tokens) context = torch.cat([cond_context, uncond_context]) else: tokens = tokenizer.batch_encode_plus([prompt], padding="max_length", max_length=77).input_ids tokens = torch.tensor(tokens, dtype=torch.long, device=device) context = clip(tokens) return context def rescale(x, old_range, new_range, clamp=False): old_min, old_max = old_range new_min, new_max = new_range x -= old_min x *= (new_max - new_min) / (old_max - old_min) x += new_min if clamp: x = x.clamp(new_min, new_max) return x def preprocess_image(input_image): input_image_tensor = input_image.resize((WIDTH, HEIGHT)) input_image_tensor = np.array(input_image_tensor) input_image_tensor = torch.tensor(input_image_tensor, dtype=torch.float32) input_image_tensor = rescale(input_image_tensor, (0, 255), (-1, 1)) input_image_tensor = input_image_tensor.unsqueeze(0) input_image_tensor = input_image_tensor.permute(0, 3, 1, 2) return input_image_tensor def encode_image(input_image, models, device): # Preprocess the input image image_tensor = preprocess_image(input_image).to(device) # Encode the image using the VAE encoder encoder = models["encoder"] encoder.to(device) with torch.no_grad(): # Create deterministic noise (zeros) since we want exact reconstruction noise = torch.zeros((1, 4, LATENTS_WIDTH, LATENTS_HEIGHT), device=device) latents = encoder(image_tensor, noise) return latents def initialize_latents(input_image, strength, generator, models, device, sampler_name, n_inference_steps, mask_image=None): if input_image is None: # Initialize with random noise latents = torch.randn((1, 4, LATENTS_WIDTH, LATENTS_HEIGHT), generator=generator, device=device) else: # Initialize with encoded input image latents = encode_image(input_image, models, device) # If mask is provided for inpainting if mask_image is not None: # Process mask mask = mask_image.resize((WIDTH, HEIGHT)) mask = np.array(mask) mask = torch.tensor(mask, dtype=torch.float32).to(device) mask = mask / 255.0 # Normalize to 0-1 mask = mask.unsqueeze(0).unsqueeze(0) # Add batch and channel dimensions mask = F.interpolate(mask, (LATENTS_WIDTH, LATENTS_HEIGHT)) mask = mask.repeat(1, 4, 1, 1) # Repeat for all latent channels # Create masked noise - torch.randn_like doesn't accept generator noise = torch.randn(latents.shape, device=device) masked_latents = latents * (1 - mask) + noise * mask latents = masked_latents # Add noise based on strength (for img2img) # torch.randn_like doesn't accept generator noise = torch.randn(latents.shape, device=device) latents = (1 - strength) * latents + strength * noise return latents def get_sampler(sampler_name, generator, n_inference_steps): if sampler_name == "ddpm": sampler = DDPMSampler(generator) sampler.set_inference_timesteps(n_inference_steps) else: raise ValueError(f"Unknown sampler value {sampler_name}.") return sampler def get_time_embedding(timestep): freqs = torch.pow(10000, -torch.arange(start=0, end=160, dtype=torch.float32) / 160) x = torch.tensor([timestep], dtype=torch.float32)[:, None] * freqs[None] return torch.cat([torch.cos(x), torch.sin(x)], dim=-1) def run_diffusion(latents, context, do_cfg, cfg_scale, models, device, sampler_name, n_inference_steps, generator): diffusion = models["diffusion"] diffusion.to(device) sampler = get_sampler(sampler_name, generator, n_inference_steps) timesteps = tqdm(sampler.timesteps) for timestep in timesteps: time_embedding = get_time_embedding(timestep).to(device) model_input = latents.repeat(2, 1, 1, 1) if do_cfg else latents model_output = diffusion(model_input, context, time_embedding) if do_cfg: output_cond, output_uncond = model_output.chunk(2) model_output = cfg_scale * (output_cond - output_uncond) + output_uncond latents = sampler.step(timestep, latents, model_output) decoder = models["decoder"] decoder.to(device) images = decoder(latents) return images def postprocess_images(images): images = rescale(images, (-1, 1), (0, 255), clamp=True) images = images.permute(0, 2, 3, 1) images = images.to("cpu", torch.uint8).numpy() return images[0] def generate( prompt, uncond_prompt=None, input_image=None, mask_image=None, config: Config = default_config, ): with torch.no_grad(): # Validate inputs and parameters if prompt is None or prompt.strip() == "": raise ValueError("Prompt cannot be empty") if uncond_prompt is None: uncond_prompt = "" validate_strength(config.diffusion.strength) # Initialize generator for reproducibility generator = initialize_generator(config.seed, config.device.device) # Encode text prompt context = encode_prompt(prompt, uncond_prompt, config.diffusion.do_cfg, config.tokenizer, config.models["clip"], config.device.device) # Initialize latents (either from noise or from input image) latents = initialize_latents(input_image, config.diffusion.strength, generator, config.models, config.device.device, config.diffusion.sampler_name, config.diffusion.n_inference_steps, mask_image) # Run diffusion process images = run_diffusion(latents, context, config.diffusion.do_cfg, config.diffusion.cfg_scale, config.models, config.device.device, config.diffusion.sampler_name, config.diffusion.n_inference_steps, generator) # Post-process and return the images return postprocess_images(images)