from typing import Optional, Union import torch from transformers import CLIPTextModel, CLIPTokenizer, BertModel, BertTokenizer from diffusers import UNet2DConditionModel, SchedulerMixin from diffusers.image_processor import VaeImageProcessor import sys import os # sys.path.append(os.path.join(os.path.dirname(__file__), "../")) # from imagesliders.model_util import SDXL_TEXT_ENCODER_TYPE from diffusers.utils.torch_utils import randn_tensor from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection SDXL_TEXT_ENCODER_TYPE = Union[CLIPTextModel, CLIPTextModelWithProjection] from tqdm import tqdm UNET_IN_CHANNELS = 4 # Stable Diffusion in_channels VAE_SCALE_FACTOR = 8 # 2 ** (len(vae.config.block_out_channels) - 1) = 8 UNET_ATTENTION_TIME_EMBED_DIM = 256 # XL TEXT_ENCODER_2_PROJECTION_DIM = 1280 UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM = 2816 def get_random_noise( batch_size: int, height: int, width: int, generator: torch.Generator = None ) -> torch.Tensor: return torch.randn( ( batch_size, UNET_IN_CHANNELS, height // VAE_SCALE_FACTOR, width // VAE_SCALE_FACTOR, ), generator=generator, device="cpu", ) def apply_noise_offset(latents: torch.FloatTensor, noise_offset: float): latents = latents + noise_offset * torch.randn( (latents.shape[0], latents.shape[1], 1, 1), device=latents.device ) return latents def get_initial_latents( scheduler: SchedulerMixin, n_imgs: int, height: int, width: int, n_prompts: int, generator=None, ) -> torch.Tensor: noise = get_random_noise(n_imgs, height, width, generator=generator).repeat( n_prompts, 1, 1, 1 ) latents = noise * scheduler.init_noise_sigma return latents def text_tokenize( tokenizer, # 普通ならひとつ、XLならふたつ! prompts, ): return tokenizer( prompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt", ) def text_encode(text_encoder , tokens): tokens = tokens.to(text_encoder.device) if isinstance(text_encoder, BertModel): embed = text_encoder(**tokens, return_dict=False)[0] elif isinstance(text_encoder, CLIPTextModel): # embed = text_encoder(**tokens, return_dict=False)[0] embed = text_encoder(tokens.input_ids, return_dict=False)[0] else: raise ValueError("text_encoder must be BertModel or CLIPTextModel") return embed def encode_prompts( tokenizer, text_encoder, prompts: list[str], ): # print(f"prompts: {prompts}") text_tokens = text_tokenize(tokenizer, prompts) # print(f"text_tokens: {text_tokens}") text_embeddings = text_encode(text_encoder, text_tokens) # print(f"text_embeddings: {text_embeddings}") return text_embeddings def prompt_replace(original, key="{prompt}", prompt=""): if key not in original: return original if isinstance(prompt, list): ret =[] for p in prompt: p = p.replace(".", "") r = original.replace(key, p) r = r.capitalize() ret.append(r) else: prompt = prompt.replace(".", "") ret = original.replace(key, prompt) ret = ret.capitalize() return ret def text_encode_xl( text_encoder: SDXL_TEXT_ENCODER_TYPE, tokens: torch.FloatTensor, num_images_per_prompt: int = 1, ): prompt_embeds = text_encoder( tokens.to(text_encoder.device), output_hidden_states=True ) pooled_prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.hidden_states[-2] # always penultimate layer bs_embed, seq_len, _ = prompt_embeds.shape prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) return prompt_embeds, pooled_prompt_embeds def encode_prompts_xl( tokenizers: list[CLIPTokenizer], text_encoders: list[SDXL_TEXT_ENCODER_TYPE], prompts: list[str], num_images_per_prompt: int = 1, ) -> tuple[torch.FloatTensor, torch.FloatTensor]: # text_encoder and text_encoder_2's penuultimate layer's output text_embeds_list = [] pooled_text_embeds = None # always text_encoder_2's pool for tokenizer, text_encoder in zip(tokenizers, text_encoders): text_tokens_input_ids = text_tokenize(tokenizer, prompts) text_embeds, pooled_text_embeds = text_encode_xl( text_encoder, text_tokens_input_ids, num_images_per_prompt ) text_embeds_list.append(text_embeds) bs_embed = pooled_text_embeds.shape[0] pooled_text_embeds = pooled_text_embeds.repeat(1, num_images_per_prompt).view( bs_embed * num_images_per_prompt, -1 ) return torch.concat(text_embeds_list, dim=-1), pooled_text_embeds def concat_embeddings( unconditional: torch.FloatTensor, conditional: torch.FloatTensor, n_imgs: int, ): if conditional.shape[0] == n_imgs and unconditional.shape[0] == 1: return torch.cat([unconditional.repeat(n_imgs, 1, 1), conditional], dim=0) return torch.cat([unconditional, conditional]).repeat_interleave(n_imgs, dim=0) def predict_noise( unet: UNet2DConditionModel, scheduler: SchedulerMixin, timestep: int, latents: torch.FloatTensor, text_embeddings: torch.FloatTensor, # uncond な text embed と cond な text embed を結合したもの guidance_scale=7.5, ) -> torch.FloatTensor: # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. latent_model_input = torch.cat([latents] * 2) latent_model_input = scheduler.scale_model_input(latent_model_input, timestep) # batch_size = latents.shape[0] # text_embeddings = text_embeddings.repeat_interleave(batch_size, dim=0) # predict the noise residual noise_pred = unet( latent_model_input, timestep, encoder_hidden_states=text_embeddings, ).sample # perform guidance noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) guided_target = noise_pred_uncond + guidance_scale * ( noise_pred_text - noise_pred_uncond ) return guided_target @torch.no_grad() def diffusion( unet: UNet2DConditionModel, scheduler: SchedulerMixin, latents: torch.FloatTensor, text_embeddings: torch.FloatTensor, total_timesteps: int = 1000, start_timesteps=0, **kwargs, ): # latents_steps = [] for timestep in scheduler.timesteps[start_timesteps:total_timesteps]: noise_pred = predict_noise( unet, scheduler, timestep, latents, text_embeddings, **kwargs ) # compute the previous noisy sample x_t -> x_t-1 latents = scheduler.step(noise_pred, timestep, latents).prev_sample # return latents_steps return latents @torch.no_grad() def get_noisy_image( img, vae, generator, unet: UNet2DConditionModel, scheduler: SchedulerMixin, total_timesteps: int = 1000, start_timesteps=0, **kwargs, ): # latents_steps = [] vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) image = img # im_orig = image device = vae.device image = image_processor.preprocess(image).to(device) init_latents = vae.encode(image).latent_dist.sample(None) init_latents = vae.config.scaling_factor * init_latents init_latents = torch.cat([init_latents], dim=0) shape = init_latents.shape noise = randn_tensor(shape, generator=generator, device=device) time_ = total_timesteps timestep = scheduler.timesteps[time_:time_+1] # get latents noised_latents = scheduler.add_noise(init_latents, noise, timestep) return noised_latents, noise, init_latents def subtract_noise( latent: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor, scheduler: SchedulerMixin, ) -> torch.FloatTensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls scheduler.alphas_cumprod = scheduler.alphas_cumprod.to(device=latent.device) alphas_cumprod = scheduler.alphas_cumprod.to(dtype=latent.dtype) timesteps = timesteps.to(latent.device) sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(latent.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(latent.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) denoised_latent = (latent - sqrt_one_minus_alpha_prod * noise) / sqrt_alpha_prod return denoised_latent def get_denoised_image( latents: torch.FloatTensor, noise_pred: torch.FloatTensor, timestep: int, # total_timesteps: int, scheduler: SchedulerMixin, vae: VaeImageProcessor, ): denoised_latents = subtract_noise(latents, noise_pred, timestep, scheduler) denoised_latents = denoised_latents / vae.config.scaling_factor # 0.18215 denoised_img = vae.decode(denoised_latents).sample # denoised_img = denoised_img.clamp(-1,1) return denoised_img def rescale_noise_cfg( noise_cfg: torch.FloatTensor, noise_pred_text, guidance_rescale=0.0 ): std_text = noise_pred_text.std( dim=list(range(1, noise_pred_text.ndim)), keepdim=True ) std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) # rescale the results from guidance (fixes overexposure) noise_pred_rescaled = noise_cfg * (std_text / std_cfg) # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images noise_cfg = ( guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg ) return noise_cfg def predict_noise_xl( unet: UNet2DConditionModel, scheduler: SchedulerMixin, timestep: int, latents: torch.FloatTensor, text_embeddings: torch.FloatTensor, # uncond な text embed と cond な text embed を結合したもの add_text_embeddings: torch.FloatTensor, # pooled なやつ add_time_ids: torch.FloatTensor, guidance_scale=7.5, guidance_rescale=0.7, ) -> torch.FloatTensor: # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. latent_model_input = torch.cat([latents] * 2) latent_model_input = scheduler.scale_model_input(latent_model_input, timestep) added_cond_kwargs = { "text_embeds": add_text_embeddings, "time_ids": add_time_ids, } # predict the noise residual noise_pred = unet( latent_model_input, timestep, encoder_hidden_states=text_embeddings, added_cond_kwargs=added_cond_kwargs, ).sample # perform guidance noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) guided_target = noise_pred_uncond + guidance_scale * ( noise_pred_text - noise_pred_uncond ) noise_pred = rescale_noise_cfg( noise_pred, noise_pred_text, guidance_rescale=guidance_rescale ) return guided_target @torch.no_grad() def diffusion_xl( unet: UNet2DConditionModel, scheduler: SchedulerMixin, latents: torch.FloatTensor, text_embeddings: tuple[torch.FloatTensor, torch.FloatTensor], add_text_embeddings: torch.FloatTensor, add_time_ids: torch.FloatTensor, guidance_scale: float = 1.0, total_timesteps: int = 1000, start_timesteps=0, ): # latents_steps = [] for timestep in tqdm(scheduler.timesteps[start_timesteps:total_timesteps]): noise_pred = predict_noise_xl( unet, scheduler, timestep, latents, text_embeddings, add_text_embeddings, add_time_ids, guidance_scale=guidance_scale, guidance_rescale=0.7, ) # compute the previous noisy sample x_t -> x_t-1 latents = scheduler.step(noise_pred, timestep, latents).prev_sample # return latents_steps return latents # for XL def get_add_time_ids( height: int, width: int, dynamic_crops: bool = False, dtype: torch.dtype = torch.float32, ): if dynamic_crops: # random float scale between 1 and 3 random_scale = torch.rand(1).item() * 2 + 1 original_size = (int(height * random_scale), int(width * random_scale)) # random position crops_coords_top_left = ( torch.randint(0, original_size[0] - height, (1,)).item(), torch.randint(0, original_size[1] - width, (1,)).item(), ) target_size = (height, width) else: original_size = (height, width) crops_coords_top_left = (0, 0) target_size = (height, width) # this is expected as 6 add_time_ids = list(original_size + crops_coords_top_left + target_size) # this is expected as 2816 passed_add_embed_dim = ( UNET_ATTENTION_TIME_EMBED_DIM * len(add_time_ids) # 256 * 6 + TEXT_ENCODER_2_PROJECTION_DIM # + 1280 ) if passed_add_embed_dim != UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM: raise ValueError( f"Model expects an added time embedding vector of length {UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." ) add_time_ids = torch.tensor([add_time_ids], dtype=dtype) return add_time_ids def get_optimizer(name: str): name = name.lower() if name.startswith("dadapt"): import dadaptation if name == "dadaptadam": return dadaptation.DAdaptAdam elif name == "dadaptlion": return dadaptation.DAdaptLion else: raise ValueError("DAdapt optimizer must be dadaptadam or dadaptlion") elif name.endswith("8bit"): import bitsandbytes as bnb if name == "adam8bit": return bnb.optim.Adam8bit elif name == "lion8bit": return bnb.optim.Lion8bit else: raise ValueError("8bit optimizer must be adam8bit or lion8bit") else: if name == "adam": return torch.optim.Adam elif name == "adamw": return torch.optim.AdamW elif name == "lion": from lion_pytorch import Lion return Lion elif name == "prodigy": import prodigyopt return prodigyopt.Prodigy else: raise ValueError("Optimizer must be adam, adamw, lion or Prodigy") def get_lr_scheduler( name: Optional[str], optimizer: torch.optim.Optimizer, max_iterations: Optional[int], lr_min: Optional[float], **kwargs, ): if name == "cosine": return torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=max_iterations, eta_min=lr_min, **kwargs ) elif name == "cosine_with_restarts": return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=max_iterations // 10, T_mult=2, eta_min=lr_min, **kwargs ) elif name == "step": return torch.optim.lr_scheduler.StepLR( optimizer, step_size=max_iterations // 100, gamma=0.999, **kwargs ) elif name == "constant": return torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1, **kwargs) elif name == "linear": return torch.optim.lr_scheduler.LinearLR( optimizer, factor=0.5, total_iters=max_iterations // 100, **kwargs ) else: raise ValueError( "Scheduler must be cosine, cosine_with_restarts, step, linear or constant" ) def get_random_resolution_in_bucket(bucket_resolution: int = 512) -> tuple[int, int]: max_resolution = bucket_resolution min_resolution = bucket_resolution // 2 step = 64 min_step = min_resolution // step max_step = max_resolution // step height = torch.randint(min_step, max_step, (1,)).item() * step width = torch.randint(min_step, max_step, (1,)).item() * step return height, width