Spaces:
Running
Running
# model.py | |
import os | |
from diffusers.models import AutoencoderKL, UNet2DConditionModel | |
from diffusers.models.attention_processor import AttnProcessor | |
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker | |
from diffusers.schedulers import KarrasDiffusionSchedulers | |
import torch | |
import torch.nn.functional as F | |
import tqdm | |
import numpy as np | |
import safetensors | |
from PIL import Image | |
from torchvision import transforms | |
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer | |
from diffusers import StableDiffusionPipeline | |
from argparse import ArgumentParser | |
import inspect | |
from utils.model_utils import get_img, slerp, do_replace_attn | |
from utils.lora_utils import train_lora, load_lora | |
from utils.alpha_scheduler import AlphaScheduler | |
class StoreProcessor(): | |
def __init__(self, original_processor, value_dict, name): | |
self.original_processor = original_processor | |
self.value_dict = value_dict | |
self.name = name | |
self.value_dict[self.name] = dict() | |
self.id = 0 | |
def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs): | |
# Is self attention | |
if encoder_hidden_states is None: | |
self.value_dict[self.name][self.id] = hidden_states.detach() | |
self.id += 1 | |
res = self.original_processor(attn, hidden_states, *args, | |
encoder_hidden_states=encoder_hidden_states, | |
attention_mask=attention_mask, | |
**kwargs) | |
return res | |
class LoadProcessor(): | |
def __init__(self, original_processor, name, img0_dict, img1_dict, alpha, beta=0, lamd=0.6): | |
super().__init__() | |
self.original_processor = original_processor | |
self.name = name | |
self.img0_dict = img0_dict | |
self.img1_dict = img1_dict | |
self.alpha = alpha | |
self.beta = beta | |
self.lamd = lamd | |
self.id = 0 | |
def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs): | |
# Is self attention | |
if encoder_hidden_states is None: | |
if self.id < 50 * self.lamd: | |
map0 = self.img0_dict[self.name][self.id] | |
map1 = self.img1_dict[self.name][self.id] | |
cross_map = self.beta * hidden_states + \ | |
(1 - self.beta) * ((1 - self.alpha) * map0 + self.alpha * map1) | |
# cross_map = self.beta * hidden_states + \ | |
# (1 - self.beta) * slerp(map0, map1, self.alpha) | |
# cross_map = slerp(slerp(map0, map1, self.alpha), | |
# hidden_states, self.beta) | |
# cross_map = hidden_states | |
# cross_map = torch.cat( | |
# ((1 - self.alpha) * map0, self.alpha * map1), dim=1) | |
res = self.original_processor(attn, hidden_states, *args, | |
encoder_hidden_states=cross_map, | |
attention_mask=attention_mask, | |
**kwargs) | |
else: | |
res = self.original_processor(attn, hidden_states, *args, | |
encoder_hidden_states=encoder_hidden_states, | |
attention_mask=attention_mask, | |
**kwargs) | |
self.id += 1 | |
# if self.id == len(self.img0_dict[self.name]): | |
if self.id == len(self.img0_dict[self.name]): | |
self.id = 0 | |
else: | |
res = self.original_processor(attn, hidden_states, *args, | |
encoder_hidden_states=encoder_hidden_states, | |
attention_mask=attention_mask, | |
**kwargs) | |
return res | |
class DiffMorpherPipeline(StableDiffusionPipeline): | |
def __init__(self, | |
vae: AutoencoderKL, | |
text_encoder: CLIPTextModel, | |
tokenizer: CLIPTokenizer, | |
unet: UNet2DConditionModel, | |
scheduler: KarrasDiffusionSchedulers, | |
safety_checker: StableDiffusionSafetyChecker, | |
feature_extractor: CLIPImageProcessor, | |
image_encoder=None, | |
requires_safety_checker: bool = True, | |
): | |
sig = inspect.signature(super().__init__) | |
params = sig.parameters | |
if 'image_encoder' in params: | |
super().__init__(vae, text_encoder, tokenizer, unet, scheduler, | |
safety_checker, feature_extractor, image_encoder, requires_safety_checker) | |
else: | |
super().__init__(vae, text_encoder, tokenizer, unet, scheduler, | |
safety_checker, feature_extractor, requires_safety_checker) | |
self.img0_dict = dict() | |
self.img1_dict = dict() | |
def inv_step( | |
self, | |
model_output: torch.FloatTensor, | |
timestep: int, | |
x: torch.FloatTensor, | |
eta=0., | |
verbose=False | |
): | |
""" | |
Inverse sampling for DDIM Inversion | |
""" | |
if verbose: | |
print("timestep: ", timestep) | |
next_step = timestep | |
timestep = min(timestep - self.scheduler.config.num_train_timesteps // | |
self.scheduler.num_inference_steps, 999) | |
alpha_prod_t = self.scheduler.alphas_cumprod[ | |
timestep] if timestep >= 0 else self.scheduler.alphas_cumprod[0] | |
alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step] | |
beta_prod_t = 1 - alpha_prod_t | |
pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5 | |
pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output | |
x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir | |
return x_next, pred_x0 | |
def invert( | |
self, | |
image: torch.Tensor, | |
prompt, | |
num_inference_steps=50, | |
num_actual_inference_steps=None, | |
guidance_scale=1., | |
eta=0.0, | |
**kwds): | |
""" | |
invert a real image into noise map with determinisc DDIM inversion | |
""" | |
DEVICE = torch.device( | |
"cuda") if torch.cuda.is_available() else torch.device("cpu") | |
batch_size = image.shape[0] | |
if isinstance(prompt, list): | |
if batch_size == 1: | |
image = image.expand(len(prompt), -1, -1, -1) | |
elif isinstance(prompt, str): | |
if batch_size > 1: | |
prompt = [prompt] * batch_size | |
# text embeddings | |
text_input = self.tokenizer( | |
prompt, | |
padding="max_length", | |
max_length=77, | |
return_tensors="pt" | |
) | |
text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0] | |
print("input text embeddings :", text_embeddings.shape) | |
# define initial latents | |
latents = self.image2latent(image) | |
# unconditional embedding for classifier free guidance | |
if guidance_scale > 1.: | |
max_length = text_input.input_ids.shape[-1] | |
unconditional_input = self.tokenizer( | |
[""] * batch_size, | |
padding="max_length", | |
max_length=77, | |
return_tensors="pt" | |
) | |
unconditional_embeddings = self.text_encoder( | |
unconditional_input.input_ids.to(DEVICE))[0] | |
text_embeddings = torch.cat( | |
[unconditional_embeddings, text_embeddings], dim=0) | |
print("latents shape: ", latents.shape) | |
# interative sampling | |
self.scheduler.set_timesteps(num_inference_steps) | |
print("Valid timesteps: ", reversed(self.scheduler.timesteps)) | |
# print("attributes: ", self.scheduler.__dict__) | |
latents_list = [latents] | |
pred_x0_list = [latents] | |
for i, t in enumerate(tqdm.tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")): | |
if num_actual_inference_steps is not None and i >= num_actual_inference_steps: | |
continue | |
if guidance_scale > 1.: | |
model_inputs = torch.cat([latents] * 2) | |
else: | |
model_inputs = latents | |
# predict the noise | |
noise_pred = self.unet( | |
model_inputs, t, encoder_hidden_states=text_embeddings).sample | |
if guidance_scale > 1.: | |
noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0) | |
noise_pred = noise_pred_uncon + guidance_scale * \ | |
(noise_pred_con - noise_pred_uncon) | |
# compute the previous noise sample x_t-1 -> x_t | |
latents, pred_x0 = self.inv_step(noise_pred, t, latents) | |
latents_list.append(latents) | |
pred_x0_list.append(pred_x0) | |
return latents | |
def ddim_inversion(self, latent, cond): | |
timesteps = reversed(self.scheduler.timesteps) | |
with torch.autocast(device_type='cuda', dtype=torch.float32): | |
for i, t in enumerate(tqdm.tqdm(timesteps, desc="DDIM inversion")): | |
cond_batch = cond.repeat(latent.shape[0], 1, 1) | |
alpha_prod_t = self.scheduler.alphas_cumprod[t] | |
alpha_prod_t_prev = ( | |
self.scheduler.alphas_cumprod[timesteps[i - 1]] | |
if i > 0 else self.scheduler.alphas_cumprod[0] | |
) | |
mu = alpha_prod_t ** 0.5 | |
mu_prev = alpha_prod_t_prev ** 0.5 | |
sigma = (1 - alpha_prod_t) ** 0.5 | |
sigma_prev = (1 - alpha_prod_t_prev) ** 0.5 | |
eps = self.unet( | |
latent, t, encoder_hidden_states=cond_batch).sample | |
pred_x0 = (latent - sigma_prev * eps) / mu_prev | |
latent = mu * pred_x0 + sigma * eps | |
# if save_latents: | |
# torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt')) | |
# torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt')) | |
return latent | |
def step( | |
self, | |
model_output: torch.FloatTensor, | |
timestep: int, | |
x: torch.FloatTensor, | |
): | |
""" | |
predict the sample of the next step in the denoise process. | |
""" | |
prev_timestep = timestep - \ | |
self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps | |
alpha_prod_t = self.scheduler.alphas_cumprod[timestep] | |
alpha_prod_t_prev = self.scheduler.alphas_cumprod[ | |
prev_timestep] if prev_timestep > 0 else self.scheduler.alphas_cumprod[0] | |
beta_prod_t = 1 - alpha_prod_t | |
pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5 | |
pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output | |
x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir | |
return x_prev, pred_x0 | |
def image2latent(self, image): | |
DEVICE = torch.device( | |
"cuda") if torch.cuda.is_available() else torch.device("cpu") | |
if type(image) is Image: | |
image = np.array(image) | |
image = torch.from_numpy(image).float() / 127.5 - 1 | |
image = image.permute(2, 0, 1).unsqueeze(0) | |
# input image density range [-1, 1] | |
latents = self.vae.encode(image.to(DEVICE))['latent_dist'].mean | |
latents = latents * 0.18215 | |
return latents | |
def latent2image(self, latents, return_type='np'): | |
latents = 1 / 0.18215 * latents.detach() | |
image = self.vae.decode(latents)['sample'] | |
if return_type == 'np': | |
image = (image / 2 + 0.5).clamp(0, 1) | |
image = image.cpu().permute(0, 2, 3, 1).numpy()[0] | |
image = (image * 255).astype(np.uint8) | |
elif return_type == "pt": | |
image = (image / 2 + 0.5).clamp(0, 1) | |
return image | |
def latent2image_grad(self, latents): | |
latents = 1 / 0.18215 * latents | |
image = self.vae.decode(latents)['sample'] | |
return image # range [-1, 1] | |
def cal_latent(self, num_inference_steps, guidance_scale, unconditioning, img_noise_0, img_noise_1, text_embeddings_0, text_embeddings_1, lora_0, lora_1, alpha, use_lora, use_lcm, fix_lora=None): | |
# latents = torch.cos(alpha * torch.pi / 2) * img_noise_0 + \ | |
# torch.sin(alpha * torch.pi / 2) * img_noise_1 | |
# latents = (1 - alpha) * img_noise_0 + alpha * img_noise_1 | |
# latents = latents / ((1 - alpha) ** 2 + alpha ** 2) | |
latents = slerp(img_noise_0, img_noise_1, alpha, self.use_adain) | |
text_embeddings = (1 - alpha) * text_embeddings_0 + \ | |
alpha * text_embeddings_1 | |
self.scheduler.set_timesteps(num_inference_steps) | |
if use_lora: | |
if fix_lora is not None: | |
self.unet = load_lora(self.unet, lora_0, lora_1, fix_lora) | |
else: | |
self.unet = load_lora(self.unet, lora_0, lora_1, alpha) | |
if use_lcm: | |
sampler_desc = "LCM multi-step sampler" | |
else: | |
sampler_desc = "DDIM Sampler" # currently defaults to this | |
for i, t in enumerate(tqdm.tqdm(self.scheduler.timesteps, desc=f"{sampler_desc}, alpha={alpha}")): | |
if guidance_scale > 1.: | |
model_inputs = torch.cat([latents] * 2) | |
else: | |
model_inputs = latents | |
if unconditioning is not None and isinstance(unconditioning, list): | |
_, text_embeddings = text_embeddings.chunk(2) | |
text_embeddings = torch.cat( | |
[unconditioning[i].expand(*text_embeddings.shape), text_embeddings]) | |
# predict the noise | |
noise_pred = self.unet( | |
model_inputs, t, encoder_hidden_states=text_embeddings).sample | |
if guidance_scale > 1.0: | |
noise_pred_uncon, noise_pred_con = noise_pred.chunk( | |
2, dim=0) | |
noise_pred = noise_pred_uncon + guidance_scale * \ | |
(noise_pred_con - noise_pred_uncon) | |
# compute the previous noise sample x_t -> x_t-1 | |
latents = self.scheduler.step( | |
noise_pred, t, latents, return_dict=False)[0] | |
return latents | |
def get_text_embeddings(self, prompt, guidance_scale, neg_prompt, batch_size): | |
DEVICE = torch.device( | |
"cuda") if torch.cuda.is_available() else torch.device("cpu") | |
# text embeddings | |
text_input = self.tokenizer( | |
prompt, | |
padding="max_length", | |
max_length=77, | |
return_tensors="pt" | |
) | |
text_embeddings = self.text_encoder(text_input.input_ids.cuda())[0] | |
if guidance_scale > 1.: | |
if neg_prompt: | |
uc_text = neg_prompt | |
else: | |
uc_text = "" | |
unconditional_input = self.tokenizer( | |
[uc_text] * batch_size, | |
padding="max_length", | |
max_length=77, | |
return_tensors="pt" | |
) | |
unconditional_embeddings = self.text_encoder( | |
unconditional_input.input_ids.to(DEVICE))[0] | |
text_embeddings = torch.cat( | |
[unconditional_embeddings, text_embeddings], dim=0) | |
return text_embeddings | |
def __call__( | |
self, | |
img_0=None, | |
img_1=None, | |
img_path_0=None, | |
img_path_1=None, | |
prompt_0="", | |
prompt_1="", | |
save_lora_dir="./lora", | |
load_lora_path_0=None, | |
load_lora_path_1=None, | |
lora_steps=200, | |
lora_lr=2e-4, | |
lora_rank=16, | |
batch_size=1, | |
height=512, | |
width=512, | |
num_inference_steps=50, | |
num_actual_inference_steps=None, | |
guidance_scale=1, | |
attn_beta=0, | |
lamd=0.6, | |
use_lora=True, | |
use_lcm = False, | |
use_adain=True, | |
use_reschedule=True, | |
output_path="./results", | |
num_frames=50, | |
fix_lora=None, | |
progress=tqdm, | |
unconditioning=None, | |
neg_prompt=None, | |
save_intermediates=False, | |
**kwds): | |
# if isinstance(prompt, list): | |
# batch_size = len(prompt) | |
# elif isinstance(prompt, str): | |
# if batch_size > 1: | |
# prompt = [prompt] * batch_size | |
self.scheduler.set_timesteps(num_inference_steps) | |
self.use_lora = use_lora | |
self.use_adain = use_adain | |
self.use_reschedule = use_reschedule | |
self.output_path = output_path | |
self.use_lcm = use_lcm | |
if img_0 is None: | |
img_0 = Image.open(img_path_0).convert("RGB") | |
# else: | |
# img_0 = Image.fromarray(img_0).convert("RGB") | |
if img_1 is None: | |
img_1 = Image.open(img_path_1).convert("RGB") | |
# else: | |
# img_1 = Image.fromarray(img_1).convert("RGB") | |
if self.use_lora: | |
print("Loading lora...") | |
if not load_lora_path_0: | |
weight_name = f"{output_path.split('/')[-1]}_lora_0.ckpt" | |
load_lora_path_0 = save_lora_dir + "/" + weight_name | |
if not os.path.exists(load_lora_path_0): | |
train_lora(img_0, prompt_0, save_lora_dir, None, self.tokenizer, self.text_encoder, | |
self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name) | |
print(f"Load from {load_lora_path_0}.") | |
if load_lora_path_0.endswith(".safetensors"): | |
lora_0 = safetensors.torch.load_file( | |
load_lora_path_0, device="cpu") | |
else: | |
lora_0 = torch.load(load_lora_path_0, map_location="cpu") | |
if not load_lora_path_1: | |
weight_name = f"{output_path.split('/')[-1]}_lora_1.ckpt" | |
load_lora_path_1 = save_lora_dir + "/" + weight_name | |
if not os.path.exists(load_lora_path_1): | |
train_lora(img_1, prompt_1, save_lora_dir, None, self.tokenizer, self.text_encoder, | |
self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name) | |
print(f"Load from {load_lora_path_1}.") | |
if load_lora_path_1.endswith(".safetensors"): | |
lora_1 = safetensors.torch.load_file( | |
load_lora_path_1, device="cpu") | |
else: | |
lora_1 = torch.load(load_lora_path_1, map_location="cpu") | |
else: | |
lora_0 = lora_1 = None | |
text_embeddings_0 = self.get_text_embeddings( | |
prompt_0, guidance_scale, neg_prompt, batch_size) | |
text_embeddings_1 = self.get_text_embeddings( | |
prompt_1, guidance_scale, neg_prompt, batch_size) | |
img_0 = get_img(img_0) | |
img_1 = get_img(img_1) | |
if self.use_lora: | |
self.unet = load_lora(self.unet, lora_0, lora_1, 0) | |
img_noise_0 = self.ddim_inversion( | |
self.image2latent(img_0), text_embeddings_0) | |
if self.use_lora: | |
self.unet = load_lora(self.unet, lora_0, lora_1, 1) | |
img_noise_1 = self.ddim_inversion( | |
self.image2latent(img_1), text_embeddings_1) | |
print("latents shape: ", img_noise_0.shape) | |
original_processor = list(self.unet.attn_processors.values())[0] | |
def morph(alpha_list, progress, desc): | |
images = [] | |
if attn_beta is not None: | |
if self.use_lora: | |
self.unet = load_lora( | |
self.unet, lora_0, lora_1, 0 if fix_lora is None else fix_lora) | |
attn_processor_dict = {} | |
for k in self.unet.attn_processors.keys(): | |
if do_replace_attn(k): | |
if self.use_lora: | |
attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k], | |
self.img0_dict, k) | |
else: | |
attn_processor_dict[k] = StoreProcessor(original_processor, | |
self.img0_dict, k) | |
else: | |
attn_processor_dict[k] = self.unet.attn_processors[k] | |
self.unet.set_attn_processor(attn_processor_dict) | |
latents = self.cal_latent( | |
num_inference_steps, | |
guidance_scale, | |
unconditioning, | |
img_noise_0, | |
img_noise_1, | |
text_embeddings_0, | |
text_embeddings_1, | |
lora_0, | |
lora_1, | |
alpha_list[0], | |
False, | |
self.use_lcm, | |
fix_lora | |
) | |
first_image = self.latent2image(latents) | |
first_image = Image.fromarray(first_image) | |
if save_intermediates: | |
first_image.save(f"{self.output_path}/{0:02d}.png") | |
if self.use_lora: | |
self.unet = load_lora( | |
self.unet, lora_0, lora_1, 1 if fix_lora is None else fix_lora) | |
attn_processor_dict = {} | |
for k in self.unet.attn_processors.keys(): | |
if do_replace_attn(k): | |
if self.use_lora: | |
attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k], | |
self.img1_dict, k) | |
else: | |
attn_processor_dict[k] = StoreProcessor(original_processor, | |
self.img1_dict, k) | |
else: | |
attn_processor_dict[k] = self.unet.attn_processors[k] | |
self.unet.set_attn_processor(attn_processor_dict) | |
latents = self.cal_latent( | |
num_inference_steps, | |
guidance_scale, | |
unconditioning, | |
img_noise_0, | |
img_noise_1, | |
text_embeddings_0, | |
text_embeddings_1, | |
lora_0, | |
lora_1, | |
alpha_list[-1], | |
False, | |
self.use_lcm, | |
fix_lora | |
) | |
last_image = self.latent2image(latents) | |
last_image = Image.fromarray(last_image) | |
if save_intermediates: | |
last_image.save( | |
f"{self.output_path}/{num_frames - 1:02d}.png") | |
for i in progress.tqdm(range(1, num_frames - 1), desc=desc): | |
alpha = alpha_list[i] | |
if self.use_lora: | |
self.unet = load_lora( | |
self.unet, lora_0, lora_1, alpha if fix_lora is None else fix_lora) | |
attn_processor_dict = {} | |
for k in self.unet.attn_processors.keys(): | |
if do_replace_attn(k): | |
if self.use_lora: | |
attn_processor_dict[k] = LoadProcessor( | |
self.unet.attn_processors[k], k, self.img0_dict, self.img1_dict, alpha, attn_beta, lamd) | |
else: | |
attn_processor_dict[k] = LoadProcessor( | |
original_processor, k, self.img0_dict, self.img1_dict, alpha, attn_beta, lamd) | |
else: | |
attn_processor_dict[k] = self.unet.attn_processors[k] | |
self.unet.set_attn_processor(attn_processor_dict) | |
latents = self.cal_latent( | |
num_inference_steps, | |
guidance_scale, | |
unconditioning, | |
img_noise_0, | |
img_noise_1, | |
text_embeddings_0, | |
text_embeddings_1, | |
lora_0, | |
lora_1, | |
alpha_list[i], | |
False, | |
self.use_lcm, | |
fix_lora | |
) | |
image = self.latent2image(latents) | |
image = Image.fromarray(image) | |
if save_intermediates: | |
image.save(f"{self.output_path}/{i:02d}.png") | |
images.append(image) | |
images = [first_image] + images + [last_image] | |
else: | |
for k, alpha in enumerate(alpha_list): | |
latents = self.cal_latent( | |
num_inference_steps, | |
guidance_scale, | |
unconditioning, | |
img_noise_0, | |
img_noise_1, | |
text_embeddings_0, | |
text_embeddings_1, | |
lora_0, | |
lora_1, | |
alpha_list[k], | |
self.use_lora, | |
self.use_lcm, | |
fix_lora | |
) | |
image = self.latent2image(latents) | |
image = Image.fromarray(image) | |
if save_intermediates: | |
image.save(f"{self.output_path}/{k:02d}.png") | |
images.append(image) | |
return images | |
with torch.no_grad(): | |
if self.use_reschedule: | |
alpha_scheduler = AlphaScheduler() | |
alpha_list = list(torch.linspace(0, 1, num_frames)) | |
images_pt = morph(alpha_list, progress, "Sampling...") | |
images_pt = [transforms.ToTensor()(img).unsqueeze(0) | |
for img in images_pt] | |
alpha_scheduler.from_imgs(images_pt) | |
alpha_list = alpha_scheduler.get_list() | |
print(alpha_list) | |
images = morph(alpha_list, progress, "Reschedule..." | |
) | |
else: | |
alpha_list = list(torch.linspace(0, 1, num_frames)) | |
print(alpha_list) | |
images = morph(alpha_list, progress, "Sampling...") | |
return images | |