Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-animate / adaface /adaface_wrapper.py

adaface-neurips

fix bug that prevents clip extension; extend clip from 97 to 147 tokens

7f97ed5 29 days ago

raw

history blame contribute delete

44.9 kB

	import torch
	import torch.nn as nn
	from transformers import CLIPTextModel
	from diffusers import (
	StableDiffusionPipeline,
	StableDiffusionImg2ImgPipeline,
	StableDiffusionXLPipeline,
	StableDiffusion3Pipeline,
	#FluxPipeline,
	DDIMScheduler,
	PNDMScheduler,
	DPMSolverSinglestepScheduler,
	AutoencoderKL,
	LCMScheduler,
	)
	from diffusers.loaders.single_file_utils import convert_ldm_unet_checkpoint
	from adaface.util import UNetEnsemble, extend_nn_embedding
	from adaface.face_id_to_ada_prompt import create_id2ada_prompt_encoder
	from adaface.diffusers_attn_lora_capture import set_up_attn_processors, set_up_ffn_loras, set_lora_and_capture_flags
	from safetensors.torch import load_file as safetensors_load_file
	import re, os
	import numpy as np
	from peft.utils.constants import DUMMY_TARGET_MODULES

	class AdaFaceWrapper(nn.Module):
	def __init__(self, pipeline_name, base_model_path, adaface_encoder_types,
	adaface_ckpt_paths, adaface_encoder_cfg_scales=None,
	enabled_encoders=None, use_lcm=False, default_scheduler_name='ddim',
	num_inference_steps=50, subject_string='z', negative_prompt=None,
	max_prompt_length=77, use_840k_vae=False, use_ds_text_encoder=False,
	main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
	enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
	attn_lora_layer_names=['q', 'k', 'v', 'out'], normalize_cross_attn=False, q_lora_updates_query=False,
	device='cuda', is_training=False, is_on_hf_space=False):
	'''
	pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
	If None, it's used only as a face encoder, and the unet and vae are
	removed from the pipeline to release RAM.
	'''
	super().__init__()
	self.pipeline_name = pipeline_name
	self.base_model_path = base_model_path
	self.adaface_encoder_types = adaface_encoder_types

	self.adaface_ckpt_paths = adaface_ckpt_paths
	self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
	self.enabled_encoders = enabled_encoders
	# None, or a list of two bools for two encoders. If None, both are disabled.
	self.enable_static_img_suffix_embs = enable_static_img_suffix_embs
	self.unet_uses_attn_lora = unet_uses_attn_lora
	self.attn_lora_layer_names = attn_lora_layer_names
	self.q_lora_updates_query = q_lora_updates_query
	self.use_lcm = use_lcm
	self.subject_string = subject_string
	self.normalize_cross_attn = normalize_cross_attn

	self.default_scheduler_name = default_scheduler_name
	self.num_inference_steps = num_inference_steps if not use_lcm else 4

	self.max_prompt_length = max_prompt_length

	self.use_840k_vae = use_840k_vae
	self.use_ds_text_encoder = use_ds_text_encoder
	self.main_unet_filepath = main_unet_filepath
	self.unet_types = unet_types
	self.extra_unet_dirpaths = extra_unet_dirpaths
	self.unet_weights_in_ensemble = unet_weights_in_ensemble
	self.device = device
	self.is_training = is_training
	self.is_on_hf_space = is_on_hf_space

	if negative_prompt is None:
	self.negative_prompt = \
	"flaws in the eyes, flaws in the face, lowres, non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, " \
	"mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, deformed eyeballs, cross-eyed, blurry, " \
	"mutation, duplicate, out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, " \
	"nude, naked, nsfw, topless, bare breasts"
	else:
	self.negative_prompt = negative_prompt

	self.initialize_pipeline()
	# During inference, we never use static image suffix embeddings.
	# So num_id_vecs is the length of the returned adaface embeddings for each encoder.
	self.encoders_num_id_vecs = np.array(self.id2ada_prompt_encoder.encoders_num_id_vecs)
	self.encoders_num_static_img_suffix_embs = np.array(self.id2ada_prompt_encoder.encoders_num_static_img_suffix_embs)
	if self.enable_static_img_suffix_embs is not None:
	assert len(self.enable_static_img_suffix_embs) == len(self.encoders_num_id_vecs)
	self.encoders_num_static_img_suffix_embs *= np.array(self.enable_static_img_suffix_embs)
	self.encoders_num_id_vecs += self.encoders_num_static_img_suffix_embs

	self.img_prompt_embs = None
	self.extend_tokenizer_and_text_encoder()

	def to(self, device):
	self.device = device
	self.id2ada_prompt_encoder.to(device)
	self.pipeline.to(device)
	print(f"Moved AdaFaceWrapper to {device}.")
	return self

	def initialize_pipeline(self):
	self.id2ada_prompt_encoder = create_id2ada_prompt_encoder(self.adaface_encoder_types,
	self.adaface_ckpt_paths,
	self.adaface_encoder_cfg_scales,
	self.enabled_encoders,
	is_on_hf_space=self.is_on_hf_space,
	num_static_img_suffix_embs=4)

	self.id2ada_prompt_encoder.to(self.device)
	print(f"adaface_encoder_cfg_scales: {self.adaface_encoder_cfg_scales}")

	if self.use_840k_vae:
	# The 840000-step vae model is slightly better in face details than the original vae model.
	# https://huggingface.co/stabilityai/sd-vae-ft-mse-original
	vae = AutoencoderKL.from_single_file("models/diffusers/sd-vae-ft-mse-original/vae-ft-mse-840000-ema-pruned.ckpt",
	torch_dtype=torch.float16)
	else:
	vae = None

	remove_unet = False

	if self.pipeline_name == "img2img":
	PipelineClass = StableDiffusionImg2ImgPipeline
	elif self.pipeline_name == "text2img":
	PipelineClass = StableDiffusionPipeline
	elif self.pipeline_name == "text2imgxl":
	PipelineClass = StableDiffusionXLPipeline
	elif self.pipeline_name == "text2img3":
	PipelineClass = StableDiffusion3Pipeline
	#elif self.pipeline_name == "flux":
	# PipelineClass = FluxPipeline
	# pipeline_name is None means only use this instance to generate adaface embeddings, not to generate images.
	elif self.pipeline_name is None:
	PipelineClass = StableDiffusionPipeline
	remove_unet = True
	else:
	raise ValueError(f"Unknown pipeline name: {self.pipeline_name}")

	if self.base_model_path is None:
	base_model_path_dict = {
	'text2img': 'models/sd15-dste8-vae.safetensors',
	'text2imgxl': 'stabilityai/stable-diffusion-xl-base-1.0',
	'text2img3': 'stabilityai/stable-diffusion-3-medium-diffusers',
	'flux': 'black-forest-labs/FLUX.1-schnell',
	}
	self.base_model_path = base_model_path_dict[self.pipeline_name]

	if os.path.isfile(self.base_model_path):
	pipeline = PipelineClass.from_single_file(
	self.base_model_path,
	torch_dtype=torch.float16
	)
	else:
	pipeline = PipelineClass.from_pretrained(
	self.base_model_path,
	torch_dtype=torch.float16,
	safety_checker=None
	)

	if self.use_lcm:
	lcm_path_dict = {
	'text2img': 'latent-consistency/lcm-lora-sdv1-5',
	'text2imgxl': 'latent-consistency/lcm-lora-sdxl',
	}
	if self.pipeline_name not in lcm_path_dict:
	raise ValueError(f"Pipeline {self.pipeline_name} does not support LCM.")

	lcm_path = lcm_path_dict[self.pipeline_name]
	pipeline.load_lora_weights(lcm_path)
	pipeline.fuse_lora()
	print(f"Loaded LCM weights from {lcm_path}.")
	pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)

	if self.main_unet_filepath is not None:
	print(f"Replacing the UNet with the UNet from {self.main_unet_filepath}.")
	ret = pipeline.unet.load_state_dict(self.load_unet_from_file(self.main_unet_filepath, device='cpu'))
	if len(ret.missing_keys) > 0:
	print(f"Missing keys: {ret.missing_keys}")
	if len(ret.unexpected_keys) > 0:
	print(f"Unexpected keys: {ret.unexpected_keys}")

	if (self.unet_types is not None and len(self.unet_types) > 0) \
	or (self.extra_unet_dirpaths is not None and len(self.extra_unet_dirpaths) > 0):
	unet_ensemble = UNetEnsemble([pipeline.unet], self.unet_types, self.extra_unet_dirpaths, self.unet_weights_in_ensemble,
	device=self.device, torch_dtype=torch.float16)
	pipeline.unet = unet_ensemble

	print(f"Loaded pipeline from {self.base_model_path}.")
	if not remove_unet and (self.unet_uses_attn_lora or self.normalize_cross_attn):
	unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
	attn_lora_layer_names=self.attn_lora_layer_names,
	normalize_cross_attn=self.normalize_cross_attn,
	q_lora_updates_query=self.q_lora_updates_query)

	pipeline.unet = unet2

	if self.use_ds_text_encoder:
	# The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
	# https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
	pipeline.text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
	torch_dtype=torch.float16)
	print("Replaced the text encoder with the DreamShaper text encoder.")

	# Extending prompt length is for SD 1.5 only.
	if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
	# pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
	# We reuse the last EL position embeddings for the new position embeddings.
	# If we use the "neat" way, i.e., initialize CLIPTextModel with a CLIPTextConfig with
	# a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
	# then the old position embeddings won't be loaded from the pretrained ckpt,
	# leading to degenerated performance.
	# max_prompt_length <= 77 + 70 = 147.
	self.max_prompt_length = min(self.max_prompt_length, 147)
	# Number of extra tokens is at most 70.
	EL = self.max_prompt_length - 77
	# position_embedding.weight: [77, 768] -> [max_length, 768]
	new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
	pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
	pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
	pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
	pipeline.text_encoder.text_model.config.max_position_embeddings = self.max_prompt_length
	pipeline.tokenizer.model_max_length = self.max_prompt_length

	if self.use_840k_vae:
	pipeline.vae = vae
	print("Replaced the VAE with the 840k-step VAE.")

	if remove_unet:
	# Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
	pipeline.unet = None
	pipeline.vae = None
	print("Removed UNet and VAE from the pipeline.")

	if self.pipeline_name not in ["text2imgxl", "text2img3", "flux"] and not self.use_lcm:
	if self.default_scheduler_name == 'ddim':
	noise_scheduler = DDIMScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	clip_sample=False,
	set_alpha_to_one=False,
	steps_offset=1,
	timestep_spacing="leading",
	rescale_betas_zero_snr=False,
	)
	elif self.default_scheduler_name == 'pndm':
	noise_scheduler = PNDMScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	set_alpha_to_one=False,
	steps_offset=1,
	timestep_spacing="leading",
	skip_prk_steps=True,
	)
	elif self.default_scheduler_name == 'dpm++':
	noise_scheduler = DPMSolverSinglestepScheduler(
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	prediction_type="epsilon",
	num_train_timesteps=1000,
	trained_betas=None,
	thresholding=False,
	algorithm_type="dpmsolver++",
	solver_type="midpoint",
	lower_order_final=True,
	use_karras_sigmas=True,
	)
	else:
	breakpoint()

	pipeline.scheduler = noise_scheduler
	# Otherwise, if not use_lcm, pipeline.scheduler == FlowMatchEulerDiscreteScheduler
	# if use_lcm, pipeline.scheduler == LCMScheduler
	self.pipeline = pipeline.to(self.device)

	def set_adaface_encoder_cfg_scales(self, adaface_encoder_cfg_scales):
	self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
	self.id2ada_prompt_encoder.set_out_id_embs_cfg_scale(adaface_encoder_cfg_scales)

	def load_unet_from_file(self, unet_path, device=None):
	if os.path.isfile(unet_path):
	if unet_path.endswith(".safetensors"):
	unet_state_dict = safetensors_load_file(unet_path, device=device)
	else:
	unet_state_dict = torch.load(unet_path, map_location=device)

	key0 = list(unet_state_dict.keys())[0]
	if key0.startswith("model.diffusion_model"):
	key_prefix = ""
	is_ldm_unet = True
	elif key0.startswith("diffusion_model"):
	key_prefix = "model."
	is_ldm_unet = True
	else:
	is_ldm_unet = False

	if is_ldm_unet:
	unet_state_dict2 = {}
	for key, value in unet_state_dict.items():
	key2 = key_prefix + key
	unet_state_dict2[key2] = value
	print(f"LDM UNet detected. Convert to diffusers")
	ldm_unet_config = { 'layers_per_block': 2 }
	unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict2, ldm_unet_config)
	else:
	raise ValueError(f"UNet path {unet_path} is not a file.")
	return unet_state_dict

	# Adapted from ConsistentIDPipeline:set_ip_adapter().
	def load_unet_loras(self, unet, unet_lora_modules_state_dict,
	use_attn_lora=True, use_ffn_lora=False,
	attn_lora_layer_names=['q', 'k', 'v', 'out'],
	normalize_cross_attn=False,
	q_lora_updates_query=False):
	attn_capture_procs, attn_opt_modules = \
	set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
	lora_rank=192, lora_scale_down=8,
	q_lora_updates_query=q_lora_updates_query)
	# up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
	if use_ffn_lora:
	target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
	else:
	# A special pattern, "dummy-target-modules" tells PEFT to add loras on NONE of the layers.
	# We couldn't simply skip PEFT initialization (converting unet to a PEFT model),
	# otherwise the attn lora layers will cause nan quickly during a fp16 training.
	target_modules_pat = DUMMY_TARGET_MODULES

	unet, ffn_lora_layers, ffn_opt_modules = \
	set_up_ffn_loras(unet, target_modules_pat=target_modules_pat, lora_uses_dora=True)

	# self.attn_capture_procs and ffn_lora_layers will be used in set_lora_and_capture_flags().
	self.attn_capture_procs = list(attn_capture_procs.values())
	self.ffn_lora_layers = list(ffn_lora_layers.values())
	# Combine attn_opt_modules and ffn_opt_modules into unet_lora_modules.
	# unet_lora_modules is for optimization and loading/saving.
	unet_lora_modules = {}
	# attn_opt_modules and ffn_opt_modules have different depths of keys.
	# attn_opt_modules:
	# up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_std_shrink_factor,
	# up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_to_q_lora_lora_A, ...
	# ffn_opt_modules:
	# base_model_model_up_blocks_3_resnets_1_conv1_lora_A, ...
	# with the prefix 'base_model_model_'. Because ffn_opt_modules are extracted from the peft-wrapped model,
	# and attn_opt_modules are extracted from the original unet model.
	# To be compatible with old param keys, we append 'base_model_model_' to the keys of attn_opt_modules.
	unet_lora_modules.update({ f'base_model_model_{k}': v for k, v in attn_opt_modules.items() })
	unet_lora_modules.update(ffn_opt_modules)
	# ParameterDict can contain both Parameter and nn.Module.
	# TODO: maybe in the future, we couldn't put nn.Module in nn.ParameterDict.
	self.unet_lora_modules = torch.nn.ParameterDict(unet_lora_modules)

	missing, unexpected = self.unet_lora_modules.load_state_dict(unet_lora_modules_state_dict, strict=False)
	if len(missing) > 0:
	print(f"Missing Keys: {missing}")
	if len(unexpected) > 0:
	print(f"Unexpected Keys: {unexpected}")

	print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
	self.outfeat_capture_blocks.append(unet.up_blocks[3])

	# If normalize_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
	# but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
	set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
	use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
	normalize_cross_attn=normalize_cross_attn, mix_attn_mats_in_batch=False,
	res_hidden_states_gradscale=0)

	return unet

	def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
	normalize_cross_attn=False, q_lora_updates_query=False):
	unet_lora_weight_found = False
	if isinstance(self.adaface_ckpt_paths, str):
	adaface_ckpt_paths = [self.adaface_ckpt_paths]
	else:
	adaface_ckpt_paths = self.adaface_ckpt_paths

	for adaface_ckpt_path in adaface_ckpt_paths:
	ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu', weights_only=False)
	if 'unet_lora_modules' in ckpt_dict:
	unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
	print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
	unet_lora_weight_found = True
	break

	# Since unet lora weights are not found in the adaface ckpt, we give up on loading unet attn processors.
	if not unet_lora_weight_found:
	print(f"LoRA weights not found in {self.adaface_ckpt_paths}.")
	return unet

	self.outfeat_capture_blocks = []

	if isinstance(unet, UNetEnsemble):
	for i, unet_ in enumerate(unet.unets):
	unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
	use_attn_lora=use_attn_lora,
	attn_lora_layer_names=attn_lora_layer_names,
	normalize_cross_attn=normalize_cross_attn,
	q_lora_updates_query=q_lora_updates_query)
	unet.unets[i] = unet_
	print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
	else:
	unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
	use_attn_lora=use_attn_lora,
	attn_lora_layer_names=attn_lora_layer_names,
	normalize_cross_attn=normalize_cross_attn,
	q_lora_updates_query=q_lora_updates_query)

	return unet

	def extend_tokenizer_and_text_encoder(self):
	if np.sum(self.encoders_num_id_vecs) < 1:
	raise ValueError(f"encoders_num_id_vecs has to be larger or equal to 1, but is {self.encoders_num_id_vecs}")

	tokenizer = self.pipeline.tokenizer
	# If adaface_encoder_types is ["arc2face", "consistentID"], then total_num_id_vecs = 20.
	# We add z_0_0, z_0_1, z_0_2, ..., z_0_15, z_1_0, z_1_1, z_1_2, z_1_3 to the tokenizer.
	self.all_placeholder_tokens = []
	self.placeholder_tokens_strs = []
	self.encoder_placeholder_tokens = []
	for i in range(len(self.adaface_encoder_types)):
	placeholder_tokens = []
	for j in range(self.encoders_num_id_vecs[i]):
	placeholder_tokens.append(f"{self.subject_string}_{i}_{j}")
	placeholder_tokens_str = " ".join(placeholder_tokens)

	self.all_placeholder_tokens.extend(placeholder_tokens)
	self.encoder_placeholder_tokens.append(placeholder_tokens)
	self.placeholder_tokens_strs.append(placeholder_tokens_str)

	self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
	self.updated_tokens_str = self.all_placeholder_tokens_str
	# all_null_placeholder_tokens_str: ", , , , ..." (20 times).
	# It just contains the commas and spaces with the same length, but no actual tokens.
	self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))

	# Add the new tokens to the tokenizer.
	num_added_tokens = tokenizer.add_tokens(self.all_placeholder_tokens)
	if num_added_tokens != np.sum(self.encoders_num_id_vecs):
	raise ValueError(
	f"The tokenizer already contains some of the tokens {self.all_placeholder_tokens_str}. Please pass a different"
	" `subject_string` that is not already in the tokenizer.")

	print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")

	# placeholder_token_ids: [49408, ..., 49427].
	self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
	#print("New tokens:", self.placeholder_token_ids)
	# Resize the token embeddings as we are adding new special tokens to the tokenizer
	old_weight_shape = self.pipeline.text_encoder.get_input_embeddings().weight.shape
	self.pipeline.text_encoder.resize_token_embeddings(len(tokenizer))
	new_weight = self.pipeline.text_encoder.get_input_embeddings().weight
	print(f"Resized text encoder token embeddings from {old_weight_shape} to {new_weight.shape} on {new_weight.device}.")

	# Extend pipeline.text_encoder with the adaface subject emeddings.
	# subj_embs: [16, 768].
	def update_text_encoder_subj_embeddings(self, subj_embs, lens_subj_emb_segments):
	# Initialise the newly added placeholder token with the embeddings of the initializer token
	# token_embeds: [49412, 768]
	token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
	all_encoders_updated_tokens = []
	all_encoders_updated_token_strs = []
	idx = 0

	with torch.no_grad():
	# sum of lens_subj_emb_segments are probably shorter than self.placeholder_token_ids,
	# when some static_img_suffix_embs are disabled.
	for i, encoder_type in enumerate(self.adaface_encoder_types):
	encoder_updated_tokens = []
	if (self.enabled_encoders is not None) and (encoder_type not in self.enabled_encoders):
	idx += lens_subj_emb_segments[i]
	continue
	for j in range(lens_subj_emb_segments[i]):
	placeholder_token = f"{self.subject_string}_{i}_{j}"
	token_id = self.pipeline.tokenizer.convert_tokens_to_ids(placeholder_token)
	token_embeds[token_id] = subj_embs[idx]
	encoder_updated_tokens.append(placeholder_token)
	idx += 1

	all_encoders_updated_tokens.extend(encoder_updated_tokens)
	all_encoders_updated_token_strs.append(" ".join(encoder_updated_tokens))

	self.updated_tokens_str = " ".join(all_encoders_updated_token_strs)
	self.all_encoders_updated_token_strs = all_encoders_updated_token_strs
	print(f"Updated {len(all_encoders_updated_tokens)} tokens ({self.updated_tokens_str}) in the text encoder.")

	def update_prompt(self, prompt, placeholder_tokens_pos='append',
	repeat_prompt_for_each_encoder=True,
	use_null_placeholders=False):
	if prompt is None:
	prompt = ""

	if use_null_placeholders:
	all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
	if not re.search(r"\b(man\|woman\|person\|child\|girl\|boy)\b", prompt.lower()):
	all_placeholder_tokens_str = "person " + all_placeholder_tokens_str
	repeat_prompt_for_each_encoder = False
	else:
	all_placeholder_tokens_str = self.updated_tokens_str

	# Delete the subject_string from the prompt.
	prompt = re.sub(r'\b(a\|an\|the)\s+' + self.subject_string + r'\b,?', "", prompt)
	prompt = re.sub(r'\b' + self.subject_string + r'\b,?', "", prompt)
	# Prevously, arc2face ada prompts work better if they are prepended to the prompt,
	# and consistentID ada prompts work better if they are appended to the prompt.
	# When we do joint training, seems both work better if they are appended to the prompt.
	# Therefore we simply appended all placeholder_tokens_str's to the prompt.
	# NOTE: Prepending them hurts compositional prompts.
	if repeat_prompt_for_each_encoder:
	encoder_prompts = []
	for encoder_updated_token_strs in self.all_encoders_updated_token_strs:
	if placeholder_tokens_pos == 'prepend':
	encoder_prompt = encoder_updated_token_strs + " " + prompt
	elif placeholder_tokens_pos == 'append':
	encoder_prompt = prompt + " " + encoder_updated_token_strs
	else:
	breakpoint()
	encoder_prompts.append(encoder_prompt)
	prompt = ", ".join(encoder_prompts)
	else:
	if placeholder_tokens_pos == 'prepend':
	prompt = all_placeholder_tokens_str + " " + prompt
	elif placeholder_tokens_pos == 'append':
	prompt = prompt + " " + all_placeholder_tokens_str
	else:
	breakpoint()

	return prompt

	# NOTE: all_adaface_subj_embs is the input to the CLIP text encoder.
	# ** DO NOT use it as prompt_embeds in the forward() method.
	# If face_id_embs is None, then it extracts face_id_embs from the images,
	# then map them to ada prompt embeddings.
	# avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
	# avg_at_stage == ada_prompt_emb usually produces the worst results.
	# id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
	def prepare_adaface_embeddings(self, image_paths, face_id_embs=None,
	avg_at_stage='id_emb', # id_emb, img_prompt_emb, ada_prompt_emb, or None.
	perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
	perturb_std=0, update_text_encoder=True):

	all_adaface_subj_embs, img_prompt_embs, lens_subj_emb_segments = \
	self.id2ada_prompt_encoder.generate_adaface_embeddings(\
	image_paths, face_id_embs=face_id_embs,
	img_prompt_embs=None,
	avg_at_stage=avg_at_stage,
	perturb_at_stage=perturb_at_stage,
	perturb_std=perturb_std,
	enable_static_img_suffix_embs=self.enable_static_img_suffix_embs)

	if all_adaface_subj_embs is None:
	return None

	self.img_prompt_embs = img_prompt_embs

	if all_adaface_subj_embs.ndim == 4:
	# [1, 1, 20, 768] -> [20, 768]
	all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0).squeeze(0)
	elif all_adaface_subj_embs.ndim == 3:
	# [1, 20, 768] -> [20, 768]
	all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0)

	if update_text_encoder:
	self.update_text_encoder_subj_embeddings(all_adaface_subj_embs, lens_subj_emb_segments)
	return all_adaface_subj_embs

	def diffusers_encode_prompts(self, prompt, plain_prompt, negative_prompt, device):
	# pooled_prompt_embeds_, negative_pooled_prompt_embeds_ are used by text2img3 and flux.
	pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None

	# Compatible with older versions of diffusers.
	if not hasattr(self.pipeline, "encode_prompt"):
	# prompt_embeds_, negative_prompt_embeds_: [77, 768] -> [1, 77, 768].
	prompt_embeds_, negative_prompt_embeds_ = \
	self.pipeline._encode_prompt(prompt, device=device, num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt=negative_prompt)
	prompt_embeds_ = prompt_embeds_.unsqueeze(0)
	negative_prompt_embeds_ = negative_prompt_embeds_.unsqueeze(0)
	else:
	if self.pipeline_name in ["text2imgxl", "text2img3", "flux"]:
	prompt_2 = plain_prompt
	# CLIP Text Encoder prompt uses a maximum sequence length of 77.
	# T5 Text Encoder prompt uses a maximum sequence length of 256.
	# 333 = 256 + 77.
	prompt_t5 = prompt + "".join([", "] * 256)

	# prompt_embeds_, negative_prompt_embeds_: [1, 333, 4096]
	# pooled_prompt_embeds_, negative_pooled_prompt_embeds_: [1, 2048]
	if self.pipeline_name == "text2imgxl":
	prompt_embeds_, negative_prompt_embeds_, \
	pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
	self.pipeline.encode_prompt(prompt, prompt_2, device=device,
	num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt=negative_prompt)
	elif self.pipeline_name == "text2img3":
	prompt_embeds_, negative_prompt_embeds_, \
	pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
	self.pipeline.encode_prompt(prompt, prompt_2, prompt_t5, device=device,
	num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt=negative_prompt)
	elif self.pipeline_name == "flux":
	# prompt_embeds_: [1, 512, 4096]
	# pooled_prompt_embeds_: [1, 768]
	prompt_embeds_, pooled_prompt_embeds_, text_ids = \
	self.pipeline.encode_prompt(prompt, prompt_t5, device=device,
	num_images_per_prompt=1)
	negative_prompt_embeds_ = negative_pooled_prompt_embeds_ = None
	else:
	breakpoint()
	else:
	# "text2img" and "img2img" pipelines.
	# prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
	prompt_embeds_, negative_prompt_embeds_ = \
	self.pipeline.encode_prompt(prompt, device=device,
	num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt=negative_prompt)

	return prompt_embeds_, negative_prompt_embeds_, \
	pooled_prompt_embeds_, negative_pooled_prompt_embeds_

	# alt_prompt_embed_type: 'ada-nonmix', 'img'
	def mix_ada_embs_with_other_embs(self, prompt, prompt_embeds,
	alt_prompt_embed_type, alt_prompt_emb_weights):
	# Scan prompt and replace tokens in self.placeholder_token_ids
	# with the corresponding image embeddings.
	prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
	# prompt_embeds are the ada embeddings.
	prompt_embeds2 = prompt_embeds.clone()
	if alt_prompt_embed_type.startswith('img'):
	if self.img_prompt_embs is None:
	print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
	return prompt_embeds
	# self.img_prompt_embs: [1, 20, 768]
	repl_embeddings = self.img_prompt_embs
	elif alt_prompt_embed_type == 'ada-nonmix':
	repl_embeddings_, _, _, _ = self.encode_prompt(prompt, ablate_prompt_only_placeholders=True,
	verbose=True)
	# repl_embeddings_: [1, 77, 768] -> [1, 20, 768]
	repl_embeddings = repl_embeddings_[:, 1:len(self.all_placeholder_tokens)+1]
	else:
	breakpoint()

	repl_tokens = {}
	ada_emb_weight = alt_prompt_emb_weights[0]
	for i in range(len(prompt_tokens)):
	if prompt_tokens[i] in self.all_placeholder_tokens:
	encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
	if prompt_tokens[i] in sublist), 0)
	alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx + 1]
	prompt_embeds2[:, i] = prompt_embeds2[:, i] * ada_emb_weight \
	+ repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
	repl_tokens[prompt_tokens[i]] = 1

	repl_token_count = len(repl_tokens)
	if ada_emb_weight == 0:
	print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
	else:
	print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")

	return prompt_embeds2


	def encode_prompt(self, prompt, negative_prompt=None,
	placeholder_tokens_pos='append',
	ablate_prompt_only_placeholders=False,
	ablate_prompt_no_placeholders=False,
	ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img', 'img1', 'img2'.
	nonmix_prompt_emb_weight=0,
	repeat_prompt_for_each_encoder=True,
	device=None, verbose=False):
	if negative_prompt is None:
	negative_prompt = self.negative_prompt

	if device is None:
	device = self.device

	plain_prompt = prompt
	if ablate_prompt_only_placeholders:
	prompt = self.updated_tokens_str
	else:
	prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos,
	repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
	use_null_placeholders=ablate_prompt_no_placeholders)

	if verbose:
	print(f"Subject prompt:\n{prompt}")

	# For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
	# So we manually move it to GPU here.
	self.pipeline.text_encoder.to(device)

	prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
	self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)

	if ablate_prompt_embed_type.startswith('img'):
	alt_prompt_embed_type = ablate_prompt_embed_type
	if alt_prompt_embed_type == 'img1':
	# The mixing weights of ada, img1, and img2 are 0, 1, and 0.
	alt_prompt_emb_weights = (0, 1, 0)
	elif alt_prompt_embed_type == 'img2':
	# The mixing weights of ada, img1, and img2 are 0, 0, and 1.
	alt_prompt_emb_weights = (0, 0, 1)
	else:
	# The mixing weights of ada, img1, and img2 are 0, 1, and 1.
	alt_prompt_emb_weights = (0, 1, 1)
	elif nonmix_prompt_emb_weight > 0:
	alt_prompt_embed_type = 'ada-nonmix'
	# The mixing weight of ada is 1 - nonmix_prompt_emb_weight, instead of 1 - nonmix_prompt_emb_weight * 2.
	# It means ada is mixed by this weight with both img1 and img2.
	alt_prompt_emb_weights = (1 - nonmix_prompt_emb_weight, nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
	else:
	# Don't change the prompt embeddings. So we set all the mixing weights to 0.
	alt_prompt_emb_weights = (0, 0, 0)

	if sum(alt_prompt_emb_weights) > 0:
	prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,
	alt_prompt_embed_type, alt_prompt_emb_weights)

	return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_

	# ref_img_strength is used only in the img2img pipeline.
	def forward(self, noise, prompt, prompt_embeds=None, negative_prompt=None,
	placeholder_tokens_pos='append',
	guidance_scale=6.0, out_image_count=4,
	ref_img_strength=0.8, generator=None,
	ablate_prompt_only_placeholders=False,
	ablate_prompt_no_placeholders=False,
	ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img1', 'img2'.
	nonmix_prompt_emb_weight=0,
	repeat_prompt_for_each_encoder=True,
	verbose=False):
	noise = noise.to(device=self.device, dtype=torch.float16)
	if self.use_lcm:
	guidance_scale = 0

	if negative_prompt is None:
	negative_prompt = self.negative_prompt
	# prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
	if prompt_embeds is None:
	prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
	negative_pooled_prompt_embeds_ = \
	self.encode_prompt(prompt, negative_prompt,
	placeholder_tokens_pos=placeholder_tokens_pos,
	ablate_prompt_only_placeholders=ablate_prompt_only_placeholders,
	ablate_prompt_no_placeholders=ablate_prompt_no_placeholders,
	ablate_prompt_embed_type=ablate_prompt_embed_type,
	nonmix_prompt_emb_weight=nonmix_prompt_emb_weight,
	repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
	device=self.device,
	verbose=verbose)
	else:
	if len(prompt_embeds) == 2:
	prompt_embeds_, negative_prompt_embeds_ = prompt_embeds
	pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
	elif len(prompt_embeds) == 4:
	prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
	negative_pooled_prompt_embeds_ = prompt_embeds
	else:
	breakpoint()

	# Repeat the prompt embeddings for all images in the batch.
	prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)

	if negative_prompt_embeds_ is not None:
	negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)

	if self.pipeline_name in ["text2imgxl", "text2img3"]:
	pooled_prompt_embeds_ = pooled_prompt_embeds_.repeat(out_image_count, 1)
	negative_pooled_prompt_embeds_ = negative_pooled_prompt_embeds_.repeat(out_image_count, 1)

	# noise: [BS, 4, 64, 64]
	# When the pipeline is text2img, strength is ignored.
	images = self.pipeline(prompt_embeds=prompt_embeds_,
	negative_prompt_embeds=negative_prompt_embeds_,
	pooled_prompt_embeds=pooled_prompt_embeds_,
	negative_pooled_prompt_embeds=negative_pooled_prompt_embeds_,
	num_inference_steps=self.num_inference_steps,
	guidance_scale=guidance_scale,
	num_images_per_prompt=1,
	generator=generator).images
	elif self.pipeline_name == "flux":
	images = self.pipeline(prompt_embeds=prompt_embeds_,
	pooled_prompt_embeds=pooled_prompt_embeds_,
	num_inference_steps=4,
	guidance_scale=guidance_scale,
	num_images_per_prompt=1,
	generator=generator).images
	else:
	# When the pipeline is text2img, noise: [BS, 4, 64, 64], and strength is ignored.
	# When the pipeline is img2img, noise is an initiali image of [BS, 3, 512, 512],
	# whose pixels are normalized to [0, 1].
	images = self.pipeline(image=noise,
	prompt_embeds=prompt_embeds_,
	negative_prompt_embeds=negative_prompt_embeds_,
	num_inference_steps=self.num_inference_steps,
	guidance_scale=guidance_scale,
	num_images_per_prompt=1,
	strength=ref_img_strength,
	generator=generator).images
	# images: [BS, 3, 512, 512]
	return images