Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-animate / adaface /diffusers_attn_lora_capture.py

adaface-neurips

Update code

737c1a0 about 1 month ago

raw

history blame contribute delete

33.9 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Optional, Tuple, Dict, Any
	from diffusers.models.attention_processor import Attention, AttnProcessor2_0
	from diffusers.utils import logging, is_torch_version, deprecate
	# UNet is a diffusers PeftAdapterMixin instance.
	from diffusers.loaders.peft import PeftAdapterMixin
	from peft import LoraConfig, get_peft_model
	import peft.tuners.lora as peft_lora
	from peft.tuners.lora.dora import DoraLinearLayer
	from einops import rearrange
	import math, re
	from peft.tuners.tuners_utils import BaseTunerLayer


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	def dummy_func(args, *kwargs):
	pass

	# Revised from RevGrad, by removing the grad negation.
	class ScaleGrad(torch.autograd.Function):
	@staticmethod
	def forward(ctx, input_, alpha_, debug=False):
	ctx.save_for_backward(alpha_, debug)
	output = input_
	if debug:
	print(f"input: {input_.abs().mean().detach().item()}")
	return output

	@staticmethod
	def backward(ctx, grad_output): # pragma: no cover
	# saved_tensors returns a tuple of tensors.
	alpha_, debug = ctx.saved_tensors
	if ctx.needs_input_grad[0]:
	grad_output2 = grad_output * alpha_
	if debug:
	print(f"grad_output2: {grad_output2.abs().mean().detach().item()}")
	else:
	grad_output2 = None
	return grad_output2, None, None

	class GradientScaler(nn.Module):
	def __init__(self, alpha=1., debug=False, args, *kwargs):
	"""
	A gradient scaling layer.
	This layer has no parameters, and simply scales the gradient in the backward pass.
	"""
	super().__init__(args, *kwargs)

	self._alpha = torch.tensor(alpha, requires_grad=False)
	self._debug = torch.tensor(debug, requires_grad=False)

	def forward(self, input_):
	_debug = self._debug if hasattr(self, '_debug') else False
	return ScaleGrad.apply(input_, self._alpha.to(input_.device), _debug)

	def gen_gradient_scaler(alpha, debug=False):
	if alpha == 1:
	return nn.Identity()
	if alpha > 0:
	return GradientScaler(alpha, debug=debug)
	else:
	assert alpha == 0
	# Don't use lambda function here, otherwise the object can't be pickled.
	return torch.detach

	def split_indices_by_instance(indices, as_dict=False):
	indices_B, indices_N = indices
	unique_indices_B = torch.unique(indices_B)
	if not as_dict:
	indices_by_instance = [ (indices_B[indices_B == uib], indices_N[indices_B == uib]) for uib in unique_indices_B ]
	else:
	indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
	return indices_by_instance

	# Slow implementation equivalent to F.scaled_dot_product_attention.
	def scaled_dot_product_attention(query, key, value, cross_attn_scale_factor,
	attn_mask=None, dropout_p=0.0,
	subj_indices=None, normalize_cross_attn=False,
	mix_attn_mats_in_batch=False,
	is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
	B, L, S = query.size(0), query.size(-2), key.size(-2)
	scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
	# 1: head (to be broadcasted). L: query length. S: key length.
	attn_bias = torch.zeros(B, 1, L, S, device=query.device, dtype=query.dtype)
	if is_causal:
	assert attn_mask is None
	temp_mask = torch.ones(B, 1, L, S, device=query.device, dtype=torch.bool).tril(diagonal=0)
	attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
	attn_bias.to(query.dtype)

	if attn_mask is not None:
	if attn_mask.dtype == torch.bool:
	attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
	else:
	attn_bias += attn_mask

	if enable_gqa:
	key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
	value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)

	attn_score = query @ key.transpose(-2, -1) * scale_factor

	# attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_score.
	attn_score += attn_bias
	if mix_attn_mats_in_batch:
	# The instances in the batch are [sc, mc]. We average their attn scores,
	# and apply to both instances.
	# attn_score: [2, 8, 4096, 77] -> [1, 8, 4096, 77] -> [2, 8, 4096, 77].
	# If BLOCK_SIZE > 1, attn_score.shape[0] = 2 * BLOCK_SIZE.
	if attn_score.shape[0] %2 != 0:
	breakpoint()
	attn_score_sc, attn_score_mc = attn_score.chunk(2, dim=0)
	# Cut off the grad flow from the SC instance to the MC instance.
	attn_score = (attn_score_sc + attn_score_mc.detach()) / 2
	attn_score = attn_score.repeat(2, 1, 1, 1)
	elif normalize_cross_attn:
	if subj_indices is None:
	breakpoint()
	subj_indices_B, subj_indices_N = subj_indices
	subj_attn_score = attn_score[subj_indices_B, :, :, subj_indices_N]
	# Normalize the attention score of the subject tokens to have mean 0 across tokens,
	# so that positive and negative scores are balanced.
	subj_attn_score = subj_attn_score - subj_attn_score.mean(dim=2, keepdim=True).detach()
	# cross_attn_scale is a learnable parameter, so the score will be scaled appropriately.
	# Scale up the BP'ed gradient to cross_attn_scale_factor by 10x.
	ca_scale_grad_scaler = gen_gradient_scaler(10)
	subj_attn_score = subj_attn_score * ca_scale_grad_scaler(cross_attn_scale_factor)
	attn_score2 = attn_score.clone()
	attn_score2[subj_indices_B, :, :, subj_indices_N] = subj_attn_score
	attn_score = attn_score2
	# Otherwise, do nothing to attn_score.

	attn_weight = torch.softmax(attn_score, dim=-1)
	attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
	output = attn_weight @ value
	return output, attn_score, attn_weight

	# All layers share the same attention processor instance.
	class AttnProcessor_LoRA_Capture(nn.Module):
	r"""
	Revised from AttnProcessor2_0
	"""
	# lora_proj_layers is a dict of lora_layer_name -> lora_proj_layer.
	def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
	lora_uses_dora=True, lora_proj_layers=None,
	lora_rank: int = 192, lora_alpha: float = 16,
	q_lora_updates_query=False, attn_proc_idx=-1):
	super().__init__()

	self.global_enable_lora = enable_lora
	self.attn_proc_idx = attn_proc_idx
	# reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
	# By default, normalize_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
	self.reset_attn_cache_and_flags(capture_ca_activations, False, False, enable_lora)
	self.lora_rank = lora_rank
	self.lora_alpha = lora_alpha
	self.lora_scale = self.lora_alpha / self.lora_rank
	self.q_lora_updates_query = q_lora_updates_query

	self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
	if self.global_enable_lora:
	# enable_lora = True iff this is a cross-attn layer in the last 3 up blocks.
	# Since we only use cross_attn_scale_factor on cross-attn layers,
	# we only use cross_attn_scale_factor when enable_lora is True.
	self.cross_attn_scale_factor = nn.Parameter(torch.tensor(0.8), requires_grad=True)
	for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
	if lora_layer_name == 'q':
	self.to_q_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
	use_dora=lora_uses_dora, lora_dropout=0.1)
	elif lora_layer_name == 'k':
	self.to_k_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
	use_dora=lora_uses_dora, lora_dropout=0.1)
	elif lora_layer_name == 'v':
	self.to_v_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
	use_dora=lora_uses_dora, lora_dropout=0.1)
	elif lora_layer_name == 'out':
	self.to_out_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
	use_dora=lora_uses_dora, lora_dropout=0.1)

	# LoRA layers can be enabled/disabled dynamically.
	def reset_attn_cache_and_flags(self, capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch, enable_lora):
	self.capture_ca_activations = capture_ca_activations
	self.normalize_cross_attn = normalize_cross_attn
	self.mix_attn_mats_in_batch = mix_attn_mats_in_batch
	self.cached_activations = {}
	# Only enable LoRA for the next call(s) if global_enable_lora is set to True.
	self.enable_lora = enable_lora and self.global_enable_lora

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	temb: Optional[torch.Tensor] = None,
	img_mask: Optional[torch.Tensor] = None,
	subj_indices: Optional[Tuple[torch.IntTensor, torch.IntTensor]] = None,
	debug: bool = False,
	*args,
	**kwargs,
	) -> torch.Tensor:
	if len(args) > 0 or kwargs.get("scale", None) is not None:
	deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
	deprecate("scale", "1.0.0", deprecation_message)

	# hidden_states: [1, 4096, 320]
	residual = hidden_states
	# attn.spatial_norm is None.
	if attn.spatial_norm is not None:
	hidden_states = attn.spatial_norm(hidden_states, temb)

	input_ndim = hidden_states.ndim

	if input_ndim == 4:
	batch_size, channel, height, width = hidden_states.shape
	# Collapse the spatial dimensions to a single token dimension.
	hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

	batch_size, sequence_length, _ = (
	hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
	)

	if attention_mask is not None:
	attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
	# scaled_dot_product_attention expects attention_mask shape to be
	# (batch, heads, source_length, target_length)
	attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])

	if attn.group_norm is not None:
	hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

	query = attn.to_q(hidden_states)
	# NOTE: there's a inconsistency between q lora and k, v loras.
	# k, v loras are directly applied to key and value (currently k, v loras are never enabled),
	# while q lora is applied to query2, and we keep the query unchanged.
	if self.enable_lora and self.to_q_lora is not None:
	# query2 will be used in ldm/util.py:calc_elastic_matching_loss() to get more accurate
	# cross attention scores between the latent images of the sc and mc instances.
	query2 = self.to_q_lora(hidden_states)
	# If not q_lora_updates_query, only query2 will be impacted by the LoRA layer.
	# The query, and thus the attention score and attn_out, will be the same
	# as the original ones.
	if self.q_lora_updates_query:
	query = query2
	else:
	query2 = query

	scale = 1 / math.sqrt(query.size(-1))

	is_cross_attn = (encoder_hidden_states is not None)
	if (not is_cross_attn) and (img_mask is not None):
	# NOTE: we assume the image is square. But this will fail if the image is not square.
	# hidden_states: [BS, 4096, 320]. img_mask: [BS, 1, 64, 64]
	# Scale the mask to the same size as hidden_states.
	mask_size = int(math.sqrt(hidden_states.shape[-2]))
	img_mask = F.interpolate(img_mask, size=(mask_size, mask_size), mode='nearest')
	if (img_mask.sum(dim=(2, 3)) == 0).any():
	img_mask = None
	else:
	# img_mask: [2, 1, 64, 64] -> [2, 4096]
	img_mask = rearrange(img_mask, 'b ... -> b (...)').contiguous()
	# max_neg_value = -torch.finfo(hidden_states.dtype).max
	# img_mask: [2, 4096] -> [2, 1, 1, 4096]
	img_mask = rearrange(img_mask.bool(), 'b j -> b () () j')
	# attn_score: [16, 4096, 4096]. img_mask will be broadcasted to [16, 4096, 4096].
	# So some rows in dim 1 (e.g. [0, :, 4095]) of attn_score will be masked out (all elements in [0, :, 4095] is -inf).
	# But not all elements in [0, 4095, :] is -inf. Since the softmax is done along dim 2, this is fine.
	# attn_score.masked_fill_(~img_mask, max_neg_value)
	# NOTE: If there's an attention mask, it will be replaced by img_mask.
	attention_mask = img_mask

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif attn.norm_cross:
	encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

	if self.enable_lora and self.to_k_lora is not None:
	key = self.to_k_lora(encoder_hidden_states)
	else:
	key = attn.to_k(encoder_hidden_states)

	if self.enable_lora and self.to_v_lora is not None:
	value = self.to_v_lora(encoder_hidden_states)
	else:
	value = attn.to_v(encoder_hidden_states)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	query2 = attn.norm_q(query2)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	query2 = query2.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	if debug and self.attn_proc_idx >= 0:
	breakpoint()

	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	if is_cross_attn and (self.capture_ca_activations or self.normalize_cross_attn):
	hidden_states, attn_score, attn_prob = \
	scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
	dropout_p=0.0, subj_indices=subj_indices,
	normalize_cross_attn=self.normalize_cross_attn,
	cross_attn_scale_factor=self.cross_attn_scale_factor,
	mix_attn_mats_in_batch=self.mix_attn_mats_in_batch)

	else:
	# Use the faster implementation of scaled_dot_product_attention
	# when not capturing the activations or suppressing the subject attention.
	hidden_states = \
	F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
	attn_prob = attn_score = None

	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	# linear proj
	if self.enable_lora and self.to_out_lora is not None:
	hidden_states = self.to_out_lora(hidden_states)
	else:
	hidden_states = attn.to_out[0](hidden_states)

	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	if input_ndim == 4:
	hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

	if attn.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / attn.rescale_output_factor

	if is_cross_attn and self.capture_ca_activations:
	# cached q will be used in ddpm.py:calc_comp_fg_bg_preserve_loss(), in which two qs will multiply each other.
	# So sqrt(scale) will scale the product of two qs by scale.
	# ANCHOR[id=attention_caching]
	# query: [2, 8, 4096, 40] -> [2, 320, 4096]
	self.cached_activations['q'] = \
	rearrange(query, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
	self.cached_activations['q2'] = \
	rearrange(query2, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
	self.cached_activations['k'] = \
	rearrange(key, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
	self.cached_activations['v'] = \
	rearrange(value, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
	# attn_prob, attn_score: [2, 8, 4096, 77]
	self.cached_activations['attn'] = attn_prob
	self.cached_activations['attnscore'] = attn_score
	# attn_out: [b, n, h * d] -> [b, h * d, n]
	# [2, 4096, 320] -> [2, 320, 4096].
	self.cached_activations['attn_out'] = hidden_states.permute(0, 2, 1).contiguous()

	return hidden_states

	def CrossAttnUpBlock2D_forward_capture(
	self,
	hidden_states: torch.Tensor,
	res_hidden_states_tuple: Tuple[torch.Tensor, ...],
	temb: Optional[torch.Tensor] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	upsample_size: Optional[int] = None,
	attention_mask: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	if cross_attention_kwargs is not None:
	if cross_attention_kwargs.get("scale", None) is not None:
	logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

	self.cached_outfeats = {}
	res_hidden_states_gradscale = getattr(self, "res_hidden_states_gradscale", 1)
	capture_outfeats = getattr(self, "capture_outfeats", False)
	layer_idx = 0
	res_grad_scaler = gen_gradient_scaler(res_hidden_states_gradscale)

	for resnet, attn in zip(self.resnets, self.attentions):
	# pop res hidden states
	res_hidden_states = res_hidden_states_tuple[-1]
	res_hidden_states_tuple = res_hidden_states_tuple[:-1]

	# Scale down the magnitudes of gradients to res_hidden_states
	# by res_hidden_states_gradscale=0.2, to match the scale of the cross-attn layer outputs.
	res_hidden_states = res_grad_scaler(res_hidden_states)

	hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

	if self.training and self.gradient_checkpointing:
	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(resnet),
	hidden_states,
	temb,
	**ckpt_kwargs,
	)
	hidden_states = attn(
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	cross_attention_kwargs=cross_attention_kwargs,
	attention_mask=attention_mask,
	encoder_attention_mask=encoder_attention_mask,
	return_dict=False,
	)[0]
	else:
	# resnet: ResnetBlock2D instance.
	#LINK diffusers.models.resnet.ResnetBlock2D
	# up_blocks.3.resnets.2.conv_shortcut is a module within ResnetBlock2D,
	# it's not transforming the UNet shortcut features.
	hidden_states = resnet(hidden_states, temb)
	hidden_states = attn(
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	cross_attention_kwargs=cross_attention_kwargs,
	attention_mask=attention_mask,
	encoder_attention_mask=encoder_attention_mask,
	return_dict=False,
	)[0]

	if capture_outfeats:
	self.cached_outfeats[layer_idx] = hidden_states
	layer_idx += 1

	if self.upsamplers is not None:
	for upsampler in self.upsamplers:
	hidden_states = upsampler(hidden_states, upsample_size)

	return hidden_states


	# Adapted from ConsistentIDPipeline:set_ip_adapter().
	# attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
	def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
	lora_rank=192, lora_scale_down=8,
	q_lora_updates_query=False):
	attn_procs = {}
	attn_capture_procs = {}
	unet_modules = dict(unet.named_modules())
	attn_opt_modules = {}

	attn_proc_idx = 0

	for name, attn_proc in unet.attn_processors.items():
	# Only capture the activations of the last 3 CA layers.
	if not name.startswith("up_blocks.3"):
	# Not the last 3 CA layers. Don't enable LoRA or capture activations.
	# Then the layer falls back to the original attention mechanism.
	# We still use AttnProcessor_LoRA_Capture, as it can handle img_mask.
	attn_procs[name] = AttnProcessor_LoRA_Capture(
	capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
	continue
	# cross_attention_dim: 768.
	cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
	if cross_attention_dim is None:
	# Self attention. Don't enable LoRA or capture activations.
	# We replace the default attn_proc with AttnProcessor_LoRA_Capture,
	# so that it can incorporate img_mask into self-attention.
	attn_procs[name] = AttnProcessor_LoRA_Capture(
	capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
	continue

	# block_id = 3
	# hidden_size: 320
	# hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
	# 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor' ->
	# 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q'
	lora_layer_dict = {}
	lora_layer_dict['q'] = unet_modules[name[:-9] + "to_q"]
	lora_layer_dict['k'] = unet_modules[name[:-9] + "to_k"]
	lora_layer_dict['v'] = unet_modules[name[:-9] + "to_v"]
	# to_out is a ModuleList(Linear, Dropout).
	lora_layer_dict['out'] = unet_modules[name[:-9] + "to_out"][0]

	lora_proj_layers = {}
	# Only apply LoRA to the specified layers.
	for lora_layer_name in attn_lora_layer_names:
	lora_proj_layers[lora_layer_name] = lora_layer_dict[lora_layer_name]

	attn_capture_proc = AttnProcessor_LoRA_Capture(
	capture_ca_activations=True, enable_lora=use_attn_lora,
	lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
	# LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
	lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
	q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)

	attn_proc_idx += 1
	# attn_procs has to use the original names.
	attn_procs[name] = attn_capture_proc
	# ModuleDict doesn't allow "." in the key.
	name = name.replace(".", "_")
	attn_capture_procs[name] = attn_capture_proc

	if use_attn_lora:
	cross_attn_scale_factor_name = name + "_cross_attn_scale_factor"
	# Put cross_attn_scale_factor in attn_opt_modules, so that we can optimize and save/load it.
	attn_opt_modules[cross_attn_scale_factor_name] = attn_capture_proc.cross_attn_scale_factor

	# Put LoRA layers in attn_opt_modules, so that we can optimize and save/load them.
	for subname, module in attn_capture_proc.named_modules():
	if isinstance(module, peft_lora.LoraLayer):
	# ModuleDict doesn't allow "." in the key.
	lora_path = name + "_" + subname.replace(".", "_")
	attn_opt_modules[lora_path + "_lora_A"] = module.lora_A
	attn_opt_modules[lora_path + "_lora_B"] = module.lora_B
	# lora_uses_dora is always True, so we don't check it here.
	attn_opt_modules[lora_path + "_lora_magnitude_vector"] = module.lora_magnitude_vector
	# We will manage attn adapters directly. By default, LoraLayer is an instance of BaseTunerLayer,
	# so according to the code logic in diffusers/loaders/peft.py,
	# they will be managed by the diffusers PeftAdapterMixin instance, through the
	# enable_adapters(), and set_adapter() methods.
	# Therefore, we disable these calls on module.
	# disable_adapters() is a property and changing it will cause exceptions.
	module.enable_adapters = dummy_func
	module.set_adapter = dummy_func

	unet.set_attn_processor(attn_procs)

	print(f"Set up {len(attn_capture_procs)} CrossAttn processors on {attn_capture_procs.keys()}.")
	print(f"Set up {len(attn_opt_modules)} attn LoRA params: {attn_opt_modules.keys()}.")
	return attn_capture_procs, attn_opt_modules

	# NOTE: cross-attn layers are included in the returned lora_modules.
	def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=True, lora_rank=192, lora_alpha=16):
	# target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
	# up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
	# Cannot set to conv.+ as it will match added adapter module names, including
	# up_blocks.3.resnets.1.conv1.base_layer, up_blocks.3.resnets.1.conv1.lora_dropout
	if target_modules_pat is not None:
	peft_config = LoraConfig(use_dora=lora_uses_dora, inference_mode=False, r=lora_rank,
	lora_alpha=lora_alpha, lora_dropout=0.1,
	target_modules=target_modules_pat)

	# UNet is a diffusers PeftAdapterMixin instance. Using get_peft_model on it will
	# cause weird errors. Instead, we directly use diffusers peft adapter methods.
	unet.add_adapter(peft_config, "recon_loss")
	unet.add_adapter(peft_config, "unet_distill")
	unet.add_adapter(peft_config, "comp_distill")
	unet.enable_adapters()

	# lora_layers contain both the LoRA A and B matrices, as well as the original layers.
	# lora_layers are used to set the flag, not used for optimization.
	# lora_modules contain only the LoRA A and B matrices, so they are used for optimization.
	# NOTE: lora_modules contain both ffn and cross-attn lora modules.
	ffn_lora_layers = {}
	ffn_opt_modules = {}
	for name, module in unet.named_modules():
	if isinstance(module, peft_lora.LoraLayer):
	# We don't want to include cross-attn layers in ffn_lora_layers.
	if target_modules_pat is not None and re.search(target_modules_pat, name):
	ffn_lora_layers[name] = module
	# ModuleDict doesn't allow "." in the key.
	name = name.replace(".", "_")
	# Since ModuleDict doesn't allow "." in the key, we manually collect
	# the LoRA matrices in each module.
	# NOTE: We cannot put every sub-module of module into lora_modules,
	# as base_layer is also a sub-module of module, which we shouldn't optimize.
	# Each value in ffn_opt_modules is a ModuleDict:
	'''
	(Pdb) ffn_opt_modules['up_blocks_3_resnets_1_conv1_lora_A']
	ModuleDict(
	(unet_distill): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(recon_loss): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	)
	'''
	ffn_opt_modules[name + "_lora_A"] = module.lora_A
	ffn_opt_modules[name + "_lora_B"] = module.lora_B
	if lora_uses_dora:
	ffn_opt_modules[name + "_lora_magnitude_vector"] = module.lora_magnitude_vector

	print(f"Set up {len(ffn_lora_layers)} FFN LoRA layers: {ffn_lora_layers.keys()}.")
	print(f"Set up {len(ffn_opt_modules)} FFN LoRA params: {ffn_opt_modules.keys()}.")

	return ffn_lora_layers, ffn_opt_modules

	def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
	outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
	use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
	normalize_cross_attn, mix_attn_mats_in_batch, res_hidden_states_gradscale):
	# For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
	for i, attn_capture_proc in enumerate(attn_capture_procs):
	attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch,
	enable_lora=use_attn_lora)
	# outfeat_capture_blocks only contains the last up block, up_blocks[3].
	# It contains 3 FFN layers. We want to capture their output features.
	for block in outfeat_capture_blocks:
	block.capture_outfeats = capture_ca_activations

	# res_hidden_states_gradscale_blocks contain the second to the last up blocks, up_blocks[1:].
	# It's only used to set res_hidden_states_gradscale, and doesn't capture anything.
	for block in res_hidden_states_gradscale_blocks:
	block.res_hidden_states_gradscale = res_hidden_states_gradscale

	if not use_ffn_lora:
	unet.disable_adapters()
	else:
	# ffn_lora_adapter_name: 'recon_loss', 'unet_distill', 'comp_distill'.
	if ffn_lora_adapter_name is not None:
	unet.set_adapter(ffn_lora_adapter_name)
	# NOTE: Don't forget to enable_adapters().
	# The adapters are not enabled by default after set_adapter().
	unet.enable_adapters()
	else:
	breakpoint()

	# During training, disable_adapters() and set_adapter() will set all/inactive adapters with requires_grad=False,
	# which might cause issues during DDP training.
	# So we restore them to requires_grad=True.
	# During test, unet_lora_modules will be passed as None, so this block will be skipped.
	if unet_lora_modules is not None:
	for param in unet_lora_modules.parameters():
	param.requires_grad = True

	def get_captured_activations(capture_ca_activations, attn_capture_procs, outfeat_capture_blocks,
	captured_layer_indices=[22, 23, 24], out_dtype=torch.float32):
	captured_activations = { k: {} for k in ('outfeat', 'attn', 'attnscore',
	'q', 'q2', 'k', 'v', 'attn_out') }

	if not capture_ca_activations:
	return captured_activations

	all_cached_outfeats = []
	for block in outfeat_capture_blocks:
	all_cached_outfeats.append(block.cached_outfeats)
	# Clear the capture flag and cached outfeats.
	block.cached_outfeats = {}
	block.capture_outfeats = False


	for layer_idx in captured_layer_indices:
	# Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
	# 23, 24 -> 1, 2 (!! not 0, 1 !!)
	internal_idx = layer_idx - 22
	for k in captured_activations.keys():
	if k == 'outfeat':
	# Currently we only capture one block, up_blocks.3. So we hard-code the index 0.
	captured_activations['outfeat'][layer_idx] = all_cached_outfeats[0][internal_idx].to(out_dtype)
	else:
	# internal_idx is the index of layers in up_blocks.3.
	# Layers 22, 23 and 24 map to 0, 1 and 2.
	cached_activations = attn_capture_procs[internal_idx].cached_activations
	captured_activations[k][layer_idx] = cached_activations[k].to(out_dtype)

	return captured_activations