|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from typing import List, Union, Optional, Tuple, Dict, Any, Callable |
|
from diffusers.models.attention_processor import Attention, F |
|
from .lora_controller import enable_lora |
|
from einops import rearrange |
|
import math |
|
from diffusers.models.embeddings import apply_rotary_emb |
|
|
|
|
|
def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor: |
|
|
|
L, S = query.size(-2), key.size(-2) |
|
B = query.size(0) |
|
scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale |
|
attn_bias = torch.zeros(B, 1, L, S, dtype=query.dtype, device=query.device) |
|
if is_causal: |
|
assert attn_mask is None |
|
assert False |
|
temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) |
|
attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) |
|
attn_bias.to(query.dtype) |
|
|
|
if attn_mask is not None: |
|
if attn_mask.dtype == torch.bool: |
|
attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf")) |
|
else: |
|
attn_bias += attn_mask |
|
attn_weight = query @ key.transpose(-2, -1) * scale_factor |
|
attn_weight += attn_bias.to(attn_weight.device) |
|
attn_weight = torch.softmax(attn_weight, dim=-1) |
|
|
|
return torch.dropout(attn_weight, dropout_p, train=True) @ value, attn_weight |
|
|
|
def attn_forward( |
|
attn: Attention, |
|
hidden_states: torch.FloatTensor, |
|
encoder_hidden_states: torch.FloatTensor = None, |
|
condition_latents: torch.FloatTensor = None, |
|
text_cond_mask: Optional[torch.FloatTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
image_rotary_emb: Optional[torch.Tensor] = None, |
|
cond_rotary_emb: Optional[torch.Tensor] = None, |
|
model_config: Optional[Dict[str, Any]] = {}, |
|
store_attn_map: bool = False, |
|
latent_height: Optional[int] = None, |
|
timestep: Optional[torch.Tensor] = None, |
|
last_attn_map: Optional[torch.Tensor] = None, |
|
condition_sblora_weight: Optional[float] = None, |
|
latent_sblora_weight: Optional[float] = None, |
|
) -> torch.FloatTensor: |
|
batch_size, _, _ = ( |
|
hidden_states.shape |
|
if encoder_hidden_states is None |
|
else encoder_hidden_states.shape |
|
) |
|
|
|
is_sblock = encoder_hidden_states is None |
|
is_dblock = not is_sblock |
|
|
|
with enable_lora( |
|
(attn.to_q, attn.to_k, attn.to_v), |
|
(is_dblock and model_config["latent_lora"]) or (is_sblock and model_config["sblock_lora"]), latent_sblora_weight=latent_sblora_weight |
|
): |
|
query = attn.to_q(hidden_states) |
|
key = attn.to_k(hidden_states) |
|
value = attn.to_v(hidden_states) |
|
|
|
inner_dim = key.shape[-1] |
|
head_dim = inner_dim // attn.heads |
|
|
|
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
|
|
if attn.norm_q is not None: |
|
query = attn.norm_q(query) |
|
if attn.norm_k is not None: |
|
key = attn.norm_k(key) |
|
|
|
|
|
if encoder_hidden_states is not None: |
|
|
|
with enable_lora((attn.add_q_proj, attn.add_k_proj, attn.add_v_proj), model_config["text_lora"]): |
|
encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) |
|
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) |
|
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) |
|
|
|
encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
|
|
if attn.norm_added_q is not None: |
|
encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj) |
|
if attn.norm_added_k is not None: |
|
encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj) |
|
|
|
|
|
query = torch.cat([encoder_hidden_states_query_proj, query], dim=2) |
|
key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) |
|
value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) |
|
|
|
if image_rotary_emb is not None: |
|
query = apply_rotary_emb(query, image_rotary_emb) |
|
key = apply_rotary_emb(key, image_rotary_emb) |
|
|
|
if condition_latents is not None: |
|
assert condition_latents.shape[0] == batch_size |
|
cond_length = condition_latents.shape[1] |
|
|
|
cond_lora_activate = (is_dblock and model_config["use_condition_dblock_lora"]) or (is_sblock and model_config["use_condition_sblock_lora"]) |
|
with enable_lora( |
|
(attn.to_q, attn.to_k, attn.to_v), |
|
dit_activated=not cond_lora_activate, cond_activated=cond_lora_activate, latent_sblora_weight=condition_sblora_weight |
|
): |
|
cond_query = attn.to_q(condition_latents) |
|
cond_key = attn.to_k(condition_latents) |
|
cond_value = attn.to_v(condition_latents) |
|
|
|
cond_query = cond_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
cond_key = cond_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
cond_value = cond_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
if attn.norm_q is not None: |
|
cond_query = attn.norm_q(cond_query) |
|
if attn.norm_k is not None: |
|
cond_key = attn.norm_k(cond_key) |
|
|
|
if cond_rotary_emb is not None: |
|
cond_query = apply_rotary_emb(cond_query, cond_rotary_emb) |
|
cond_key = apply_rotary_emb(cond_key, cond_rotary_emb) |
|
|
|
if model_config.get("text_cond_attn", False): |
|
if encoder_hidden_states is not None: |
|
assert text_cond_mask is not None |
|
img_length = hidden_states.shape[1] |
|
seq_length = encoder_hidden_states_query_proj.shape[2] |
|
assert len(text_cond_mask.shape) == 2 or len(text_cond_mask.shape) == 3 |
|
if len(text_cond_mask.shape) == 2: |
|
text_cond_mask = text_cond_mask.unsqueeze(-1) |
|
N = text_cond_mask.shape[-1] |
|
else: |
|
raise NotImplementedError() |
|
|
|
query = torch.cat([query, cond_query], dim=2) |
|
key = torch.cat([key, cond_key], dim=2) |
|
value = torch.cat([value, cond_value], dim=2) |
|
|
|
assert query.shape[2] == key.shape[2] |
|
assert query.shape[2] == cond_length + img_length + seq_length |
|
|
|
attention_mask = torch.ones(batch_size, 1, query.shape[2], key.shape[2], device=query.device, dtype=torch.bool) |
|
attention_mask[..., -cond_length:, :-cond_length] = False |
|
attention_mask[..., :-cond_length, -cond_length:] = False |
|
|
|
if encoder_hidden_states is not None: |
|
tokens_per_cond = cond_length // N |
|
for i in range(batch_size): |
|
for j in range(N): |
|
start = seq_length + img_length + tokens_per_cond * j |
|
attention_mask[i, 0, :seq_length, start:start+tokens_per_cond] = text_cond_mask[i, :, j].unsqueeze(-1) |
|
|
|
elif model_config.get("union_cond_attn", False): |
|
query = torch.cat([query, cond_query], dim=2) |
|
key = torch.cat([key, cond_key], dim=2) |
|
value = torch.cat([value, cond_value], dim=2) |
|
|
|
attention_mask = torch.ones(batch_size, 1, query.shape[2], key.shape[2], device=query.device, dtype=torch.bool) |
|
cond_length = condition_latents.shape[1] |
|
assert len(text_cond_mask.shape) == 2 or len(text_cond_mask.shape) == 3 |
|
if len(text_cond_mask.shape) == 2: |
|
text_cond_mask = text_cond_mask.unsqueeze(-1) |
|
N = text_cond_mask.shape[-1] |
|
tokens_per_cond = cond_length // N |
|
|
|
seq_length = 0 |
|
if encoder_hidden_states is not None: |
|
seq_length = encoder_hidden_states_query_proj.shape[2] |
|
img_length = hidden_states.shape[1] |
|
else: |
|
seq_length = 128 |
|
img_length = hidden_states.shape[1] - seq_length |
|
|
|
if not model_config.get("cond_cond_cross_attn", True): |
|
|
|
cond_start = seq_length + img_length |
|
attention_mask[:, :, cond_start:, cond_start:] = False |
|
|
|
for j in range(N): |
|
start = cond_start + tokens_per_cond * j |
|
end = cond_start + tokens_per_cond * (j + 1) |
|
attention_mask[..., start:end, start:end] = True |
|
|
|
|
|
if encoder_hidden_states is not None: |
|
|
|
|
|
attention_mask[..., :-cond_length, -cond_length:] = False |
|
|
|
if model_config.get("use_attention_double", False) and last_attn_map is not None: |
|
attention_mask = torch.zeros(batch_size, 1, query.shape[2], key.shape[2], device=query.device, dtype=torch.bfloat16) |
|
last_attn_map = last_attn_map.to(query.device) |
|
attention_mask[..., seq_length:-cond_length, :seq_length] = torch.log(last_attn_map/last_attn_map.mean()*model_config["use_atten_lambda"]).view(-1, seq_length) |
|
|
|
|
|
else: |
|
|
|
if model_config.get("use_attention_single", False) and last_attn_map is not None: |
|
attention_mask = torch.zeros(batch_size, 1, query.shape[2], key.shape[2], device=query.device, dtype=torch.bfloat16) |
|
attention_mask[..., :seq_length, -cond_length:] = float('-inf') |
|
|
|
use_atten_lambdas = model_config["use_atten_lambda"] if len(model_config["use_atten_lambda"])!=1 else model_config["use_atten_lambda"] * (N+1) |
|
attention_mask[..., -cond_length:, seq_length:-cond_length] = math.log(use_atten_lambdas[0]) |
|
last_attn_map = last_attn_map.to(query.device) |
|
|
|
cond2latents = [] |
|
for i in range(batch_size): |
|
AM = last_attn_map[i] |
|
for j in range(N): |
|
start = seq_length + img_length + tokens_per_cond * j |
|
mask = text_cond_mask[i, :, j] |
|
weighted_AM = AM * mask.unsqueeze(0).unsqueeze(0) |
|
|
|
cond2latent = weighted_AM.mean(-1) |
|
if model_config.get("attention_norm", "mean") == "max": |
|
cond2latent = cond2latent / cond2latent.max() |
|
else: |
|
cond2latent = cond2latent / cond2latent.mean() |
|
cond2latent = cond2latent.view(-1,) |
|
|
|
|
|
current_lambda = use_atten_lambdas[j+1] |
|
|
|
attention_mask[i, 0, seq_length:-cond_length, start:start+tokens_per_cond] = torch.log(current_lambda * cond2latent.unsqueeze(-1)) |
|
|
|
|
|
cond = mask.unsqueeze(-1).expand(-1, tokens_per_cond) |
|
sub_mask = attention_mask[i, 0, :seq_length, start:start+tokens_per_cond] |
|
attention_mask[i, 0, :seq_length, start:start+tokens_per_cond] = torch.where(cond, math.log(current_lambda), sub_mask) |
|
cond2latents.append( |
|
cond2latent.reshape(latent_height, -1).detach().cpu() |
|
) |
|
if store_attn_map: |
|
if not hasattr(attn, "cond2latents"): |
|
attn.cond2latents = [] |
|
attn.cond_timesteps = [] |
|
attn.cond2latents.append(torch.stack(cond2latents, dim=0)) |
|
attn.cond_timesteps.append(timestep.cpu()) |
|
|
|
pass |
|
else: |
|
raise NotImplementedError() |
|
if hasattr(attn, "c_factor"): |
|
assert False |
|
attention_mask = torch.zeros( |
|
query.shape[2], key.shape[2], device=query.device, dtype=query.dtype |
|
) |
|
bias = torch.log(attn.c_factor[0]) |
|
attention_mask[-cond_length:, :-cond_length] = bias |
|
attention_mask[:-cond_length, -cond_length:] = bias |
|
|
|
|
|
if store_attn_map and encoder_hidden_states is not None: |
|
seq_length = encoder_hidden_states_query_proj.shape[2] |
|
img_length = hidden_states.shape[1] |
|
hidden_states, attention_probs = scaled_dot_product_attention( |
|
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False |
|
) |
|
|
|
t2i_attention_probs = attention_probs[:, :, seq_length:seq_length+img_length, :seq_length] |
|
|
|
i2t_attention_probs = attention_probs[:, :, :seq_length, seq_length:seq_length+img_length].transpose(-1, -2) |
|
|
|
if not hasattr(attn, "attn_maps"): |
|
attn.attn_maps = [] |
|
attn.timestep = [] |
|
|
|
attn.attn_maps.append( |
|
( |
|
rearrange(t2i_attention_probs, 'B attn_head (H W) attn_dim -> B attn_head H W attn_dim', H=latent_height), |
|
rearrange(i2t_attention_probs, 'B attn_head (H W) attn_dim -> B attn_head H W attn_dim', H=latent_height), |
|
) |
|
) |
|
|
|
attn.timestep.append(timestep.cpu()) |
|
has_nan = torch.isnan(hidden_states).any().item() |
|
if has_nan: |
|
print("[attn_forward] detect nan hidden_states in store_attn_map") |
|
else: |
|
hidden_states = F.scaled_dot_product_attention( |
|
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False |
|
) |
|
has_nan = torch.isnan(hidden_states).any().item() |
|
if has_nan: |
|
print("[attn_forward] detect nan hidden_states") |
|
|
|
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim).to(query.dtype) |
|
|
|
if encoder_hidden_states is not None: |
|
if condition_latents is not None: |
|
encoder_hidden_states, hidden_states, condition_latents = ( |
|
hidden_states[:, : encoder_hidden_states.shape[1]], |
|
hidden_states[ |
|
:, encoder_hidden_states.shape[1] : -condition_latents.shape[1] |
|
], |
|
hidden_states[:, -condition_latents.shape[1] :], |
|
) |
|
if model_config.get("latent_cond_by_text_attn", False): |
|
|
|
hidden_states = new_hidden_states |
|
|
|
else: |
|
encoder_hidden_states, hidden_states = ( |
|
hidden_states[:, : encoder_hidden_states.shape[1]], |
|
hidden_states[:, encoder_hidden_states.shape[1] :], |
|
) |
|
|
|
|
|
with enable_lora((attn.to_out[0],), model_config["latent_lora"]): |
|
hidden_states = attn.to_out[0](hidden_states) |
|
hidden_states = attn.to_out[1](hidden_states) |
|
with enable_lora((attn.to_add_out,), model_config["text_lora"]): |
|
encoder_hidden_states = attn.to_add_out(encoder_hidden_states) |
|
|
|
if condition_latents is not None: |
|
cond_lora_activate = model_config["use_condition_dblock_lora"] |
|
with enable_lora( |
|
(attn.to_out[0],), |
|
dit_activated=not cond_lora_activate, cond_activated=cond_lora_activate, |
|
): |
|
condition_latents = attn.to_out[0](condition_latents) |
|
condition_latents = attn.to_out[1](condition_latents) |
|
|
|
|
|
return ( |
|
(hidden_states, encoder_hidden_states, condition_latents) |
|
if condition_latents is not None |
|
else (hidden_states, encoder_hidden_states) |
|
) |
|
elif condition_latents is not None: |
|
hidden_states, condition_latents = ( |
|
hidden_states[:, : -condition_latents.shape[1]], |
|
hidden_states[:, -condition_latents.shape[1] :], |
|
) |
|
return hidden_states, condition_latents |
|
else: |
|
return hidden_states |
|
|
|
|
|
def set_delta_by_start_end( |
|
start_ends, |
|
src_delta_emb, src_delta_emb_pblock, |
|
delta_emb, delta_emb_pblock, delta_emb_mask, |
|
): |
|
for (i, j, src_s, src_e, tar_s, tar_e) in start_ends: |
|
if src_delta_emb is not None: |
|
delta_emb[i, tar_s:tar_e] = src_delta_emb[j, src_s:src_e] |
|
if src_delta_emb_pblock is not None: |
|
delta_emb_pblock[i, tar_s:tar_e] = src_delta_emb_pblock[j, src_s:src_e] |
|
delta_emb_mask[i, tar_s:tar_e] = True |
|
return delta_emb, delta_emb_pblock, delta_emb_mask |
|
|
|
def norm1_context_forward( |
|
self, |
|
x: torch.Tensor, |
|
condition_latents: Optional[torch.Tensor] = None, |
|
timestep: Optional[torch.Tensor] = None, |
|
class_labels: Optional[torch.LongTensor] = None, |
|
hidden_dtype: Optional[torch.dtype] = None, |
|
emb: Optional[torch.Tensor] = None, |
|
delta_emb: Optional[torch.Tensor] = None, |
|
delta_emb_cblock: Optional[torch.Tensor] = None, |
|
delta_emb_mask: Optional[torch.Tensor] = None, |
|
delta_start_ends = None, |
|
mod_adapter = None, |
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: |
|
batch_size, seq_length = x.shape[:2] |
|
|
|
if mod_adapter is not None: |
|
assert False |
|
|
|
if delta_emb is None: |
|
emb = self.linear(self.silu(emb)) |
|
emb = emb.unsqueeze(1) |
|
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp |
|
else: |
|
|
|
emb_orig = self.linear(self.silu(emb)).unsqueeze(1).expand((-1, seq_length, -1)) |
|
|
|
if delta_emb_cblock is None: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb)) |
|
else: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb + delta_emb_cblock)) |
|
emb = torch.where(delta_emb_mask.unsqueeze(-1), emb_new, emb_orig) |
|
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp |
|
|
|
|
|
def norm1_forward( |
|
self, |
|
x: torch.Tensor, |
|
timestep: Optional[torch.Tensor] = None, |
|
class_labels: Optional[torch.LongTensor] = None, |
|
hidden_dtype: Optional[torch.dtype] = None, |
|
emb: Optional[torch.Tensor] = None, |
|
delta_emb: Optional[torch.Tensor] = None, |
|
delta_emb_cblock: Optional[torch.Tensor] = None, |
|
delta_emb_mask: Optional[torch.Tensor] = None, |
|
t2i_attn_map: Optional[torch.Tensor] = None, |
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: |
|
if delta_emb is None: |
|
emb = self.linear(self.silu(emb)) |
|
emb = emb.unsqueeze(1) |
|
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp |
|
else: |
|
raise NotImplementedError() |
|
batch_size, HW = x.shape[:2] |
|
seq_length = t2i_attn_map.shape[-1] |
|
|
|
emb_orig = self.linear(self.silu(emb)).unsqueeze(1).expand((-1, seq_length, -1)) |
|
|
|
if delta_emb_cblock is None: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb)) |
|
else: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb + delta_emb_cblock)) |
|
|
|
emb = torch.where(delta_emb_mask.unsqueeze(-1), emb_new, emb_orig) |
|
emb = t2i_attn_map @ emb |
|
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp |
|
|
|
|
|
def block_forward( |
|
self, |
|
hidden_states: torch.FloatTensor, |
|
encoder_hidden_states: torch.FloatTensor, |
|
condition_latents: torch.FloatTensor, |
|
temb: torch.FloatTensor, |
|
cond_temb: torch.FloatTensor, |
|
text_cond_mask: Optional[torch.FloatTensor] = None, |
|
delta_emb: Optional[torch.FloatTensor] = None, |
|
delta_emb_cblock: Optional[torch.FloatTensor] = None, |
|
delta_emb_mask: Optional[torch.Tensor] = None, |
|
delta_start_ends = None, |
|
cond_rotary_emb=None, |
|
image_rotary_emb=None, |
|
model_config: Optional[Dict[str, Any]] = {}, |
|
store_attn_map: bool = False, |
|
use_text_mod: bool = True, |
|
use_img_mod: bool = False, |
|
mod_adapter = None, |
|
latent_height: Optional[int] = None, |
|
timestep: Optional[torch.Tensor] = None, |
|
last_attn_map: Optional[torch.Tensor] = None, |
|
): |
|
batch_size = hidden_states.shape[0] |
|
use_cond = condition_latents is not None |
|
|
|
train_partial_latent_lora = model_config.get("train_partial_latent_lora", False) |
|
train_partial_text_lora = model_config.get("train_partial_text_lora", False) |
|
if train_partial_latent_lora: |
|
train_partial_latent_lora_layers = model_config.get("train_partial_latent_lora_layers", "") |
|
activate_norm1 = activate_ff = True |
|
if "norm1" not in train_partial_latent_lora_layers: |
|
activate_norm1 = False |
|
if "ff" not in train_partial_latent_lora_layers: |
|
activate_ff = False |
|
|
|
if train_partial_text_lora: |
|
train_partial_text_lora_layers = model_config.get("train_partial_text_lora_layers", "") |
|
activate_norm1_context = activate_ff_context = True |
|
if "norm1" not in train_partial_text_lora_layers: |
|
activate_norm1_context = False |
|
if "ff" not in train_partial_text_lora_layers: |
|
activate_ff_context = False |
|
|
|
if use_cond: |
|
cond_lora_activate = model_config["use_condition_dblock_lora"] |
|
with enable_lora( |
|
(self.norm1.linear,), |
|
dit_activated=activate_norm1 if train_partial_latent_lora else not cond_lora_activate, cond_activated=cond_lora_activate, |
|
): |
|
norm_condition_latents, cond_gate_msa, cond_shift_mlp, cond_scale_mlp, cond_gate_mlp = ( |
|
norm1_forward( |
|
self.norm1, |
|
condition_latents, |
|
emb=cond_temb, |
|
) |
|
) |
|
delta_emb_img = delta_emb_img_cblock = None |
|
if use_img_mod and use_text_mod: |
|
if delta_emb is not None: |
|
delta_emb_img, delta_emb = delta_emb.chunk(2, dim=-1) |
|
if delta_emb_cblock is not None: |
|
delta_emb_img_cblock, delta_emb_cblock = delta_emb_cblock.chunk(2, dim=-1) |
|
|
|
with enable_lora((self.norm1.linear,), activate_norm1 if train_partial_latent_lora else model_config["latent_lora"]): |
|
if use_img_mod and encoder_hidden_states is not None: |
|
with torch.no_grad(): |
|
attn = self.attn |
|
|
|
norm_img = self.norm1(hidden_states, emb=temb)[0] |
|
norm_text = self.norm1_context(encoder_hidden_states, emb=temb)[0] |
|
|
|
img_query = attn.to_q(norm_img) |
|
img_key = attn.to_k(norm_img) |
|
text_query = attn.add_q_proj(norm_text) |
|
text_key = attn.add_k_proj(norm_text) |
|
|
|
inner_dim = img_key.shape[-1] |
|
head_dim = inner_dim // attn.heads |
|
|
|
img_query = img_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
img_key = img_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
text_query = text_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
text_key = text_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) |
|
|
|
if attn.norm_q is not None: |
|
img_query = attn.norm_q(img_query) |
|
if attn.norm_added_q is not None: |
|
text_query = attn.norm_added_q(text_query) |
|
if attn.norm_k is not None: |
|
img_key = attn.norm_k(img_key) |
|
if attn.norm_added_k is not None: |
|
text_key = attn.norm_added_k(text_key) |
|
|
|
query = torch.cat([text_query, img_query], dim=2) |
|
key = torch.cat([text_key, img_key], dim=2) |
|
if image_rotary_emb is not None: |
|
query = apply_rotary_emb(query, image_rotary_emb) |
|
key = apply_rotary_emb(key, image_rotary_emb) |
|
|
|
seq_length = text_query.shape[2] |
|
|
|
scale_factor = 1 / math.sqrt(query.size(-1)) |
|
t2i_attn_map = query @ key.transpose(-2, -1) * scale_factor |
|
t2i_attn_map = t2i_attn_map.mean(1)[:, seq_length:, :seq_length] |
|
t2i_attn_map = torch.softmax(t2i_attn_map, dim=-1) |
|
|
|
else: |
|
t2i_attn_map = None |
|
|
|
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( |
|
norm1_forward( |
|
self.norm1, |
|
hidden_states, |
|
emb=temb, |
|
delta_emb=delta_emb_img, |
|
delta_emb_cblock=delta_emb_img_cblock, |
|
delta_emb_mask=delta_emb_mask, |
|
t2i_attn_map=t2i_attn_map, |
|
) |
|
) |
|
|
|
with enable_lora((self.norm1_context.linear,), activate_norm1_context if train_partial_text_lora else model_config["text_lora"]): |
|
norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = ( |
|
norm1_context_forward( |
|
self.norm1_context, |
|
encoder_hidden_states, |
|
emb=temb, |
|
delta_emb=delta_emb if use_text_mod else None, |
|
delta_emb_cblock=delta_emb_cblock if use_text_mod else None, |
|
delta_emb_mask=delta_emb_mask if use_text_mod else None, |
|
delta_start_ends=delta_start_ends if use_text_mod else None, |
|
mod_adapter=mod_adapter, |
|
condition_latents=condition_latents, |
|
) |
|
) |
|
|
|
|
|
result = attn_forward( |
|
self.attn, |
|
model_config=model_config, |
|
hidden_states=norm_hidden_states, |
|
encoder_hidden_states=norm_encoder_hidden_states, |
|
condition_latents=norm_condition_latents if use_cond else None, |
|
text_cond_mask=text_cond_mask if use_cond else None, |
|
image_rotary_emb=image_rotary_emb, |
|
cond_rotary_emb=cond_rotary_emb if use_cond else None, |
|
store_attn_map=store_attn_map, |
|
latent_height=latent_height, |
|
timestep=timestep, |
|
last_attn_map=last_attn_map, |
|
) |
|
attn_output, context_attn_output = result[:2] |
|
cond_attn_output = result[2] if use_cond else None |
|
|
|
|
|
|
|
attn_output = gate_msa * attn_output |
|
hidden_states = hidden_states + attn_output |
|
|
|
context_attn_output = c_gate_msa * context_attn_output |
|
encoder_hidden_states = encoder_hidden_states + context_attn_output |
|
|
|
if use_cond: |
|
cond_attn_output = cond_gate_msa * cond_attn_output |
|
condition_latents = condition_latents + cond_attn_output |
|
if model_config.get("add_cond_attn", False): |
|
hidden_states += cond_attn_output |
|
|
|
|
|
|
|
norm_hidden_states = self.norm2(hidden_states) |
|
norm_hidden_states = ( |
|
norm_hidden_states * (1 + scale_mlp) + shift_mlp |
|
) |
|
|
|
norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) |
|
norm_encoder_hidden_states = ( |
|
norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp |
|
) |
|
|
|
if use_cond: |
|
norm_condition_latents = self.norm2(condition_latents) |
|
norm_condition_latents = ( |
|
norm_condition_latents * (1 + cond_scale_mlp) + cond_shift_mlp |
|
) |
|
|
|
|
|
with enable_lora((self.ff.net[2],), activate_ff if train_partial_latent_lora else model_config["latent_lora"]): |
|
|
|
ff_output = self.ff(norm_hidden_states) |
|
ff_output = gate_mlp * ff_output |
|
|
|
with enable_lora((self.ff_context.net[2],), activate_ff_context if train_partial_text_lora else model_config["text_lora"]): |
|
context_ff_output = self.ff_context(norm_encoder_hidden_states) |
|
context_ff_output = c_gate_mlp * context_ff_output |
|
|
|
if use_cond: |
|
cond_lora_activate = model_config["use_condition_dblock_lora"] |
|
with enable_lora( |
|
(self.ff.net[2],), |
|
dit_activated=activate_ff if train_partial_latent_lora else not cond_lora_activate, cond_activated=cond_lora_activate, |
|
): |
|
cond_ff_output = self.ff(norm_condition_latents) |
|
cond_ff_output = cond_gate_mlp * cond_ff_output |
|
|
|
|
|
hidden_states = hidden_states + ff_output |
|
encoder_hidden_states = encoder_hidden_states + context_ff_output |
|
if use_cond: |
|
condition_latents = condition_latents + cond_ff_output |
|
|
|
|
|
if encoder_hidden_states.dtype == torch.float16: |
|
encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) |
|
|
|
return encoder_hidden_states, hidden_states, condition_latents if use_cond else None |
|
|
|
def single_norm_forward( |
|
self, |
|
x: torch.Tensor, |
|
timestep: Optional[torch.Tensor] = None, |
|
class_labels: Optional[torch.LongTensor] = None, |
|
hidden_dtype: Optional[torch.dtype] = None, |
|
emb: Optional[torch.Tensor] = None, |
|
delta_emb: Optional[torch.Tensor] = None, |
|
delta_emb_cblock: Optional[torch.Tensor] = None, |
|
delta_emb_mask: Optional[torch.Tensor] = None, |
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: |
|
if delta_emb is None: |
|
emb = self.linear(self.silu(emb)) |
|
emb = emb.unsqueeze(1) |
|
shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa |
|
else: |
|
img_text_seq_length = x.shape[1] |
|
text_seq_length = delta_emb_mask.shape[1] |
|
|
|
emb_orig = self.linear(self.silu(emb)).unsqueeze(1).expand((-1, img_text_seq_length, -1)) |
|
|
|
if delta_emb_cblock is None: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb)) |
|
else: |
|
emb_new = self.linear(self.silu(emb.unsqueeze(1) + delta_emb + delta_emb_cblock)) |
|
|
|
emb_text = torch.where(delta_emb_mask.unsqueeze(-1), emb_new, emb_orig[:, :text_seq_length]) |
|
emb_img = emb_orig[:, text_seq_length:] |
|
emb = torch.cat([emb_text, emb_img], dim=1) |
|
|
|
shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1) |
|
x = self.norm(x) * (1 + scale_msa) + shift_msa |
|
return x, gate_msa |
|
|
|
|
|
def single_block_forward( |
|
self, |
|
hidden_states: torch.FloatTensor, |
|
temb: torch.FloatTensor, |
|
image_rotary_emb=None, |
|
condition_latents: torch.FloatTensor = None, |
|
text_cond_mask: torch.FloatTensor = None, |
|
cond_temb: torch.FloatTensor = None, |
|
delta_emb: Optional[torch.FloatTensor] = None, |
|
delta_emb_cblock: Optional[torch.FloatTensor] = None, |
|
delta_emb_mask: Optional[torch.Tensor] = None, |
|
use_text_mod: bool = True, |
|
use_img_mod: bool = False, |
|
cond_rotary_emb=None, |
|
latent_height: Optional[int] = None, |
|
timestep: Optional[torch.Tensor] = None, |
|
store_attn_map: bool = False, |
|
model_config: Optional[Dict[str, Any]] = {}, |
|
last_attn_map: Optional[torch.Tensor] = None, |
|
latent_sblora_weight=None, |
|
condition_sblora_weight=None, |
|
): |
|
|
|
using_cond = condition_latents is not None |
|
residual = hidden_states |
|
|
|
train_partial_lora = model_config.get("train_partial_lora", False) |
|
if train_partial_lora: |
|
train_partial_lora_layers = model_config.get("train_partial_lora_layers", "") |
|
activate_norm = activate_projmlp = activate_projout = True |
|
|
|
if "norm" not in train_partial_lora_layers: |
|
activate_norm = False |
|
if "projmlp" not in train_partial_lora_layers: |
|
activate_projmlp = False |
|
if "projout" not in train_partial_lora_layers: |
|
activate_projout = False |
|
|
|
with enable_lora((self.norm.linear,), activate_norm if train_partial_lora else model_config["sblock_lora"], latent_sblora_weight=latent_sblora_weight): |
|
|
|
norm_hidden_states, gate = single_norm_forward( |
|
self.norm, |
|
hidden_states, |
|
emb=temb, |
|
delta_emb=delta_emb if use_text_mod else None, |
|
delta_emb_cblock=delta_emb_cblock if use_text_mod else None, |
|
delta_emb_mask=delta_emb_mask if use_text_mod else None, |
|
) |
|
with enable_lora((self.proj_mlp,), activate_projmlp if train_partial_lora else model_config["sblock_lora"], latent_sblora_weight=latent_sblora_weight): |
|
mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states)) |
|
if using_cond: |
|
cond_lora_activate = model_config["use_condition_sblock_lora"] |
|
with enable_lora( |
|
(self.norm.linear,), |
|
dit_activated=activate_norm if train_partial_lora else not cond_lora_activate, cond_activated=cond_lora_activate, latent_sblora_weight=condition_sblora_weight |
|
): |
|
residual_cond = condition_latents |
|
norm_condition_latents, cond_gate = self.norm(condition_latents, emb=cond_temb) |
|
with enable_lora( |
|
(self.proj_mlp,), |
|
dit_activated=activate_projmlp if train_partial_lora else not cond_lora_activate, cond_activated=cond_lora_activate, latent_sblora_weight=condition_sblora_weight |
|
): |
|
mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_condition_latents)) |
|
|
|
attn_output = attn_forward( |
|
self.attn, |
|
model_config=model_config, |
|
hidden_states=norm_hidden_states, |
|
image_rotary_emb=image_rotary_emb, |
|
last_attn_map=last_attn_map, |
|
latent_height=latent_height, |
|
store_attn_map=store_attn_map, |
|
timestep=timestep, |
|
latent_sblora_weight=latent_sblora_weight, |
|
condition_sblora_weight=condition_sblora_weight, |
|
**( |
|
{ |
|
"condition_latents": norm_condition_latents, |
|
"cond_rotary_emb": cond_rotary_emb if using_cond else None, |
|
"text_cond_mask": text_cond_mask if using_cond else None, |
|
} |
|
if using_cond |
|
else {} |
|
), |
|
) |
|
if using_cond: |
|
attn_output, cond_attn_output = attn_output |
|
|
|
with enable_lora((self.proj_out,), activate_projout if train_partial_lora else model_config["sblock_lora"], latent_sblora_weight=latent_sblora_weight): |
|
hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2) |
|
|
|
hidden_states = gate * self.proj_out(hidden_states) |
|
hidden_states = residual + hidden_states |
|
if using_cond: |
|
cond_lora_activate = model_config["use_condition_sblock_lora"] |
|
with enable_lora( |
|
(self.proj_out,), |
|
dit_activated=activate_projout if train_partial_lora else not cond_lora_activate, cond_activated=cond_lora_activate, latent_sblora_weight=condition_sblora_weight |
|
): |
|
condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2) |
|
cond_gate = cond_gate.unsqueeze(1) |
|
condition_latents = cond_gate * self.proj_out(condition_latents) |
|
condition_latents = residual_cond + condition_latents |
|
|
|
if hidden_states.dtype == torch.float16: |
|
hidden_states = hidden_states.clip(-65504, 65504) |
|
|
|
return hidden_states if not using_cond else (hidden_states, condition_latents) |
|
|