UniK3D-demo / unik3d /layers /attention.py
Luigi Piccinelli
init demo
1ea89dd
from functools import partial
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from .layer_scale import LayerScale
from .mlp import MLP
class SimpleAttention(nn.Module):
def __init__(
self,
dim: int,
num_heads: int = 4,
dropout: float = 0.0,
cosine: bool = False,
context_dim: int | None = None,
):
super().__init__()
self.dropout = dropout
self.num_heads = num_heads
self.hidden_dim = dim
context_dim = context_dim or dim
self.kv = nn.Linear(context_dim, dim * 2, bias=False)
self.q = nn.Linear(dim, dim, bias=False)
self.norm_attnx = nn.LayerNorm(dim)
self.norm_attnctx = nn.LayerNorm(context_dim)
self.cosine = cosine
self.out = nn.Linear(dim, dim, bias=False)
def forward(
self,
x: torch.Tensor,
attn_bias: torch.Tensor | None = None,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
context = x if context is None else context
x = self.norm_attnx(x)
context = self.norm_attnctx(context)
k, v = rearrange(
self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
).unbind(dim=-1)
q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)
if rope is not None:
q = rope(q)
k = rope(k)
else:
if pos_embed is not None:
pos_embed = rearrange(
pos_embed, "b n (h d) -> b h n d", h=self.num_heads
)
q = q + pos_embed
if pos_embed_context is not None:
pos_embed_context = rearrange(
pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
)
k = k + pos_embed_context
if self.cosine:
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
x = F.scaled_dot_product_attention(
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
)
x = rearrange(x, "b h n d -> b n (h d)")
x = self.out(x)
return x
class AttentionBlock(nn.Module):
def __init__(
self,
dim: int,
num_heads: int = 4,
expansion: int = 4,
dropout: float = 0.0,
cosine: bool = False,
gated: bool = False,
layer_scale: float = 1.0,
context_dim: int | None = None,
detach_query: bool = False,
residual_ls: bool = False,
):
super().__init__()
self.dropout = dropout
self.num_heads = num_heads
self.hidden_dim = dim
context_dim = dim if context_dim is None else context_dim
self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
self.kv = nn.Linear(context_dim, dim * 2, bias=False)
self.q = nn.Linear(dim, dim, bias=False)
self.norm_attnx = nn.LayerNorm(dim)
self.norm_attnctx = nn.LayerNorm(context_dim)
self.cosine = cosine
self.out = nn.Linear(dim, dim, bias=False)
self.ls1_1 = (
LayerScale(dim, layer_scale)
if layer_scale > 0.0 and not residual_ls
else nn.Identity()
)
self.ls1_2 = (
LayerScale(dim, layer_scale)
if layer_scale > 0.0 and residual_ls
else nn.Identity()
)
self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
self.detach_query = detach_query
def attn(
self,
x: torch.Tensor,
attn_bias: torch.Tensor | None = None,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
if self.detach_query:
x = x.detach()
x = self.norm_attnx(x)
context = self.norm_attnctx(context)
k, v = rearrange(
self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
).unbind(dim=-1)
q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)
if rope is not None:
q = rope(q.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3)
k = rope(k.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3)
else:
if pos_embed is not None:
pos_embed = rearrange(
pos_embed, "b n (h d) -> b h n d", h=self.num_heads
)
q = q + pos_embed
if pos_embed_context is not None:
pos_embed_context = rearrange(
pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
)
k = k + pos_embed_context
if self.cosine:
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
x = F.scaled_dot_product_attention(
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
)
x = rearrange(x, "b h n d -> b n (h d)")
x = self.out(x)
return x
def forward(
self,
x: torch.Tensor,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
attn_bias: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
context = x if context is None else context
x = self.ls1_1(
self.attn(
x,
rope=rope,
rope_pos=rope_pos,
attn_bias=attn_bias,
context=context,
pos_embed=pos_embed,
pos_embed_context=pos_embed_context,
)
) + self.ls1_2(x)
x = self.ls2(self.mlp(x)) + x
return x
class AttentionLayer(nn.Module):
def __init__(
self,
num_blocks: int,
dim: int,
num_heads: int = 4,
expansion: int = 4,
dropout: float = 0.0,
cosine: bool = False,
gated: bool = False,
layer_scale: float = 1.0,
context_dim: int | None = None,
detach_query: bool = False,
residual_ls: bool = False,
):
super().__init__()
self.layers = nn.ModuleList(
[
AttentionBlock(
dim=dim,
num_heads=num_heads,
expansion=expansion,
dropout=dropout,
cosine=cosine,
gated=gated,
layer_scale=layer_scale,
context_dim=context_dim,
detach_query=detach_query,
residual_ls=residual_ls,
)
for _ in range(num_blocks)
]
)
def forward(
self,
x: torch.Tensor,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
attn_bias: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
for layer in self.layers:
x = layer(
x,
context=context,
pos_embed=pos_embed,
pos_embed_context=pos_embed_context,
attn_bias=attn_bias,
rope=rope,
rope_pos=rope_pos,
)
return x
class AttentionDecoderBlock(nn.Module):
def __init__(
self,
dim: int,
num_heads: int = 4,
expansion: int = 4,
dropout: float = 0.0,
cosine: bool = False,
gated: bool = False,
layer_scale: float = 1.0,
context_dim: int | None = None,
single_head_ca: bool = True,
):
super().__init__()
self.dropout = dropout
self.num_heads = num_heads
self.hidden_dim = dim
self.single_head_ca = single_head_ca
context_dim = context_dim or dim
self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
self.kv_ca = nn.Linear(context_dim, dim * 2, bias=False)
self.q_ca = nn.Linear(dim, dim, bias=False)
self.kv_sa = nn.Linear(dim, dim * 2, bias=False)
self.q_sa = nn.Linear(dim, dim, bias=False)
self.norm_x_sa = nn.LayerNorm(dim)
self.norm_x_ca = nn.LayerNorm(dim)
self.norm_ctx_ca = nn.LayerNorm(context_dim)
self.cosine = cosine
self.out_ca = nn.Linear(dim, dim, bias=False)
self.out_sa = nn.Linear(dim, dim, bias=False)
self.ls1 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
self.ls3 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
def cross_attn(
self,
x: torch.Tensor,
attn_bias: torch.Tensor | None = None,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
num_heads = 1 if self.single_head_ca else self.num_heads
x = self.norm_x_ca(x)
context = self.norm_ctx_ca(context)
k, v = rearrange(
self.kv_ca(context), "b n (kv h d) -> b h n d kv", h=num_heads, kv=2
).unbind(dim=-1)
q = rearrange(self.q_ca(x), "b n (h d) -> b h n d", h=num_heads)
if rope is not None:
q = rope(q)
k = rope(k)
else:
if pos_embed is not None:
pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=num_heads)
q = q + pos_embed
if pos_embed_context is not None:
pos_embed_context = rearrange(
pos_embed_context, "b n (h d) -> b h n d", h=num_heads
)
k = k + pos_embed_context
if self.cosine:
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
x = F.scaled_dot_product_attention(
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
)
x = rearrange(x, "b h n d -> b n (h d)")
x = self.out_ca(x)
return x
def self_attn(
self,
x: torch.Tensor,
attn_bias: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
x = self.norm_x_sa(x)
k, v = rearrange(
self.kv_sa(x), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
).unbind(dim=-1)
q = rearrange(self.q_sa(x), "b n (h d) -> b h n d", h=self.num_heads)
if rope is not None:
q = rope(q)
k = rope(k)
elif pos_embed is not None:
pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=self.num_heads)
q = q + pos_embed
if self.cosine:
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
x = F.scaled_dot_product_attention(
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
)
x = rearrange(x, "b h n d -> b n (h d)")
x = self.out_sa(x)
return x
def forward(
self,
x: torch.Tensor,
attn_bias: torch.Tensor | None = None,
context: torch.Tensor | None = None,
pos_embed: torch.Tensor | None = None,
pos_embed_context: torch.Tensor | None = None,
rope: nn.Module | None = None,
rope_pos: torch.Tensor | None = None,
) -> torch.Tensor:
context = x if context is None else context
x = (
self.ls1(
self.cross_attn(
x,
rope=rope,
attn_bias=attn_bias,
context=context,
pos_embed=pos_embed,
pos_embed_context=pos_embed_context,
)
)
+ x
)
x = (
self.ls2(
self.self_attn(x, rope=rope, attn_bias=attn_bias, pos_embed=pos_embed)
)
+ x
)
x = self.ls3(self.mlp(x)) + x
return x