Spaces:

lpiccinelli
/

UniK3D-demo

Running on Zero

UniK3D-demo / unik3d /layers /attention.py

Luigi Piccinelli

init demo

1ea89dd 4 months ago

12.9 kB

	from functools import partial

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange

	from .layer_scale import LayerScale
	from .mlp import MLP


	class SimpleAttention(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 4,
	dropout: float = 0.0,
	cosine: bool = False,
	context_dim: int \| None = None,
	):
	super().__init__()
	self.dropout = dropout
	self.num_heads = num_heads
	self.hidden_dim = dim
	context_dim = context_dim or dim

	self.kv = nn.Linear(context_dim, dim * 2, bias=False)
	self.q = nn.Linear(dim, dim, bias=False)
	self.norm_attnx = nn.LayerNorm(dim)
	self.norm_attnctx = nn.LayerNorm(context_dim)
	self.cosine = cosine
	self.out = nn.Linear(dim, dim, bias=False)

	def forward(
	self,
	x: torch.Tensor,
	attn_bias: torch.Tensor \| None = None,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	context = x if context is None else context
	x = self.norm_attnx(x)
	context = self.norm_attnctx(context)
	k, v = rearrange(
	self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
	).unbind(dim=-1)
	q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)

	if rope is not None:
	q = rope(q)
	k = rope(k)
	else:
	if pos_embed is not None:
	pos_embed = rearrange(
	pos_embed, "b n (h d) -> b h n d", h=self.num_heads
	)
	q = q + pos_embed
	if pos_embed_context is not None:
	pos_embed_context = rearrange(
	pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
	)
	k = k + pos_embed_context

	if self.cosine:
	q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
	x = F.scaled_dot_product_attention(
	q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
	)
	x = rearrange(x, "b h n d -> b n (h d)")
	x = self.out(x)
	return x


	class AttentionBlock(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 4,
	expansion: int = 4,
	dropout: float = 0.0,
	cosine: bool = False,
	gated: bool = False,
	layer_scale: float = 1.0,
	context_dim: int \| None = None,
	detach_query: bool = False,
	residual_ls: bool = False,
	):
	super().__init__()
	self.dropout = dropout
	self.num_heads = num_heads
	self.hidden_dim = dim
	context_dim = dim if context_dim is None else context_dim
	self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
	self.kv = nn.Linear(context_dim, dim * 2, bias=False)
	self.q = nn.Linear(dim, dim, bias=False)
	self.norm_attnx = nn.LayerNorm(dim)
	self.norm_attnctx = nn.LayerNorm(context_dim)
	self.cosine = cosine
	self.out = nn.Linear(dim, dim, bias=False)
	self.ls1_1 = (
	LayerScale(dim, layer_scale)
	if layer_scale > 0.0 and not residual_ls
	else nn.Identity()
	)
	self.ls1_2 = (
	LayerScale(dim, layer_scale)
	if layer_scale > 0.0 and residual_ls
	else nn.Identity()
	)
	self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
	self.detach_query = detach_query

	def attn(
	self,
	x: torch.Tensor,
	attn_bias: torch.Tensor \| None = None,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	if self.detach_query:
	x = x.detach()
	x = self.norm_attnx(x)
	context = self.norm_attnctx(context)
	k, v = rearrange(
	self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
	).unbind(dim=-1)
	q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads)

	if rope is not None:
	q = rope(q.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3)
	k = rope(k.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3)
	else:
	if pos_embed is not None:
	pos_embed = rearrange(
	pos_embed, "b n (h d) -> b h n d", h=self.num_heads
	)
	q = q + pos_embed
	if pos_embed_context is not None:
	pos_embed_context = rearrange(
	pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads
	)
	k = k + pos_embed_context

	if self.cosine:
	q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim

	x = F.scaled_dot_product_attention(
	q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
	)
	x = rearrange(x, "b h n d -> b n (h d)")
	x = self.out(x)
	return x

	def forward(
	self,
	x: torch.Tensor,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	attn_bias: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	context = x if context is None else context
	x = self.ls1_1(
	self.attn(
	x,
	rope=rope,
	rope_pos=rope_pos,
	attn_bias=attn_bias,
	context=context,
	pos_embed=pos_embed,
	pos_embed_context=pos_embed_context,
	)
	) + self.ls1_2(x)
	x = self.ls2(self.mlp(x)) + x
	return x


	class AttentionLayer(nn.Module):
	def __init__(
	self,
	num_blocks: int,
	dim: int,
	num_heads: int = 4,
	expansion: int = 4,
	dropout: float = 0.0,
	cosine: bool = False,
	gated: bool = False,
	layer_scale: float = 1.0,
	context_dim: int \| None = None,
	detach_query: bool = False,
	residual_ls: bool = False,
	):
	super().__init__()
	self.layers = nn.ModuleList(
	[
	AttentionBlock(
	dim=dim,
	num_heads=num_heads,
	expansion=expansion,
	dropout=dropout,
	cosine=cosine,
	gated=gated,
	layer_scale=layer_scale,
	context_dim=context_dim,
	detach_query=detach_query,
	residual_ls=residual_ls,
	)
	for _ in range(num_blocks)
	]
	)

	def forward(
	self,
	x: torch.Tensor,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	attn_bias: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	for layer in self.layers:
	x = layer(
	x,
	context=context,
	pos_embed=pos_embed,
	pos_embed_context=pos_embed_context,
	attn_bias=attn_bias,
	rope=rope,
	rope_pos=rope_pos,
	)
	return x


	class AttentionDecoderBlock(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 4,
	expansion: int = 4,
	dropout: float = 0.0,
	cosine: bool = False,
	gated: bool = False,
	layer_scale: float = 1.0,
	context_dim: int \| None = None,
	single_head_ca: bool = True,
	):
	super().__init__()
	self.dropout = dropout
	self.num_heads = num_heads
	self.hidden_dim = dim
	self.single_head_ca = single_head_ca
	context_dim = context_dim or dim
	self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated)
	self.kv_ca = nn.Linear(context_dim, dim * 2, bias=False)
	self.q_ca = nn.Linear(dim, dim, bias=False)
	self.kv_sa = nn.Linear(dim, dim * 2, bias=False)
	self.q_sa = nn.Linear(dim, dim, bias=False)
	self.norm_x_sa = nn.LayerNorm(dim)
	self.norm_x_ca = nn.LayerNorm(dim)
	self.norm_ctx_ca = nn.LayerNorm(context_dim)
	self.cosine = cosine
	self.out_ca = nn.Linear(dim, dim, bias=False)
	self.out_sa = nn.Linear(dim, dim, bias=False)
	self.ls1 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
	self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()
	self.ls3 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity()

	def cross_attn(
	self,
	x: torch.Tensor,
	attn_bias: torch.Tensor \| None = None,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	num_heads = 1 if self.single_head_ca else self.num_heads
	x = self.norm_x_ca(x)
	context = self.norm_ctx_ca(context)
	k, v = rearrange(
	self.kv_ca(context), "b n (kv h d) -> b h n d kv", h=num_heads, kv=2
	).unbind(dim=-1)
	q = rearrange(self.q_ca(x), "b n (h d) -> b h n d", h=num_heads)

	if rope is not None:
	q = rope(q)
	k = rope(k)
	else:
	if pos_embed is not None:
	pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=num_heads)
	q = q + pos_embed
	if pos_embed_context is not None:
	pos_embed_context = rearrange(
	pos_embed_context, "b n (h d) -> b h n d", h=num_heads
	)
	k = k + pos_embed_context

	if self.cosine:
	q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
	x = F.scaled_dot_product_attention(
	q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
	)
	x = rearrange(x, "b h n d -> b n (h d)")
	x = self.out_ca(x)
	return x

	def self_attn(
	self,
	x: torch.Tensor,
	attn_bias: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	x = self.norm_x_sa(x)
	k, v = rearrange(
	self.kv_sa(x), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2
	).unbind(dim=-1)
	q = rearrange(self.q_sa(x), "b n (h d) -> b h n d", h=self.num_heads)

	if rope is not None:
	q = rope(q)
	k = rope(k)
	elif pos_embed is not None:
	pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=self.num_heads)
	q = q + pos_embed

	if self.cosine:
	q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim
	x = F.scaled_dot_product_attention(
	q, k, v, dropout_p=self.dropout, attn_mask=attn_bias
	)
	x = rearrange(x, "b h n d -> b n (h d)")
	x = self.out_sa(x)
	return x

	def forward(
	self,
	x: torch.Tensor,
	attn_bias: torch.Tensor \| None = None,
	context: torch.Tensor \| None = None,
	pos_embed: torch.Tensor \| None = None,
	pos_embed_context: torch.Tensor \| None = None,
	rope: nn.Module \| None = None,
	rope_pos: torch.Tensor \| None = None,
	) -> torch.Tensor:
	context = x if context is None else context
	x = (
	self.ls1(
	self.cross_attn(
	x,
	rope=rope,
	attn_bias=attn_bias,
	context=context,
	pos_embed=pos_embed,
	pos_embed_context=pos_embed_context,
	)
	)
	+ x
	)
	x = (
	self.ls2(
	self.self_attn(x, rope=rope, attn_bias=attn_bias, pos_embed=pos_embed)
	)
	+ x
	)
	x = self.ls3(self.mlp(x)) + x
	return x