Spaces:
Running
on
Zero
Running
on
Zero
from functools import partial | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from einops import rearrange | |
from .layer_scale import LayerScale | |
from .mlp import MLP | |
class SimpleAttention(nn.Module): | |
def __init__( | |
self, | |
dim: int, | |
num_heads: int = 4, | |
dropout: float = 0.0, | |
cosine: bool = False, | |
context_dim: int | None = None, | |
): | |
super().__init__() | |
self.dropout = dropout | |
self.num_heads = num_heads | |
self.hidden_dim = dim | |
context_dim = context_dim or dim | |
self.kv = nn.Linear(context_dim, dim * 2, bias=False) | |
self.q = nn.Linear(dim, dim, bias=False) | |
self.norm_attnx = nn.LayerNorm(dim) | |
self.norm_attnctx = nn.LayerNorm(context_dim) | |
self.cosine = cosine | |
self.out = nn.Linear(dim, dim, bias=False) | |
def forward( | |
self, | |
x: torch.Tensor, | |
attn_bias: torch.Tensor | None = None, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
context = x if context is None else context | |
x = self.norm_attnx(x) | |
context = self.norm_attnctx(context) | |
k, v = rearrange( | |
self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2 | |
).unbind(dim=-1) | |
q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads) | |
if rope is not None: | |
q = rope(q) | |
k = rope(k) | |
else: | |
if pos_embed is not None: | |
pos_embed = rearrange( | |
pos_embed, "b n (h d) -> b h n d", h=self.num_heads | |
) | |
q = q + pos_embed | |
if pos_embed_context is not None: | |
pos_embed_context = rearrange( | |
pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads | |
) | |
k = k + pos_embed_context | |
if self.cosine: | |
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim | |
x = F.scaled_dot_product_attention( | |
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias | |
) | |
x = rearrange(x, "b h n d -> b n (h d)") | |
x = self.out(x) | |
return x | |
class AttentionBlock(nn.Module): | |
def __init__( | |
self, | |
dim: int, | |
num_heads: int = 4, | |
expansion: int = 4, | |
dropout: float = 0.0, | |
cosine: bool = False, | |
gated: bool = False, | |
layer_scale: float = 1.0, | |
context_dim: int | None = None, | |
detach_query: bool = False, | |
residual_ls: bool = False, | |
): | |
super().__init__() | |
self.dropout = dropout | |
self.num_heads = num_heads | |
self.hidden_dim = dim | |
context_dim = dim if context_dim is None else context_dim | |
self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated) | |
self.kv = nn.Linear(context_dim, dim * 2, bias=False) | |
self.q = nn.Linear(dim, dim, bias=False) | |
self.norm_attnx = nn.LayerNorm(dim) | |
self.norm_attnctx = nn.LayerNorm(context_dim) | |
self.cosine = cosine | |
self.out = nn.Linear(dim, dim, bias=False) | |
self.ls1_1 = ( | |
LayerScale(dim, layer_scale) | |
if layer_scale > 0.0 and not residual_ls | |
else nn.Identity() | |
) | |
self.ls1_2 = ( | |
LayerScale(dim, layer_scale) | |
if layer_scale > 0.0 and residual_ls | |
else nn.Identity() | |
) | |
self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() | |
self.detach_query = detach_query | |
def attn( | |
self, | |
x: torch.Tensor, | |
attn_bias: torch.Tensor | None = None, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
if self.detach_query: | |
x = x.detach() | |
x = self.norm_attnx(x) | |
context = self.norm_attnctx(context) | |
k, v = rearrange( | |
self.kv(context), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2 | |
).unbind(dim=-1) | |
q = rearrange(self.q(x), "b n (h d) -> b h n d", h=self.num_heads) | |
if rope is not None: | |
q = rope(q.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) | |
k = rope(k.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) | |
else: | |
if pos_embed is not None: | |
pos_embed = rearrange( | |
pos_embed, "b n (h d) -> b h n d", h=self.num_heads | |
) | |
q = q + pos_embed | |
if pos_embed_context is not None: | |
pos_embed_context = rearrange( | |
pos_embed_context, "b n (h d) -> b h n d", h=self.num_heads | |
) | |
k = k + pos_embed_context | |
if self.cosine: | |
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim | |
x = F.scaled_dot_product_attention( | |
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias | |
) | |
x = rearrange(x, "b h n d -> b n (h d)") | |
x = self.out(x) | |
return x | |
def forward( | |
self, | |
x: torch.Tensor, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
attn_bias: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
context = x if context is None else context | |
x = self.ls1_1( | |
self.attn( | |
x, | |
rope=rope, | |
rope_pos=rope_pos, | |
attn_bias=attn_bias, | |
context=context, | |
pos_embed=pos_embed, | |
pos_embed_context=pos_embed_context, | |
) | |
) + self.ls1_2(x) | |
x = self.ls2(self.mlp(x)) + x | |
return x | |
class AttentionLayer(nn.Module): | |
def __init__( | |
self, | |
num_blocks: int, | |
dim: int, | |
num_heads: int = 4, | |
expansion: int = 4, | |
dropout: float = 0.0, | |
cosine: bool = False, | |
gated: bool = False, | |
layer_scale: float = 1.0, | |
context_dim: int | None = None, | |
detach_query: bool = False, | |
residual_ls: bool = False, | |
): | |
super().__init__() | |
self.layers = nn.ModuleList( | |
[ | |
AttentionBlock( | |
dim=dim, | |
num_heads=num_heads, | |
expansion=expansion, | |
dropout=dropout, | |
cosine=cosine, | |
gated=gated, | |
layer_scale=layer_scale, | |
context_dim=context_dim, | |
detach_query=detach_query, | |
residual_ls=residual_ls, | |
) | |
for _ in range(num_blocks) | |
] | |
) | |
def forward( | |
self, | |
x: torch.Tensor, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
attn_bias: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
for layer in self.layers: | |
x = layer( | |
x, | |
context=context, | |
pos_embed=pos_embed, | |
pos_embed_context=pos_embed_context, | |
attn_bias=attn_bias, | |
rope=rope, | |
rope_pos=rope_pos, | |
) | |
return x | |
class AttentionDecoderBlock(nn.Module): | |
def __init__( | |
self, | |
dim: int, | |
num_heads: int = 4, | |
expansion: int = 4, | |
dropout: float = 0.0, | |
cosine: bool = False, | |
gated: bool = False, | |
layer_scale: float = 1.0, | |
context_dim: int | None = None, | |
single_head_ca: bool = True, | |
): | |
super().__init__() | |
self.dropout = dropout | |
self.num_heads = num_heads | |
self.hidden_dim = dim | |
self.single_head_ca = single_head_ca | |
context_dim = context_dim or dim | |
self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated) | |
self.kv_ca = nn.Linear(context_dim, dim * 2, bias=False) | |
self.q_ca = nn.Linear(dim, dim, bias=False) | |
self.kv_sa = nn.Linear(dim, dim * 2, bias=False) | |
self.q_sa = nn.Linear(dim, dim, bias=False) | |
self.norm_x_sa = nn.LayerNorm(dim) | |
self.norm_x_ca = nn.LayerNorm(dim) | |
self.norm_ctx_ca = nn.LayerNorm(context_dim) | |
self.cosine = cosine | |
self.out_ca = nn.Linear(dim, dim, bias=False) | |
self.out_sa = nn.Linear(dim, dim, bias=False) | |
self.ls1 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() | |
self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() | |
self.ls3 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() | |
def cross_attn( | |
self, | |
x: torch.Tensor, | |
attn_bias: torch.Tensor | None = None, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
num_heads = 1 if self.single_head_ca else self.num_heads | |
x = self.norm_x_ca(x) | |
context = self.norm_ctx_ca(context) | |
k, v = rearrange( | |
self.kv_ca(context), "b n (kv h d) -> b h n d kv", h=num_heads, kv=2 | |
).unbind(dim=-1) | |
q = rearrange(self.q_ca(x), "b n (h d) -> b h n d", h=num_heads) | |
if rope is not None: | |
q = rope(q) | |
k = rope(k) | |
else: | |
if pos_embed is not None: | |
pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=num_heads) | |
q = q + pos_embed | |
if pos_embed_context is not None: | |
pos_embed_context = rearrange( | |
pos_embed_context, "b n (h d) -> b h n d", h=num_heads | |
) | |
k = k + pos_embed_context | |
if self.cosine: | |
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim | |
x = F.scaled_dot_product_attention( | |
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias | |
) | |
x = rearrange(x, "b h n d -> b n (h d)") | |
x = self.out_ca(x) | |
return x | |
def self_attn( | |
self, | |
x: torch.Tensor, | |
attn_bias: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
x = self.norm_x_sa(x) | |
k, v = rearrange( | |
self.kv_sa(x), "b n (kv h d) -> b h n d kv", h=self.num_heads, kv=2 | |
).unbind(dim=-1) | |
q = rearrange(self.q_sa(x), "b n (h d) -> b h n d", h=self.num_heads) | |
if rope is not None: | |
q = rope(q) | |
k = rope(k) | |
elif pos_embed is not None: | |
pos_embed = rearrange(pos_embed, "b n (h d) -> b h n d", h=self.num_heads) | |
q = q + pos_embed | |
if self.cosine: | |
q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim | |
x = F.scaled_dot_product_attention( | |
q, k, v, dropout_p=self.dropout, attn_mask=attn_bias | |
) | |
x = rearrange(x, "b h n d -> b n (h d)") | |
x = self.out_sa(x) | |
return x | |
def forward( | |
self, | |
x: torch.Tensor, | |
attn_bias: torch.Tensor | None = None, | |
context: torch.Tensor | None = None, | |
pos_embed: torch.Tensor | None = None, | |
pos_embed_context: torch.Tensor | None = None, | |
rope: nn.Module | None = None, | |
rope_pos: torch.Tensor | None = None, | |
) -> torch.Tensor: | |
context = x if context is None else context | |
x = ( | |
self.ls1( | |
self.cross_attn( | |
x, | |
rope=rope, | |
attn_bias=attn_bias, | |
context=context, | |
pos_embed=pos_embed, | |
pos_embed_context=pos_embed_context, | |
) | |
) | |
+ x | |
) | |
x = ( | |
self.ls2( | |
self.self_attn(x, rope=rope, attn_bias=attn_bias, pos_embed=pos_embed) | |
) | |
+ x | |
) | |
x = self.ls3(self.mlp(x)) + x | |
return x | |