Spaces:
Sleeping
Sleeping
import torch | |
from diffusers.models.attention_processor import Attention | |
import torch.nn.functional as F | |
try: | |
from fla.ops.linear_attn import chunk_linear_attn | |
FLA_ENABLE = True | |
except ImportError: | |
print("Warning: FLA is not installed, falling back to default attention.") | |
FLA_ENABLE = False | |
def get_none_linear_projection(query_dim, mid_dim=None): | |
# If mid_dim is None, then the mid_dim is the same as query_dim | |
# If mid_dim is -1, then no non-linear projection is used, and the identity is returned | |
return ( | |
torch.nn.Sequential( | |
torch.nn.Linear(query_dim, mid_dim or query_dim), | |
torch.nn.LayerNorm(mid_dim or query_dim), | |
torch.nn.LeakyReLU(inplace=True), | |
torch.nn.Linear(mid_dim or query_dim, query_dim), | |
) | |
if mid_dim != -1 | |
else torch.nn.Identity() | |
) | |
class GeneralizedLinearAttention(Attention): | |
def __init__(self, *args, projection_mid_dim=None, **kwargs): | |
""" | |
Args: | |
query_dim: the dimension of the query. | |
out_dim: the dimension of the output. | |
dim_head: the dimension of the head. (dim_head * num_heads = query_dim) | |
projection_mid_dim: the dimension of the intermediate layer in the non-linear projection. | |
If `None`, then the dimension is the same as the query dimension. | |
If `-1`, then no non-linear projection is used, and the identity is returned. | |
""" | |
super().__init__(*args, **kwargs) | |
self.add_non_linear_model(projection_mid_dim) | |
def from_attention_instance(attention_instance, projection_mid_dim=None): | |
assert isinstance(attention_instance, Attention) | |
new_instance = GeneralizedLinearAttention(128) | |
new_instance.__dict__ = attention_instance.__dict__ | |
new_instance.add_non_linear_model(mid_dim = projection_mid_dim) | |
return new_instance | |
def add_non_linear_model(self, mid_dim=None, **kwargs): | |
query_dim = self.to_q.weight.shape[0] | |
self.to_q_ = get_none_linear_projection(query_dim, mid_dim, **kwargs) | |
self.to_k_ = get_none_linear_projection(query_dim, mid_dim, **kwargs) | |
def forward( | |
self, | |
hidden_states, | |
encoder_hidden_states=None, | |
attention_mask=None, | |
**kwargs, | |
): | |
if encoder_hidden_states is None: | |
encoder_hidden_states = hidden_states | |
_, sequence_length, _ = hidden_states.shape | |
query = self.to_q(hidden_states + self.to_q_(hidden_states)) | |
key = self.to_k(encoder_hidden_states + self.to_k_(encoder_hidden_states)) | |
value = self.to_v(encoder_hidden_states) | |
query = self.head_to_batch_dim(query) | |
key = self.head_to_batch_dim(key) | |
value = self.head_to_batch_dim(value) | |
query = F.elu(query) + 1.0 | |
key = F.elu(key) + 1.0 | |
if FLA_ENABLE and False: | |
# TODO: there is a bug in the FLA implementation | |
raise NotImplementedError | |
else: | |
z = query @ key.mean(dim=-2, keepdim=True).transpose(-2, -1) + 1e-4 | |
kv = (key.transpose(-2, -1) * (sequence_length**-0.5)) @ ( | |
value * (sequence_length**-0.5) | |
) | |
hidden_states = query @ kv / z | |
hidden_states = self.batch_to_head_dim(hidden_states) | |
# linear proj | |
hidden_states = self.to_out[0](hidden_states) | |
# dropout | |
hidden_states = self.to_out[1](hidden_states) | |
return hidden_states |