jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -12,8 +12,10 @@ class LTXEnhanceAttnProcessor2_0:
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
             raise ImportError("LTXEnhanceAttnProcessor2_0 requires PyTorch 2.0.")
-    def _get_enhance_scores(self, query, key, head_dim, num_frames, text_seq_length=None):
         """Calculate enhancement scores for the attention mechanism"""
         if text_seq_length is not None:
             img_q = query[:, :, :-text_seq_length] if text_seq_length > 0 else query
             img_k = key[:, :, :-text_seq_length] if text_seq_length > 0 else key
@@ -48,15 +50,17 @@ class LTXEnhanceAttnProcessor2_0:
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
-        head_dim = attn.heads
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
-        query = query.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
         if attn.upcast_attention:
             query = query.float()
@@ -65,8 +69,9 @@ class LTXEnhanceAttnProcessor2_0:
         enhance_scores = None
         if is_enhance_enabled():
             enhance_scores = self._get_enhance_scores(
-                query, key,
-                attn.head_dim,
                 get_num_frames(),
                 text_seq_length
             )
@@ -78,7 +83,7 @@ class LTXEnhanceAttnProcessor2_0:
             is_causal=False
         )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * attn.head_dim)
         hidden_states = hidden_states.to(query.dtype)
         # Apply enhancement if enabled
@@ -112,4 +117,4 @@ def num_frames_hook(module, args, kwargs):
         hidden_states = args[0]
     num_frames = hidden_states.shape[2]
     set_num_frames(num_frames)
-    return args, kwargs

         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
             raise ImportError("LTXEnhanceAttnProcessor2_0 requires PyTorch 2.0.")
+    def _get_enhance_scores(self, query, key, inner_dim, num_heads, num_frames, text_seq_length=None):
         """Calculate enhancement scores for the attention mechanism"""
+        head_dim = inner_dim // num_heads
         if text_seq_length is not None:
             img_q = query[:, :, :-text_seq_length] if text_seq_length > 0 else query
             img_k = key[:, :, :-text_seq_length] if text_seq_length > 0 else key
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
+        inner_dim = attn.to_q.out_features
+        num_heads = attn.heads
+        head_dim = inner_dim // num_heads
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         if attn.upcast_attention:
             query = query.float()
         enhance_scores = None
         if is_enhance_enabled():
             enhance_scores = self._get_enhance_scores(
+                query, key,
+                inner_dim,
+                num_heads,
                 get_num_frames(),
                 text_seq_length
             )
             is_causal=False
         )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, inner_dim)
         hidden_states = hidden_states.to(query.dtype)
         # Apply enhancement if enabled
         hidden_states = args[0]
     num_frames = hidden_states.shape[2]
     set_num_frames(num_frames)
+    return args, kwargs