Updated flash attention usage

Files changed (2) hide show

configuration_megatron_gpt.py CHANGED Viewed

@@ -81,7 +81,7 @@ class MegatronGPTConfig(PretrainedConfig):
             Whether to calculate and apply the relative position bias within the attention function.
             If this is False, then model.generate will require you to calculate the triangular attention
             mask and pass it through in the attention mask.
-        skip_flash_attention (`bool`, *optional*, defaults to `False`):
             When calculating attention, whether to attempt to use flash attention if it's installed, or to always skip and use the regular method.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
@@ -120,7 +120,7 @@ class MegatronGPTConfig(PretrainedConfig):
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_scaling=None,
-        skip_flash_attention=False,
         **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -144,7 +144,7 @@ class MegatronGPTConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.self_attention_relative_position_bias = self_attention_relative_position_bias
         self.tie_word_embeddings = tie_word_embeddings
-        self.skip_flash_attention = skip_flash_attention
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()

             Whether to calculate and apply the relative position bias within the attention function.
             If this is False, then model.generate will require you to calculate the triangular attention
             mask and pass it through in the attention mask.
+        use_flash_attention (`bool`, *optional*, defaults to `False`):
             When calculating attention, whether to attempt to use flash attention if it's installed, or to always skip and use the regular method.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_scaling=None,
+        use_flash_attention=False,
         **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.use_cache = use_cache
         self.self_attention_relative_position_bias = self_attention_relative_position_bias
         self.tie_word_embeddings = tie_word_embeddings
+        self.use_flash_attention = use_flash_attention
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()

modeling_megatron_gpt.py CHANGED Viewed

@@ -222,7 +222,7 @@ class MegatronGPTAttention(nn.Module):
         present = (key, value) if use_cache else None
         # Compute attention
-        if not HAS_FLASH or output_attentions or head_mask is not None or self.config.skip_flash_attention:
             attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
         else:
             attn_output = self._flash_attn(query, key, value, attention_mask)

         present = (key, value) if use_cache else None
         # Compute attention
+        if not HAS_FLASH or output_attentions or head_mask is not None or not self.config.use_flash_attention:
             attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
         else:
             attn_output = self._flash_attn(query, key, value, attention_mask)