PLM-Team
/

PLM-1.8B-Base

PyTorch

English

Chinese

plm

custom_code

Model card Files Files and versions Community

jjw0126 commited on 12 days ago

Commit

f78105b

verified ·

1 Parent(s): 014e6dd

Update modeling_plm.py

Browse files

Files changed (1) hide show

modeling_plm.py +4 -4

modeling_plm.py CHANGED Viewed

@@ -408,7 +408,7 @@ class PLMAttention(nn.Module):
 class PLMFlashAttention2(PLMAttention):
     """
-    DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -431,7 +431,7 @@ class PLMFlashAttention2(PLMAttention):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # DeepseekV2FlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
@@ -509,7 +509,7 @@ class PLMFlashAttention2(PLMAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (DeepseekV2RMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
@@ -587,7 +587,7 @@ class PLMFlashAttention2(PLMAttention):
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence

 class PLMFlashAttention2(PLMAttention):
     """
+    PLM flash attention module. This module inherits from `PLMAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # PLMFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (PLMV2RMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in PLMFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence