ybelkada
/

chatglm3-6b-hf

@@ -220,11 +220,10 @@ class CoreAttention(torch.nn.Module):
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
         pytorch_major_version = int(torch.__version__.split('.')[0])
-        if False:
             query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
             if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
             else:
                 if attention_mask is not None:
                     attention_mask = ~attention_mask
@@ -237,7 +236,7 @@ class CoreAttention(torch.nn.Module):
             # Raw attention scores
             # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
@@ -312,7 +311,6 @@ class CoreAttention(torch.nn.Module):
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
@@ -448,7 +446,6 @@ class SelfAttention(torch.nn.Module):
         return output, kv_cache
 def _config_to_kwargs(args):
     common_kwargs = {
         "dtype": args.torch_dtype,
@@ -504,7 +501,6 @@ class MLP(torch.nn.Module):
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
@@ -862,6 +858,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         all_hidden_states = () if output_hidden_states else None
         hidden_states = inputs_embeds
         for index, layer in enumerate(self.layers):
             if output_hidden_states:

     def forward(self, query_layer, key_layer, value_layer, attention_mask):
         pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
             query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
             if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,is_causal=True)
             else:
                 if attention_mask is not None:
                     attention_mask = ~attention_mask
             # Raw attention scores
             # [b, np, sq, sk]
+            output_size = (query_layer.size(0), query_layer.size(2), query_layer.size(1), key_layer.size(0))
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
         return output, kv_cache
 def _config_to_kwargs(args):
     common_kwargs = {
         "dtype": args.torch_dtype,
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
         all_hidden_states = () if output_hidden_states else None
         hidden_states = inputs_embeds
+        # To comply with former chat-glm format that expects (seqlen, bs, hd)
+        hidden_states = hidden_states.permute(1, 0, 2)
         for index, layer in enumerate(self.layers):
             if output_hidden_states: