fix tensor shape error when torch version less than 2 (#4)

- Update modeling_chatglm.py (5853f34804e757ba1bb3496331d108e2674088cc)

Co-authored-by: Yaowei Zheng <[email protected]>

Files changed (1) hide show

modeling_chatglm.py CHANGED Viewed

@@ -247,15 +247,12 @@ class CoreAttention(torch.nn.Module):
             # This is actually dropping out entire tokens to attend to, which might
             # seem a bit unusual, but is taken from the original Transformer paper.
             attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
             # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
             # change view [b * np, sk, hn]
             value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
             # change view [b * np, sq, sk]

             # This is actually dropping out entire tokens to attend to, which might
             # seem a bit unusual, but is taken from the original Transformer paper.
             attention_probs = self.attention_dropout(attention_probs)
+            # query layer shape: [b * np, sq, hn]
+            # value layer shape: [b, np, sk, hn]
+            # attention shape: [b, np, sq, sk]
             # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
             # change view [b * np, sk, hn]
             value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
             # change view [b * np, sq, sk]