d-matrix
/

gpt-j-6b

@@ -77,8 +77,8 @@ def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
 def apply_rotary_pos_emb(
     tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor
 ) -> torch.Tensor:
-    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
-    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
     return (tensor * cos) + (rotate_every_two(tensor) * sin)
@@ -181,7 +181,9 @@ class GPTJAttention(nn.Module):
         mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
             attn_weights.device
         )
-        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
         attn_weights = attn_weights / self.scale_attn
@@ -349,7 +351,11 @@ class GPTJBlock(nn.Module):
         outputs = attn_outputs[1:]
         feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = attn_output + feed_forward_hidden_states + residual
         if use_cache:
             outputs = (hidden_states,) + outputs

 def apply_rotary_pos_emb(
     tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor
 ) -> torch.Tensor:
+    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3).to(tensor.device)
+    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3).to(tensor.device)
     return (tensor * cos) + (rotate_every_two(tensor) * sin)
         mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
             attn_weights.device
         )
+        attn_weights = torch.where(
+            causal_mask.to(attn_weights.device), attn_weights, mask_value
+        )
         attn_weights = attn_weights / self.scale_attn
         outputs = attn_outputs[1:]
         feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = (
+            attn_output.to(feed_forward_hidden_states.device)
+            + feed_forward_hidden_states
+            + residual.to(feed_forward_hidden_states.device)
+        )
         if use_cache:
             outputs = (hidden_states,) + outputs