Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on Mar 3

Commit

0f4cf3f

1 Parent(s): d983ee9

update

Browse files

Files changed (1) hide show

toolbox/torchaudio/models/nx_clean_unet/transformer/attention.py +21 -13

toolbox/torchaudio/models/nx_clean_unet/transformer/attention.py CHANGED Viewed

@@ -108,7 +108,7 @@ class MultiHeadedAttention(nn.Module):
         return self.forward_attention(v, scores, mask), new_cache
-class RelativeMultiHeadedAttention(nn.Module):
     def __init__(self, n_head: int, n_feat: int, dropout_rate: float, max_relative_position: int = 5120):
         """
@@ -203,30 +203,37 @@ class RelativeMultiHeadedAttention(nn.Module):
                 mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
                 cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
                 ) -> Tuple[torch.Tensor, torch.Tensor]:
         q, k, v = self.forward_qkv(query, key, value)
         if cache.size(0) > 0:
             key_cache, value_cache = torch.split(
                 cache, cache.size(-1) // 2, dim=-1)
             k = torch.cat([key_cache, k], dim=2)
             v = torch.cat([value_cache, v], dim=2)
-        # NOTE: We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-        # new_cache shape: [batch_size, self.h, time_steps, self.d_v * 2]
         new_cache = torch.cat((k, v), dim=-1)
         # Compute relative position encoding
-        length = q.size(2)
-        relative_position = self.relative_position_encoding(length)
-        relative_position_k = self.relative_position_k[relative_position.view(-1)].view(length, length, -1)
-        relative_position_k = relative_position_k.unsqueeze(0).unsqueeze(0)  # (1, 1, length, length, d_k)
-        relative_position_k = relative_position_k.expand(q.size(0), q.size(1), -1, -1, -1)  # (batch, head, length, length, d_k)
         native_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
         relative_position_scores = torch.matmul(q.unsqueeze(3), relative_position_k.transpose(-2, -1)).squeeze(3) / math.sqrt(self.d_k)
         scores = native_scores + relative_position_scores
         return self.forward_attention(v, scores, mask), new_cache
@@ -235,12 +242,13 @@ class RelativeMultiHeadedAttention(nn.Module):
 def main():
     rel_attention = RelativeMultiHeadedAttention(n_head=4, n_feat=256, dropout_rate=0.1)
-    # x = torch.ones(size=(1, 200, 256), dtype=torch.float32)
-    x = torch.ones(size=(1, 1, 256), dtype=torch.float32)
-    cache = torch.ones(size=(1, 4, 199, 128), dtype=torch.float32)
-    xt, new_cache = rel_attention.forward(x, x, x, cache=cache)
     print(xt.shape)
     print(new_cache.shape)
     return

         return self.forward_attention(v, scores, mask), new_cache
+class RelativeMultiHeadSelfAttention(nn.Module):
     def __init__(self, n_head: int, n_feat: int, dropout_rate: float, max_relative_position: int = 5120):
         """
                 mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
                 cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
                 ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # attention! self attention.
         q, k, v = self.forward_qkv(query, key, value)
+        # q shape: [batch_size, self.h, time_steps, self.d_k]
         if cache.size(0) > 0:
             key_cache, value_cache = torch.split(
                 cache, cache.size(-1) // 2, dim=-1)
             k = torch.cat([key_cache, k], dim=2)
             v = torch.cat([value_cache, v], dim=2)
+        # new_cache shape: [batch_size, self.h, time_steps, self.d_k * 2]
         new_cache = torch.cat((k, v), dim=-1)
         # Compute relative position encoding
+        q_length, k_length = q.size(2), k.size(2)
+        relative_position = self.relative_position_encoding(k_length)
+        # 流式推理时 q_length 与 k_length 不同。
+        relative_position = relative_position[-q_length:]
+        relative_position_k = self.relative_position_k[relative_position.view(-1)].view(q_length, k_length, -1)
+        relative_position_k = relative_position_k.unsqueeze(0).unsqueeze(0)  # (1, 1, q_length, k_length, d_k)
+        relative_position_k = relative_position_k.expand(q.size(0), q.size(1), -1, -1, -1)  # (batch, head, q_length, k_length, d_k)
         native_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        # native_scores shape: [batch_size, self.h, q_time_steps, k_time_steps]
         relative_position_scores = torch.matmul(q.unsqueeze(3), relative_position_k.transpose(-2, -1)).squeeze(3) / math.sqrt(self.d_k)
+        # relative_position_scores shape: [batch_size, self.h, q_time_steps, k_time_steps]
         scores = native_scores + relative_position_scores
         return self.forward_attention(v, scores, mask), new_cache
 def main():
     rel_attention = RelativeMultiHeadedAttention(n_head=4, n_feat=256, dropout_rate=0.1)
+    x = torch.ones(size=(1, 200, 256), dtype=torch.float32)
+    xt, new_cache = rel_attention.forward(x, x, x)
+    # x = torch.ones(size=(1, 1, 256), dtype=torch.float32)
+    # cache = torch.ones(size=(1, 4, 199, 128), dtype=torch.float32)
+    # xt, new_cache = rel_attention.forward(x, x, x, cache=cache)
     print(xt.shape)
     print(new_cache.shape)
     return