Sin2pi
/

asr-model

@@ -9,7 +9,7 @@ import numpy as np
 from datetime import datetime
 from dataclasses import dataclass
 from torch.nn.functional import scaled_dot_product_attention
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
@@ -31,7 +31,7 @@ class rotary(nn.Module):
         self.dims = dims
         self.head = head
         self.head_dim = dims // head
-        self.theta = nn.Parameter((torch.tensor(10000, device=device, dtype=dtype)), requires_grad=True)
         self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
     def _compute_freqs_base(self):
@@ -53,6 +53,16 @@ class rotary(nn.Module):
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
 def qkv_init(dims: int, head: int):
     head_dim = dims // head
     q = nn.Linear(dims, dims)
@@ -60,60 +70,51 @@ def qkv_init(dims: int, head: int):
     v = nn.Linear(dims, dims)
     o = nn.Linear(dims, dims)
     lna = nn.LayerNorm(dims, bias=False)
-    lnb = nn.LayerNorm(head_dim, bias=False)
-    return q, k, v, o, lna, lnb
-def create_qkv(dims, head, q, k, v, x, xa):
-    head_dim = dims // head
-    scale = head_dim ** -0.25
-    q = q(x) * scale
-    k = k(xa) * scale
-    v = v(xa)
-    batch, ctx, dims = x.shape
-    def _shape(tensor):
-        return tensor.view(batch, ctx, head, head_dim).transpose(1, 2).contiguous()
-    return _shape(q), _shape(k), _shape(v)
-def calculate_attention(q, k, v, mask=None, temperature=1.0):
     scaled_q = q
-    if temperature != 1.0 and temperature > 0:
-        scaled_q = q * (1.0 / temperature)**.5
     out = scaled_dot_product_attention(scaled_q, k, v, is_causal=mask is not None and q.shape[1] > 1)
     return out
 class LocalOut(nn.Module):
-    def __init__(self, head_dim: int):
         super().__init__()
-        self.head_dim = head_dim
         self.query_module = nn.Linear(head_dim, head_dim)
         self.key_module = nn.Linear(head_dim, head_dim)
         self.value_module = nn.Linear(head_dim, head_dim)
         self.out_proj = nn.Linear(head_dim, head_dim)
-    def _reshape_to_output(self, x):
-        return x
-class attentiona(nn.Module):
-    def __init__(self, dims: int, head: int, max_iter: int = 3, threshold: float = 0.01, factor: float = 0.1, dropout: float = 0.1):
-        super(attentiona, self).__init__()
-        self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
         self.head = head
-        self.head_dim = dims // head
         self.max_iter = max_iter
         self.threshold = nn.Parameter(torch.tensor(threshold))
         self.factor = nn.Parameter(torch.tensor(factor))
-        self.dropout = dropout
-        self.lnc = nn.LayerNorm(self.head_dim, bias=False)
-        self.lnd = nn.LayerNorm(self.head_dim, bias=False)
-        self.attn_local = LocalOut(self.head_dim)
     def _focus(self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None):
-        z = default(xa, x)
-        q, k, v = create_qkv(self.dims, self.head, self.q, self.k, self.v, self.lna(x), self.lna(z))
         iteration = 0
         prev_out = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
         threshold = self.threshold.item()
@@ -121,7 +122,7 @@ class attentiona(nn.Module):
         qcur = q
         while iteration < self.max_iter:
-            eff_span = min(x.shape[1], qcur.size(1), k.size(1))
             if xa is not None:
                 eff_span = min(eff_span, xa.shape[1])
             if eff_span == 0:
@@ -130,9 +131,9 @@ class attentiona(nn.Module):
             qiter = qcur[:, :, :eff_span, :]
             kiter = k[:, :, :eff_span, :]
             viter = v[:, :, :eff_span, :]
-            q = self.attn_local.query_module(qiter)
-            k = self.attn_local.key_module(kiter)
-            v = self.attn_local.value_module(viter)
             iter_mask = None
             if mask is not None:
@@ -143,7 +144,7 @@ class attentiona(nn.Module):
             attn_iter = calculate_attention(
                 self.lnc(q), self.lnd(k), v,
-                mask=iter_mask)
             iter_out = torch.zeros_like(qcur)
             iter_out[:, :, :eff_span, :] = attn_iter
@@ -157,21 +158,22 @@ class attentiona(nn.Module):
             qcur = qcur + iter_out
             attn_out = iter_out
             iteration += 1
         output = attn_out.permute(0, 2, 1, 3).flatten(start_dim=2)
         return self.o(output), None
     def _slide_win_local(self, x: Tensor, win_size: int, span_len: int, mask: Optional[Tensor] = None) -> Tensor:
-        batch, ctx, dims = x.size()
         output = torch.zeros_like(x)
         num_win = (ctx + win_size - 1) // win_size
         for i in range(num_win):
             qstart = i * win_size
             qend = min(qstart + win_size, ctx)
-            current_win_qlen = qend - qstart
-            if current_win_qlen == 0:
                 continue
             kstart = max(0, qend - span_len)
@@ -186,10 +188,7 @@ class attentiona(nn.Module):
                 elif mask.dim() == 2:
                     win_mask = mask[qstart:qend, kstart:kend]
-            attn_out, _ = self._focus(
-                x=qwin,
-                xa=kwin,
-                mask=win_mask)
             output[:, qstart:qend, :] = attn_out
         return output
@@ -201,21 +200,21 @@ class attentiona(nn.Module):
             output, _ = self._focus(x, xa, mask)
             return output
-class attentionb(nn.Module):
     def __init__(self, dims: int, head: int):
-        super(attentionb, self).__init__()
-        self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
         self.head = head
-        self.head_dim = dims // head
-        self.rope = rotary(dims=dims, head=head)
     def forward(self, x: Tensor, xa = None, mask = None):
-        z = default(xa, x)
-        q, k, v = create_qkv(self.dims, self.head, self.q, self.k, self.v, self.lna(x), self.lna(z))
         q = self.rope(q, q.shape[2])
         k = self.rope(k, k.shape[2])
-        a = scaled_dot_product_attention(self.lnb(q), self.lnb(k), v, is_causal=mask is not None and q.shape[1] > 1)
         out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
         return self.o(out)
@@ -224,14 +223,15 @@ class Residual(nn.Module):
         super().__init__()
         self.lna = nn.LayerNorm(dims, bias=False)
-        self.attnb = attentionb(dims, head)
-        self.attna = attentiona(dims, head, max_iter=3)
         self.mlp = nn.Sequential(Linear(dims, dims*4), get_activation(act), Linear(dims*4, dims))
     def forward(self, x, xa = None, mask = None) -> Tensor:
-        x = x + self.attnb(self.lna(x), None, mask)
         if xa is not None:
-            x = x + self.attna(self.lna(x), xa, None, use_sliding_win=True, win_size=500, span_len=1500)
         x = x + self.mlp(self.lna(x))
         return x
@@ -239,8 +239,9 @@ class processor(nn.Module):
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
-        self.ln = nn.LayerNorm(dims)
-        self.blend = nn.Parameter(torch.tensor(0.5), requires_grad=True)
         self.token_emb = nn.Embedding(vocab, dims)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
         self.audio_emb = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
@@ -252,7 +253,7 @@ class processor(nn.Module):
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.bA = nn.ModuleList([Residual(dims, head, act_fn) for _ in range(layer)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
@@ -263,14 +264,13 @@ class processor(nn.Module):
         xa = xa + self.audio_emb(xa.shape[1], xa.shape[-1], 36000.0).to(device, dtype)
         for b in chain(self.bA or []):
-            xa = b(xa, None, None)
-            x  = b(x, None, self.mask)
-            x  = b(x, xa, None)
-            xc = b(torch.cat([x, xa], dim=1), xa=xa, mask=self.mask) if modal else None
             x  = b(x=xc[:, :x.shape[1]], xa=xc[:, x.shape[1]:], mask=None) if modal else x
         x = nn.functional.dropout(x, p=0.001, training=self.training)
-        x = self.ln(x)
         x = x @ torch.transpose(self.token_emb.weight.to(dtype), 0, 1).float()
         return x

 from datetime import datetime
 from dataclasses import dataclass
 from torch.nn.functional import scaled_dot_product_attention
+from echoutils import *
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
         self.dims = dims
         self.head = head
         self.head_dim = dims // head
+        self.theta = nn.Parameter((torch.tensor(36000, device=device, dtype=dtype)), requires_grad=True)
         self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
     def _compute_freqs_base(self):
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
+def shape(dims, head, q, k, v):
+    head_dim = dims // head
+    scale = head_dim ** -0.25
+    q = q * scale
+    k = k * scale
+    v = v
+    def _shape(tensor):
+        return tensor.view(*tensor.shape[:2], head, -1).permute(0, 2, 1, 3).contiguous()
+    return _shape(q), _shape(k), _shape(v)
 def qkv_init(dims: int, head: int):
     head_dim = dims // head
     q = nn.Linear(dims, dims)
     v = nn.Linear(dims, dims)
     o = nn.Linear(dims, dims)
     lna = nn.LayerNorm(dims, bias=False)
+    lnb = nn.LayerNorm(dims, bias=False)
+    lnc = nn.LayerNorm(head_dim, bias=False)
+    lnd = nn.LayerNorm(head_dim, bias=False)
+    return q, k, v, o, lna, lnb, lnc, lnd
+def calculate_attention(q, k, v, mask=None, temp=1.0):
     scaled_q = q
+    if temp != 1.0 and temp > 0:
+        scaled_q = q * (1.0 / temp)**.5
     out = scaled_dot_product_attention(scaled_q, k, v, is_causal=mask is not None and q.shape[1] > 1)
     return out
 class LocalOut(nn.Module):
+    def __init__(self, dims: int, head: int):
         super().__init__()
+        head_dim = dims // head
         self.query_module = nn.Linear(head_dim, head_dim)
         self.key_module = nn.Linear(head_dim, head_dim)
         self.value_module = nn.Linear(head_dim, head_dim)
         self.out_proj = nn.Linear(head_dim, head_dim)
+    def _reshape_to_output(self, attn_output: Tensor) -> Tensor:
+        batch, _, ctx, _ = attn_output.shape
+        return attn_output.transpose(1, 2).contiguous().view(batch, ctx, self.dims)
+class attentionb(nn.Module):
+    def __init__(self, dims: int, head: int, max_iter: int = 3, threshold: float = 0.01, factor: float = 0.1, dropout: float = 0.1, temp = 1.0):
+        super(attentionb, self).__init__()
+        self.q,  self.k,  self.v,  self.o, self.lna, self.lnb, self.lnc, self.lnd  = qkv_init(dims, head)
         self.dims = dims
         self.head = head
         self.max_iter = max_iter
         self.threshold = nn.Parameter(torch.tensor(threshold))
+        self.temp = nn.Parameter(torch.tensor(temp), requires_grad=True)
         self.factor = nn.Parameter(torch.tensor(factor))
+        self.alocal = LocalOut(dims, head)
     def _focus(self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None):
+        q = self.q(self.lna(x))
+        k = self.k(self.lnb(x if xa is None else xa))
+        v = self.v(self.lnb(x if xa is None else xa))
+        q, k, v = shape(self.dims, self.head, q, k, v)
         iteration = 0
+        temp = self.temp.item()
         prev_out = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
         threshold = self.threshold.item()
         qcur = q
         while iteration < self.max_iter:
+            eff_span = min(qcur.shape[1], k.shape[1])
             if xa is not None:
                 eff_span = min(eff_span, xa.shape[1])
             if eff_span == 0:
             qiter = qcur[:, :, :eff_span, :]
             kiter = k[:, :, :eff_span, :]
             viter = v[:, :, :eff_span, :]
+            q = self.alocal.query_module(qiter)
+            k = self.alocal.key_module(kiter)
+            v = self.alocal.value_module(viter)
             iter_mask = None
             if mask is not None:
             attn_iter = calculate_attention(
                 self.lnc(q), self.lnd(k), v,
+                mask=iter_mask, temp=temp)
             iter_out = torch.zeros_like(qcur)
             iter_out[:, :, :eff_span, :] = attn_iter
             qcur = qcur + iter_out
             attn_out = iter_out
             iteration += 1
+            temp += 0.005
         output = attn_out.permute(0, 2, 1, 3).flatten(start_dim=2)
         return self.o(output), None
     def _slide_win_local(self, x: Tensor, win_size: int, span_len: int, mask: Optional[Tensor] = None) -> Tensor:
+        batch, ctx, dims = x.shape
         output = torch.zeros_like(x)
         num_win = (ctx + win_size - 1) // win_size
         for i in range(num_win):
             qstart = i * win_size
             qend = min(qstart + win_size, ctx)
+            win_qlen = qend - qstart
+            if win_qlen == 0:
                 continue
             kstart = max(0, qend - span_len)
                 elif mask.dim() == 2:
                     win_mask = mask[qstart:qend, kstart:kend]
+            attn_out, _ = self._focus(x=qwin, xa=kwin, mask=win_mask)
             output[:, qstart:qend, :] = attn_out
         return output
             output, _ = self._focus(x, xa, mask)
             return output
+class attentiona(nn.Module):
     def __init__(self, dims: int, head: int):
+        super(attentiona, self).__init__()
+        self.q,  self.k,  self.v,  self.o, self.lna, self.lnb, self.lnc, self.lnd  = qkv_init(dims, head)
         self.dims = dims
         self.head = head
+        self.rope = rotary(dims=dims, head=head)
     def forward(self, x: Tensor, xa = None, mask = None):
+        q = self.q(self.lna(x))
+        k = self.k(self.lnb(x if xa is None else xa))
+        v = self.v(self.lnb(x if xa is None else xa))
+        q, k, v = shape(self.dims, self.head, q, k, v)
         q = self.rope(q, q.shape[2])
         k = self.rope(k, k.shape[2])
+        a = scaled_dot_product_attention(self.lnc(q), self.lnd(k), v, is_causal=mask is not None and q.shape[1] > 1)
         out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
         return self.o(out)
         super().__init__()
         self.lna = nn.LayerNorm(dims, bias=False)
+        self.attna = attentiona(dims, head)
+        self.attnb = attentionb(dims, head, max_iter=3)
         self.mlp = nn.Sequential(Linear(dims, dims*4), get_activation(act), Linear(dims*4, dims))
     def forward(self, x, xa = None, mask = None) -> Tensor:
+        x = x + self.attna(self.lna(x), mask=mask)
         if xa is not None:
+            x = x + self.attna(self.lna(x), xa, mask=None)
+            x = x + self.attnb(self.lna(x), xa, mask=None, use_sliding_win=True, win_size=256, span_len=512)
         x = x + self.mlp(self.lna(x))
         return x
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
+        self.lna = nn.LayerNorm(dims)
+        self.lnb = nn.LayerNorm(dims)
+        self.lnc = nn.LayerNorm(dims)
         self.token_emb = nn.Embedding(vocab, dims)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
         self.audio_emb = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.bA = nn.ModuleList([Residual(dims, head, act_fn) for _ in range(layer)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
         xa = xa + self.audio_emb(xa.shape[1], xa.shape[-1], 36000.0).to(device, dtype)
         for b in chain(self.bA or []):
+            xa = b(self.lna(xa))
+            x = b(self.lnb(x), xa=xa, mask=self.mask)
+            xc = b(torch.cat([x, xa], dim=1), xa=None, mask=self.mask) if modal else None
             x  = b(x=xc[:, :x.shape[1]], xa=xc[:, x.shape[1]:], mask=None) if modal else x
         x = nn.functional.dropout(x, p=0.001, training=self.training)
+        x = self.lnc(x)
         x = x @ torch.transpose(self.token_emb.weight.to(dtype), 0, 1).float()
         return x