Sin2pi
/

asr-model

@@ -1,19 +1,17 @@
 import os
-import math
 import warnings
 import logging
 from itertools import chain
 import torch
-import torch.nn.functional as feature
 from torch import nn, Tensor
-from typing import Optional, Dict, Union, List, Tuple
 import numpy as np
-from functools import partial
 from datetime import datetime
 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 from echoutils import *
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
@@ -36,11 +34,18 @@ class rotary(nn.Module):
         self.head = head
         self.head_dim = dims // head
         self.theta = nn.Parameter((torch.tensor(10000, device=device, dtype=dtype)), requires_grad=True)
     def forward(self, x, ctx) -> Tensor:
-        freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
-        t = torch.arange(ctx, device=device, dtype=dtype)
-        freqs = t[:, None] * freqs
         freqs=torch.polar(torch.ones_like(freqs), freqs)
         x1 = x[..., :freqs.shape[-1]*2]
         x2 = x[..., freqs.shape[-1]*2:]
         orig_shape = x1.shape
@@ -63,8 +68,8 @@ class attention(nn.Module):
         self.rope = rotary(dims=dims, head=head)
         self.lny = nn.LayerNorm(self.head_dim, bias = False)
         self.lnx = nn.LayerNorm(dims, bias = False)
     def forward(self, x: Tensor, xa = None, mask = None):
-        scale = (self.dims // self.head) ** -0.25
         q = self.q(self.lnx(x))
         k = self.k(self.lnx(x if xa is None else xa))
         v = self.v(self.lnx(x if xa is None else xa))
@@ -80,59 +85,58 @@ class attention(nn.Module):
 class tgate(nn.Module):
     def __init__(self, dims, num_types=4):
         super().__init__()
-        self.gate_projections = nn.ModuleList([
-            nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            for _ in range(num_types)])
-        self.type_classifier = nn.Sequential(
-            Linear(dims, num_types),
-            nn.Softmax(dim=-1))
     def forward(self, x):
-        type_probs = self.type_classifier(x)
-        gates = torch.stack([gate(x) for gate in self.gate_projections], dim=-1)
-        comb_gate = torch.sum(gates * type_probs.unsqueeze(2), dim=-1)
-        return comb_gate
 class Residual(nn.Module):
     _seen = set()
     def __init__(self, dims: int, head: int, act: str = "silu"):
         super().__init__()
-        act_fn = get_activation(act)
         self.blend = nn.Parameter(torch.tensor(0.5))
         self.attn = attention(dims, head)
-        self.mlp = nn.Sequential(Linear(dims, dims*4), act_fn, Linear(dims*4, dims))
         self.tgate = tgate(dims=dims, num_types=4*2)
-        self.lna = nn.LayerNorm(dims, bias = False)
     def forward(self, x, xa=None, mask=None) -> Tensor:
-        xb = x + self.attn(self.lna(x), xa=None, mask=mask)[0]
         if xa is not None:
-            x = x + self.attn(self.lna(x), xa=xa, mask=None)[0]
             b = torch.sigmoid(self.blend)
-            x = b * xb + (1 - b) * x
-        out = self.mlp(self.lna(x))
-        gate = self.tgate(self.lna(x))
         x = x + gate * out
         return x
 class processor(nn.Module):
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
-        act_fn = get_activation(act)
         self.token = nn.Embedding(vocab, dims, device=device, dtype=dtype)
         self.positional = nn.Parameter(torch.empty(ctx, dims, device=device, dtype=dtype), requires_grad=True)
-        self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
         self.positional_sin = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
         self.encoder = nn.Sequential(
             Conv1d(1, dims, kernel_size=3, stride=1, padding=1), act_fn,
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1), act_fn,
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.bA = nn.ModuleList([Residual(dims=dims, head=head, act=act_fn) for _ in range(layer)])
         self.bB = nn.ModuleList([Residual(dims=dims, head=head, act=act_fn) for _ in range(layer)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
-        self.norm = nn.LayerNorm(dims, device=device, dtype=dtype)
     def forward(self, x, xa) -> Tensor:
-        x = self.token(x.long()) + self.positional[:x.shape[1]]
         xa = self.encoder(xa).permute(0, 2, 1)
         xa = xa + self.positional_sin(xa.shape[1], xa.shape[-1], 10000.0).to(device, dtype)
         for b in chain(self.bA or []):
@@ -141,7 +145,7 @@ class processor(nn.Module):
             x = b(x=x, xa=None, mask=self.mask)
             x = b(x, xa=xa, mask=None)
         x = nn.functional.dropout(x, p=0.001, training=self.training)
-        x = self.norm(x)
         x = x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
         return x
@@ -149,7 +153,6 @@ class Model(nn.Module):
     def __init__(self, param: Dimensions):
         super().__init__()
         self.param = param
         self.processor = processor(
             vocab=param.vocab,
             mels=param.mels,
@@ -157,14 +160,12 @@ class Model(nn.Module):
             dims=param.dims,
             head=param.head,
             layer=param.layer,
-            act=param.act,
-            )
     def forward(self,
         labels=None, input_ids=None, pitch: Optional[torch.Tensor]=None) -> Dict[str, Optional[torch.Tensor]]:
-        if pitch is not None:
-            xa = pitch
         x = input_ids
         logits = self.processor(x, xa)
         loss = None
         if labels is not None:
@@ -210,92 +211,3 @@ class Model(nn.Module):
             if count > 0:
                 print(f"{module_type}: {count}")
-def main():
-    token = ""
-    log_dir = os.path.join('D:/newmodel/output/logs', datetime.now().strftime('%m-%d_%H_%M_%S'))
-    os.makedirs(log_dir, exist_ok=True)
-    tokenizer = setup_tokenizer("D:/newmodel/mod5/tokenizer.json")
-    extract_args = {
-        "waveform": False,
-        "spec": False,
-        "f0": False,
-        "f0t": False,
-        "pitch": True,
-        "harmonics": False,
-        "aperiodics": False,
-        "phase_mod": False,
-        "crepe": False,
-        "sample_rate": 16000,
-        "hop_length": 256,
-        "mode": "mean",
-        "debug": False,
-    }
-    param = Dimensions(
-        vocab=40000,
-        mels=128,
-        ctx=2048,
-        dims=512,
-        head=4,
-        layer=4,
-        act="swish",
-        )
-    train_dataset, test_dataset = prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=False,
-        load_saved=False, save_dataset=False, cache_dir=None, extract_args=extract_args, max_ctx=param.ctx)
-    model = Model(param).to('cuda')
-    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
-    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
-    from functools import partial
-    metrics_fn = partial(compute_metrics, print_pred=True, num_samples=1, tokenizer=tokenizer, model=model)
-    training_args = Seq2SeqTrainingArguments(
-        output_dir=log_dir,
-        per_device_train_batch_size=1,
-        per_device_eval_batch_size=1,
-        max_steps=1000,
-        eval_steps=100,
-        save_steps=1000,
-        warmup_steps=100,
-        logging_steps=10,
-        logging_dir=log_dir,
-        logging_strategy="steps",
-        eval_strategy="steps",
-        save_strategy="no",
-        report_to=["tensorboard"],
-        push_to_hub=False,
-        save_total_limit=1,
-        label_names=["labels"],
-        save_safetensors=False,
-        eval_on_start=False,
-        batch_eval_metrics=False,
-        disable_tqdm=False,
-        include_tokens_per_second=True,
-        include_num_input_tokens_seen=True,
-        learning_rate=0.00025,
-        weight_decay=0.025,
-    )
-    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-8, weight_decay=training_args.weight_decay, betas=(0.9, 0.999),
-    amsgrad=False, foreach=False, fused=False, capturable=False, differentiable=False, maximize=False)
-    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=training_args.max_steps, eta_min=1e-9, last_epoch=-1)
-    trainer = Seq2SeqTrainer(
-        args=training_args,
-        model=model,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        data_collator=DataCollator(tokenizer=tokenizer),
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        compute_metrics=metrics_fn,
-        optimizers=(optimizer, scheduler)
-    )
-    model.init_weights()
-    trainer.train()
-if __name__ == "__main__":
-    main()

 import os
 import warnings
 import logging
 from itertools import chain
 import torch
 from torch import nn, Tensor
+from typing import Optional, Dict
 import numpy as np
 from datetime import datetime
+from dataclasses import dataclass
 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
+from torch.nn.functional import scaled_dot_product_attention
 from echoutils import *
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
         self.head = head
         self.head_dim = dims // head
         self.theta = nn.Parameter((torch.tensor(10000, device=device, dtype=dtype)), requires_grad=True)
+        self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
+    def _compute_freqs_base(self):
+        mel_scale = torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1
+        return 200 * mel_scale / 1000
     def forward(self, x, ctx) -> Tensor:
+        freqs = (self.theta / 220.0) * self.freqs_base
+        pos = torch.arange(ctx, device=device, dtype=dtype)
+        freqs = pos[:, None] * freqs
         freqs=torch.polar(torch.ones_like(freqs), freqs)
         x1 = x[..., :freqs.shape[-1]*2]
         x2 = x[..., freqs.shape[-1]*2:]
         orig_shape = x1.shape
         self.rope = rotary(dims=dims, head=head)
         self.lny = nn.LayerNorm(self.head_dim, bias = False)
         self.lnx = nn.LayerNorm(dims, bias = False)
     def forward(self, x: Tensor, xa = None, mask = None):
         q = self.q(self.lnx(x))
         k = self.k(self.lnx(x if xa is None else xa))
         v = self.v(self.lnx(x if xa is None else xa))
 class tgate(nn.Module):
     def __init__(self, dims, num_types=4):
         super().__init__()
+        self.gates = nn.ModuleList([nn.Sequential(Linear(dims, 1), nn.Sigmoid()) for _ in range(num_types)])
+        self.classifier = nn.Sequential(Linear(dims, num_types), nn.Softmax(dim=-1))
     def forward(self, x):
+        types = self.classifier(x)
+        gates = torch.stack([gate(x) for gate in self.gates], dim=-1)
+        cgate = torch.sum(gates * types.unsqueeze(2), dim=-1)
+        return cgate
 class Residual(nn.Module):
     _seen = set()
     def __init__(self, dims: int, head: int, act: str = "silu"):
         super().__init__()
+        self.ln = nn.LayerNorm(dims, bias = False)
         self.blend = nn.Parameter(torch.tensor(0.5))
         self.attn = attention(dims, head)
+        self.mlp = nn.Sequential(Linear(dims, dims*4), get_activation(act), Linear(dims*4, dims))
         self.tgate = tgate(dims=dims, num_types=4*2)
     def forward(self, x, xa=None, mask=None) -> Tensor:
+        xb = x + self.attn(self.ln(x), xa=None, mask=mask)
         if xa is not None:
+            x = x + self.attn(self.ln(x), xa=xa, mask=None)
             b = torch.sigmoid(self.blend)
+            x = b * xb + (1 - b) * x
+        out = self.mlp(self.ln(x))
+        gate = self.tgate(self.ln(x))
         x = x + gate * out
         return x
 class processor(nn.Module):
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
+        self.ln = nn.LayerNorm(dims, device=device, dtype=dtype)
+        self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
         self.token = nn.Embedding(vocab, dims, device=device, dtype=dtype)
         self.positional = nn.Parameter(torch.empty(ctx, dims, device=device, dtype=dtype), requires_grad=True)
         self.positional_sin = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
+        act_fn = get_activation(act)
         self.encoder = nn.Sequential(
             Conv1d(1, dims, kernel_size=3, stride=1, padding=1), act_fn,
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1), act_fn,
             Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.bA = nn.ModuleList([Residual(dims=dims, head=head, act=act_fn) for _ in range(layer)])
         self.bB = nn.ModuleList([Residual(dims=dims, head=head, act=act_fn) for _ in range(layer)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
     def forward(self, x, xa) -> Tensor:
+        x = self.token(x.long()) + self.positional[:x.shape[1]]
         xa = self.encoder(xa).permute(0, 2, 1)
         xa = xa + self.positional_sin(xa.shape[1], xa.shape[-1], 10000.0).to(device, dtype)
         for b in chain(self.bA or []):
             x = b(x=x, xa=None, mask=self.mask)
             x = b(x, xa=xa, mask=None)
         x = nn.functional.dropout(x, p=0.001, training=self.training)
+        x = self.ln(x)
         x = x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
         return x
     def __init__(self, param: Dimensions):
         super().__init__()
         self.param = param
         self.processor = processor(
             vocab=param.vocab,
             mels=param.mels,
             dims=param.dims,
             head=param.head,
             layer=param.layer,
+            act=param.act)
     def forward(self,
         labels=None, input_ids=None, pitch: Optional[torch.Tensor]=None) -> Dict[str, Optional[torch.Tensor]]:
         x = input_ids
+        xa = pitch if pitch is not None else torch.zeros(1, 1, self.param.mels, device=device, dtype=dtype)
         logits = self.processor(x, xa)
         loss = None
         if labels is not None:
             if count > 0:
                 print(f"{module_type}: {count}")