Sin2pi
/

asr-model

@@ -11,7 +11,7 @@ from dataclasses import dataclass
 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 from torch.nn.functional import scaled_dot_product_attention
-from echoutils import *
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
@@ -106,7 +106,7 @@ class LocalAttentionModule(nn.Module):
 class attentiona(nn.Module):
     def __init__(self, dims: int, head: int, max_iters: int = 3, threshold: float = 0.01, factor: float = 0.1, dropout: float = 0.1):
-        super(attention, self).__init__()
         self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
@@ -122,9 +122,8 @@ class attentiona(nn.Module):
     def _focus(self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None):
         z = default(xa, x)
         q, k, v = create_qkv(self.dims, self.head, self.q, self.k, self.v, self.lna(x), self.lna(z))
-        # q=self.lnb(q)
-        # k=self.lnb(k)
         iteration = 0
         prev_attn = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
@@ -231,6 +230,7 @@ class attentiona(nn.Module):
 class attentionb(nn.Module):
     def __init__(self, dims: int, head: int):
         super(attentionb, self).__init__()
         self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
         self.head = head
@@ -344,7 +344,7 @@ class Model(nn.Module):
     def _init_weights(self, module):
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
-            "Conv2d": 0, "processor": 0, "attention": 0, "Residual": 0}
         for name, module in self.named_modules():
             if isinstance(module, RMSNorm):
                 nn.init.ones_(module.weight)
@@ -365,11 +365,10 @@ class Model(nn.Module):
                 if module.bias is not None:
                     nn.init.zeros_(module.bias)
                 self.init_counts["Conv2d"] += 1
-            elif isinstance(module, attention):
-                self.init_counts["attention"] += 1
-            elif isinstance(module, Residual):
-                self.init_counts["Residual"] += 1
-            elif isinstance(module, processor):
                 self.init_counts["processor"] += 1
     def init_weights(self):
@@ -380,92 +379,3 @@ class Model(nn.Module):
             if count > 0:
                 print(f"{module_type}: {count}")
-def main():
-    token = ""
-    log_dir = os.path.join('D:/newmodel/output/logs/', datetime.now().strftime('%m-%d_%H_%M_%S'))
-    os.makedirs(log_dir, exist_ok=True)
-    tokenizer = setup_tokenizer("D:/newmodel/mod5/tokenizer.json")
-    extract_args = {
-        "waveform": False,
-        "spec": False,
-        "f0": False,
-        "f0t": False,
-        "pitch": True,
-        "harmonics": False,
-        "aperiodics": False,
-        "phase_mod": False,
-        "crepe": False,
-        "sample_rate": 16000,
-        "hop_length": 256,
-        "mode": "mean",
-        "debug": False,
-    }
-    param = Dimensions(
-        vocab=40000,
-        mels=128,
-        ctx=2048,
-        dims=512,
-        head=4,
-        layer=4,
-        act="swish",
-        )
-    train_dataset, test_dataset = prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=False,
-        load_saved=False, save_dataset=False, cache_dir=None, extract_args=extract_args, max_ctx=param.ctx)
-    model = Model(param).to('cuda')
-    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
-    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
-    from functools import partial
-    metrics_fn = partial(compute_metrics, print_pred=True, num_samples=1, tokenizer=tokenizer, model=model)
-    training_args = Seq2SeqTrainingArguments(
-        output_dir=log_dir,
-        per_device_train_batch_size=1,
-        per_device_eval_batch_size=1,
-        max_steps=1000,
-        eval_steps=100,
-        save_steps=1000,
-        warmup_steps=100,
-        logging_steps=10,
-        logging_dir=log_dir,
-        logging_strategy="steps",
-        eval_strategy="steps",
-        save_strategy="no",
-        report_to=["tensorboard"],
-        push_to_hub=False,
-        save_total_limit=1,
-        label_names=["labels"],
-        save_safetensors=False,
-        eval_on_start=False,
-        batch_eval_metrics=False,
-        disable_tqdm=False,
-        include_tokens_per_second=True,
-        include_num_input_tokens_seen=True,
-        learning_rate=0.00025,
-        weight_decay=0.025,
-    )
-    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-8, weight_decay=training_args.weight_decay, betas=(0.9, 0.999),
-    amsgrad=False, foreach=False, fused=False, capturable=False, differentiable=False, maximize=False)
-    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=training_args.max_steps, eta_min=1e-9, last_epoch=-1)
-    trainer = Seq2SeqTrainer(
-        args=training_args,
-        model=model,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        data_collator=DataCollator(tokenizer=tokenizer),
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        compute_metrics=metrics_fn,
-        optimizers=(optimizer, scheduler)
-    )
-    model.init_weights()
-    trainer.train()
-if __name__ == "__main__":
-    main()

 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 from torch.nn.functional import scaled_dot_product_attention
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
 class attentiona(nn.Module):
     def __init__(self, dims: int, head: int, max_iters: int = 3, threshold: float = 0.01, factor: float = 0.1, dropout: float = 0.1):
+        super(attentiona, self).__init__()
         self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
     def _focus(self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None):
         z = default(xa, x)
         q, k, v = create_qkv(self.dims, self.head, self.q, self.k, self.v, self.lna(x), self.lna(z))
         iteration = 0
         prev_attn = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
 class attentionb(nn.Module):
     def __init__(self, dims: int, head: int):
         super(attentionb, self).__init__()
         self.q,  self.k,  self.v,  self.o, self.lna, self.lnb = qkv_init(dims, head)
         self.dims = dims
         self.head = head
     def _init_weights(self, module):
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
+            "Conv2d": 0, "processor": 0, "attentiona": 0, "attentionb": 0, "Residual": 0}
         for name, module in self.named_modules():
             if isinstance(module, RMSNorm):
                 nn.init.ones_(module.weight)
                 if module.bias is not None:
                     nn.init.zeros_(module.bias)
                 self.init_counts["Conv2d"] += 1
+            elif isinstance(module, attentiona):
+                self.init_counts["attentiona"] += 1
+            elif isinstance(module, attentionb):
+                self.init_counts["attentionb"] += 1
                 self.init_counts["processor"] += 1
     def init_weights(self):
             if count > 0:
                 print(f"{module_type}: {count}")