numerals

Browse files

Files changed (8) hide show

api.py +11 -55
audiocraft/activations.py +0 -96
audiocraft/builders.py +44 -91
audiocraft/conditioners.py +3 -3
audiocraft/lm.py +28 -222
audiocraft/transformer.py +94 -287
msinference.py +57 -23
requirements.txt +1 -1

api.py CHANGED Viewed

@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 import numpy as np
 import soundfile
-import audresample
 from Utils.text_utils import split_into_sentences
 import msinference
 import re
@@ -15,10 +14,12 @@ from flask import Flask, request, send_from_directory
 from moviepy.video.io.VideoFileClip import VideoFileClip
 from moviepy.video.VideoClip import ImageClip
 from audiocraft.builders import AudioGen
-CACHE_DIR = 'flask_cache/'
-NUM_SOUND_GENERATIONS = 3  # batch size to generate same text (same soundscape for long video)
-sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
@@ -57,62 +58,17 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     # return the resized image
     return resized
-def _shift(x):
-    n = x.shape[0]
-    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0
-    x = np.roll(x, i)
-    # we can add the one or fade it and then amplify
-    # the audio is so short 6s that is difficult to not hear the shift somewhere
-    # Just concatenate - raw - and then shift - the longconcat audio - many times may fix it
-    # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4))  +  .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
-    return x  #* fade_in   # silence this
 def overlay(x, soundscape=None):
     if soundscape is not None:
-        # SOUNDS
-        background = sound_generator.generate(
-                                        [soundscape] * NUM_SOUND_GENERATIONS
-                                        ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
-        # upsample 16 kHz AudioGen to 24kHZ of VITS/StyleTTS2
-        print('Resampling')  # soundscape each generation in batch differs from the other generations thus clone/shift each element in batch, finally  concat w/o shift
-        background = audresample.resample(
-            background,
-            original_rate=16000, # sound_generator.sample_rate,
-            target_rate=24000)[0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
-        n_repeat = len(x) // background.shape[0] + 1
-        total = np.tile(background, n_repeat)
-        # less periodic
-        for _ in range(4):
-            total = _shift(total)
-        # amplify sounds full [-1,1]
-        total /= np.abs(total).max() + 1e-7
-        x = .5 * x + .5 * total[:len(x)]
     else:

 # -*- coding: utf-8 -*-
 import numpy as np
 import soundfile
 from Utils.text_utils import split_into_sentences
 import msinference
 import re
 from moviepy.video.io.VideoFileClip import VideoFileClip
 from moviepy.video.VideoClip import ImageClip
 from audiocraft.builders import AudioGen
+CACHE_DIR = 'flask_cache/'
+PIECE_OF_SOUND_DURATION = 4.74 # seconds
+sound_generator = AudioGen(
+    duration=PIECE_OF_SOUND_DURATION
+                            ).to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
     # return the resized image
     return resized
 def overlay(x, soundscape=None):
+    # pre-calculate the n_repeat here then apply torchaudio.resample and repeat insd sound_gen forward()
     if soundscape is not None:
+        background = sound_generator.generate(soundscape,
+            n_repeat=int(len(x) / (PIECE_OF_SOUND_DURATION * 16000)) + 1
+                                        ).detach().cpu().numpy() # bs, 11400 @.74s
+        # blend TTS
+        x = .5 * x + .5 * background[:len(x)]
     else:

audiocraft/activations.py DELETED Viewed

@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-import torch.nn as nn
-from torch import Tensor
-from typing import Union, Callable
-class CustomGLU(nn.Module):
-    """Custom Gated Linear Unit activation.
-    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
-    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
-    function (i.e. sigmoid, swish, etc.).
-    Args:
-        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
-        dim (int): the dimension on which to split the input. Default: -1
-    Shape:
-        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
-    Examples::
-        >>> m = CustomGLU(nn.Sigmoid())
-        >>> input = torch.randn(4, 2)
-        >>> output = m(input)
-    """
-    def __init__(self, activation: nn.Module, dim: int = -1):
-        super(CustomGLU, self).__init__()
-        self.dim = dim
-        self.activation = activation
-    def forward(self, x: Tensor):
-        assert x.shape[self.dim] % 2 == 0  # M = N / 2
-        a, b = torch.chunk(x, 2, dim=self.dim)
-        return a * self.activation(b)
-class SwiGLU(CustomGLU):
-    """SiLU Gated Linear Unit activation.
-    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(SwiGLU, self).__init__(nn.SiLU(), dim)
-class GeGLU(CustomGLU):
-    """GeLU Gated Linear Unit activation.
-    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(GeGLU, self).__init__(nn.GELU(), dim)
-class ReGLU(CustomGLU):
-    """ReLU Gated Linear Unit activation.
-    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(ReGLU, self).__init__(nn.ReLU(), dim)
-def get_activation_fn(
-    activation: Union[str, Callable[[Tensor], Tensor]]
-) -> Union[str, Callable[[Tensor], Tensor]]:
-    """Helper function to map an activation string to the activation class.
-    If the supplied activation is not a string that is recognized, the activation is passed back.
-    Args:
-        activation (str, or Callable[[Tensor], Tensor]): Activation to check
-    """
-    if isinstance(activation, str):
-        if activation == "reglu":
-            return ReGLU()
-        elif activation == "geglu":
-            return GeGLU()
-        elif activation == "swiglu":
-            return SwiGLU()
-    return activation

audiocraft/builders.py CHANGED Viewed

@@ -1,22 +1,25 @@
-import typing as tp
 import omegaconf
 from torch import nn
 import torch
 from huggingface_hub import hf_hub_download
 import os
-from omegaconf import OmegaConf, DictConfig
 from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
-from .codebooks_patterns import DelayedPatternProvider
-from .conditioners import T5Conditioner
 from .vq import ResidualVectorQuantizer
-def _delete_param(cfg: DictConfig, full_name: str):
     parts = full_name.split('.')
     for part in parts[:-1]:
         if part in cfg:
@@ -35,48 +38,53 @@ def dict_from_config(cfg):
     return dct
-# ============================================== DEFINE AUDIOGEN
 class AudioGen(nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
     def __init__(self,
-                 duration=0.024,
-                 device='cpu'):
         super().__init__()
-        self.device = device  # needed for loading & select float16 LM
         self.load_compression_model()
         self.load_lm_model()
         self.duration = duration
     @property
     def frame_rate(self):
         return self.compression_model.frame_rate
     def generate(self,
-                 descriptions):
         with torch.no_grad():
             gen_tokens = self.lm.generate(
-                descriptions=descriptions,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            # print('______________\nAudioGen Tokens', gen_tokens)
-        return x / x.abs().max(2, keepdims=True)[0] + 1e-7
     # == BUILD Fn
     def get_quantizer(self, quantizer, cfg, dimension):
@@ -126,58 +134,7 @@ class AudioGen(nn.Module):
                                 ).to(cfg.device)
         else:
             raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-    def get_lm_model(self, cfg):
-        """Instantiate a transformer LM."""
-        if cfg.lm_model in ['transformer_lm',
-                            'transformer_lm_magnet']:
-            kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
-            n_q = kwargs['n_q']
-            q_modeling = kwargs.pop('q_modeling', None)
-            codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
-            attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
-            cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
-            cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
-            # if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
-            kwargs['cross_attention'] = True
-            if codebooks_pattern_cfg.modeling is None:
-                print('Q MODELING\n=\n=><')
-                assert q_modeling is not None, \
-                    "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
-                codebooks_pattern_cfg = omegaconf.OmegaConf.create(
-                    {'modeling': q_modeling, 'delay': {'delays': list(range(n_q))}}
-                )
-            pattern_provider = self.get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
-            return LMModel(
-                pattern_provider=pattern_provider,
-                condition_provider=T5Conditioner(name='t5-large', output_dim=kwargs["dim"], device=self.device),
-                cfg_dropout=cfg_prob,
-                cfg_coef=cfg_coef,
-                attribute_dropout=attribute_dropout,
-                dtype=getattr(torch, cfg.dtype),
-                device=self.device,
-                **kwargs
-            ).to(cfg.device)
-        else:
-            raise KeyError(f"Unexpected LM model {cfg.lm_model}")
-    def get_codebooks_pattern_provider(self, n_q, cfg):
-        pattern_providers = {
-            'delay': DelayedPatternProvider,  # THIS
-        }
-        name = cfg.modeling
-        kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
-        klass = pattern_providers[name]
-        return klass(n_q, **kwargs)
-    # ======================
     def load_compression_model(self):
         file = hf_hub_download(
             repo_id='facebook/audiogen-medium',
@@ -204,24 +161,20 @@ class AudioGen(nn.Module):
             library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(file,
-                        map_location=self.device) #'cpu')
-        cfg = OmegaConf.create(pkg['xp.cfg'])
-        # cfg.device = 'cpu'
-        if self.device == 'cpu':
-            cfg.dtype = 'float32'
-        else:
-            cfg.dtype = 'float16'
         _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
         _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
         _delete_param(cfg, 'conditioners.args.drop_desc_p')
-        model = self.get_lm_model(cfg)
         _best = pkg['best_state']
         _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
         _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         model.load_state_dict(pkg['best_state'])
-        model.cfg = cfg
-        # return model
         self.lm = model.to(torch.float)
     # def _flush(self):

 import omegaconf
+import torchaudio
 from torch import nn
 import torch
+import numpy as np
 from huggingface_hub import hf_hub_download
 import os
+from omegaconf import OmegaConf
 from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+def _shift(x):
+    # [bs, samples] shift circular each batch elem of sound
+    n = x.shape[1]
+    for i, batch_elem in enumerate(x):
+        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+        x[i, :] = torch.roll(batch_elem, offset, dims=0)  # batch_elem = [400000, ]
+    return x
+def _delete_param(cfg, full_name):
     parts = full_name.split('.')
     for part in parts[:-1]:
         if part in cfg:
     return dct
 class AudioGen(nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
     def __init__(self,
+                 duration=2.24,  # s
+                 ):
         super().__init__()
         self.load_compression_model()
         self.load_lm_model()
         self.duration = duration
+        #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
+        self.resample_fn = torchaudio.transforms.Resample(16000, 24000)
     @property
     def frame_rate(self):
         return self.compression_model.frame_rate
     def generate(self,
+                 descriptions,
+                 n_repeat=3):
         with torch.no_grad():
             gen_tokens = self.lm.generate(
+                descriptions=[descriptions]*3,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            x = x[:, 0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
+            # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
+            # x = self.resample_fn(x)
+            # batch size = different sounds for same txt
+            x = x.repeat(1, n_repeat)
+            # less periodic - shift every batch elem
+            for _ in range(7):
+                x = _shift(x)
+            x = x.reshape(-1)
+            print(x.abs().max(), 'MAX')
+            return x / (x.abs().max() + 1e-7)
     # == BUILD Fn
     def get_quantizer(self, quantizer, cfg, dimension):
                                 ).to(cfg.device)
         else:
             raise KeyError(f"Unexpected compression model {cfg.compression_model}")
     def load_compression_model(self):
         file = hf_hub_download(
             repo_id='facebook/audiogen-medium',
             library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(file,
+                        map_location='cpu')
+        cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
         _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
         _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
         _delete_param(cfg, 'conditioners.args.drop_desc_p')
+        print('___________________________CFG___________________',cfg,'\n=======================')
+        kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
+        print('___________________________Kwarg___________________',kwargs,'\n=======================')
+        model = LMModel().to(getattr(torch, cfg.dtype)) #.to(cfg.device)
         _best = pkg['best_state']
         _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
         _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         model.load_state_dict(pkg['best_state'])
+        # model.cfg = cfg
         self.lm = model.to(torch.float)
     # def _flush(self):

audiocraft/conditioners.py CHANGED Viewed

@@ -25,7 +25,7 @@ class T5Conditioner(nn.Module):
     def __init__(self,
                  name,
                  output_dim,
-                 device,
                  finetune=False):
         print(f'{finetune=}')
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
@@ -36,7 +36,7 @@ class T5Conditioner(nn.Module):
         self.device = device
         self.name = name
-        self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
         t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
         if finetune:
             self.t5 = t5
@@ -65,7 +65,7 @@ class T5Conditioner(nn.Module):
             embeds = self.t5(input_ids=d['input_ids'],
                              attention_mask=d['attention_mask']
                              ).last_hidden_state  # no kvcache for txt conditioning
-        embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * d['attention_mask'].unsqueeze(-1))
         return embeds # , d['attention_mask']

     def __init__(self,
                  name,
                  output_dim,
+                 device='cuda:0',
                  finetune=False):
         print(f'{finetune=}')
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
         self.device = device
         self.name = name
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(name, legacy=True)
         t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
         if finetune:
             self.t5 = t5
             embeds = self.t5(input_ids=d['input_ids'],
                              attention_mask=d['attention_mask']
                              ).last_hidden_state  # no kvcache for txt conditioning
+            embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * d['attention_mask'].unsqueeze(-1))
         return embeds # , d['attention_mask']

audiocraft/lm.py CHANGED Viewed

@@ -1,237 +1,45 @@
-from dataclasses import dataclass
-import logging
-import math
-import typing as tp
 import torch
 import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer
-from dataclasses import dataclass
-from functools import partial
 from torch import nn
-from audiocraft.activations import get_activation_fn
 import numpy as np
-def _shift(x):
-    # cyclic shift of [1, 4, seq_len] slices from [bs, 4, seq_len]
-    print(x.shape, 'SHIFT\n= = = = = ')
-    for i, _slice in enumerate(x):
-        n = x.shape[2]
-        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
-        print(offset)
-        x[i, :, :] = torch.roll(_slice, offset, dims=1)
-    return x
-def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
-    """LM layer initialization.
-    Inspired from xlformers: https://github.com/fairinternal/xlformers
-    Args:
-        method (str): Method name for init function. Valid options are:
-            'gaussian', 'uniform'.
-        input_dim (int): Input dimension of the initialized module.
-        init_depth (int, optional): Optional init depth value used to rescale
-            the standard deviation if defined.
-    """
-    # Compute std
-    std = 1 / math.sqrt(input_dim)
-    # Rescale with depth
-    if init_depth is not None:
-        std = std / math.sqrt(2 * init_depth)
-    if method == 'gaussian':
-        return partial(
-            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
-        )
-    elif method == 'uniform':
-        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
-        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
-    else:
-        raise ValueError("Unsupported layer initialization method")
-def init_layer(m: nn.Module,
-               method: str,
-               init_depth: tp.Optional[int] = None,
-               zero_bias_init: bool = False):
-    """Wrapper around ``get_init_fn`` for proper initialization of LM modules.
-    Args:
-        m (nn.Module): Module to initialize.
-        method (str): Method name for the init function.
-        init_depth (int, optional): Optional init depth value used to rescale
-            the standard deviation if defined.
-        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
-    """
-    if isinstance(m, nn.Linear):
-        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
-        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
-            weight = m.weight.float()
-            init_fn(weight)
-            m.weight.data[:] = weight.half()
-        else:
-            init_fn(m.weight)
-        if zero_bias_init and m.bias is not None:
-            nn.init.constant_(m.bias, 0)
-    elif isinstance(m, nn.Embedding):
-        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
-        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
-            weight = m.weight.float()
-            init_fn(weight)
-            m.weight.data[:] = weight.half()
-        else:
-            init_fn(m.weight)
-class ScaledEmbedding(nn.Embedding):
-    """Boost learning rate for embeddings (with `scale`).
-    """
-    def __init__(self, *args, lr=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.lr = lr
-    def make_optim_group(self):
-        group = {"params": list(self.parameters())}
-        if self.lr is not None:
-            group["lr"] = self.lr
-        return group
-@dataclass
-class LMOutput:
-    # The logits are already re-aligned with the input codes
-    # hence no extra shift is required, e.g. when computing CE
-    logits: torch.Tensor  # [B, K, T, card]
-    mask: torch.Tensor  # [B, K, T]
 class LMModel(nn.Module):
-    """Transformer-based language model on multiple streams of codes.
-    Args:
-        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
-        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
-        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
-        n_q (int): Number of parallel streams to model.
-        card (int): Cardinality, vocabulary size.
-        dim (int): Dimension of the transformer encoder.
-        num_heads (int): Number of heads for the transformer encoder.
-        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
-        norm (str): Normalization method.
-        norm_first (bool): Use pre-norm instead of post-norm.
-        emb_lr (float, optional): Embedding-specific learning rate.
-        bias_proj (bool): Use bias for output projections.
-        weight_init (str, optional): Method for weight initialization.
-        depthwise_init (str, optional): Method for depthwise weight initialization.
-        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
-        cfg_dropout (float): Classifier-free guidance dropout.
-        cfg_coef (float): Classifier-free guidance coefficient.
-        attribute_dropout (dict): Attribute dropout probabilities.
-        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
-        **kwargs: Additional parameters for the transformer encoder.
-    """
     def __init__(self,
-                 pattern_provider,
-                 condition_provider,
-                 n_q: int = 8,
-                 card: int = 1024,
-                 dim: int = 128,
-                 num_heads: int = 8,
-                 hidden_scale: int = 4,
-                 norm: str = 'layer_norm',
-                 norm_first: bool = False,
-                 emb_lr: tp.Optional[float] = None,
-                 bias_proj: bool = True,
-                 weight_init: tp.Optional[str] = None,
-                 depthwise_init: tp.Optional[str] = None,
-                 zero_bias_init: bool = False, cfg_dropout: float = 0,
-                 cfg_coef: float = 1.0,
-                 two_step_cfg: bool = False,
-                 **kwargs):
         super().__init__()
-        self.cfg_coef = cfg_coef
-        self.condition_provider = condition_provider
         self.card = card  # 2048 ?
         self.n_draw = 1  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
-        self.pattern_provider = pattern_provider
-        self.two_step_cfg = two_step_cfg
-        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
-        if 'activation' in kwargs:
-            kwargs['activation'] = get_activation_fn(kwargs['activation'])
-        # ========================================================================
-        #  {
-        #   'dtype': torch.float16, 'device': 'cuda',
-        #   'num_layers': 48, 'dropout': 0.0, 'activation': 'gelu',
-        #   'bias_ff': False, 'bias_attn': False,
-        #   'past_context': None, 'causal': True,
-        #   'custom': False, 'memory_efficient': True,
-        #   'attention_as_float32': False, 'positional_embedding': 'sin', 'xpos': False,
-        #   'checkpointing': 'none', 'cross_attention': True, 'qk_layer_norm': False,
-        #   'qk_layer_norm_cross': False, 'attention_dropout': None, 'kv_repeat': 1
-        #   }
-        # ==========================================================================
-        kwargs.pop('layer_scale')  # nn.Indentity()
         self.transformer = StreamingTransformer(
             d_model=dim,
             num_heads=num_heads,
             dim_feedforward=int(hidden_scale * dim),
-            norm=norm,
-            norm_first=norm_first, **kwargs)
-        self.out_norm: tp.Optional[nn.Module] = None
-        if norm_first:
-            self.out_norm = nn.LayerNorm(dim, eps=1e-5)
-        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
-        self._init_weights(weight_init, depthwise_init, zero_bias_init)
-        self._fsdp: tp.Optional[nn.Module]
-        self.__dict__['_fsdp'] = None
-    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
-        """Initialization of the transformer module weights.
-        Args:
-            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
-            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
-                'current' where the depth corresponds to the current layer index or 'global' where the total number
-                of layer is used as depth. If not set, no depthwise initialization strategy is used.
-            zero_bias_init (bool): Whether to initialize bias to zero or not.
-        """
-        assert depthwise_init is None or depthwise_init in ['current', 'global']
-        assert depthwise_init is None or weight_init is not None, \
-            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
-        assert not zero_bias_init or weight_init is not None, \
-            "If 'zero_bias_init', a 'weight_init' method should be provided"
-        if weight_init is None:
-            return
-        for emb_layer in self.emb:
-            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-        for layer_idx, tr_layer in enumerate(self.transformer.layers):
-            depth = None
-            if depthwise_init == 'current':
-                depth = layer_idx + 1
-            elif depthwise_init == 'global':
-                depth = len(self.transformer.layers)
-            init_fn = partial(init_layer,
-                              method=weight_init,
-                              init_depth=depth,
-                              zero_bias_init=zero_bias_init)
-            tr_layer.apply(init_fn)
-        for linear in self.linears:
-            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-    @property
-    def special_token_id(self) -> int:
-        return self.card
     def forward(self,
                 sequence,
@@ -293,7 +101,7 @@ class LMModel(nn.Module):
                                 max_gen_len), -1, dtype=torch.long,
                                 device=text_condition.device)
-        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
         _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
         # print(gen_sequence.shape, mask.shape, 'F')  # mask has no batch = [4,audio_duration]
@@ -313,7 +121,7 @@ class LMModel(nn.Module):
         for offset in range(1, audiodur):
             # forward duplicates the query to nullcond - then cfg & returns deduplicate token
-            next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset-1)  # [bs, 4, 1, 2048]
@@ -322,7 +130,7 @@ class LMModel(nn.Module):
             # MASK is not full 1---- HAS 4 x audioduration PATTERN
             m = mask[:, :, :, offset]
-            next_token[~m] = self.special_token_id
             gen_sequence[:, :, :, offset] = torch.where(
                 gen_sequence[:, :, :, offset] == -1, #unknown_token,
                 next_token,
@@ -333,7 +141,7 @@ class LMModel(nn.Module):
         # 1. reshape n_draw as bs * n_draw
         # 2. invert all short-sequences
         # 3. reshape bs * n_draw -> bs, n_draw * audiodur ELONGATION
-        out_codes, _, _ = pattern.revert_pattern_sequence(
             gen_sequence.reshape(bs * self.n_draw, 4, audiodur),  # [3,8,4,7]
             special_token=-1)
         # print(f'{gen_sequence.shape=} {out_codes.shape=} Ha')  # REVERT PATTERN REDUCES DURATION?
@@ -341,12 +149,10 @@ class LMModel(nn.Module):
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
-        for _ in range(7):
-            out_codes = _shift(out_codes)
-        # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
-        return out_codes

 import torch
 import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer
 from torch import nn
+from audiocraft.codebooks_patterns import DelayedPatternProvider
+from audiocraft.conditioners import T5Conditioner
 import numpy as np
 class LMModel(nn.Module):
     def __init__(self,
+                 n_q = 4,
+                 card = 2048,
+                 dim = 1536,
+                 num_heads = 24,
+                 hidden_scale = 4,  # FFN of Transformer
+                 ):
         super().__init__()
+        self.condition_provider = T5Conditioner(name='t5-large',
+                                                output_dim=dim)
         self.card = card  # 2048 ?
         self.n_draw = 1  # replicate so many times the generation of each text in batch
+        # the batch is more expensive than n_draw as it re-runs the model bs times
+        # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
+        self.pattern_provider = DelayedPatternProvider()
+        self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer(
             d_model=dim,
             num_heads=num_heads,
             dim_feedforward=int(hidden_scale * dim),
+            num_layers=48,
+            positional_embedding='sin',
+            )
+        self.out_norm = nn.LayerNorm(dim, eps=1e-5)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
+        # self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        # self.__dict__['_fsdp'] = None
     def forward(self,
                 sequence,
                                 max_gen_len), -1, dtype=torch.long,
                                 device=text_condition.device)
+        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.card)
         _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
         # print(gen_sequence.shape, mask.shape, 'F')  # mask has no batch = [4,audio_duration]
         for offset in range(1, audiodur):
             # forward duplicates the query to nullcond - then cfg & returns deduplicate token
+            next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset-1)  # [bs, 4, 1, 2048]
             # MASK is not full 1---- HAS 4 x audioduration PATTERN
             m = mask[:, :, :, offset]
+            next_token[~m] = self.card
             gen_sequence[:, :, :, offset] = torch.where(
                 gen_sequence[:, :, :, offset] == -1, #unknown_token,
                 next_token,
         # 1. reshape n_draw as bs * n_draw
         # 2. invert all short-sequences
         # 3. reshape bs * n_draw -> bs, n_draw * audiodur ELONGATION
+        out_codes = pattern.revert_pattern_sequence(
             gen_sequence.reshape(bs * self.n_draw, 4, audiodur),  # [3,8,4,7]
             special_token=-1)
         # print(f'{gen_sequence.shape=} {out_codes.shape=} Ha')  # REVERT PATTERN REDUCES DURATION?
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
+        # Clear k/v cache (Different kv is saved by every 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
+        return out_codes  # bs*n_draw, duration -> repeat/shift in api.py

audiocraft/transformer.py CHANGED Viewed

@@ -1,26 +1,12 @@
-import typing as tp
-from einops import rearrange
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from torch.utils.checkpoint import checkpoint as torch_checkpoint
-_efficient_attention_backend: str = 'torch'
-def _get_attention_time_dimension(memory_efficient: bool) -> int:
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        return 2
-    else:
-        return 1
-def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
-                         dtype: torch.dtype = torch.float32) -> torch.Tensor:
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
     Args:
@@ -41,256 +27,102 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
     return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
-def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers."""
-    if n_rep == 1:
-        return x
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        bs, n_kv_heads, slen, head_dim = x.shape
-        return (
-            x[:, :, None, :, :]
-            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
-            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
-        )
-    else:
-        bs, slen, n_kv_heads, head_dim = x.shape
-        return (
-            x[:, :, :, None, :]
-            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-        )
 class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
-                 num_heads, dropout: float = 0.0, bias: bool = True,
-                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
-                 memory_efficient: bool = False, attention_as_float32: bool = False,
-                 cross_attention: bool = False,
-                 kv_repeat: int = 1,
-                 device=None, dtype=None):
         super().__init__()
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        if past_context is not None:
-            assert causal
         self.embed_dim = embed_dim
         self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
-        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
-        self.memory_efficient = memory_efficient
-        self.cross_attention = cross_attention
         self.num_heads = num_heads
-        self.dropout = dropout
-        self.kv_repeat = kv_repeat
-        self.custom = True #_is_custom(custom, memory_efficient)
-        if not self.custom:
-            print(f'{self.custom}')
-        if self.custom:
-            out_dim = embed_dim
-            assert num_heads % kv_repeat == 0
-            assert not cross_attention or kv_repeat == 1
-            num_kv = num_heads // kv_repeat
-            kv_dim = (embed_dim // num_heads) * num_kv
-            out_dim += 2 * kv_dim
-            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
-            # We try to follow the default PyTorch MHA convention, to easily compare results.
-            self.in_proj_weight = in_proj.weight
-            self.in_proj_bias = in_proj.bias
-            if bias:
-                self.in_proj_bias.data.zero_()  # Following Pytorch convention
-            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
-            if bias:
-                self.out_proj.bias.data.zero_()
-        else:
-            assert kv_repeat == 1
-            self.mha = nn.MultiheadAttention(
-                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
-                **factory_kwargs)
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if not self.custom:
-            # Support compat with regular MHA
-            keys = [n for n, _ in self.mha.named_parameters()]
-            for key in keys:
-                if prefix + key in state_dict:
-                    state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
-        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-    def forward(self,
-                query,
-                key=None,   # ignores those 2 args if not self.cross_attn
                 value=None):
-        # time_dim = _get_attention_time_dimension(self.memory_efficient)
-        # if time_dim == 2:
         layout = "b h t d"
-        # else:
-        #     layout = "b t h d"
-        # dtype = query.dtype
-        if self.custom:
-            if self.cross_attention:
-                # Different queries, keys, values, we have to spit manually the weights
-                # before applying the linear.
-                dim = self.in_proj_weight.shape[0] // 3
-                if self.in_proj_bias is None:
-                    bias_q, bias_k, bias_v = None, None, None
-                else:
-                    bias_q = self.in_proj_bias[:dim]
-                    bias_k = self.in_proj_bias[dim: 2 * dim]
-                    bias_v = self.in_proj_bias[2 * dim:]
-                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
-                # todo: when streaming, we could actually save k, v and check the shape actually match.
-                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
-                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
-                q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
-                # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
-            else:
-                # 1st projected makes k,v (instantaneous)
-                # 2nd cat
-                # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
-                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
-                if self.kv_repeat == 1:
-                    # if time_dim == 2:
-                    bound_layout = "b h p t d"
-                    # else:
-                    #     bound_layout = "b t p h d"
-                    packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
-                    q, k, v = packed.unbind(dim=2)
-                if self.k_history is not None:
-                    #
-                    # pk.shape=torch.Size([2, 24, 3, 64]) k.shape=torch.Size([2, 24, 1, 64]) CONCAT
-                    # has to be 4D with batch 1 due to single condition 3=seqlen
-                    # 24 heads 64 dimofh
-                    self.k_history = torch.cat([self.k_history, k], 2)
-                    self.v_history = torch.cat([self.v_history, v], 2)
-                else:
-                    # init on 1st token (for all 47 transf layers)
-                    print(f'AudioGen kv cache Flush')
-                    self.k_history = k
-                    self.v_history = v
-                k = self.k_history
-                v = self.v_history
-                # KV COMPLETION ONLY ON SELF ATTENTION
-                # print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
-            if self.memory_efficient:
-                # print('EVER IN MEMORY EFFICIENT A')
-                p = self.dropout if self.training else 0
-                if _efficient_attention_backend == 'torch':
-                    x = torch.nn.functional.scaled_dot_product_attention(
-                        q, k, v, is_causal=False, dropout_p=p
-                    )
-            x = x.to(q.dtype)
-            x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
-            x = self.out_proj(x)
         return x
-class StreamingTransformerLayer(nn.Module): #nn.TransformerEncoderLayer):
-    # INHERITS MHA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     def __init__(self,
-                 d_model: int,
-                 num_heads: int,
-                 dim_feedforward: int = 2048,
-                 dropout: float = 0.1,
-                 bias_ff: bool = True,
-                 bias_attn: bool = True,
-                 custom: bool = False,
-                 memory_efficient: bool = False,
-                 attention_as_float32: bool = False,
-                 cross_attention: bool = False,
-                 attention_dropout: tp.Optional[float] = None,
-                 kv_repeat: int = 1,
-                 norm: str = 'layer_norm',
-                 device=None,
-                 dtype=None,
-                 **kwargs):
-        super().__init__() #d_model, num_heads, dim_feedforward, dropout,
-                         #device=device, dtype=dtype, batch_first=True, **kwargs)
-        # print(kwargs['activation'], 'ACTIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n\n\n\n')
-        # -- EN Layer
-        # DOES NOT INHERIT NO VARIABLE FROM nn.TransformerEncoderLayer only the _sa_block function
-        # -- EN layer
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        # Redefine self_attn to our streaming multi-head attention
-        attn_kwargs: tp.Dict[str, tp.Any] = {
-            'embed_dim': d_model,
-            'num_heads': num_heads,
-            'dropout': dropout if attention_dropout is None else attention_dropout,
-            'bias': bias_attn,
-            'custom': custom,
-            'memory_efficient': memory_efficient,
-            'attention_as_float32': attention_as_float32,
-        }
-        self.self_attn = StreamingMultiheadAttention(
-            kv_repeat=kv_repeat,
-            **attn_kwargs,
-            **factory_kwargs)  # type: ignore
-        # Redefine feedforward layers to expose bias parameter
-        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
-        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
-        # print('LAYER scale', layer_scale, '\n\n\n\n\n\n\n\n\n')   # always
-        self.cross_attention= None
-        if cross_attention:
-            self.cross_attention = StreamingMultiheadAttention(
-                cross_attention=True,
-                **attn_kwargs,
-                **factory_kwargs)
-            self.dropout_cross = nn.Dropout(dropout)
-            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
@@ -316,59 +148,34 @@ class StreamingTransformerLayer(nn.Module): #nn.TransformerEncoderLayer):
 class StreamingTransformer(nn.Module):
-    def __init__(self, d_model: int,
-                 num_heads: int,
-                 num_layers: int,
-                 dim_feedforward: int = 2048,
-                 dropout: float = 0.1,
-                 bias_ff: bool = True,
-                 bias_attn: bool = True,
-                 custom: bool = False,
-                 memory_efficient: bool = False,
-                 attention_as_float32: bool = False,
-                 cross_attention: bool = False,
                  positional_embedding: str = 'sin',
-                 max_period: float = 10_000,
-                 layer_class=StreamingTransformerLayer,
-                 checkpointing: str = 'none',
-                 device=None,
-                 dtype=None,
-                 **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
-        # self._stream_off = 0  # the llm should reinitialize this at ery generate()
-        self.checkpointing = checkpointing
         self.layers = nn.ModuleList()
         for idx in range(num_layers):
             self.layers.append(
-                layer_class(
-                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
-                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
-                    custom=custom,
-                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
-                    cross_attention=cross_attention,
-                    device=device, dtype=dtype, **kwargs))
-        if self.checkpointing != 'none':
-            for layer in self.layers:
-                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
-                # backward hook inside of FSDP...
-                layer._magma_checkpointed = True  # type: ignore
-    def forward(self, x: torch.Tensor, *args, **kwargs):
         B, T, C = x.shape
@@ -376,7 +183,7 @@ class StreamingTransformer(nn.Module):
         if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            positions = positions + kwargs['token_count']  #offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + pos_emb
@@ -384,6 +191,6 @@ class StreamingTransformer(nn.Module):
         for j, lay in enumerate(self.layers):
             # print(f'Transf Layer{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
-            x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # cross_attention_src = txt-cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from einops import rearrange
+def create_sin_embedding(positions,
+                         dim,
+                         max_period = 10000,
+                         dtype = torch.float32):
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
     Args:
     return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
 class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
+                 num_heads,
+                 cross_attention = False):
         super().__init__()
+        self.cross_attention = cross_attention
         self.embed_dim = embed_dim
         self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
+        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.num_heads = num_heads
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
+                                                           dtype=torch.float))
+    def forward(self,
+                query,
+                key=None,
                 value=None):
         layout = "b h t d"
+        if self.cross_attention:
+            # Different queries, keys, values, we have to spit manually the in_proj_weight
+            dim = self.in_proj_weight.shape[0] // 3
+            q = nn.functional.linear(query, self.in_proj_weight[:dim])
+            k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
+            v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
+            q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
+            # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
+        else:
+            # 1st projected makes k,v (instantaneous)
+            # 2nd cat
+            # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
+            projected = nn.functional.linear(query, self.in_proj_weight)
+            bound_layout = "b h p t d"
+            packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
+            q, k, v = packed.unbind(dim=2)
+            if self.k_history is not None:
+                #
+                # pk.shape=torch.Size([2, 24, 3, 64]) k.shape=torch.Size([2, 24, 1, 64]) CONCAT
+                # has to be 4D with batch 1 due to single condition 3=seqlen
+                # 24 heads 64 dimofh
+                self.k_history = torch.cat([self.k_history, k], 2)
+                self.v_history = torch.cat([self.v_history, v], 2)
+            else:
+                # init on 1st token (for all 47 transf layers)
+                print(f'AudioGen kv cache Flush')
+                self.k_history = k
+                self.v_history = v
+            k = self.k_history
+            v = self.v_history
+            # KV COMPLETION ONLY ON SELF ATTENTION
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, is_causal=False, dropout_p=0
+        )
+        x = x.to(q.dtype)
+        x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
+        x = self.out_proj(x)
         return x
+class StreamingTransformerLayer(nn.Module):
     def __init__(self,
+                 d_model,
+                 num_heads,
+                 dim_feedforward):
+        super().__init__()
+        self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
+                                                     num_heads=num_heads)
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=False)
+        self.cross_attention = StreamingMultiheadAttention(embed_dim=d_model,
+                                                           num_heads=num_heads,
+                                                           cross_attention=True)
+        self.norm_cross = nn.LayerNorm(d_model, eps=1e-5)
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
 class StreamingTransformer(nn.Module):
+    def __init__(self,
+                 d_model=1536,
+                 num_heads=24,
+                 num_layers=48,
+                 dim_feedforward=6144,
+                 cross_attention = True,
                  positional_embedding: str = 'sin',
+                 max_period: float = 10_000
+                 ):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
         self.layers = nn.ModuleList()
         for idx in range(num_layers):
             self.layers.append(
+                StreamingTransformerLayer(
+                    d_model=d_model,
+                    num_heads=num_heads,
+                    dim_feedforward=dim_feedforward
+                    )
+            )
+    def forward(self,
+                x,
+                token_count=None,
+                cross_attention_src=None):
         B, T, C = x.shape
         if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + token_count  #offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):
             # print(f'Transf Layer{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
+            x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

msinference.py CHANGED Viewed

@@ -293,10 +293,41 @@ with open(f"Utils/all_langs.csv") as f:
-# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
 def has_cyrillic(text):
     # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
@@ -358,7 +389,7 @@ class TextForeign(object):
 def foreign(text=None,   # list of text
             lang='romanian',
             speed=None):
     lang = lang.lower()  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
     # https://huggingface.co/spaces/mms-meta/MMS
@@ -367,11 +398,11 @@ def foreign(text=None,   # list of text
         lang_code = 'hun'
-    elif 'ser' in lang or 'bosn' in lang or 'macedon' in lang or 'croatia' in lang:
         if has_cyrillic(text[0]):  # check 0-th sentence if is cyrillic
-            lang_code = 'rmc-script_cyrillic'   # romani carpathian (also has lating/cyrillic Vlax)
         else:
@@ -387,6 +418,11 @@ def foreign(text=None,   # list of text
         lang_code = 'deu'
         speed = 1.14 if speed is None else speed
     else:
         lang_code = lang.split()[0].strip()
@@ -431,20 +467,29 @@ def foreign(text=None,   # list of text
     x = []
     for _t in text:
         if is_uroman:
             uroman_dir = "Utils/uroman"
             assert os.path.exists(uroman_dir)
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
-        _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')  # Parse STTS2 pronounciation on tts_mult()
         _t = text_mapper.filter_oov(_t, lang=lang)
-        # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
         stn_tst = text_mapper.get_text(_t, hps)
         with torch.no_grad():
             x_tst = stn_tst.unsqueeze(0).to(device)
@@ -468,14 +513,3 @@ def foreign(text=None,   # list of text
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x
-# LANG = 'eng'
-# _t = 'Converts a string of text to a sequence of IDs corresponding to the symbols in the text. Args: text: string to convert to a sequence'
-# x = synthesize(text=_t, lang=LANG, speed=1.14)
-# audiofile.write('_r.wav', x, 16000)  # mms-tts = 16,000

+# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
+# ==============================================================================================
+import re
+from num2words import num2words
+PHONEME_MAP = {
+        'q': 'ku',
+        'w': 'aou',
+        'z': 's',
+        "š": "s",
+        'th': 'ta',
+        'v': 'vv',
+        # "ć": "č",
+        # "đ": "ď",
+        # "lj": "ľ",
+        # "nj": "ň",
+        "ž": "z",
+        # "c": "č"
+        }
+# ALLOWED_PHONEMES = set("šč_bďph`-3žt 'ľzj5yuoóx1vfnaiedt́sṁkň2rčlg")
+def number_to_phonemes(match):
+    number = int(match.group())
+    words = num2words(number, lang='sr')
+    return fix_phones(words.lower())
+    # return words
+def fix_phones(text):
+    for src, target in PHONEME_MAP.items():
+        text = text.replace(src, target)
+    # text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
+    # text = re.sub(r'\s+', '_     _', text)  # almost proper pausing
+    return text.replace(',', '_     _').replace('.', '_    _')
 def has_cyrillic(text):
     # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
 def foreign(text=None,   # list of text
             lang='romanian',
             speed=None):
     lang = lang.lower()  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
     # https://huggingface.co/spaces/mms-meta/MMS
         lang_code = 'hun'
+    elif any([i in lang for i in ['ser', 'bosn', 'herzegov', 'montenegr', 'macedon']]):
         if has_cyrillic(text[0]):  # check 0-th sentence if is cyrillic
+            lang_code = 'rmc-script_cyrillic'   # romani carpathian (also has latin / cyrillic Vlax)
         else:
         lang_code = 'deu'
         speed = 1.14 if speed is None else speed
+    elif 'alban' in lang:
+        lang_code = 'sqi'
+        speed = 1.04 if speed is None else speed
     else:
         lang_code = lang.split()[0].strip()
     x = []
     for _t in text:
         if is_uroman:
             uroman_dir = "Utils/uroman"
             assert os.path.exists(uroman_dir)
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
+        _t = _t.lower()
+        if lang_code == 'rmc-script_latin':
+            _t = re.sub(r'\d+', number_to_phonemes, _t)
+            _t = fix_phones(_t)
+        elif lang_code == 'ron':
+            _t = _t.replace("ţ", "ț"
+                        ).replace('ț','ts').replace('î', 'u')
+        # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
         _t = text_mapper.filter_oov(_t, lang=lang)
+        print(f'{speed=}\n\n\n\n_______________________________ {_t}')
         stn_tst = text_mapper.get_text(_t, hps)
         with torch.no_grad():
             x_tst = stn_tst.unsqueeze(0).to(device)
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ torch
 torchaudio
 numpy
 audiofile
-audresample
 cached_path
 einops
 flask

 torchaudio
 numpy
 audiofile
+num2words
 cached_path
 einops
 flask