clean unused functions

Browse files

Files changed (7) hide show

audiocraft/builders.py +5 -41
audiocraft/codebooks_patterns.py +1 -190
audiocraft/conditioners.py +4 -1
audiocraft/encodec.py +5 -47
audiocraft/lm.py +41 -859
audiocraft/loaders.py +2 -1
demo.py +4 -5

audiocraft/builders.py CHANGED Viewed

@@ -10,22 +10,13 @@ from the Hydra config.
 """
 import typing as tp
-import audiocraft
 import omegaconf
 import torch
 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
-from .codebooks_patterns import (
-    CodebooksPatternProvider,
-    DelayedPatternProvider,
-    MusicLMPattern,
-    ParallelPatternProvider,
-    UnrolledPatternProvider,
-    CoarseFirstPattern,
-)
 from .conditioners import (
     BaseConditioner,
     ConditionFuser,
@@ -159,45 +150,18 @@ def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
     return fuser
-def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -> CodebooksPatternProvider:
-    """Instantiate a codebooks pattern provider object."""
     pattern_providers = {
-        'parallel': ParallelPatternProvider,
-        'delay': DelayedPatternProvider,
-        'unroll': UnrolledPatternProvider,
-        'coarse_first': CoarseFirstPattern,
-        'musiclm': MusicLMPattern,
     }
     name = cfg.modeling
     kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
     klass = pattern_providers[name]
     return klass(n_q, **kwargs)
-def get_debug_compression_model(device='cpu', sample_rate: int = 32000):
-    """Instantiate a debug compression model to be used for unit tests."""
-    assert sample_rate in [16000, 32000], "unsupported sample rate for debug compression model"
-    model_ratios = {
-        16000: [10, 8, 8],  # 25 Hz at 16kHz
-        32000: [10, 8, 16]  # 25 Hz at 32kHz
-    }
-    ratios: tp.List[int] = model_ratios[sample_rate]
-    frame_rate = 25
-    seanet_kwargs: dict = {
-        'n_filters': 4,
-        'n_residual_layers': 1,
-        'dimension': 32,
-        'ratios': ratios,
-    }
-    encoder = SEANetEncoder(**seanet_kwargs)
-    decoder = SEANetDecoder(**seanet_kwargs)
-    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
-    init_x = torch.randn(8, 32, 128)
-    quantizer(init_x, 1)  # initialize kmeans etc.
-    compression_model = EncodecModel(
-        encoder, decoder, quantizer,
-        frame_rate=frame_rate, sample_rate=sample_rate, channels=1).to(device)
-    return compression_model.eval()
 def get_diffusion_model(cfg: omegaconf.DictConfig):

 """
 import typing as tp
 import omegaconf
 import torch
 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
+from .codebooks_patterns import DelayedPatternProvider
 from .conditioners import (
     BaseConditioner,
     ConditionFuser,
     return fuser
+def get_codebooks_pattern_provider(n_q, cfg):
     pattern_providers = {
+        'delay': DelayedPatternProvider,  # THIS
     }
     name = cfg.modeling
     kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
     klass = pattern_providers[name]
     return klass(n_q, **kwargs)
 def get_diffusion_model(cfg: omegaconf.DictConfig):

audiocraft/codebooks_patterns.py CHANGED Viewed

@@ -52,7 +52,7 @@ class Pattern:
         self._validate_layout()
         self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
         self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
-        logger.info("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
     def _validate_layout(self):
         """Runs checks on the layout to ensure a valid pattern is defined.
@@ -356,193 +356,4 @@ class DelayedPatternProvider(CodebooksPatternProvider):
         return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class ParallelPatternProvider(DelayedPatternProvider):
-    """Provider for parallel pattern across codebooks.
-    This pattern provider is a special case of the delayed pattern with actually no delay,
-    hence delays=repeat(0, n_q).
-    Args:
-        n_q (int): Number of codebooks.
-        empty_initial (int): Prepend with N empty list of coordinates.
-    """
-    def __init__(self, n_q: int, empty_initial: int = 0):
-        super().__init__(n_q, [0] * n_q, empty_initial=empty_initial)
-class UnrolledPatternProvider(CodebooksPatternProvider):
-    """Provider for unrolling codebooks pattern.
-    This pattern provider enables to represent the codebook flattened completely or only to some extend
-    while also specifying a given delay between the flattened codebooks representation, allowing to
-    unroll the codebooks in the sequence.
-    Example:
-        1. Flattening of the codebooks.
-        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
-        taking n_q = 3 and timesteps = 4:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
-         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
-        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
-        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
-        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
-        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
-        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
-        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
-        and delays = [0, 3, 3]:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, S, S, 1, S, 2, S, 3, S, 4],
-         [S, S, S, 1, S, 2, S, 3, S, 4],
-         [1, 2, 3, S, 4, S, 5, S, 6, S]]
-    Args:
-        n_q (int): Number of codebooks.
-        flattening (list of int, optional): Flattening schema over the codebooks. If not defined,
-            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
-            have n_q extra steps for each timestep.
-        delays (list of int, optional): Delay for each of the codebooks. If not defined,
-            no delay is added and therefore will default to [0] * ``n_q``.
-            Note that two codebooks that will be flattened to the same inner step
-            should have the same delay, otherwise the pattern is considered as invalid.
-    """
-    FlattenedCodebook = namedtuple('FlattenedCodebook', ['codebooks', 'delay'])
-    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
-                 delays: tp.Optional[tp.List[int]] = None):
-        super().__init__(n_q)
-        if flattening is None:
-            flattening = list(range(n_q))
-        if delays is None:
-            delays = [0] * n_q
-        assert len(flattening) == n_q
-        assert len(delays) == n_q
-        assert sorted(flattening) == flattening
-        assert sorted(delays) == delays
-        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
-        self.max_delay = max(delays)
-    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
-        """Build a flattened codebooks representation as a dictionary of inner step
-        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
-        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
-        """
-        flattened_codebooks: dict = {}
-        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
-            if inner_step not in flattened_codebooks:
-                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
-            else:
-                flat_codebook = flattened_codebooks[inner_step]
-                assert flat_codebook.delay == delay, (
-                    "Delay and flattening between codebooks is inconsistent: ",
-                    "two codebooks flattened to the same position should have the same delay."
-                )
-                flat_codebook.codebooks.append(q)
-            flattened_codebooks[inner_step] = flat_codebook
-        return flattened_codebooks
-    @property
-    def _num_inner_steps(self):
-        """Number of inner steps to unroll between timesteps in order to flatten the codebooks.
-        """
-        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
-    def num_virtual_steps(self, timesteps: int) -> int:
-        return timesteps * self._num_inner_steps + 1
-    def get_pattern(self, timesteps: int) -> Pattern:
-        """Builds pattern for delay across codebooks.
-        Args:
-            timesteps (int): Total number of timesteps.
-        """
-        # the PatternLayout is built as a tuple of sequence position and list of coordinates
-        # so that it can be reordered properly given the required delay between codebooks of given timesteps
-        indexed_out: list = [(-1, [])]
-        max_timesteps = timesteps + self.max_delay
-        for t in range(max_timesteps):
-            # for each timestep, we unroll the flattened codebooks,
-            # emitting the sequence step with the corresponding delay
-            for step in range(self._num_inner_steps):
-                if step in self._flattened_codebooks:
-                    # we have codebooks at this virtual step to emit
-                    step_codebooks = self._flattened_codebooks[step]
-                    t_for_q = t + step_codebooks.delay
-                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
-                    if t_for_q < max_timesteps and t < max_timesteps:
-                        indexed_out.append((t_for_q, coords))
-                else:
-                    # there is no codebook in this virtual step so we emit an empty list
-                    indexed_out.append((t, []))
-        out = [coords for _, coords in sorted(indexed_out)]
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class CoarseFirstPattern(CodebooksPatternProvider):
-    """First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
-    potentially with delays.
-    ..Warning:: You must always generate the full training duration at test time, for instance,
-        30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
-        location. This is due to the non causality of the remaining codebooks with respect to
-        the first ones.
-    Args:
-        n_q (int): Number of codebooks.
-        delays (list of int, optional): Delay for each of the codebooks.
-            If delays not defined, each codebook is delayed by 1 compared to the previous one.
-    """
-    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
-        super().__init__(n_q)
-        if delays is None:
-            delays = [0] * (n_q - 1)
-        self.delays = delays
-        assert len(self.delays) == self.n_q - 1
-        assert sorted(self.delays) == self.delays
-    def get_pattern(self, timesteps: int) -> Pattern:
-        out: PatternLayout = [[]]
-        for t in range(timesteps):
-            out.append([LayoutCoord(t, 0)])
-        max_delay = max(self.delays)
-        for t in range(timesteps + max_delay):
-            v = []
-            for q, delay in enumerate(self.delays):
-                t_for_q = t - delay
-                if t_for_q >= 0:
-                    v.append(LayoutCoord(t_for_q, q + 1))
-            out.append(v)
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class MusicLMPattern(CodebooksPatternProvider):
-    """Almost MusicLM style pattern. This is equivalent to full flattening
-    but in a different order.
-    Args:
-        n_q (int): Number of codebooks.
-        group_by (int): Number of codebooks to group together.
-    """
-    def __init__(self, n_q: int, group_by: int = 2):
-        super().__init__(n_q)
-        self.group_by = group_by
-    def get_pattern(self, timesteps: int) -> Pattern:
-        out: PatternLayout = [[]]
-        for offset in range(0, self.n_q, self.group_by):
-            for t in range(timesteps):
-                for q in range(offset, offset + self.group_by):
-                    out.append([LayoutCoord(t, q)])
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)

         self._validate_layout()
         self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
         self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        print("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
     def _validate_layout(self):
         """Runs checks on the layout to ensure a valid pattern is defined.
         return Pattern(out, n_q=self.n_q, timesteps=timesteps)

audiocraft/conditioners.py CHANGED Viewed

@@ -410,7 +410,10 @@ class ConditionFuser(StreamingModule):
             # print(f'{self.cond2fuse=}')  - self.cond2fuse={'description': 'cross'}
             cross_attention_output = cond
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T

             # print(f'{self.cond2fuse=}')  - self.cond2fuse={'description': 'cross'}
             cross_attention_output = cond
+            # print(f'{cross_attention_output.shape=} for {input.sum()=}')
+# cross_attention_output.shape=torch.Size([2, 5, 1536]) for input.sum()=tensor(-0.0650, device='cuda:0')
+# cross_attention_output.shape=torch.Size([2, 5, 1536]) for input.sum()=tensor(3.7672, device='cuda:0')
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T

audiocraft/encodec.py CHANGED Viewed

@@ -77,42 +77,7 @@ class CompressionModel(ABC, nn.Module):
         """Set the active number of codebooks used by the quantizer."""
         ...
-    @staticmethod
-    def get_pretrained(
-            name: str, device: tp.Union[torch.device, str] = 'cpu'
-            ) -> 'CompressionModel':
-        """Instantiate a CompressionModel from a given pretrained model.
-        Args:
-            name (Path or str): name of the pretrained model. See after.
-            device (torch.device or str): Device on which the model is loaded.
-        Pretrained models:
-            - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
-            - dac_24khz (same)
-            - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
-            - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
-            - your own model on Hugging Face. Export instructions to come...
-        """
-        from . import builders, loaders
-        model: CompressionModel
-        if name in ['dac_44khz', 'dac_24khz']:
-            model_type = name.split('_')[1]
-            logger.info("Getting pretrained compression model from DAC %s", model_type)
-            model = DAC(model_type)
-        elif name in ['debug_compression_model']:
-            logger.info("Getting pretrained compression model for debug")
-            model = builders.get_debug_compression_model()
-        elif Path(name).exists():
-            # We assume here if the path exists that it is in fact an AC checkpoint
-            # that was exported using `audiocraft.utils.export` functions.
-            model = loaders.load_compression_model(name, device=device)
-        else:
-            logger.info("Getting pretrained compression model from HF %s", name)
-            hf_model = HFEncodecModel.from_pretrained(name)
-            model = HFEncodecCompressionModel(hf_model).to(device)
-        return model.to(device).eval()
 class EncodecModel(CompressionModel):
@@ -196,20 +161,13 @@ class EncodecModel(CompressionModel):
         return x
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        """Decode the given codes to a reconstructed representation, using the scale to perform
-        audio denormalization if needed.
-        Args:
-            codes (torch.Tensor): Int tensor of shape [B, K, T]
-            scale (torch.Tensor, optional): Float tensor containing the scale value.
-        Returns:
-            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
-        """
         emb = self.decode_latent(codes)
         out = self.decoder(emb)
         out = self.postprocess(out, scale)
-        # out contains extra padding added by the encoder and decoder
         return out
     def decode_latent(self, codes: torch.Tensor):

         """Set the active number of codebooks used by the quantizer."""
         ...
 class EncodecModel(CompressionModel):
         return x
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        # B,K,T -> B,C,T
         emb = self.decode_latent(codes)
         out = self.decoder(emb)
         out = self.postprocess(out, scale)
         return out
     def decode_latent(self, codes: torch.Tensor):

audiocraft/lm.py CHANGED Viewed

@@ -1,769 +1,27 @@
-# ========================= From conditioners.py
-import soundfile
-from collections import defaultdict
-from copy import deepcopy
 from dataclasses import dataclass, field
 from itertools import chain
 import logging
 import math
-from pathlib import Path
-import random
 import re
 import typing as tp
-import warnings
-import einops
-from num2words import num2words
-import spacy
-from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
 from audiocraft.streaming import StreamingModule
-from audiocraft.transformer import create_sin_embedding
-from audiocraft.utils.autocast import TorchAutocast
-from audiocraft.utils.utils import collate, length_to_mask
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
-import logging
-import math
-import typing as tp
 from torch import nn
 from audiocraft.utils import utils
-from audiocraft.codebooks_patterns import CodebooksPatternProvider
 from audiocraft.activations import get_activation_fn
 logger = logging.getLogger(__name__)
 TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
-class WavCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
-class JointEmbedCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    text: tp.List[tp.Optional[str]]
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
-@dataclass
-class ConditioningAttributes:
-    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
-    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
-    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
-    def __getitem__(self, item):
-        return getattr(self, item)
-    @property
-    def text_attributes(self):
-        return self.text.keys()
-    @property
-    def wav_attributes(self):
-        return self.wav.keys()
-    @property
-    def joint_embed_attributes(self):
-        return self.joint_embed.keys()
-    @property
-    def attributes(self):
-        return {
-            "text": self.text_attributes,
-            "wav": self.wav_attributes,
-            "joint_embed": self.joint_embed_attributes,
-        }
-    def to_flat_dict(self):
-        return {
-            **{f"text.{k}": v for k, v in self.text.items()},
-            **{f"wav.{k}": v for k, v in self.wav.items()},
-            **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()}
-        }
-    @classmethod
-    def from_flat_dict(cls, x):
-        out = cls()
-        for k, v in x.items():
-            kind, att = k.split(".")
-            out[kind][att] = v
-        return out
-def nullify_condition(condition: ConditionType, dim: int = 1):
-    """Transform an input condition to a null condition.
-    The way it is done by converting it to a single zero vector similarly
-    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
-    Args:
-        condition (ConditionType): A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])
-        dim (int): The dimension that will be truncated (should be the time dimension)
-        WARNING!: dim should not be the batch dimension!
-    Returns:
-        ConditionType: A tuple of null condition and mask
-    """
-    assert dim != 0, "dim cannot be the batch dimension!"
-    assert isinstance(condition, tuple) and \
-        isinstance(condition[0], torch.Tensor) and \
-        isinstance(condition[1], torch.Tensor), "'nullify_condition' got an unexpected input type!"
-    cond, mask = condition
-    B = cond.shape[0]
-    last_dim = cond.dim() - 1
-    out = cond.transpose(dim, last_dim)
-    out = 0. * out[..., :1]
-    out = out.transpose(dim, last_dim)
-    mask = torch.zeros((B, 1), device=out.device).int()
-    assert cond.dim() == out.dim()
-    return out, mask
-def nullify_wav(cond: WavCondition) -> WavCondition:
-    """Transform a WavCondition to a nullified WavCondition.
-    It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.
-    Args:
-        cond (WavCondition): Wav condition with wav, tensor of shape [B, T].
-    Returns:
-        WavCondition: Nullified wav condition.
-    """
-    null_wav, _ = nullify_condition((cond.wav, torch.zeros_like(cond.wav)), dim=cond.wav.dim() - 1)
-    return WavCondition(
-        wav=null_wav,
-        length=torch.tensor([0] * cond.wav.shape[0], device=cond.wav.device),
-        sample_rate=cond.sample_rate,
-        path=[None] * cond.wav.shape[0],
-        seek_time=[None] * cond.wav.shape[0],
-    )
-def nullify_joint_embed(embed: JointEmbedCondition) -> JointEmbedCondition:
-    """Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
-    and replacing metadata by dummy attributes.
-    Args:
-        cond (JointEmbedCondition): Joint embedding condition with wav and text, wav tensor of shape [B, C, T].
-    """
-    null_wav, _ = nullify_condition((embed.wav, torch.zeros_like(embed.wav)), dim=embed.wav.dim() - 1)
-    return JointEmbedCondition(
-        wav=null_wav, text=[None] * len(embed.text),
-        length=torch.LongTensor([0]).to(embed.wav.device),
-        sample_rate=embed.sample_rate,
-        path=[None] * embed.wav.shape[0],
-        seek_time=[0] * embed.wav.shape[0],
-    )
-class Tokenizer:
-    """Base tokenizer implementation
-    (in case we want to introduce more advances tokenizers in the future).
-    """
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError()
-class WhiteSpaceTokenizer(Tokenizer):
-    """This tokenizer should be used for natural language descriptions.
-    For example:
-    ["he didn't, know he's going home.", 'shorter sentence'] =>
-    [[78, 62, 31,  4, 78, 25, 19, 34],
-    [59, 77,  0,  0,  0,  0,  0,  0]]
-    """
-    PUNCTUATION = "?:!.,;"
-    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = "en_core_web_sm",
-                 lemma: bool = True, stopwords: bool = True) -> None:
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-        self.lemma = lemma
-        self.stopwords = stopwords
-        try:
-            self.nlp = spacy.load(language)
-        except IOError:
-            spacy.cli.download(language)  # type: ignore
-            self.nlp = spacy.load(language)
-    @tp.no_type_check
-    def __call__(self, texts: tp.List[tp.Optional[str]],
-                 return_text: bool = False) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Take a list of strings and convert them to a tensor of indices.
-        Args:
-            texts (list[str]): List of strings.
-            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]:
-                - Indices of words in the LUT.
-                - And a mask indicating where the padding tokens are
-        """
-        output, lengths = [], []
-        texts = deepcopy(texts)
-        for i, text in enumerate(texts):
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(torch.Tensor([self.pad_idx]))
-                lengths.append(0)
-                continue
-            # convert numbers to words
-            text = re.sub(r"(\d+)", lambda x: num2words(int(x.group(0))), text)  # type: ignore
-            # normalize text
-            text = self.nlp(text)  # type: ignore
-            # remove stopwords
-            if self.stopwords:
-                text = [w for w in text if not w.is_stop]  # type: ignore
-            # remove punctuation
-            text = [w for w in text if w.text not in self.PUNCTUATION]  # type: ignore
-            # lemmatize if needed
-            text = [getattr(t, "lemma_" if self.lemma else "text") for t in text]  # type: ignore
-            texts[i] = " ".join(text)
-            lengths.append(len(text))
-            # convert to tensor
-            tokens = torch.Tensor([hash_trick(w, self.n_bins) for w in text])
-            output.append(tokens)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
-        if return_text:
-            return padded_output, mask, texts  # type: ignore
-        return padded_output, mask
-class NoopTokenizer(Tokenizer):
-    """This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
-    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
-    strings, so "Jeff Buckley" will get it's own index. Whereas WhiteSpaceTokenizer will
-    split it to ["Jeff", "Buckley"] and return an index per word.
-    For example:
-    ["Queen", "ABBA", "Jeff Buckley"] => [43, 55, 101]
-    ["Metal", "Rock", "Classical"] => [0, 223, 51]
-    """
-    def __init__(self, n_bins: int, pad_idx: int = 0):
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        output, lengths = [], []
-        for text in texts:
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(self.pad_idx)
-                lengths.append(0)
-            else:
-                output.append(hash_trick(text, self.n_bins))
-                lengths.append(1)
-        tokens = torch.LongTensor(output).unsqueeze(1)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        return tokens, mask
-class BaseConditioner(nn.Module):
-    """Base model for all conditioner modules.
-    We allow the output dim to be different than the hidden dim for two reasons:
-    1) keep our LUTs small when the vocab is large;
-    2) make all condition dims consistent.
-    Args:
-        dim (int): Hidden dim of the model.
-        output_dim (int): Output dim of the conditioner.
-    """
-    def __init__(self, dim: int, output_dim: int):
-        super().__init__()
-        self.dim = dim
-        self.output_dim = output_dim
-        self.output_proj = nn.Linear(dim, output_dim)
-    def forward(self, inputs: tp.Any) -> ConditionType:
-        """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
-        Outputs a ConditionType, after the input data was embedded as a dense vector.
-        Returns:
-            ConditionType:
-                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
-                  output embedding and D is the dimension of the embedding.
-                - And a mask indicating where the padding tokens.
-        """
-        raise NotImplementedError()
-class TextConditioner(BaseConditioner):
-    ...
-class T5Conditioner(TextConditioner):
-    """T5-based TextConditioner.
-    Args:
-        name (str): Name of the T5 model.
-        output_dim (int): Output dim of the conditioner.
-        finetune (bool): Whether to fine-tune T5 at train time.
-        device (str): Device for T5 Conditioner.
-        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
-        word_dropout (float, optional): Word dropout probability.
-        normalize_text (bool, optional): Whether to apply text normalization.
-    """
-    MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
-              "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
-              "google/flan-t5-xl", "google/flan-t5-xxl"]
-    MODELS_DIMS = {
-        "t5-small": 512,
-        "t5-base": 768,
-        "t5-large": 1024,
-        "t5-3b": 1024,
-        "t5-11b": 1024,
-        "google/flan-t5-small": 512,
-        "google/flan-t5-base": 768,
-        "google/flan-t5-large": 1024,
-        "google/flan-t5-3b": 1024,
-        "google/flan-t5-11b": 1024,
-    }
-    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', word_dropout: float = 0.,
-                 normalize_text: bool = False):
-        assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
-        super().__init__(self.MODELS_DIMS[name], output_dim)
-        self.device = device
-        self.name = name
-        self.finetune = finetune
-        self.word_dropout = word_dropout
-        if autocast_dtype is None or self.device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            if self.device != 'cpu':
-                logger.warning("T5 has no autocast, this might lead to NaN")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"T5 will be evaluated with autocast as {autocast_dtype}")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
-        # Let's disable logging temporarily because T5 will vomit some errors otherwise.
-        # thanks https://gist.github.com/simon-weber/7853144
-        previous_level = logging.root.manager.disable
-        logging.disable(logging.ERROR)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            try:
-                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
-                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
-            finally:
-                logging.disable(previous_level)
-        if finetune:
-            self.t5 = t5
-        else:
-            # this makes sure that the t5 models is not part
-            # of the saved checkpoint
-            self.__dict__['t5'] = t5.to(device)
-        self.normalize_text = normalize_text
-        if normalize_text:
-            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
-        # if current sample doesn't have a certain attribute, replace with empty string
-        entries: tp.List[str] = [xi if xi is not None else "" for xi in x]
-        if self.normalize_text:
-            _, _, entries = self.text_normalizer(entries, return_text=True)
-        if self.word_dropout > 0. and self.training:
-            new_entries = []
-            for entry in entries:
-                words = [word for word in entry.split(" ") if random.random() >= self.word_dropout]
-                new_entries.append(" ".join(words))
-            entries = new_entries
-        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == ""])
-        inputs = self.t5_tokenizer(entries, return_tensors='pt', padding=True).to(self.device)
-        mask = inputs['attention_mask']
-        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-        return inputs
-    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -> ConditionType:
-        mask = inputs['attention_mask']
-        with torch.set_grad_enabled(self.finetune), self.autocast:
-            embeds = self.t5(**inputs).last_hidden_state
-        embeds = self.output_proj(embeds.to(self.output_proj.weight))
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
-class JointEmbeddingConditioner(BaseConditioner):
-    """Joint embedding conditioning supporting both audio or text conditioning.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        autocast_dtype (str): Autocast for the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', quantize: bool = True,
-                 n_q: int = 12, bins: int = 1024, **kwargs):
-        super().__init__(dim=dim, output_dim=output_dim)
-        self.device = device
-        self.attribute = attribute
-        if autocast_dtype is None or device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            logger.warning("JointEmbeddingConditioner has no autocast, this might lead to NaN.")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"JointEmbeddingConditioner will be evaluated with autocast as {autocast_dtype}.")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
-        # residual vector quantizer to discretize the conditioned embedding
-        self.quantizer=None
-        if quantize:
-            print('\n\n\n\nWANTS TO QUANTIZE on Inference\n\n\n\n')
-            # self.quantizer = ResidualVectorQuantizer(dim, n_q=n_q, bins=bins, **kwargs)
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Get joint embedding in latent space from the inputs.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]: Tensor for the latent embedding
-                and corresponding empty indexes.
-        """
-        raise NotImplementedError()
-    def forward(self, x: JointEmbedCondition) -> ConditionType:
-        with self.autocast:
-            embed, empty_idx = self._get_embed(x)
-            if self.quantizer is not None:
-                embed = embed.view(-1, self.dim, 1)
-                q_res = self.quantizer(embed, frame_rate=1)
-                out_embed = q_res.x.view(-1, self.dim)
-            else:
-                out_embed = embed
-            out_embed = self.output_proj(out_embed).view(-1, 1, self.output_dim)
-            mask = torch.ones(*out_embed.shape[:2], device=out_embed.device)
-            mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-            out_embed = (out_embed * mask.unsqueeze(-1))
-            return out_embed, mask
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        return x
-class ConditioningProvider(nn.Module):
-    """Prepare and provide conditions given all the supported conditioners.
-    Args:
-        conditioners (dict): Dictionary of conditioners.
-        device (torch.device or str, optional): Device for conditioners and output condition types.
-    """
-    def __init__(self, conditioners: tp.Dict[str, BaseConditioner], device: tp.Union[torch.device, str] = "cpu"):
-        super().__init__()
-        self.device = device
-        self.conditioners = nn.ModuleDict(conditioners)
-    @property
-    def joint_embed_conditions(self):
-        return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
-    @property
-    def has_joint_embed_conditions(self):
-        return len(self.joint_embed_conditions) > 0
-    @property
-    def text_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
-    @property
-    def wav_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
-    @property
-    def has_wav_condition(self):
-        return len(self.wav_conditions) > 0
-    def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
-        """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
-        The output is for example:
-        {
-            "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
-            "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
-            ...
-        }
-        Args:
-            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
-        """
-        output = {}
-        for attribute, inputs in tokenized.items():
-            condition, mask = self.conditioners[attribute](inputs)
-            output[attribute] = (condition, mask)
-        return output
-    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.List[tp.Optional[str]]]:
-        """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
-        are the attributes and the values are the aggregated input per attribute.
-        For example:
-        Input:
-        [
-            ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...),
-            ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, wav=...),
-        ]
-        Output:
-        {
-            "genre": ["Rock", "Hip-hop"],
-            "description": ["A rock song with a guitar solo", "A hip-hop verse"]
-        }
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
-        """
-        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
-        texts = [x.text for x in samples]
-        for text in texts:
-            for condition in self.text_conditions:
-                out[condition].append(text[condition])
-        return out
-    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, WavCondition]:
-        """Generate a dict where the keys are attributes by which we fetch similar wavs,
-        and the values are Tensors of wavs according to said attributes.
-        *Note*: by the time the samples reach this function, each sample should have some waveform
-        inside the "wav" attribute. It should be either:
-        1. A real waveform
-        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
-        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
-        """
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        out: tp.Dict[str, WavCondition] = {}
-        for sample in samples:
-            for attribute in self.wav_conditions:
-                wav, length, sample_rate, path, seek_time = sample.wav[attribute]
-                assert wav.dim() == 3, f"Got wav with dim={wav.dim()}, but expected 3 [1, C, T]"
-                assert wav.size(0) == 1, f"Got wav [B, C, T] with shape={wav.shape}, but expected B == 1"
-                # mono-channel conditioning
-                wav = wav.mean(1, keepdim=True)  # [1, 1, T]
-                wavs[attribute].append(wav.flatten())  # [T]
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        # stack all wavs to a single tensor
-        for attribute in self.wav_conditions:
-            stacked_wav, _ = collate(wavs[attribute], dim=0)
-            out[attribute] = WavCondition(
-                stacked_wav.unsqueeze(1), torch.cat(lengths[attribute]), sample_rates[attribute],
-                paths[attribute], seek_times[attribute])
-        return out
-    def _collate_joint_embeds(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, JointEmbedCondition]:
-        """Generate a dict where the keys are attributes by which we compute joint embeddings,
-        and the values are Tensors of pre-computed embeddings and the corresponding text attributes.
-        Args:
-            samples (list[ConditioningAttributes]): List of ConditioningAttributes samples.
-        Returns:
-            A dictionary mapping an attribute name to joint embeddings.
-        """
-        texts = defaultdict(list)
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        channels: int = 0
-        out = {}
-        for sample in samples:
-            for attribute in self.joint_embed_conditions:
-                wav, text, length, sample_rate, path, seek_time = sample.joint_embed[attribute]
-                assert wav.dim() == 3
-                if channels == 0:
-                    channels = wav.size(1)
-                else:
-                    assert channels == wav.size(1), "not all audio has same number of channels in batch"
-                assert wav.size(0) == 1, "Expecting single-wav batch in the collate method"
-                wav = einops.rearrange(wav, "b c t -> (b c t)")  # [1, C, T] => [C * T]
-                wavs[attribute].append(wav)
-                texts[attribute].extend(text)
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        for attribute in self.joint_embed_conditions:
-            stacked_texts = texts[attribute]
-            stacked_paths = paths[attribute]
-            stacked_seek_times = seek_times[attribute]
-            stacked_wavs = pad_sequence(wavs[attribute]).to(self.device)
-            stacked_wavs = einops.rearrange(stacked_wavs, "(c t) b -> b c t", c=channels)
-            stacked_sample_rates = sample_rates[attribute]
-            stacked_lengths = torch.cat(lengths[attribute]).to(self.device)
-            assert stacked_lengths.size(0) == stacked_wavs.size(0)
-            assert len(stacked_sample_rates) == stacked_wavs.size(0)
-            assert len(stacked_texts) == stacked_wavs.size(0)
-            out[attribute] = JointEmbedCondition(
-                text=stacked_texts, wav=stacked_wavs,
-                length=stacked_lengths, sample_rate=stacked_sample_rates,
-                path=stacked_paths, seek_time=stacked_seek_times)
-        return out
-class ConditionFuser(StreamingModule):
-    """Condition fuser handles the logic to combine the different conditions
-    to the actual model input.
-    Args:
-        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
-            each condition. For example:
-            {
-                "prepend": ["description"],
-                "sum": ["genre", "bpm"],
-                "cross": ["description"],
-            }
-        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
-        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
-    """
-    FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]
-    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
-                 cross_attention_pos_emb_scale: float = 1.0):
-        super().__init__()
-        assert all(
-            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
-        ), f"Got invalid fuse method, allowed methods: {self.FUSING_METHODS}"
-        self.cross_attention_pos_emb = cross_attention_pos_emb
-        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
-        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
-        self.cond2fuse: tp.Dict[str, str] = {}
-        for fuse_method, conditions in fuse2cond.items():
-            for condition in conditions:
-                self.cond2fuse[condition] = fuse_method
-    def forward(
-        self,
-        input: torch.Tensor,
-        conditions: tp.Dict[str, ConditionType]
-    ) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """Fuse the conditions to the provided model input.
-        Args:
-            input (torch.Tensor): Transformer input.
-            conditions (dict[str, ConditionType]): Dict of conditions.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
-                after the conditions have been fused. The second output tensor is the tensor
-                used for cross-attention or None if no cross attention inputs exist.
-        """
-        B, T, _ = input.shape
-        if 'offsets' in self._streaming_state:
-            first_step = False
-            offsets = self._streaming_state['offsets']
-        else:
-            first_step = True
-            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
-        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
-            f"given conditions contain unknown attributes for fuser, " \
-            f"expected {self.cond2fuse.keys()}, got {conditions.keys()}"
-        cross_attention_output = None
-        for cond_type, (cond, cond_mask) in conditions.items():
-            op = self.cond2fuse[cond_type]
-            if op == 'sum':
-                input += cond
-            elif op == 'input_interpolate':
-                cond = einops.rearrange(cond, "b t d -> b d t")
-                cond = F.interpolate(cond, size=input.shape[1])
-                input += einops.rearrange(cond, "b d t -> b t d")
-            elif op == 'prepend':
-                if first_step:
-                    input = torch.cat([cond, input], dim=1)
-            elif op == 'cross':
-                if cross_attention_output is not None:
-                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
-                else:
-                    cross_attention_output = cond
-            else:
-                raise ValueError(f"unknown op ({op})")
-        if self.cross_attention_pos_emb and cross_attention_output is not None:
-            print('SIN EMBED')
-            positions = torch.arange(
-                cross_attention_output.shape[1],
-                device=cross_attention_output.device
-            ).view(1, -1, 1)
-            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
-            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
-        return input, cross_attention_output
-# ============================================== From LM.py
-logger = logging.getLogger(__name__)
 ConditionTensors = tp.Dict[str, ConditionType]
 CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
@@ -876,8 +134,11 @@ class LMModel(StreamingModule):
         two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
         **kwargs: Additional parameters for the transformer encoder.
     """
-    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
-                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
                  hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False,
                  emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
                  weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
@@ -952,27 +213,11 @@ class LMModel(StreamingModule):
     def num_codebooks(self) -> int:
         return self.n_q
-    def forward(self, sequence: torch.Tensor,
-                conditions: tp.List[ConditioningAttributes],
-                condition_tensors: tp.Optional[ConditionTensors] = None,
-                stage: int = -1) -> torch.Tensor:
-        """Apply language model on sequence and conditions.
-        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
-        S the sequence steps, return the logits with shape [B, card, K, S].
-        Args:
-            indices (torch.Tensor): Indices of the codes to model.
-            conditions (list of ConditioningAttributes): Conditions to use when modeling
-                the given codes. Note that when evaluating multiple time with the same conditioning
-                you should pre-compute those and pass them as `condition_tensors`.
-            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
-                tensors, see `conditions`.
-            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
-                in which prediction is done in a codebook-by-codebook manner.
-                Takes values in range(n_q), and ignored by default.
-        Returns:
-            torch.Tensor: Logits.
-        """
         B, K, S = sequence.shape
         assert K == self.num_codebooks, "Sequence shape must match the specified number of codebooks"
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
@@ -983,8 +228,8 @@ class LMModel(StreamingModule):
             condition_tensors = self.condition_provider(tokenized)
         else:
             assert not conditions, "Shouldn't pass both conditions and condition_tensors."
-        input_, cross_attention_input = self.fuser(input_, condition_tensors)
         out = self.transformer(input_, cross_attention_src=cross_attention_input,
                                src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
@@ -999,60 +244,6 @@ class LMModel(StreamingModule):
         return logits  # [B, K, S, card]
-    def compute_predictions(
-            self, codes: torch.Tensor,
-            conditions: tp.List[ConditioningAttributes],
-            condition_tensors: tp.Optional[ConditionTensors] = None,
-            stage: int = -1,
-            keep_only_valid_steps: bool = True) -> LMOutput:
-        """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
-        forward using the specified codes interleaving pattern.
-        Args:
-            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
-                K the number of codebooks and T the number of timesteps.
-            conditions (list of ConditioningAttributes): conditionings to use when modeling
-                the given codes. Note that when evaluating multiple time with the same conditioning
-                you should pre-compute those and pass them as `condition_tensors`.
-            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
-                tensors, see `conditions`.
-            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
-                in which prediction is done in a codebook-by-codebook manner.
-                Takes values in range(n_q), and ignored by default.
-            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
-                Steps that are beyond valid steps will be replaced by the special_token in that case.
-        Returns:
-            LMOutput: Language model outputs
-                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
-                    i.e. the first item corresponds to logits to predict the first code, meaning that
-                    no additional shifting of codes and logits is required.
-                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
-                    Given the specified interleaving strategies, parts of the logits and codes should
-                    not be considered as valid predictions because of invalid context.
-        """
-        B, K, T = codes.shape
-        codes = codes.contiguous()
-        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
-        # what is the T is it 2048 ?
-        # and then what is pattern -> another function?
-        pattern = self.pattern_provider.get_pattern(T)
-        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
-            codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
-        )
-        # apply model on pattern sequence
-        model = self if self._fsdp is None else self._fsdp
-        logits = model(sequence_codes, conditions, condition_tensors, stage=stage)  # [B, K, S, card]
-        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
-        # and provide the corresponding mask over invalid positions of tokens
-        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
-        # note: we use nans as special token to make it obvious if we feed unexpected logits
-        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
-            logits, float('nan'), keep_only_valid_steps=keep_only_valid_steps
-        )
-        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
-        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
-        return LMOutput(logits, logits_mask)
     def _sample_next_token(self,
                            sequence,
@@ -1127,11 +318,12 @@ class LMModel(StreamingModule):
         return next_token
     @torch.no_grad()
     def generate(self,
-                 prompt: tp.Optional[torch.Tensor] = None,
-                 conditions: tp.List[ConditioningAttributes] = [],
-                 num_samples: tp.Optional[int] = None,
                  max_gen_len: int = 256,
                  use_sampling: bool = True,
                  temp: float = 1.0,
@@ -1143,25 +335,12 @@ class LMModel(StreamingModule):
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
-        """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
-        be performed in a greedy fashion or using sampling with top K and top P strategies.
         Args:
-            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
-            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
-            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
-            max_gen_len (int): Maximum generation length.
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Sampling temperature.
-            top_k (int): K for "top-k" sampling.
-            top_p (float): P for "top-p" sampling.
-            cfg_coeff (float, optional): Classifier-free guidance coefficient.
-            two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
-            remove_prompts (bool): Whether to remove prompts from generation or not.
-            check (bool): Whether to apply further checks on generated sequence.
-            callback (Callback, optional): Callback function to report generation progress.
         Returns:
-            torch.Tensor: Generated tokens.
         """
         assert not self.training, "generation shouldn't be used in training mode."
         first_param = next(iter(self.parameters()))
@@ -1190,20 +369,13 @@ class LMModel(StreamingModule):
         # the padding structure is exactly the same between train and test.
         # With a batch size of 1, this can be slower though.
         cfg_conditions: CFGConditions
-        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        if conditions:
-            null_conditions = conditions
-            if two_step_cfg:
-                cfg_conditions = (
-                    self.condition_provider(self.condition_provider.tokenize(conditions)),
-                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
-                )
-            else:
-                conditions = conditions + null_conditions
-                tokenized = self.condition_provider.tokenize(conditions)
-                cfg_conditions = self.condition_provider(tokenized)
-        else:
-            cfg_conditions = {}
         if prompt is None:
             assert num_samples > 0
@@ -1222,18 +394,26 @@ class LMModel(StreamingModule):
         gen_codes[..., :start_offset] = prompt
         # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
-        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
-        # retrieve the start_offset in the sequence:
-        # it is the first sequence step that contains the `start_offset` timestep
         start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
         assert start_offset_sequence is not None
         with self.streaming():
             unconditional_state = self.get_streaming_state()
             prev_offset = 0
             gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
             for offset in range(start_offset_sequence, gen_sequence_len):
                 # get current sequence (note that the streaming API is providing the caching over previous offsets)
                 curr_sequence = gen_sequence[..., prev_offset:offset]
                 curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
                 if check:
@@ -1268,11 +448,13 @@ class LMModel(StreamingModule):
                     callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
         unconditional_state.clear()
-        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]
         # ensure the returned codes are all valid
         # assert (out_codes >= 0).all() and (out_codes <= self.card).all()
         return out_codes

 from dataclasses import dataclass, field
 from itertools import chain
 import logging
 import math
 import re
 import typing as tp
 import torch
 import torch.nn.functional as F
 from audiocraft.streaming import StreamingModule
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
 from torch import nn
 from audiocraft.utils import utils
 from audiocraft.activations import get_activation_fn
+# ============================================== From LM.py
 logger = logging.getLogger(__name__)
 TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
 ConditionTensors = tp.Dict[str, ConditionType]
 CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
         two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
         **kwargs: Additional parameters for the transformer encoder.
     """
+    def __init__(self,
+                 pattern_provider,
+                 condition_provider,
+                 fuser,
+                 n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
                  hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False,
                  emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
                  weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
     def num_codebooks(self) -> int:
         return self.n_q
+    def forward(self,
+                sequence,
+                conditions,
+                condition_tensors=None,
+                stage = -1):
         B, K, S = sequence.shape
         assert K == self.num_codebooks, "Sequence shape must match the specified number of codebooks"
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
             condition_tensors = self.condition_provider(tokenized)
         else:
             assert not conditions, "Shouldn't pass both conditions and condition_tensors."
+        input_, cross_attention_input = self.fuser(input_, condition_tensors) # DEFINE conditioners.py
         out = self.transformer(input_, cross_attention_src=cross_attention_input,
                                src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
         return logits  # [B, K, S, card]
     def _sample_next_token(self,
                            sequence,
         return next_token
+    # GENERATE class revert_codebook_patterns()
     @torch.no_grad()
     def generate(self,
+                 prompt = None,
+                 conditions = [],
+                 num_samples = None,
                  max_gen_len: int = 256,
                  use_sampling: bool = True,
                  temp: float = 1.0,
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
+        """Default generation takes random token of top_250 logits
         Args:
         Returns:
+            torch.Tensor: tokens
         """
         assert not self.training, "generation shouldn't be used in training mode."
         first_param = next(iter(self.parameters()))
         # the padding structure is exactly the same between train and test.
         # With a batch size of 1, this can be slower though.
         cfg_conditions: CFGConditions
+        # two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        null_conditions = conditions
+        conditions = conditions + null_conditions
+        tokenized = self.condition_provider.tokenize(conditions)
+        cfg_conditions = self.condition_provider(tokenized)
         if prompt is None:
             assert num_samples > 0
         gen_codes[..., :start_offset] = prompt
         # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
         start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        # print('\n=', start_offset_sequence, '\n=')   # 1
         assert start_offset_sequence is not None
         with self.streaming():
             unconditional_state = self.get_streaming_state()
             prev_offset = 0
             gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            # --
+            # print(mask.shape, mask.sum(), 'MSK LM')
+            # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
+            # --
             for offset in range(start_offset_sequence, gen_sequence_len):
                 # get current sequence (note that the streaming API is providing the caching over previous offsets)
                 curr_sequence = gen_sequence[..., prev_offset:offset]
                 curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
                 if check:
                     callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
         unconditional_state.clear()
+        out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]
         # ensure the returned codes are all valid
         # assert (out_codes >= 0).all() and (out_codes <= self.card).all()
         return out_codes

audiocraft/loaders.py CHANGED Viewed

@@ -101,7 +101,8 @@ def _delete_param(cfg: DictConfig, full_name: str):
     OmegaConf.set_struct(cfg, True)
-def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
     pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)

     OmegaConf.set_struct(cfg, True)
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu',
+                  cache_dir: tp.Optional[str] = None):
     pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)

demo.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from audiocraft.audiogen import AudioGen #, audio_write
-import audiofile
-import numpy as np
 print('\n\n\n\n___________________')
-txt = 'car'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=1)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7
-audiofile.write('_audio1_.wav', x, 16000)

 from audiocraft.audiogen import AudioGen #, audio_write
 print('\n\n\n\n___________________')
+txt = 'austrian music'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=4.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7
+audiofile.write('del_seane.wav', x, 16000)