dkounadis
/

artificial-styletts2

@@ -28,10 +28,8 @@ from .codebooks_patterns import (
 )
 from .conditioners import (
     BaseConditioner,
-    CLAPEmbeddingConditioner,
     ConditionFuser,
     ConditioningProvider,
-    LUTConditioner,
     T5Conditioner,
 )
 from .unet import DiffusionUnet

 )
 from .conditioners import (
     BaseConditioner,
     ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
 )
 from .unet import DiffusionUnet

audiocraft/conditioners.py CHANGED Viewed

@@ -19,7 +19,7 @@ import soundfile
 import einops
 from num2words import num2words
 import spacy
-from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -317,39 +317,7 @@ class TextConditioner(BaseConditioner):
     ...
-class LUTConditioner(TextConditioner):
-    """Lookup table TextConditioner.
-    Args:
-        n_bins (int): Number of bins.
-        dim (int): Hidden dim of the model (text-encoder/LUT).
-        output_dim (int): Output dim of the conditioner.
-        tokenizer (str): Name of the tokenizer.
-        pad_idx (int, optional): Index for padding token. Defaults to 0.
-    """
-    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
-        super().__init__(dim, output_dim)
-        self.embed = nn.Embedding(n_bins, dim)
-        self.tokenizer: Tokenizer
-        if tokenizer == 'whitespace':
-            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
-        elif tokenizer == 'noop':
-            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
-        else:
-            raise ValueError(f"unrecognized tokenizer `{tokenizer}`.")
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        device = self.embed.weight.device
-        tokens, mask = self.tokenizer(x)
-        tokens, mask = tokens.to(device), mask.to(device)
-        return tokens, mask
-    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -> ConditionType:
-        tokens, mask = inputs
-        embeds = self.embed(tokens)
-        embeds = self.output_proj(embeds)
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
 class T5Conditioner(TextConditioner):
@@ -448,357 +416,7 @@ class T5Conditioner(TextConditioner):
         return embeds, mask
-class WaveformConditioner(BaseConditioner):
-    """Base class for all conditioners that take a waveform as input.
-    Classes that inherit must implement `_get_wav_embedding` that outputs
-    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
-    factor of the embedding model.
-    Args:
-        dim (int): The internal representation dimension.
-        output_dim (int): Output dimension.
-        device (tp.Union[torch.device, str]): Device.
-    """
-    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
-        super().__init__(dim, output_dim)
-        self.device = device
-        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
-        self._use_masking = True
-    def tokenize(self, x: WavCondition) -> WavCondition:
-        wav, length, sample_rate, path, seek_time = x
-        assert length is not None
-        return WavCondition(wav.to(self.device), length.to(self.device), sample_rate, path, seek_time)
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        """Gets as input a WavCondition and returns a dense embedding."""
-        raise NotImplementedError()
-    def _downsampling_factor(self):
-        """Returns the downsampling factor of the embedding model."""
-        raise NotImplementedError()
-    def forward(self, x: WavCondition) -> ConditionType:
-        """Extract condition embedding and mask from a waveform and its metadata.
-        Args:
-            x (WavCondition): Waveform condition containing raw waveform and metadata.
-        Returns:
-            ConditionType: a dense vector representing the conditioning along with its mask
-        """
-        wav, lengths, *_ = x
-        with torch.no_grad():
-            embeds = self._get_wav_embedding(x)
-        embeds = embeds.to(self.output_proj.weight)
-        embeds = self.output_proj(embeds)
-        if lengths is not None and self._use_masking:
-            lengths = lengths / self._downsampling_factor()
-            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
-        else:
-            mask = torch.ones_like(embeds[..., 0])
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
-class JointEmbeddingConditioner(BaseConditioner):
-    """Joint embedding conditioning supporting both audio or text conditioning.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        autocast_dtype (str): Autocast for the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', quantize: bool = True,
-                 n_q: int = 12, bins: int = 1024, **kwargs):
-        super().__init__(dim=dim, output_dim=output_dim)
-        self.device = device
-        self.attribute = attribute
-        if autocast_dtype is None or device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            logger.warning("JointEmbeddingConditioner has no autocast, this might lead to NaN.")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"JointEmbeddingConditioner will be evaluated with autocast as {autocast_dtype}.")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
-        # residual vector quantizer to discretize the conditioned embedding
-        self.quantizer: tp.Optional[ResidualVectorQuantizer] = None
-        if quantize:
-            self.quantizer = ResidualVectorQuantizer(dim, n_q=n_q, bins=bins, **kwargs)
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Get joint embedding in latent space from the inputs.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]: Tensor for the latent embedding
-                and corresponding empty indexes.
-        """
-        raise NotImplementedError()
-    def forward(self, x: JointEmbedCondition) -> ConditionType:
-        with self.autocast:
-            embed, empty_idx = self._get_embed(x)
-            if self.quantizer is not None:
-                embed = embed.view(-1, self.dim, 1)
-                q_res = self.quantizer(embed, frame_rate=1)
-                out_embed = q_res.x.view(-1, self.dim)
-            else:
-                out_embed = embed
-            out_embed = self.output_proj(out_embed).view(-1, 1, self.output_dim)
-            mask = torch.ones(*out_embed.shape[:2], device=out_embed.device)
-            mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-            out_embed = (out_embed * mask.unsqueeze(-1))
-            return out_embed, mask
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        return x
-class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
-    """Joint Embedding conditioner based on pre-trained CLAP model.
-    This CLAP-based conditioner supports a caching mechanism
-    over the computed embeddings for faster training.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        checkpoint (str): Path to CLAP checkpoint.
-        model_arch (str): CLAP model architecture.
-        enable_fusion (bool): Enable fusion for CLAP model.
-        sample_rate (int): Sample rate used by CLAP model.
-        max_audio_length (float): Maximum audio length for CLAP model.
-        audio_stride (float): Stride to use for getting a CLAP embedding on the full sequence.
-        normalize (bool): Whether to normalize the CLAP embedding.
-        text_p (float): Probability of using text representation instead of audio at train time.
-        batch_size (Optional[int]): Batch size for CLAP embedding computation.
-        autocast_dtype (str): Autocast for the conditioner.
-        cache_path (Optional[str]): Path for pre-computed embeddings caching.
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 quantize: bool, n_q: int, bins: int, checkpoint: tp.Union[str, Path], model_arch: str,
-                 enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int,
-                 normalize: bool, text_p: bool, batch_size: tp.Optional[int] = None,
-                 autocast_dtype: tp.Optional[str] = 'float32', cache_path: tp.Optional[str] = None, **kwargs):
-        try:
-            import laion_clap  # type: ignore
-        except ImportError:
-            raise ImportError("Please install CLAP to use the CLAPEmbeddingConditioner: 'pip install laion_clap'")
-        warnings.warn("Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). "
-                      "Please retrain all models.")
-        # checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
-        clap_tokenize = RobertaTokenizer.from_pretrained('roberta-base')
-        clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
-        load_clap_state_dict(clap_model, checkpoint)
-        clap_model.eval()
-        clap_model.to(device)
-        super().__init__(dim=dim, output_dim=output_dim, device=device, attribute=attribute,
-                         autocast_dtype=autocast_dtype, quantize=quantize, n_q=n_q, bins=bins,
-                         **kwargs)
-        self.checkpoint = checkpoint
-        self.enable_fusion = enable_fusion
-        self.model_arch = model_arch
-        self.clap: laion_clap.CLAP_Module
-        self.clap_tokenize: RobertaTokenizer
-        self.clap_sample_rate = sample_rate
-        self.clap_max_frames = int(self.clap_sample_rate * max_audio_length)
-        self.clap_stride = int(self.clap_sample_rate * audio_stride)
-        self.batch_size = batch_size or 1
-        self.normalize = normalize
-        self.text_p = text_p
-        self.__dict__['clap_tokenize'] = clap_tokenize
-        self.__dict__['clap'] = clap_model
-        self.wav_cache, self.text_cache = None, None
-        if cache_path is not None:
-            self.wav_cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
-                                            compute_embed_fn=self._get_wav_embedding_for_cache,
-                                            extract_embed_fn=self._extract_wav_embedding_chunk)
-            self.text_cache = EmbeddingCache(Path(cache_path) / 'text', self.device,
-                                             compute_embed_fn=self._get_text_embedding_for_cache)
-    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -> dict:
-        # we use the default params from CLAP module here as well
-        return self.clap_tokenize(texts, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
-    def _compute_text_embedding(self, text: tp.List[str]) -> torch.Tensor:
-        """Compute text embedding from CLAP model on a given a batch of text.
-        Args:
-            text (list[str]): List of text for the batch, with B items.
-        Returns:
-            torch.Tensor: CLAP embedding derived from text, of shape [B, 1, D], with D the CLAP embedding dimension.
-        """
-        with torch.no_grad():
-            embed = self.clap.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
-            return embed.view(embed.size(0), 1, embed.size(-1))
-    def _get_text_embedding_for_cache(self, path: tp.Union[Path, str],
-                                      x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Get text embedding function for the cache."""
-        text = x.text[idx]
-        text = text if text is not None else ""
-        return self._compute_text_embedding([text])[0]
-    def _preprocess_wav(self, wav: torch.Tensor, length: torch.Tensor, sample_rates: tp.List[int]) -> torch.Tensor:
-        """Preprocess wav to expected format by CLAP model.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch
-        Returns:
-            torch.Tensor: Audio wav of shape [B, T].
-        """
-        assert wav.dim() == 3, "Expecting wav to be [B, C, T]"
-        if sample_rates is not None:
-            _wav = []
-            for i, audio in enumerate(wav):
-                sr = sample_rates[i]
-                audio = convert_audio(audio, from_rate=sr, to_rate=self.clap_sample_rate, to_channels=1)
-                _wav.append(audio)
-            wav = torch.stack(_wav, dim=0)
-        wav = wav.mean(dim=1)
-        return wav
-    def _compute_wav_embedding(self, wav: torch.Tensor, length: torch.Tensor,
-                               sample_rates: tp.List[int], reduce_mean: bool = False) -> torch.Tensor:
-        """Compute audio wave embedding from CLAP model.
-        Since CLAP operates on a fixed sequence length audio inputs and we need to process longer audio sequences,
-        we calculate the wav embeddings on `clap_max_frames` windows with `clap_stride`-second stride and
-        average the resulting embeddings.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch.
-            reduce_mean (bool): Whether to get the average tensor.
-        Returns:
-            torch.Tensor: Audio embedding of shape [B, F, D], F being the number of chunks, D the dimension.
-        """
-        with torch.no_grad():
-            wav = self._preprocess_wav(wav, length, sample_rates)
-            B, T = wav.shape
-            if T >= self.clap_max_frames:
-                wav = wav.unfold(-1, self.clap_max_frames, self.clap_stride)  # [B, F, T]
-            else:
-                wav = wav.view(-1, 1, T)  # [B, F, T] with F=1
-            wav = einops.rearrange(wav, 'b f t -> (b f) t')
-            embed_list = []
-            for i in range(0, wav.size(0), self.batch_size):
-                _wav = wav[i:i+self.batch_size, ...]
-                _embed = self.clap.get_audio_embedding_from_data(_wav, use_tensor=True)
-                embed_list.append(_embed)
-            embed = torch.cat(embed_list, dim=0)
-            embed = einops.rearrange(embed, '(b f) d -> b f d', b=B)
-            if reduce_mean:
-                embed = embed.mean(dim=1, keepdim=True)
-            return embed  # [B, F, D] with F=1 if reduce_mean is True
-    def _get_wav_embedding_for_cache(self, path: tp.Union[str, Path],
-                                     x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Compute audio wave embedding for the cache.
-        The embedding is computed on a given audio read from file.
-        Args:
-            path (str or Path): Path to the full audio file.
-        Returns:
-            torch.Tensor: Single-item tensor of shape [F, D], F being the number of chunks, D the dimension.
-        """
-        wav, sr = soundfile.read(path)  # [C, T]
-        wav = wav.unsqueeze(0).to(self.device)  # [1, C, T]
-        wav_len = torch.LongTensor([wav.shape[-1]]).to(self.device)
-        embed = self._compute_wav_embedding(wav, wav_len, [sr], reduce_mean=False)  # [B, F, D]
-        return embed.squeeze(0)  # [F, D]
-    def _extract_wav_embedding_chunk(self, full_embed: torch.Tensor, x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Extract the chunk of embedding matching the seek_time and length from the full CLAP audio embedding.
-        Args:
-            full_embed (torch.Tensor): CLAP embedding computed on the full wave, of shape [F, D].
-            x (JointEmbedCondition): Joint embedding condition for the full batch.
-            idx (int): Index considered for the given embedding to extract.
-        Returns:
-            torch.Tensor: Wav embedding averaged on sliding window, of shape [1, D].
-        """
-        sample_rate = x.sample_rate[idx]
-        seek_time = x.seek_time[idx]
-        seek_time = 0. if seek_time is None else seek_time
-        clap_stride = int(self.clap_stride / self.clap_sample_rate) * sample_rate
-        end_seek_time = seek_time + self.clap_max_frames / self.clap_sample_rate
-        start_offset = int(seek_time * sample_rate // clap_stride)
-        end_offset = int(end_seek_time * sample_rate // clap_stride)
-        wav_embed = full_embed[start_offset:end_offset, ...]
-        wav_embed = wav_embed.mean(dim=0, keepdim=True)
-        return wav_embed.to(self.device)  # [F, D]
-    def _get_text_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of text descriptions."""
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.text_cache is not None and no_nullified_cond:
-            assert all(p is not None for p in x.path), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.text_cache.get_embed_from_cache(paths, x)
-        else:
-            text = [xi if xi is not None else "" for xi in x.text]
-            embed = self._compute_text_embedding(text)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def _get_wav_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of audio tensors (and corresponding sample rates)."""
-        no_undefined_paths = all(p is not None for p in x.path)
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.wav_cache is not None and no_undefined_paths and no_nullified_cond:
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.wav_cache.get_embed_from_cache(paths, x)
-        else:
-            embed = self._compute_wav_embedding(x.wav, x.length, x.sample_rate, reduce_mean=True)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        # Trying to limit as much as possible sync points when the cache is warm.
-        no_undefined_paths = all(p is not None for p in x.path)
-        if self.wav_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.wav_cache.populate_embed_cache(paths, x)
-        if self.text_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.text_cache.populate_embed_cache(paths, x)
-        return x
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Extract shared latent representation from either the wav or the text using CLAP."""
-        # decide whether to use text embedding at train time or not
-        use_text_embed = random.random() < self.text_p
-        if self.training and not use_text_embed:
-            embed = self._get_wav_embedding(x)
-            empty_idx = torch.LongTensor([])  # we assume we always have the audio wav
-        else:
-            embed = self._get_text_embedding(x)
-            empty_idx = torch.LongTensor([i for i, xi in enumerate(x.text) if xi is None or xi == ""])
-        return embed, empty_idx
 def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -> ConditioningAttributes:
@@ -938,25 +556,19 @@ class ConditioningProvider(nn.Module):
         self.device = device
         self.conditioners = nn.ModuleDict(conditioners)
-    @property
-    def joint_embed_conditions(self):
-        return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
-    @property
-    def has_joint_embed_conditions(self):
-        return len(self.joint_embed_conditions) > 0
     @property
     def text_conditions(self):
         return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
-    @property
-    def wav_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
-    @property
-    def has_wav_condition(self):
-        return len(self.wav_conditions) > 0
     def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
         """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
@@ -974,15 +586,15 @@ class ConditioningProvider(nn.Module):
         output = {}
         text = self._collate_text(inputs)
-        wavs = self._collate_wavs(inputs)
-        joint_embeds = self._collate_joint_embeds(inputs)
-        assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
-            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
-            f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
-        )
-        for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
             output[attribute] = self.conditioners[attribute].tokenize(batch)
         return output
@@ -1031,102 +643,9 @@ class ConditioningProvider(nn.Module):
                 out[condition].append(text[condition])
         return out
-    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, WavCondition]:
-        """Generate a dict where the keys are attributes by which we fetch similar wavs,
-        and the values are Tensors of wavs according to said attributes.
-        *Note*: by the time the samples reach this function, each sample should have some waveform
-        inside the "wav" attribute. It should be either:
-        1. A real waveform
-        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
-        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
-        """
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        out: tp.Dict[str, WavCondition] = {}
-        for sample in samples:
-            for attribute in self.wav_conditions:
-                wav, length, sample_rate, path, seek_time = sample.wav[attribute]
-                assert wav.dim() == 3, f"Got wav with dim={wav.dim()}, but expected 3 [1, C, T]"
-                assert wav.size(0) == 1, f"Got wav [B, C, T] with shape={wav.shape}, but expected B == 1"
-                # mono-channel conditioning
-                wav = wav.mean(1, keepdim=True)  # [1, 1, T]
-                wavs[attribute].append(wav.flatten())  # [T]
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        # stack all wavs to a single tensor
-        for attribute in self.wav_conditions:
-            stacked_wav, _ = collate(wavs[attribute], dim=0)
-            out[attribute] = WavCondition(
-                stacked_wav.unsqueeze(1), torch.cat(lengths[attribute]), sample_rates[attribute],
-                paths[attribute], seek_times[attribute])
-        return out
-    def _collate_joint_embeds(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, JointEmbedCondition]:
-        """Generate a dict where the keys are attributes by which we compute joint embeddings,
-        and the values are Tensors of pre-computed embeddings and the corresponding text attributes.
-        Args:
-            samples (list[ConditioningAttributes]): List of ConditioningAttributes samples.
-        Returns:
-            A dictionary mapping an attribute name to joint embeddings.
-        """
-        texts = defaultdict(list)
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        channels: int = 0
-        out = {}
-        for sample in samples:
-            for attribute in self.joint_embed_conditions:
-                wav, text, length, sample_rate, path, seek_time = sample.joint_embed[attribute]
-                assert wav.dim() == 3
-                if channels == 0:
-                    channels = wav.size(1)
-                else:
-                    assert channels == wav.size(1), "not all audio has same number of channels in batch"
-                assert wav.size(0) == 1, "Expecting single-wav batch in the collate method"
-                wav = einops.rearrange(wav, "b c t -> (b c t)")  # [1, C, T] => [C * T]
-                wavs[attribute].append(wav)
-                texts[attribute].extend(text)
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        for attribute in self.joint_embed_conditions:
-            stacked_texts = texts[attribute]
-            stacked_paths = paths[attribute]
-            stacked_seek_times = seek_times[attribute]
-            stacked_wavs = pad_sequence(wavs[attribute]).to(self.device)
-            stacked_wavs = einops.rearrange(stacked_wavs, "(c t) b -> b c t", c=channels)
-            stacked_sample_rates = sample_rates[attribute]
-            stacked_lengths = torch.cat(lengths[attribute]).to(self.device)
-            assert stacked_lengths.size(0) == stacked_wavs.size(0)
-            assert len(stacked_sample_rates) == stacked_wavs.size(0)
-            assert len(stacked_texts) == stacked_wavs.size(0)
-            out[attribute] = JointEmbedCondition(
-                text=stacked_texts, wav=stacked_wavs,
-                length=stacked_lengths, sample_rate=stacked_sample_rates,
-                path=stacked_paths, seek_time=stacked_seek_times)
-        return out
 class ConditionFuser(StreamingModule):

 import einops
 from num2words import num2words
 import spacy
+from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 import torch.nn.functional as F
     ...
 class T5Conditioner(TextConditioner):
         return embeds, mask
 def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -> ConditioningAttributes:
         self.device = device
         self.conditioners = nn.ModuleDict(conditioners)
+    # @property
+    # def joint_embed_conditions(self):
+    #     return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
+    # @property
+    # def has_joint_embed_conditions(self):
+    #     return len(self.joint_embed_conditions) > 0
     @property
     def text_conditions(self):
         return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
     def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
         """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
         output = {}
         text = self._collate_text(inputs)
+        # wavs = self._collate_wavs(inputs)
+        # joint_embeds = self._collate_joint_embeds(inputs)
+        # assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
+        #     f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
+        #     f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
+        # )
+        for attribute, batch in text.items(): #, joint_embeds.items()):
             output[attribute] = self.conditioners[attribute].tokenize(batch)
         return output
                 out[condition].append(text[condition])
         return out
 class ConditionFuser(StreamingModule):

audiocraft/lm.py CHANGED Viewed

@@ -322,39 +322,7 @@ class TextConditioner(BaseConditioner):
     ...
-class LUTConditioner(TextConditioner):
-    """Lookup table TextConditioner.
-    Args:
-        n_bins (int): Number of bins.
-        dim (int): Hidden dim of the model (text-encoder/LUT).
-        output_dim (int): Output dim of the conditioner.
-        tokenizer (str): Name of the tokenizer.
-        pad_idx (int, optional): Index for padding token. Defaults to 0.
-    """
-    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
-        super().__init__(dim, output_dim)
-        self.embed = nn.Embedding(n_bins, dim)
-        self.tokenizer: Tokenizer
-        if tokenizer == 'whitespace':
-            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
-        elif tokenizer == 'noop':
-            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
-        else:
-            raise ValueError(f"unrecognized tokenizer `{tokenizer}`.")
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        device = self.embed.weight.device
-        tokens, mask = self.tokenizer(x)
-        tokens, mask = tokens.to(device), mask.to(device)
-        return tokens, mask
-    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -> ConditionType:
-        tokens, mask = inputs
-        embeds = self.embed(tokens)
-        embeds = self.output_proj(embeds)
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
 class T5Conditioner(TextConditioner):
@@ -453,56 +421,7 @@ class T5Conditioner(TextConditioner):
         return embeds, mask
-class WaveformConditioner(BaseConditioner):
-    """Base class for all conditioners that take a waveform as input.
-    Classes that inherit must implement `_get_wav_embedding` that outputs
-    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
-    factor of the embedding model.
-    Args:
-        dim (int): The internal representation dimension.
-        output_dim (int): Output dimension.
-        device (tp.Union[torch.device, str]): Device.
-    """
-    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
-        super().__init__(dim, output_dim)
-        self.device = device
-        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
-        self._use_masking = True
-    def tokenize(self, x: WavCondition) -> WavCondition:
-        wav, length, sample_rate, path, seek_time = x
-        assert length is not None
-        return WavCondition(wav.to(self.device), length.to(self.device), sample_rate, path, seek_time)
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        """Gets as input a WavCondition and returns a dense embedding."""
-        raise NotImplementedError()
-    def _downsampling_factor(self):
-        """Returns the downsampling factor of the embedding model."""
-        raise NotImplementedError()
-    def forward(self, x: WavCondition) -> ConditionType:
-        """Extract condition embedding and mask from a waveform and its metadata.
-        Args:
-            x (WavCondition): Waveform condition containing raw waveform and metadata.
-        Returns:
-            ConditionType: a dense vector representing the conditioning along with its mask
-        """
-        wav, lengths, *_ = x
-        with torch.no_grad():
-            embeds = self._get_wav_embedding(x)
-        embeds = embeds.to(self.output_proj.weight)
-        embeds = self.output_proj(embeds)
-        if lengths is not None and self._use_masking:
-            lengths = lengths / self._downsampling_factor()
-            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
-        else:
-            mask = torch.ones_like(embeds[..., 0])
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
@@ -570,366 +489,13 @@ class JointEmbeddingConditioner(BaseConditioner):
         return x
-class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
-    """Joint Embedding conditioner based on pre-trained CLAP model.
-    This CLAP-based conditioner supports a caching mechanism
-    over the computed embeddings for faster training.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        checkpoint (str): Path to CLAP checkpoint.
-        model_arch (str): CLAP model architecture.
-        enable_fusion (bool): Enable fusion for CLAP model.
-        sample_rate (int): Sample rate used by CLAP model.
-        max_audio_length (float): Maximum audio length for CLAP model.
-        audio_stride (float): Stride to use for getting a CLAP embedding on the full sequence.
-        normalize (bool): Whether to normalize the CLAP embedding.
-        text_p (float): Probability of using text representation instead of audio at train time.
-        batch_size (Optional[int]): Batch size for CLAP embedding computation.
-        autocast_dtype (str): Autocast for the conditioner.
-        cache_path (Optional[str]): Path for pre-computed embeddings caching.
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 quantize: bool, n_q: int, bins: int, checkpoint: tp.Union[str, Path], model_arch: str,
-                 enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int,
-                 normalize: bool, text_p: bool, batch_size: tp.Optional[int] = None,
-                 autocast_dtype: tp.Optional[str] = 'float32', cache_path: tp.Optional[str] = None, **kwargs):
-        try:
-            import laion_clap  # type: ignore
-        except ImportError:
-            raise ImportError("Please install CLAP to use the CLAPEmbeddingConditioner: 'pip install laion_clap'")
-        warnings.warn("Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). "
-                      "Please retrain all models.")
-        checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
-        clap_tokenize = RobertaTokenizer.from_pretrained('roberta-base')
-        clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
-        load_clap_state_dict(clap_model, checkpoint)
-        clap_model.eval()
-        clap_model.to(device)
-        super().__init__(dim=dim, output_dim=output_dim, device=device, attribute=attribute,
-                         autocast_dtype=autocast_dtype, quantize=quantize, n_q=n_q, bins=bins,
-                         **kwargs)
-        self.checkpoint = checkpoint
-        self.enable_fusion = enable_fusion
-        self.model_arch = model_arch
-        self.clap: laion_clap.CLAP_Module
-        self.clap_tokenize: RobertaTokenizer
-        self.clap_sample_rate = sample_rate
-        self.clap_max_frames = int(self.clap_sample_rate * max_audio_length)
-        self.clap_stride = int(self.clap_sample_rate * audio_stride)
-        self.batch_size = batch_size or 1
-        self.normalize = normalize
-        self.text_p = text_p
-        self.__dict__['clap_tokenize'] = clap_tokenize
-        self.__dict__['clap'] = clap_model
-        self.wav_cache, self.text_cache = None, None
-        if cache_path is not None:
-            self.wav_cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
-                                            compute_embed_fn=self._get_wav_embedding_for_cache,
-                                            extract_embed_fn=self._extract_wav_embedding_chunk)
-            self.text_cache = EmbeddingCache(Path(cache_path) / 'text', self.device,
-                                             compute_embed_fn=self._get_text_embedding_for_cache)
-    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -> dict:
-        # we use the default params from CLAP module here as well
-        return self.clap_tokenize(texts, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
-    def _compute_text_embedding(self, text: tp.List[str]) -> torch.Tensor:
-        """Compute text embedding from CLAP model on a given a batch of text.
-        Args:
-            text (list[str]): List of text for the batch, with B items.
-        Returns:
-            torch.Tensor: CLAP embedding derived from text, of shape [B, 1, D], with D the CLAP embedding dimension.
-        """
-        with torch.no_grad():
-            embed = self.clap.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
-            return embed.view(embed.size(0), 1, embed.size(-1))
-    def _get_text_embedding_for_cache(self, path: tp.Union[Path, str],
-                                      x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Get text embedding function for the cache."""
-        text = x.text[idx]
-        text = text if text is not None else ""
-        return self._compute_text_embedding([text])[0]
-    def _preprocess_wav(self, wav: torch.Tensor, length: torch.Tensor, sample_rates: tp.List[int]) -> torch.Tensor:
-        """Preprocess wav to expected format by CLAP model.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch
-        Returns:
-            torch.Tensor: Audio wav of shape [B, T].
-        """
-        assert wav.dim() == 3, "Expecting wav to be [B, C, T]"
-        if sample_rates is not None:
-            _wav = []
-            for i, audio in enumerate(wav):
-                sr = sample_rates[i]
-                audio = convert_audio(audio, from_rate=sr, to_rate=self.clap_sample_rate, to_channels=1)
-                _wav.append(audio)
-            wav = torch.stack(_wav, dim=0)
-        wav = wav.mean(dim=1)
-        return wav
-    def _compute_wav_embedding(self, wav: torch.Tensor, length: torch.Tensor,
-                               sample_rates: tp.List[int], reduce_mean: bool = False) -> torch.Tensor:
-        """Compute audio wave embedding from CLAP model.
-        Since CLAP operates on a fixed sequence length audio inputs and we need to process longer audio sequences,
-        we calculate the wav embeddings on `clap_max_frames` windows with `clap_stride`-second stride and
-        average the resulting embeddings.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch.
-            reduce_mean (bool): Whether to get the average tensor.
-        Returns:
-            torch.Tensor: Audio embedding of shape [B, F, D], F being the number of chunks, D the dimension.
-        """
-        with torch.no_grad():
-            wav = self._preprocess_wav(wav, length, sample_rates)
-            B, T = wav.shape
-            if T >= self.clap_max_frames:
-                wav = wav.unfold(-1, self.clap_max_frames, self.clap_stride)  # [B, F, T]
-            else:
-                wav = wav.view(-1, 1, T)  # [B, F, T] with F=1
-            wav = einops.rearrange(wav, 'b f t -> (b f) t')
-            embed_list = []
-            for i in range(0, wav.size(0), self.batch_size):
-                _wav = wav[i:i+self.batch_size, ...]
-                _embed = self.clap.get_audio_embedding_from_data(_wav, use_tensor=True)
-                embed_list.append(_embed)
-            embed = torch.cat(embed_list, dim=0)
-            embed = einops.rearrange(embed, '(b f) d -> b f d', b=B)
-            if reduce_mean:
-                embed = embed.mean(dim=1, keepdim=True)
-            return embed  # [B, F, D] with F=1 if reduce_mean is True
-    def _get_wav_embedding_for_cache(self, path: tp.Union[str, Path],
-                                     x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Compute audio wave embedding for the cache.
-        The embedding is computed on a given audio read from file.
-        Args:
-            path (str or Path): Path to the full audio file.
-        Returns:
-            torch.Tensor: Single-item tensor of shape [F, D], F being the number of chunks, D the dimension.
-        """
-        wav, sr = soundfile.read(path)  # [C, T]
-        wav = wav.unsqueeze(0).to(self.device)  # [1, C, T]
-        wav_len = torch.LongTensor([wav.shape[-1]]).to(self.device)
-        embed = self._compute_wav_embedding(wav, wav_len, [sr], reduce_mean=False)  # [B, F, D]
-        return embed.squeeze(0)  # [F, D]
-    def _extract_wav_embedding_chunk(self, full_embed: torch.Tensor, x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Extract the chunk of embedding matching the seek_time and length from the full CLAP audio embedding.
-        Args:
-            full_embed (torch.Tensor): CLAP embedding computed on the full wave, of shape [F, D].
-            x (JointEmbedCondition): Joint embedding condition for the full batch.
-            idx (int): Index considered for the given embedding to extract.
-        Returns:
-            torch.Tensor: Wav embedding averaged on sliding window, of shape [1, D].
-        """
-        sample_rate = x.sample_rate[idx]
-        seek_time = x.seek_time[idx]
-        seek_time = 0. if seek_time is None else seek_time
-        clap_stride = int(self.clap_stride / self.clap_sample_rate) * sample_rate
-        end_seek_time = seek_time + self.clap_max_frames / self.clap_sample_rate
-        start_offset = int(seek_time * sample_rate // clap_stride)
-        end_offset = int(end_seek_time * sample_rate // clap_stride)
-        wav_embed = full_embed[start_offset:end_offset, ...]
-        wav_embed = wav_embed.mean(dim=0, keepdim=True)
-        return wav_embed.to(self.device)  # [F, D]
-    def _get_text_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of text descriptions."""
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.text_cache is not None and no_nullified_cond:
-            assert all(p is not None for p in x.path), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.text_cache.get_embed_from_cache(paths, x)
-        else:
-            text = [xi if xi is not None else "" for xi in x.text]
-            embed = self._compute_text_embedding(text)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def _get_wav_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of audio tensors (and corresponding sample rates)."""
-        no_undefined_paths = all(p is not None for p in x.path)
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.wav_cache is not None and no_undefined_paths and no_nullified_cond:
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.wav_cache.get_embed_from_cache(paths, x)
-        else:
-            embed = self._compute_wav_embedding(x.wav, x.length, x.sample_rate, reduce_mean=True)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        # Trying to limit as much as possible sync points when the cache is warm.
-        no_undefined_paths = all(p is not None for p in x.path)
-        if self.wav_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.wav_cache.populate_embed_cache(paths, x)
-        if self.text_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.text_cache.populate_embed_cache(paths, x)
-        return x
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Extract shared latent representation from either the wav or the text using CLAP."""
-        # decide whether to use text embedding at train time or not
-        use_text_embed = random.random() < self.text_p
-        if self.training and not use_text_embed:
-            embed = self._get_wav_embedding(x)
-            empty_idx = torch.LongTensor([])  # we assume we always have the audio wav
-        else:
-            embed = self._get_text_embedding(x)
-            empty_idx = torch.LongTensor([i for i, xi in enumerate(x.text) if xi is None or xi == ""])
-        return embed, empty_idx
-def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -> ConditioningAttributes:
-    """Utility function for nullifying an attribute inside an ConditioningAttributes object.
-    If the condition is of type "wav", then nullify it using `nullify_condition` function.
-    If the condition is of any other type, set its value to None.
-    Works in-place.
-    """
-    if condition_type not in ['text', 'wav', 'joint_embed']:
-        raise ValueError(
-            "dropout_condition got an unexpected condition type!"
-            f" expected 'text', 'wav' or 'joint_embed' but got '{condition_type}'"
-        )
-    if condition not in getattr(sample, condition_type):
-        raise ValueError(
-            "dropout_condition received an unexpected condition!"
-            f" expected wav={sample.wav.keys()} and text={sample.text.keys()}"
-            f" but got '{condition}' of type '{condition_type}'!"
-        )
-    if condition_type == 'wav':
-        wav_cond = sample.wav[condition]
-        sample.wav[condition] = nullify_wav(wav_cond)
-    elif condition_type == 'joint_embed':
-        embed = sample.joint_embed[condition]
-        sample.joint_embed[condition] = nullify_joint_embed(embed)
-    else:
-        sample.text[condition] = None
-    return sample
-class DropoutModule(nn.Module):
-    """Base module for all dropout modules."""
-    def __init__(self, seed: int = 1234):
-        super().__init__()
-        self.rng = torch.Generator()
-        self.rng.manual_seed(seed)
-class AttributeDropout(DropoutModule):
-    """Dropout with a given probability per attribute.
-    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
-    to be dropped out separately. For example, "artist" can be dropped while "genre" remains.
-    This is in contrast to ClassifierFreeGuidanceDropout where if "artist" is dropped "genre"
-    must also be dropped.
-    Args:
-        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
-            ...
-            "genre": 0.1,
-            "artist": 0.5,
-            "wav": 0.25,
-            ...
-        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
-        seed (int, optional): Random seed.
-    """
-    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.active_on_eval = active_on_eval
-        # construct dict that return the values from p otherwise 0
-        self.p = {}
-        for condition_type, probs in p.items():
-            self.p[condition_type] = defaultdict(lambda: 0, probs)
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
-        """
-        if not self.training and not self.active_on_eval:
-            return samples
-        samples = deepcopy(samples)
-        for condition_type, ps in self.p.items():  # for condition types [text, wav]
-            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
-                if torch.rand(1, generator=self.rng).item() < p:
-                    for sample in samples:
-                        dropout_condition(sample, condition_type, condition)
-        return samples
-    def __repr__(self):
-        return f"AttributeDropout({dict(self.p)})"
-class ClassifierFreeGuidanceDropout(DropoutModule):
-    """Classifier Free Guidance dropout.
-    All attributes are dropped with the same probability.
-    Args:
-        p (float): Probability to apply condition dropout during training.
-        seed (int): Random seed.
-    """
-    def __init__(self, p: float, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.p = p
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
-        """
-        if not self.training:
-            return samples
-        # decide on which attributes to drop in a batched fashion
-        drop = torch.rand(1, generator=self.rng).item() < self.p
-        if not drop:
-            return samples
-        # nullify conditions of all attributes
-        samples = deepcopy(samples)
-        for condition_type in ["wav", "text"]:
-            for sample in samples:
-                for condition in sample.attributes[condition_type]:
-                    dropout_condition(sample, condition_type, condition)
-        return samples
-    def __repr__(self):
-        return f"ClassifierFreeGuidanceDropout(p={self.p})"
 class ConditioningProvider(nn.Module):
@@ -1355,8 +921,8 @@ class LMModel(StreamingModule):
                  **kwargs):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
-        self.att_dropout = AttributeDropout(p=attribute_dropout)
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card
@@ -1447,10 +1013,7 @@ class LMModel(StreamingModule):
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
         if condition_tensors is None:
             assert not self._is_streaming, "Conditions tensors should be precomputed when streaming."
-            # apply dropout modules
-            conditions = self.cfg_dropout(conditions)
-            conditions = self.att_dropout(conditions)
-            tokenized = self.condition_provider.tokenize(conditions)
             # encode conditions and fuse, both have a streaming cache to not recompute when generating.
             condition_tensors = self.condition_provider(tokenized)
         else:
@@ -1661,7 +1224,7 @@ class LMModel(StreamingModule):
         cfg_conditions: CFGConditions
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
         if conditions:
-            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
             if two_step_cfg:
                 cfg_conditions = (
                     self.condition_provider(self.condition_provider.tokenize(conditions)),

     ...
 class T5Conditioner(TextConditioner):
         return embeds, mask
         return x
 class ConditioningProvider(nn.Module):
                  **kwargs):
         super().__init__()
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
         if condition_tensors is None:
             assert not self._is_streaming, "Conditions tensors should be precomputed when streaming."
             # encode conditions and fuse, both have a streaming cache to not recompute when generating.
             condition_tensors = self.condition_provider(tokenized)
         else:
         cfg_conditions: CFGConditions
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
         if conditions:
+            null_conditions = conditions
             if two_step_cfg:
                 cfg_conditions = (
                     self.condition_provider(self.condition_provider.tokenize(conditions)),