cleanup Vq

Browse files

Files changed (8) hide show

audiocraft/builders.py +4 -10
audiocraft/encodec.py +0 -3
audiocraft/lm.py +4 -2
audiocraft/quantization/__init__.py +0 -9
audiocraft/quantization/base.py +0 -99
audiocraft/quantization/core_vq.py +0 -405
audiocraft/{quantization/vq.py → vq.py} +149 -14
demo.py +4 -3

audiocraft/builders.py CHANGED Viewed

@@ -4,15 +4,9 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-"""
-All the functions to build the relevant models and modules
-from the Hydra config.
-"""
 import typing as tp
 import omegaconf
 import torch
 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
@@ -24,15 +18,15 @@ from .conditioners import (
     T5Conditioner,
 )
 from .unet import DiffusionUnet
-import audiocraft.quantization as qt
 from .utils.utils import dict_from_config
 from .diffusion_schedule import MultiBandProcessor, SampleProcessor
-def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -> qt.BaseQuantizer:
     klass = {
-        'no_quant': qt.DummyQuantizer,
-        'rvq': qt.ResidualVectorQuantizer
     }[quantizer]
     kwargs = dict_from_config(getattr(cfg, quantizer))
     if quantizer != 'no_quant':

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import typing as tp
 import omegaconf
 import torch
 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
     T5Conditioner,
 )
 from .unet import DiffusionUnet
+from .vq import ResidualVectorQuantizer
 from .utils.utils import dict_from_config
 from .diffusion_schedule import MultiBandProcessor, SampleProcessor
+def get_quantizer(quantizer, cfg, dimension):
     klass = {
+        'no_quant': None,
+        'rvq': ResidualVectorQuantizer
     }[quantizer]
     kwargs = dict_from_config(getattr(cfg, quantizer))
     if quantizer != 'no_quant':

audiocraft/encodec.py CHANGED Viewed

@@ -9,7 +9,6 @@ Also defines the main interface that a model must follow to be usable as an audi
 from abc import ABC, abstractmethod
 import logging
-import math
 from pathlib import Path
 import typing as tp
@@ -19,8 +18,6 @@ import torch
 from torch import nn
 from transformers import EncodecModel as HFEncodecModel
-import audiocraft.quantization as qt
 logger = logging.getLogger()

 from abc import ABC, abstractmethod
 import logging
 from pathlib import Path
 import typing as tp
 from torch import nn
 from transformers import EncodecModel as HFEncodecModel
 logger = logging.getLogger()

audiocraft/lm.py CHANGED Viewed

@@ -433,7 +433,9 @@ class LMModel(StreamingModule):
                 # print(f'{unconditional_state=} \n
                 # print('Set All to Special')
-                # next_token[:] = self.special_token_id
@@ -449,7 +451,7 @@ class LMModel(StreamingModule):
         unconditional_state.clear()
         out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]

                 # print(f'{unconditional_state=} \n
                 # print('Set All to Special')
+                # RUNS with = 2047 just different of self.special_token_id  -> 2047 is drill noise
+                # next_token[:] = self.special_token_id
         unconditional_state.clear()
         out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+        print(f'{out_codes.shape=} {out_codes.min()}  {out_codes.max()}\n')
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]

audiocraft/quantization/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""RVQ."""
-# flake8: noqa
-from .vq import ResidualVectorQuantizer
-from .base import BaseQuantizer, DummyQuantizer, QuantizedResult

audiocraft/quantization/base.py DELETED Viewed

@@ -1,99 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Base class for all quantizers.
-"""
-from dataclasses import dataclass, field
-import typing as tp
-import torch
-from torch import nn
-@dataclass
-class QuantizedResult:
-    x: torch.Tensor
-    codes: torch.Tensor
-    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
-    penalty: tp.Optional[torch.Tensor] = None
-    metrics: dict = field(default_factory=dict)
-class BaseQuantizer(nn.Module):
-    """Base class for quantizers.
-    """
-    def forward(self, x: torch.Tensor, frame_rate: int) -> QuantizedResult:
-        """
-        Given input tensor x, returns first the quantized (or approximately quantized)
-        representation along with quantized codes, bandwidth, and any penalty term for the loss.
-        Finally, this returns a dict of metrics to update logging etc.
-        Frame rate must be passed so that the bandwidth is properly computed.
-        """
-        raise NotImplementedError()
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a given input tensor with the specified sample rate at the given bandwidth."""
-        raise NotImplementedError()
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation."""
-        raise NotImplementedError()
-    @property
-    def total_codebooks(self):
-        """Total number of codebooks."""
-        raise NotImplementedError()
-    @property
-    def num_codebooks(self):
-        """Number of active codebooks."""
-        raise NotImplementedError()
-    def set_num_codebooks(self, n: int):
-        """Set the number of active codebooks."""
-        raise NotImplementedError()
-class DummyQuantizer(BaseQuantizer):
-    """Fake quantizer that actually does not perform any quantization.
-    """
-    def __init__(self):
-        super().__init__()
-    def forward(self, x: torch.Tensor, frame_rate: int):
-        q = x.unsqueeze(1)
-        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a given input tensor with the specified sample rate at the given bandwidth.
-        In the case of the DummyQuantizer, the codes are actually identical
-        to the input and resulting quantized representation as no quantization is done.
-        """
-        return x.unsqueeze(1)
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation.
-        In the case of the DummyQuantizer, the codes are actually identical
-        to the input and resulting quantized representation as no quantization is done.
-        """
-        return codes.squeeze(1)
-    @property
-    def total_codebooks(self):
-        """Total number of codebooks."""
-        return 1
-    @property
-    def num_codebooks(self):
-        """Total number of codebooks."""
-        return self.total_codebooks
-    def set_num_codebooks(self, n: int):
-        """Set the number of active codebooks."""
-        raise AttributeError("Cannot override the number of codebooks for the dummy quantizer")

audiocraft/quantization/core_vq.py DELETED Viewed

@@ -1,405 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-from einops import rearrange, repeat
-import flashy
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-def exists(val: tp.Optional[tp.Any]) -> bool:
-    return val is not None
-def default(val: tp.Any, d: tp.Any) -> tp.Any:
-    return val if exists(val) else d
-def l2norm(t):
-    return F.normalize(t, p=2, dim=-1)
-def ema_inplace(moving_avg, new, decay: float):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
-    return (x + epsilon) / (x.sum() + n_categories * epsilon)
-def uniform_init(*shape: int):
-    t = torch.empty(shape)
-    nn.init.kaiming_uniform_(t)
-    return t
-def sample_vectors(samples, num: int):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num,), device=device)
-    return samples[indices]
-def kmeans(samples, num_clusters: int, num_iters: int = 10):
-    dim, dtype = samples.shape[-1], samples.dtype
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        diffs = rearrange(samples, "n d -> n () d") - rearrange(
-            means, "c d -> () c d"
-        )
-        dists = -(diffs ** 2).sum(dim=-1)
-        buckets = dists.max(dim=-1).indices
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
-        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
-        new_means = new_means / bins_min_clamped[..., None]
-        means = torch.where(zero_mask[..., None], means, new_means)
-    return means, bins
-def orthogonal_loss_fn(t):
-    # eq (2) from https://arxiv.org/abs/2112.00384
-    n = t.shape[0]
-    normed_codes = l2norm(t)
-    identity = torch.eye(n, device=t.device)
-    cosine_sim = einsum("i d, j d -> i j", normed_codes, normed_codes)
-    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)
-class EuclideanCodebook(nn.Module):
-    """Codebook with Euclidean distance.
-    Args:
-        dim (int): Dimension.
-        codebook_size (int): Codebook size.
-        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
-            If set to true, run the k-means algorithm on the first training batch and use
-            the learned centroids as initialization.
-        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        kmeans_init: int = False,
-        kmeans_iters: int = 10,
-        decay: float = 0.8,
-        epsilon: float = 1e-5,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
-        embed = init_fn(codebook_size, dim)
-        self.codebook_size = codebook_size
-        self.kmeans_iters = kmeans_iters
-        self.epsilon = epsilon
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
-        self.register_buffer("cluster_size", torch.zeros(codebook_size))
-        self.register_buffer("embed", embed)
-        self.register_buffer("embed_avg", embed.clone())
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        if self.inited:
-            return
-        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
-        self.embed.data.copy_(embed)
-        self.embed_avg.data.copy_(embed.clone())
-        self.cluster_size.data.copy_(cluster_size)
-        self.inited.data.copy_(torch.Tensor([True]))
-        # Make sure all buffers across workers are in sync after initialization
-        flashy.distrib.broadcast_tensors(self.buffers())
-    def replace_(self, samples, mask):
-        modified_codebook = torch.where(
-            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
-        )
-        self.embed.data.copy_(modified_codebook)
-    def expire_codes_(self, batch_samples):
-        if self.threshold_ema_dead_code == 0:
-            return
-        expired_codes = self.cluster_size < self.threshold_ema_dead_code
-        if not torch.any(expired_codes):
-            return
-        batch_samples = rearrange(batch_samples, "... d -> (...) d")
-        self.replace_(batch_samples, mask=expired_codes)
-        flashy.distrib.broadcast_tensors(self.buffers())
-    def preprocess(self, x):
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    def quantize(self, x):
-        embed = self.embed.t()
-        dist = -(
-            x.pow(2).sum(1, keepdim=True)
-            - 2 * x @ embed
-            + embed.pow(2).sum(0, keepdim=True)
-        )
-        embed_ind = dist.max(dim=-1).indices
-        return embed_ind
-    def postprocess_emb(self, embed_ind, shape):
-        return embed_ind.view(*shape[:-1])
-    def dequantize(self, embed_ind):
-        quantize = F.embedding(embed_ind, self.embed)
-        return quantize
-    def encode(self, x):
-        shape = x.shape
-        # pre-process
-        x = self.preprocess(x)
-        # quantize
-        embed_ind = self.quantize(x)
-        # post-process
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        return embed_ind
-    def decode(self, embed_ind):
-        quantize = self.dequantize(embed_ind)
-        return quantize
-    def forward(self, x):
-        shape, dtype = x.shape, x.dtype
-        x = self.preprocess(x)
-        self.init_embed_(x)
-        embed_ind = self.quantize(x)
-        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        quantize = self.dequantize(embed_ind)
-        if self.training:
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            self.expire_codes_(x)
-            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
-            embed_sum = x.t() @ embed_onehot
-            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
-            cluster_size = (
-                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
-                * self.cluster_size.sum()
-            )
-            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
-            self.embed.data.copy_(embed_normalized)
-        return quantize, embed_ind
-class VectorQuantization(nn.Module):
-    """Vector quantization implementation.
-    Currently supports only euclidean distance.
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int):
-        channels_last (bool): Channels are the last dimension in the input tensors.
-        commitment_weight (float): Weight for commitment loss.
-        orthogonal_reg_weight (float): Orthogonal regularization weights.
-        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
-        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
-            for orthogonal regularization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        codebook_dim: tp.Optional[int] = None,
-        decay: float = 0.8,
-        epsilon: float = 1e-5,
-        kmeans_init: bool = False,
-        kmeans_iters: int = 10,
-        threshold_ema_dead_code: int = 2,
-        channels_last: bool = False,
-        commitment_weight: float = 1.,
-        orthogonal_reg_weight: float = 0.0,
-        orthogonal_reg_active_codes_only: bool = False,
-        orthogonal_reg_max_codes: tp.Optional[int] = None,
-    ):
-        super().__init__()
-        _codebook_dim: int = default(codebook_dim, dim)
-        requires_projection = _codebook_dim != dim
-        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
-        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
-        self.epsilon = epsilon
-        self.commitment_weight = commitment_weight
-        self.orthogonal_reg_weight = orthogonal_reg_weight
-        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
-        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
-        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
-                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
-                                           decay=decay, epsilon=epsilon,
-                                           threshold_ema_dead_code=threshold_ema_dead_code)
-        self.codebook_size = codebook_size
-        self.channels_last = channels_last
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    @property
-    def inited(self):
-        return self._codebook.inited
-    def _preprocess(self, x):
-        if not self.channels_last:
-            x = rearrange(x, "b d n -> b n d")
-        return x
-    def _postprocess(self, quantize):
-        if not self.channels_last:
-            quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-    def encode(self, x):
-        x = self._preprocess(x)
-        x = self.project_in(x)
-        embed_in = self._codebook.encode(x)
-        return embed_in
-    def decode(self, embed_ind):
-        quantize = self._codebook.decode(embed_ind)
-        quantize = self.project_out(quantize)
-        quantize = self._postprocess(quantize)
-        return quantize
-    def forward(self, x):
-        device = x.device
-        x = self._preprocess(x)
-        x = self.project_in(x)
-        quantize, embed_ind = self._codebook(x)
-        if self.training:
-            quantize = x + (quantize - x).detach()
-        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
-        if self.training:
-            if self.commitment_weight > 0:
-                commit_loss = F.mse_loss(quantize.detach(), x)
-                loss = loss + commit_loss * self.commitment_weight
-            if self.orthogonal_reg_weight > 0:
-                codebook = self.codebook
-                if self.orthogonal_reg_active_codes_only:
-                    # only calculate orthogonal loss for the activated codes for this batch
-                    unique_code_ids = torch.unique(embed_ind)
-                    codebook = codebook[unique_code_ids]
-                num_codes = codebook.shape[0]
-                if exists(self.orthogonal_reg_max_codes) and num_codes > self.orthogonal_reg_max_codes:
-                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
-                    codebook = codebook[rand_ids]
-                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
-                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
-        quantize = self.project_out(quantize)
-        quantize = self._postprocess(quantize)
-        return quantize, embed_ind, loss
-class ResidualVectorQuantization(nn.Module):
-    """Residual vector quantization implementation.
-    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
-    """
-    def __init__(self, *, num_quantizers, **kwargs):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
-        )
-    def forward(self, x, n_q: tp.Optional[int] = None):
-        quantized_out = 0.0
-        residual = x
-        all_losses = []
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for i, layer in enumerate(self.layers[:n_q]):
-            quantized, indices, loss = layer(residual)
-            quantized = quantized.detach()
-            residual = residual - quantized
-            quantized_out = quantized_out + quantized
-            all_indices.append(indices)
-            all_losses.append(loss)
-        if self.training:
-            # Solving subtle bug with STE and RVQ: https://github.com/facebookresearch/encodec/issues/25
-            quantized_out = x + (quantized_out - x).detach()
-        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
-        return quantized_out, out_indices, out_losses
-    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
-        residual = x
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            indices = layer.encode(residual)
-            quantized = layer.decode(indices)
-            residual = residual - quantized
-            all_indices.append(indices)
-        out_indices = torch.stack(all_indices)
-        return out_indices
-    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
-        quantized_out = torch.tensor(0.0, device=q_indices.device)
-        for i, indices in enumerate(q_indices):
-            layer = self.layers[i]
-            quantized = layer.decode(indices)
-            quantized_out = quantized_out + quantized
-        return quantized_out

audiocraft/{quantization/vq.py → vq.py} RENAMED Viewed

@@ -1,19 +1,157 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 import math
 import typing as tp
 import torch
-from .base import BaseQuantizer, QuantizedResult
-from .core_vq import ResidualVectorQuantization
-class ResidualVectorQuantizer(BaseQuantizer):
     """Residual Vector Quantizer.
     Args:
@@ -59,6 +197,7 @@ class ResidualVectorQuantizer(BaseQuantizer):
         self.orthogonal_reg_weight = orthogonal_reg_weight
         self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
         self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
         self.vq = ResidualVectorQuantization(
             dim=self.dimension,
             codebook_size=self.bins,
@@ -66,10 +205,6 @@ class ResidualVectorQuantizer(BaseQuantizer):
             decay=self.decay,
             kmeans_init=self.kmeans_init,
             kmeans_iters=self.kmeans_iters,
-            threshold_ema_dead_code=self.threshold_ema_dead_code,
-            orthogonal_reg_weight=self.orthogonal_reg_weight,
-            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
-            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
             channels_last=False
         )

 import math
 import typing as tp
+from dataclasses import dataclass, field
+import typing as tp
 import torch
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+@dataclass
+class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)
+class EuclideanCodebook(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        kmeans_init=False,
+        kmeans_iters=10,
+        decay=0.8,
+        epsilon=1e-5,
+    ):
+        super().__init__()
+        self.decay=decay
+        init_fn=uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        # flashy.distrib.broadcast_tensors(self.buffers())   # brodcast param values to all GPUS
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        # print('\n\nDE QUANT\n\n', quantize.shape)  # (1, 35, 128) -> also arrives here for special_token
+        return quantize
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+class VectorQuantization(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        codebook_dim=None,
+        decay=0.8,
+        epsilon=1e-5,
+        kmeans_init=False,
+        kmeans_iters=10,
+        channels_last=False,
+    ):
+        super().__init__()
+        # _codebook_dim: int = default(codebook_dim, dim)
+        _codebook_dim = codebook_dim if codebook_dim is not None else dim
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+        self._codebook = EuclideanCodebook(dim=_codebook_dim,
+                                           codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init,
+                                           kmeans_iters=kmeans_iters,
+                                           decay=decay,
+                                           epsilon=epsilon)
+        self.codebook_size = codebook_size
+        self.channels_last = channels_last
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    @property
+    def inited(self):
+        return self._codebook.inited
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+# ------------------------------------- END core_vq.py
+class ResidualVectorQuantizer(nn.Module):
     """Residual Vector Quantizer.
     Args:
         self.orthogonal_reg_weight = orthogonal_reg_weight
         self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
         self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        print(f'         {kmeans_init=}\n\n\n\n')
         self.vq = ResidualVectorQuantization(
             dim=self.dimension,
             codebook_size=self.bins,
             decay=self.decay,
             kmeans_init=self.kmeans_init,
             kmeans_iters=self.kmeans_iters,
             channels_last=False
         )

demo.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from audiocraft.audiogen import AudioGen #, audio_write
 print('\n\n\n\n___________________')
-txt = 'austrian music'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=4.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 from audiocraft.audiogen import AudioGen #, audio_write
+import audiofile
+import numpy as np
 print('\n\n\n\n___________________')
+txt = 'sea waves rock crash pirates'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7