cleanup 3

Browse files

Files changed (7) hide show

README.md +19 -6
audiocraft/builders.py +1 -18
audiocraft/conditioners.py +2 -103
audiocraft/encodec.py +1 -248
audiocraft/genmodel.py +0 -4
audiocraft/multibanddiffusion.py +0 -392
demo.py +1 -1

README.md CHANGED Viewed

@@ -2,20 +2,17 @@
 license: mit
 language:
 - en
-pipeline_tag: text-to-speech
 tags:
 - audiocraft
 - audiogen
 - styletts2
-- audio
-- synthesis
 - shift
 - audeering
-- dkounadis
 - sound
-- scene
-- acoustic-scene
 - audio-generation
 ---
@@ -35,13 +32,23 @@ tags:
 ```
 git clone https://huggingface.co/dkounadis/artificial-styletts2
 virtualenv --python=python3 ~/.envs/.my_env
 source ~/.envs/.my_env/bin/activate
 cd artificial-styletts2/
 pip install -r requirements.txt
 ```
 Start Flask
 ```
@@ -128,4 +135,10 @@ Client - Describe any sound with words and it will be played back to you.
 ```python
 python live_demo.py  # will ask text input & play soundscape
 ```

 license: mit
 language:
 - en
+pipeline_tag: audio-generation
 tags:
 - audiocraft
 - audiogen
 - styletts2
 - shift
 - audeering
 - sound
 - audio-generation
+- text-to-speech
+- mimic3
 ---
 ```
 git clone https://huggingface.co/dkounadis/artificial-styletts2
+```
+<details>
+<summary>
+Create virtualenv
+</summary>
+```
 virtualenv --python=python3 ~/.envs/.my_env
 source ~/.envs/.my_env/bin/activate
 cd artificial-styletts2/
 pip install -r requirements.txt
 ```
+</details>
 Start Flask
 ```
 ```python
 python live_demo.py  # will ask text input & play soundscape
+```
+# Simple Demo
+```python
+CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
 ```

audiocraft/builders.py CHANGED Viewed

@@ -15,7 +15,7 @@ import audiocraft
 import omegaconf
 import torch
-from .encodec import CompressionModel, EncodecModel, InterleaveStereoCompressionModel
 from .lm import LMModel
 from .seanet import SEANetEncoder, SEANetDecoder
 from .codebooks_patterns import (
@@ -211,20 +211,3 @@ def get_processor(cfg, sample_rate: int = 24000):
         if cfg.name == "multi_band_processor":
             sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
     return sample_processor
-def get_wrapped_compression_model(
-        compression_model: CompressionModel,
-        cfg: omegaconf.DictConfig) -> CompressionModel:
-    if hasattr(cfg, 'interleave_stereo_codebooks'):
-        if cfg.interleave_stereo_codebooks.use:
-            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
-            kwargs.pop('use')
-            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
-    if hasattr(cfg, 'compression_model_n_q'):
-        if cfg.compression_model_n_q is not None:
-            compression_model.set_num_codebooks(cfg.compression_model_n_q)
-    return compression_model

 import omegaconf
 import torch
+from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetEncoder, SEANetDecoder
 from .codebooks_patterns import (
         if cfg.name == "multi_band_processor":
             sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
     return sample_processor

audiocraft/conditioners.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from collections import defaultdict
-from copy import deepcopy
 from dataclasses import dataclass, field
 from itertools import chain
 import logging
@@ -10,20 +9,12 @@ import re
 import typing as tp
 import warnings
 import soundfile
-from num2words import num2words
-import spacy
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
 from .streaming import StreamingModule
-from .streaming import StreamingModule
-from .transformer import create_sin_embedding
 from .quantization import ResidualVectorQuantizer
 from .utils.autocast import TorchAutocast
 from .utils.cache import EmbeddingCache
@@ -112,102 +103,10 @@ class Tokenizer:
         raise NotImplementedError()
-class WhiteSpaceTokenizer(Tokenizer):
-    """This tokenizer should be used for natural language descriptions.
-    For example:
-    ["he didn't, know he's going home.", 'shorter sentence'] =>
-    [[78, 62, 31,  4, 78, 25, 19, 34],
-    [59, 77,  0,  0,  0,  0,  0,  0]]
-    """
-    PUNCTUATION = "?:!.,;"
-    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = "en_core_web_sm",
-                 lemma: bool = True, stopwords: bool = True) -> None:
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-        self.lemma = lemma
-        self.stopwords = stopwords
-        try:
-            self.nlp = spacy.load(language)
-        except IOError:
-            spacy.cli.download(language)  # type: ignore
-            self.nlp = spacy.load(language)
-    @tp.no_type_check
-    def __call__(self, texts: tp.List[tp.Optional[str]],
-                 return_text: bool = False) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Take a list of strings and convert them to a tensor of indices.
-        Args:
-            texts (list[str]): List of strings.
-            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]:
-                - Indices of words in the LUT.
-                - And a mask indicating where the padding tokens are
-        """
-        output, lengths = [], []
-        texts = deepcopy(texts)
-        for i, text in enumerate(texts):
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(torch.Tensor([self.pad_idx]))
-                lengths.append(0)
-                continue
-            # convert numbers to words
-            text = re.sub(r"(\d+)", lambda x: num2words(int(x.group(0))), text)  # type: ignore
-            # normalize text
-            text = self.nlp(text)  # type: ignore
-            # remove stopwords
-            if self.stopwords:
-                text = [w for w in text if not w.is_stop]  # type: ignore
-            # remove punctuation
-            text = [w for w in text if w.text not in self.PUNCTUATION]  # type: ignore
-            # lemmatize if needed
-            text = [getattr(t, "lemma_" if self.lemma else "text") for t in text]  # type: ignore
-            texts[i] = " ".join(text)
-            lengths.append(len(text))
-            # convert to tensor
-            tokens = torch.Tensor([hash_trick(w, self.n_bins) for w in text])
-            output.append(tokens)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
-        if return_text:
-            return padded_output, mask, texts  # type: ignore
-        return padded_output, mask
-class NoopTokenizer(Tokenizer):
-    """This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
-    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
-    strings, so "Jeff Buckley" will get it's own index. Whereas WhiteSpaceTokenizer will
-    split it to ["Jeff", "Buckley"] and return an index per word.
-    For example:
-    ["Queen", "ABBA", "Jeff Buckley"] => [43, 55, 101]
-    ["Metal", "Rock", "Classical"] => [0, 223, 51]
-    """
-    def __init__(self, n_bins: int, pad_idx: int = 0):
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        output, lengths = [], []
-        for text in texts:
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(self.pad_idx)
-                lengths.append(0)
-            else:
-                output.append(hash_trick(text, self.n_bins))
-                lengths.append(1)
-        tokens = torch.LongTensor(output).unsqueeze(1)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        return tokens, mask
 class BaseConditioner(nn.Module):

 from collections import defaultdict
 from dataclasses import dataclass, field
 from itertools import chain
 import logging
 import typing as tp
 import warnings
 import soundfile
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 from .streaming import StreamingModule
 from .quantization import ResidualVectorQuantizer
 from .utils.autocast import TorchAutocast
 from .utils.cache import EmbeddingCache
         raise NotImplementedError()
 class BaseConditioner(nn.Module):

audiocraft/encodec.py CHANGED Viewed

@@ -256,251 +256,4 @@ class EncodecModel(CompressionModel):
     def decode_latent(self, codes: torch.Tensor):
         """Decode from the discrete codes to continuous latent space."""
-        return self.quantizer.decode(codes)
-class DAC(CompressionModel):
-    def __init__(self, model_type: str = "44khz"):
-        super().__init__()
-        try:
-            import dac.utils
-        except ImportError:
-            raise RuntimeError("Could not import dac, make sure it is installed, "
-                               "please run `pip install descript-audio-codec`")
-        self.model = dac.utils.load_model(model_type=model_type)
-        self.n_quantizers = self.total_codebooks
-        self.model.eval()
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        # We don't support training with this.
-        raise NotImplementedError("Forward and training with DAC not supported.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        codes = self.model.encode(x, self.n_quantizers)[1]
-        return codes[:, :self.n_quantizers], None
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        assert scale is None
-        z_q = self.decode_latent(codes)
-        return self.model.decode(z_q)
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        return self.model.quantizer.from_codes(codes)[0]
-    @property
-    def channels(self) -> int:
-        return 1
-    @property
-    def frame_rate(self) -> float:
-        return self.model.sample_rate / self.model.hop_length
-    @property
-    def sample_rate(self) -> int:
-        return self.model.sample_rate
-    @property
-    def cardinality(self) -> int:
-        return self.model.codebook_size
-    @property
-    def num_codebooks(self) -> int:
-        return self.n_quantizers
-    @property
-    def total_codebooks(self) -> int:
-        return self.model.n_codebooks
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        """
-        assert n >= 1
-        assert n <= self.total_codebooks
-        self.n_quantizers = n
-class HFEncodecCompressionModel(CompressionModel):
-    """Wrapper around HuggingFace Encodec.
-    """
-    def __init__(self, model: HFEncodecModel):
-        super().__init__()
-        self.model = model
-        bws = self.model.config.target_bandwidths
-        num_codebooks = [
-            bw * 1000 / (self.frame_rate * math.log2(self.cardinality))
-            for bw in bws
-        ]
-        deltas = [nc - int(nc) for nc in num_codebooks]
-        # Checking we didn't do some bad maths and we indeed have integers!
-        assert all(deltas) <= 1e-3, deltas
-        self.possible_num_codebooks = [int(nc) for nc in num_codebooks]
-        self.set_num_codebooks(max(self.possible_num_codebooks))
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        # We don't support training with this.
-        raise NotImplementedError("Forward and training with HF EncodecModel not supported.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        bandwidth_index = self.possible_num_codebooks.index(self.num_codebooks)
-        bandwidth = self.model.config.target_bandwidths[bandwidth_index]
-        res = self.model.encode(x, None, bandwidth)
-        assert len(res[0]) == 1
-        assert len(res[1]) == 1
-        return res[0][0], res[1][0]
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        if scale is None:
-            scales = [None]  # type: ignore
-        else:
-            scales = scale  # type: ignore
-        res = self.model.decode(codes[None], scales)
-        return res[0]
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        return self.model.quantizer.decode(codes.transpose(0, 1))
-    @property
-    def channels(self) -> int:
-        return self.model.config.audio_channels
-    @property
-    def frame_rate(self) -> float:
-        hop_length = int(np.prod(self.model.config.upsampling_ratios))
-        return self.sample_rate / hop_length
-    @property
-    def sample_rate(self) -> int:
-        return self.model.config.sampling_rate
-    @property
-    def cardinality(self) -> int:
-        return self.model.config.codebook_size
-    @property
-    def num_codebooks(self) -> int:
-        return self._num_codebooks
-    @property
-    def total_codebooks(self) -> int:
-        return max(self.possible_num_codebooks)
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        """
-        if n not in self.possible_num_codebooks:
-            raise ValueError(f"Allowed values for num codebooks: {self.possible_num_codebooks}")
-        self._num_codebooks = n
-class InterleaveStereoCompressionModel(CompressionModel):
-    """Wraps a CompressionModel to support stereo inputs. The wrapped model
-    will be applied independently to the left and right channels, and both codebooks
-    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
-    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
-    `per_timestep`.
-    Args:
-        model (CompressionModel): Compression model to wrap.
-        per_timestep (bool): Whether to interleave on the timestep dimension
-            or on the codebooks dimension.
-    """
-    def __init__(self, model: CompressionModel, per_timestep: bool = False):
-        super().__init__()
-        self.model = model
-        self.per_timestep = per_timestep
-        assert self.model.channels == 1, "Wrapped model is expected to be for monophonic audio"
-    @property
-    def total_codebooks(self):
-        return self.model.total_codebooks
-    @property
-    def num_codebooks(self):
-        """Active number of codebooks used by the quantizer.
-        ..Warning:: this reports the number of codebooks after the interleaving
-        of the codebooks!
-        """
-        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        ..Warning:: this sets the number of codebooks before the interleaving!
-        """
-        self.model.set_num_codebooks(n)
-    @property
-    def num_virtual_steps(self) -> float:
-        """Return the number of virtual steps, e.g. one real step
-        will be split into that many steps.
-        """
-        return 2 if self.per_timestep else 1
-    @property
-    def frame_rate(self) -> float:
-        return self.model.frame_rate * self.num_virtual_steps
-    @property
-    def sample_rate(self) -> int:
-        return self.model.sample_rate
-    @property
-    def channels(self) -> int:
-        return 2
-    @property
-    def cardinality(self):
-        """Cardinality of each codebook.
-        """
-        return self.model.cardinality
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        raise NotImplementedError("Not supported, use encode and decode.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        B, C, T = x.shape
-        assert C == self.channels, f"Expecting stereo audio but audio num channels is {C}"
-        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
-        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
-        indices = torch.stack([indices_c0, indices_c1], dim=0)
-        scales: tp.Optional[torch.Tensor] = None
-        if scales_c0 is not None and scales_c1 is not None:
-            scales = torch.stack([scales_c0, scales_c1], dim=1)
-        if self.per_timestep:
-            indices = rearrange(indices, 'c b k t -> b k (t c)', c=2)
-        else:
-            indices = rearrange(indices, 'c b k t -> b (k c) t', c=2)
-        return (indices, scales)
-    def get_left_right_codes(self, codes: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        if self.per_timestep:
-            codes = rearrange(codes, 'b k (t c) -> c b k t', c=2)
-        else:
-            codes = rearrange(codes, 'b (k c) t -> c b k t', c=2)
-        return codes[0], codes[1]
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        B, K, T = codes.shape
-        assert T % self.num_virtual_steps == 0, "Provided codes' number of timesteps does not match"
-        assert K == self.num_codebooks, "Provided codes' number of codebooks does not match"
-        scale_c0, scale_c1 = None, None
-        if scale is not None:
-            assert scale.size(0) == B and scale.size(1) == 2, f"Scale has unexpected shape: {scale.shape}"
-            scale_c0 = scale[0, ...]
-            scale_c1 = scale[1, ...]
-        codes_c0, codes_c1 = self.get_left_right_codes(codes)
-        audio_c0 = self.model.decode(codes_c0, scale_c0)
-        audio_c1 = self.model.decode(codes_c1, scale_c1)
-        return torch.cat([audio_c0, audio_c1], dim=1)
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        raise NotImplementedError("Not supported by interleaved stereo wrapped models.")

     def decode_latent(self, codes: torch.Tensor):
         """Decode from the discrete codes to continuous latent space."""
+        return self.quantizer.decode(codes)

audiocraft/genmodel.py CHANGED Viewed

@@ -6,7 +6,6 @@ import torch
 from .encodec import CompressionModel
 from .lm import LMModel
-from .builders import get_wrapped_compression_model
 from .utils.audio_utils import convert_audio
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
@@ -38,9 +37,6 @@ class BaseGenModel(ABC):
             assert isinstance(cfg, omegaconf.DictConfig)
             self.cfg = cfg
-        if self.cfg is not None:
-            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
         if max_duration is None:
             if self.cfg is not None:
                 max_duration = lm.cfg.dataset.segment_duration  # type: ignore

 from .encodec import CompressionModel
 from .lm import LMModel
 from .utils.audio_utils import convert_audio
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
             assert isinstance(cfg, omegaconf.DictConfig)
             self.cfg = cfg
         if max_duration is None:
             if self.cfg is not None:
                 max_duration = lm.cfg.dataset.segment_duration  # type: ignore

audiocraft/multibanddiffusion.py DELETED Viewed

@@ -1,392 +0,0 @@
-#====================================== From  CompressionSolver.py
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import logging
-import multiprocessing
-from pathlib import Path
-import typing as tp
-import flashy
-import omegaconf
-import torch
-from torch import nn
-# from . import base, builders
-from .. import models, quantization
-from ..utils import checkpoint
-from ..utils.samples.manager import SampleManager
-from ..utils.utils import get_pool_executor
-class CompressionSolver(): #base.StandardSolver):
-    """Solver for compression task.
-    The compression task combines a set of perceptual and objective losses
-    to train an EncodecModel (composed of an encoder-decoder and a quantizer)
-    to perform high fidelity audio reconstruction.
-    """
-    def __init__(self, cfg: omegaconf.DictConfig):
-        # super().__init__(cfg)
-        self.cfg = cfg
-        self.rng: torch.Generator  # set at each epoch
-        self.adv_losses = builders.get_adversarial_losses(self.cfg)
-        self.aux_losses = nn.ModuleDict()
-        self.info_losses = nn.ModuleDict()
-        assert not cfg.fsdp.use, "FSDP not supported by CompressionSolver."
-        loss_weights = dict()
-        for loss_name, weight in self.cfg.losses.items():
-            if loss_name in ['adv', 'feat']:
-                for adv_name, _ in self.adv_losses.items():
-                    loss_weights[f'{loss_name}_{adv_name}'] = weight
-            elif weight > 0:
-                self.aux_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
-                loss_weights[loss_name] = weight
-            else:
-                self.info_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
-        self.balancer = builders.get_balancer(loss_weights, self.cfg.balancer)
-        self.register_stateful('adv_losses')
-    @property
-    def best_metric_name(self) -> tp.Optional[str]:
-        # best model is the last for the compression model
-        return None
-    def build_model(self):
-        """Instantiate model and optimizer."""
-        # Model and optimizer
-        self.model = models.builders.get_compression_model(self.cfg).to(self.device)
-        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
-        self.register_stateful('model', 'optimizer')
-        self.register_best_state('model')
-        self.register_ema('model')
-    def evaluate(self):
-        """Evaluate stage. Runs audio reconstruction evaluation."""
-        self.model.eval()
-        evaluate_stage_name = str(self.current_stage)
-        loader = self.dataloaders['evaluate']
-        updates = len(loader)
-        lp = self.log_progress(f'{evaluate_stage_name} inference', loader, total=updates, updates=self.log_updates)
-        average = flashy.averager()
-        pendings = []
-        ctx = multiprocessing.get_context('spawn')
-        with get_pool_executor(self.cfg.evaluate.num_workers, mp_context=ctx) as pool:
-            for idx, batch in enumerate(lp):
-                x = batch.to(self.device)
-                with torch.no_grad():
-                    qres = self.model(x)
-                y_pred = qres.x.cpu()
-                y = batch.cpu()  # should already be on CPU but just in case
-                pendings.append(pool.submit(evaluate_audio_reconstruction, y_pred, y, self.cfg))
-            metrics_lp = self.log_progress(f'{evaluate_stage_name} metrics', pendings, updates=self.log_updates)
-            for pending in metrics_lp:
-                metrics = pending.result()
-                metrics = average(metrics)
-        metrics = flashy.distrib.average_metrics(metrics, len(loader))
-        return metrics
-    def generate(self):
-        """Generate stage."""
-        self.model.eval()
-        sample_manager = SampleManager(self.xp, map_reference_to_sample_id=True)
-        generate_stage_name = str(self.current_stage)
-        loader = self.dataloaders['generate']
-        updates = len(loader)
-        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
-        for batch in lp:
-            reference, _ = batch
-            reference = reference.to(self.device)
-            with torch.no_grad():
-                qres = self.model(reference)
-            assert isinstance(qres, quantization.QuantizedResult)
-            reference = reference.cpu()
-            estimate = qres.x.cpu()
-            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
-        flashy.distrib.barrier()
-    def load_from_pretrained(self, name: str) -> dict:
-        model = models.CompressionModel.get_pretrained(name)
-        if isinstance(model, models.DAC):
-            raise RuntimeError("Cannot fine tune a DAC model.")
-        elif isinstance(model, models.HFEncodecCompressionModel):
-            self.logger.warning('Trying to automatically convert a HuggingFace model '
-                                'to AudioCraft, this might fail!')
-            state = model.model.state_dict()
-            new_state = {}
-            for k, v in state.items():
-                if k.startswith('decoder.layers') and '.conv.' in k and '.block.' not in k:
-                    # We need to determine if this a convtr or a regular conv.
-                    layer = int(k.split('.')[2])
-                    if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):
-                        k = k.replace('.conv.', '.convtr.')
-                k = k.replace('encoder.layers.', 'encoder.model.')
-                k = k.replace('decoder.layers.', 'decoder.model.')
-                k = k.replace('conv.', 'conv.conv.')
-                k = k.replace('convtr.', 'convtr.convtr.')
-                k = k.replace('quantizer.layers.', 'quantizer.vq.layers.')
-                k = k.replace('.codebook.', '._codebook.')
-                new_state[k] = v
-            state = new_state
-        elif isinstance(model, models.EncodecModel):
-            state = model.state_dict()
-        else:
-            raise RuntimeError(f"Cannot fine tune model type {type(model)}.")
-        return {
-            'best_state': {'model': state}
-        }
-    @staticmethod
-    def model_from_checkpoint(checkpoint_path: tp.Union[Path, str],
-                              device: tp.Union[torch.device, str] = 'cpu') -> models.CompressionModel:
-        """Instantiate a CompressionModel from a given checkpoint path or dora sig.
-        This method is a convenient endpoint to load a CompressionModel to use in other solvers.
-        Args:
-            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
-                This also supports pre-trained models by using a path of the form //pretrained/NAME.
-                See `model_from_pretrained` for a list of supported pretrained models.
-            use_ema (bool): Use EMA variant of the model instead of the actual model.
-            device (torch.device or str): Device on which the model is loaded.
-        """
-        checkpoint_path = str(checkpoint_path)
-        if checkpoint_path.startswith('//pretrained/'):
-            name = checkpoint_path.split('/', 3)[-1]
-            return models.CompressionModel.get_pretrained(name, device)
-        logger = logging.getLogger(__name__)
-        logger.info(f"Loading compression model from checkpoint: {checkpoint_path}")
-        _checkpoint_path = checkpoint.resolve_checkpoint_path(checkpoint_path, use_fsdp=False)
-        assert _checkpoint_path is not None, f"Could not resolve compression model checkpoint path: {checkpoint_path}"
-        state = checkpoint.load_checkpoint(_checkpoint_path)
-        assert state is not None and 'xp.cfg' in state, f"Could not load compression model from ckpt: {checkpoint_path}"
-        cfg = state['xp.cfg']
-        cfg.device = device
-        compression_model = models.builders.get_compression_model(cfg).to(device)
-        assert compression_model.sample_rate == cfg.sample_rate, "Compression model sample rate should match"
-        assert 'best_state' in state and state['best_state'] != {}
-        assert 'exported' not in state, "When loading an exported checkpoint, use the //pretrained/ prefix."
-        compression_model.load_state_dict(state['best_state']['model'])
-        compression_model.eval()
-        logger.info("Compression model loaded!")
-        return compression_model
-    @staticmethod
-    def wrapped_model_from_checkpoint(cfg: omegaconf.DictConfig,
-                                      checkpoint_path: tp.Union[Path, str],
-                                      device: tp.Union[torch.device, str] = 'cpu') -> models.CompressionModel:
-        """Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.
-        Args:
-            cfg (omegaconf.DictConfig): Configuration to read from for wrapped mode.
-            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
-            use_ema (bool): Use EMA variant of the model instead of the actual model.
-            device (torch.device or str): Device on which the model is loaded.
-        """
-        compression_model = CompressionSolver.model_from_checkpoint(checkpoint_path, device)
-        compression_model = models.builders.get_wrapped_compression_model(compression_model, cfg)
-        return compression_model
-#=========================================================================== ORIG
-import typing as tp
-import torch
-import julius
-from .unet import DiffusionUnet
-from ..modules.diffusion_schedule import NoiseSchedule
-from .encodec import CompressionModel
-from .loaders import load_compression_model, load_diffusion_models
-class DiffusionProcess:
-    """Sampling for a diffusion Model.
-    Args:
-        model (DiffusionUnet): Diffusion U-Net model.
-        noise_schedule (NoiseSchedule): Noise schedule for diffusion process.
-    """
-    def __init__(self, model: DiffusionUnet, noise_schedule: NoiseSchedule) -> None:
-        self.model = model
-        self.schedule = noise_schedule
-    def generate(self, condition: torch.Tensor, initial_noise: torch.Tensor,
-                 step_list: tp.Optional[tp.List[int]] = None):
-        """Perform one diffusion process to generate one of the bands.
-        Args:
-            condition (torch.Tensor): The embeddings from the compression model.
-            initial_noise (torch.Tensor): The initial noise to start the process.
-        """
-        return self.schedule.generate_subsampled(model=self.model, initial=initial_noise, step_list=step_list,
-                                                 condition=condition)
-class MultiBandDiffusion:
-    """Sample from multiple diffusion models.
-    Args:
-        DPs (list of DiffusionProcess): Diffusion processes.
-        codec_model (CompressionModel): Underlying compression model used to obtain discrete tokens.
-    """
-    def __init__(self, DPs: tp.List[DiffusionProcess], codec_model: CompressionModel) -> None:
-        self.DPs = DPs
-        self.codec_model = codec_model
-        self.device = next(self.codec_model.parameters()).device
-    @property
-    def sample_rate(self) -> int:
-        return self.codec_model.sample_rate
-    @staticmethod
-    def get_mbd_musicgen(device=None):
-        """Load our diffusion models trained for MusicGen."""
-        if device is None:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        path = 'facebook/multiband-diffusion'
-        filename = 'mbd_musicgen_32khz.th'
-        name = 'facebook/musicgen-small'
-        codec_model = load_compression_model(name, device=device)
-        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
-        DPs = []
-        for i in range(len(models)):
-            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
-            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
-        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
-    @staticmethod
-    def get_mbd_24khz(bw: float = 3.0,
-                      device: tp.Optional[tp.Union[torch.device, str]] = None,
-                      n_q: tp.Optional[int] = None):
-        """Get the pretrained Models for MultibandDiffusion.
-        Args:
-            bw (float): Bandwidth of the compression model.
-            device (torch.device or str, optional): Device on which the models are loaded.
-            n_q (int, optional): Number of quantizers to use within the compression model.
-        """
-        if device is None:
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        assert bw in [1.5, 3.0, 6.0], f"bandwidth {bw} not available"
-        if n_q is not None:
-            assert n_q in [2, 4, 8]
-            assert {1.5: 2, 3.0: 4, 6.0: 8}[bw] == n_q, \
-                f"bandwidth and number of codebooks missmatch to use n_q = {n_q} bw should be {n_q * (1.5 / 2)}"
-        n_q = {1.5: 2, 3.0: 4, 6.0: 8}[bw]
-        codec_model = CompressionSolver.model_from_checkpoint(
-            '//pretrained/facebook/encodec_24khz', device=device)
-        codec_model.set_num_codebooks(n_q)
-        codec_model = codec_model.to(device)
-        path = 'facebook/multiband-diffusion'
-        filename = f'mbd_comp_{n_q}.pt'
-        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
-        DPs = []
-        for i in range(len(models)):
-            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
-            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
-        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
-    @torch.no_grad()
-    def get_condition(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
-        """Get the conditioning (i.e. latent representations of the compression model) from a waveform.
-        Args:
-            wav (torch.Tensor): The audio that we want to extract the conditioning from.
-            sample_rate (int): Sample rate of the audio."""
-        if sample_rate != self.sample_rate:
-            wav = julius.resample_frac(wav, sample_rate, self.sample_rate)
-        codes, scale = self.codec_model.encode(wav)
-        assert scale is None, "Scaled compression models not supported."
-        emb = self.get_emb(codes)
-        return emb
-    @torch.no_grad()
-    def get_emb(self, codes: torch.Tensor):
-        """Get latent representation from the discrete codes.
-        Args:
-            codes (torch.Tensor): Discrete tokens."""
-        emb = self.codec_model.decode_latent(codes)
-        return emb
-    def generate(self, emb: torch.Tensor, size: tp.Optional[torch.Size] = None,
-                 step_list: tp.Optional[tp.List[int]] = None):
-        """Generate waveform audio from the latent embeddings of the compression model.
-        Args:
-            emb (torch.Tensor): Conditioning embeddings
-            size (None, torch.Size): Size of the output
-                if None this is computed from the typical upsampling of the model.
-            step_list (list[int], optional): list of Markov chain steps, defaults to 50 linearly spaced step.
-        """
-        if size is None:
-            upsampling = int(self.codec_model.sample_rate / self.codec_model.frame_rate)
-            size = torch.Size([emb.size(0), self.codec_model.channels, emb.size(-1) * upsampling])
-        assert size[0] == emb.size(0)
-        out = torch.zeros(size).to(self.device)
-        for DP in self.DPs:
-            out += DP.generate(condition=emb, step_list=step_list, initial_noise=torch.randn_like(out))
-        return out
-    def re_eq(self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1):
-        """Match the eq to the encodec output by matching the standard deviation of some frequency bands.
-        Args:
-            wav (torch.Tensor): Audio to equalize.
-            ref (torch.Tensor): Reference audio from which we match the spectrogram.
-            n_bands (int): Number of bands of the eq.
-            strictness (float): How strict the matching. 0 is no matching, 1 is exact matching.
-        """
-        split = julius.SplitBands(n_bands=n_bands, sample_rate=self.codec_model.sample_rate).to(wav.device)
-        bands = split(wav)
-        bands_ref = split(ref)
-        out = torch.zeros_like(ref)
-        for i in range(n_bands):
-            out += bands[i] * (bands_ref[i].std() / bands[i].std()) ** strictness
-        return out
-    def regenerate(self, wav: torch.Tensor, sample_rate: int):
-        """Regenerate a waveform through compression and diffusion regeneration.
-        Args:
-            wav (torch.Tensor): Original 'ground truth' audio.
-            sample_rate (int): Sample rate of the input (and output) wav.
-        """
-        if sample_rate != self.codec_model.sample_rate:
-            wav = julius.resample_frac(wav, sample_rate, self.codec_model.sample_rate)
-        emb = self.get_condition(wav, sample_rate=self.codec_model.sample_rate)
-        size = wav.size()
-        out = self.generate(emb, size=size)
-        if sample_rate != self.codec_model.sample_rate:
-            out = julius.resample_frac(out, self.codec_model.sample_rate, sample_rate)
-        return out
-    def tokens_to_wav(self, tokens: torch.Tensor, n_bands: int = 32):
-        """Generate Waveform audio with diffusion from the discrete codes.
-        Args:
-            tokens (torch.Tensor): Discrete codes.
-            n_bands (int): Bands for the eq matching.
-        """
-        wav_encodec = self.codec_model.decode(tokens)
-        condition = self.get_emb(tokens)
-        wav_diffusion = self.generate(emb=condition, size=wav_encodec.size())
-        return self.re_eq(wav=wav_diffusion, ref=wav_encodec, n_bands=n_bands)

demo.py CHANGED Viewed

@@ -12,4 +12,4 @@ sound_generator.set_generation_params(duration=1)   # why is generating so long
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7
-audiofile.write('_audio_.wav', x, 16000)

 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7
+audiofile.write('_audio3_.wav', x, 16000)