Seed-VC

Running

File size: 3,211 Bytes

f32cd36

import math
import warnings
from pathlib import Path

import argbind
import numpy as np
import torch
from audiotools import AudioSignal
from audiotools.core import util
from tqdm import tqdm

from dac.utils import load_model

warnings.filterwarnings("ignore", category=UserWarning)


@argbind.bind(group="encode", positional=True, without_prefix=True)
@torch.inference_mode()
@torch.no_grad()
def encode(

    input: str,

    output: str = "",

    weights_path: str = "",

    model_tag: str = "latest",

    model_bitrate: str = "8kbps",

    n_quantizers: int = None,

    device: str = "cuda",

    model_type: str = "44khz",

    win_duration: float = 5.0,

    verbose: bool = False,

):
    """Encode audio files in input path to .dac format.



    Parameters

    ----------

    input : str

        Path to input audio file or directory

    output : str, optional

        Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.

    weights_path : str, optional

        Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the

        model_tag and model_type.

    model_tag : str, optional

        Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.

    model_bitrate: str

        Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".

    n_quantizers : int, optional

        Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.

    device : str, optional

        Device to use, by default "cuda"

    model_type : str, optional

        The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.

    """
    generator = load_model(
        model_type=model_type,
        model_bitrate=model_bitrate,
        tag=model_tag,
        load_path=weights_path,
    )
    generator.to(device)
    generator.eval()
    kwargs = {"n_quantizers": n_quantizers}

    # Find all audio files in input path
    input = Path(input)
    audio_files = util.find_audio(input)

    output = Path(output)
    output.mkdir(parents=True, exist_ok=True)

    for i in tqdm(range(len(audio_files)), desc="Encoding files"):
        # Load file
        signal = AudioSignal(audio_files[i])

        # Encode audio to .dac format
        artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs)

        # Compute output path
        relative_path = audio_files[i].relative_to(input)
        output_dir = output / relative_path.parent
        if not relative_path.name:
            output_dir = output
            relative_path = audio_files[i]
        output_name = relative_path.with_suffix(".dac").name
        output_path = output_dir / output_name
        output_path.parent.mkdir(parents=True, exist_ok=True)

        artifact.save(output_path)


if __name__ == "__main__":
    args = argbind.parse_args()
    with argbind.scope(args):
        encode()