MusicGen

Build error

File size: 9,694 Bytes

5325fcc

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import csv
import json
import logging
from pathlib import Path
import tempfile
import typing as tp
import subprocess
import shutil

import torch
import torchaudio

logger = logging.getLogger(__name__)


class ViSQOL:
    """ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.

    To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
    instructions available in the open source repository: https://github.com/google/visqol

    ViSQOL is capable of running in two modes:

    Audio Mode:
        When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
        Audio mode uses support vector regression, with the maximum range at ~4.75.

    Speech Mode:
        When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
            Input should be resampled to 16kHz.
        As part of the speech mode processing, a root mean square implementation for voice activity detection
            is performed on the reference signal to determine what parts of the signal have voice activity and
            should therefore be included in the comparison. The signal is normalized before performing the voice
            activity detection.
        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
        Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.

    For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input

    Args:
        visqol_bin (str): Path to the ViSQOL binary.
        mode (str): ViSQOL computation mode, expecting "audio" or "speech".
        model (str): Name of the model to use for similarity to quality model.
        debug (bool): Whether to also get debug metrics from ViSQOL or not.
    """
    SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000}
    ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())

    def __init__(self, bin: tp.Union[Path, str], mode: str = "audio",
                 model: str = "libsvm_nu_svr_model.txt", debug: bool = False):
        assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}"
        self.visqol_bin = str(bin)
        self.visqol_mode = mode
        self.target_sr = self._get_target_sr(self.visqol_mode)
        self.model = model
        self.debug = debug
        assert Path(self.visqol_model).exists(), \
            f"Could not find the specified model in ViSQOL install: {self.visqol_model}"

    def _get_target_sr(self, mode: str) -> int:
        # returns target sampling rate for the corresponding ViSQOL mode.
        if mode not in ViSQOL.SAMPLE_RATES_MODES:
            raise ValueError(
                f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}"
            )
        return ViSQOL.SAMPLE_RATES_MODES[mode]

    def _prepare_files(
        self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
    ):
        # prepare files for ViSQOL evaluation.
        assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
        assert len(ref_sig) == len(deg_sig), (
            "Expects same number of ref and degraded inputs",
            f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}"
        )
        # resample audio if needed
        if sr != target_sr:
            transform = torchaudio.transforms.Resample(sr, target_sr)
            pad = int(0.5 * target_sr)
            rs_ref = []
            rs_deg = []
            for i in range(len(ref_sig)):
                rs_ref_i = transform(ref_sig[i])
                rs_deg_i = transform(deg_sig[i])
                if pad_with_silence:
                    rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0)
                    rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0)
                rs_ref.append(rs_ref_i)
                rs_deg.append(rs_deg_i)
            ref_sig = torch.stack(rs_ref)
            deg_sig = torch.stack(rs_deg)
        # save audio chunks to tmp dir and create csv
        tmp_dir = Path(tempfile.mkdtemp())
        try:
            tmp_input_csv_path = tmp_dir / "input.csv"
            tmp_results_csv_path = tmp_dir / "results.csv"
            tmp_debug_json_path = tmp_dir / "debug.json"
            with open(tmp_input_csv_path, "w") as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(["reference", "degraded"])
                for i in range(len(ref_sig)):
                    tmp_ref_filename = tmp_dir / f"ref_{i}.wav"
                    tmp_deg_filename = tmp_dir / f"deg_{i}.wav"
                    torchaudio.save(
                        tmp_ref_filename,
                        torch.clamp(ref_sig[i], min=-0.99, max=0.99),
                        sample_rate=target_sr,
                        bits_per_sample=16,
                        encoding="PCM_S"
                    )
                    torchaudio.save(
                        tmp_deg_filename,
                        torch.clamp(deg_sig[i], min=-0.99, max=0.99),
                        sample_rate=target_sr,
                        bits_per_sample=16,
                        encoding="PCM_S"
                    )
                    csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
            return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
        except Exception as e:
            logger.error("Exception occurred when preparing files for ViSQOL: %s", e)
            return tmp_dir, None, None, None

    def _flush_files(self, tmp_dir: tp.Union[Path, str]):
        # flush tmp files used to compute ViSQOL.
        shutil.rmtree(str(tmp_dir))

    def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float:
        # collect results for each evaluated pair and return averaged moslqo score.
        with open(results_csv_path, "r") as csv_file:
            reader = csv.DictReader(csv_file)
            moslqo_scores = [float(row["moslqo"]) for row in reader]
            if len(moslqo_scores) > 0:
                return sum(moslqo_scores) / len(moslqo_scores)
            else:
                return 0.0

    def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict:
        # collect debug data for the visqol inference.
        with open(debug_json_path, "r") as f:
            data = json.load(f)
            return data

    @property
    def visqol_model(self):
        return f'{self.visqol_bin}/model/{self.model}'

    def _run_visqol(
        self,
        input_csv_path: tp.Union[Path, str],
        results_csv_path: tp.Union[Path, str],
        debug_csv_path: tp.Optional[tp.Union[Path, str]],
    ):
        input_csv_path = str(input_csv_path)
        results_csv_path = str(results_csv_path)
        debug_csv_path = str(debug_csv_path)
        cmd = [
            f'{self.visqol_bin}/bazel-bin/visqol',
            '--batch_input_csv', f'{input_csv_path}',
            '--results_csv', f'{results_csv_path}'
        ]
        if debug_csv_path is not None:
            cmd += ['--output_debug', f'{debug_csv_path}']
        if self.visqol_mode == "speech":
            cmd += ['--use_speech_mode']
        cmd += ['--similarity_to_quality_model', f'{self.visqol_model}']
        result = subprocess.run(cmd, capture_output=True)
        if result.returncode:
            logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode())
            raise RuntimeError("Error while executing visqol")
        result.check_returncode()

    def __call__(
        self,
        ref_sig: torch.Tensor,
        deg_sig: torch.Tensor,
        sr: int,
        pad_with_silence: bool = False,
    ):
        """Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
        Args:
            ref_sig (torch.Tensor): Reference signals as [B, C, T].
            deg_sig (torch.Tensor): Degraded signals as [B, C, T].
            sr (int): Sample rate of the two audio signals.
            pad_with_silence (bool): Whether to pad the file with silences as recommended
                in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
        Returns:
            float: The ViSQOL score or mean score for the batch.
        """
        logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples")
        tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
            ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
        )
        try:
            if input_csv and results_csv:
                self._run_visqol(
                    input_csv,
                    results_csv,
                    debug_json if self.debug else None,
                )
                mosqol = self._collect_moslqo_score(results_csv)
                return mosqol
            else:
                raise RuntimeError("Something unexpected happened when running VISQOL!")
        except Exception as e:
            logger.error("Exception occurred when running ViSQOL: %s", e)
        finally:
            self._flush_files(tmp_dir)