Spaces:

OpenSound
/

EzAudio

Running on Zero

File size: 6,512 Bytes

71de706

import json
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple

import ffmpy
import numpy as np
import torch


def r128stats(filepath: str, quiet: bool):
    """Takes a path to an audio file, returns a dict with the loudness
    stats computed by the ffmpeg ebur128 filter.

    Parameters
    ----------
    filepath : str
        Path to compute loudness stats on.
    quiet : bool
        Whether to show FFMPEG output during computation.

    Returns
    -------
    dict
        Dictionary containing loudness stats.
    """
    ffargs = [
        "ffmpeg",
        "-nostats",
        "-i",
        filepath,
        "-filter_complex",
        "ebur128",
        "-f",
        "null",
        "-",
    ]
    if quiet:
        ffargs += ["-hide_banner"]
    proc = subprocess.Popen(ffargs, stderr=subprocess.PIPE, universal_newlines=True)
    stats = proc.communicate()[1]
    summary_index = stats.rfind("Summary:")

    summary_list = stats[summary_index:].split()
    i_lufs = float(summary_list[summary_list.index("I:") + 1])
    i_thresh = float(summary_list[summary_list.index("I:") + 4])
    lra = float(summary_list[summary_list.index("LRA:") + 1])
    lra_thresh = float(summary_list[summary_list.index("LRA:") + 4])
    lra_low = float(summary_list[summary_list.index("low:") + 1])
    lra_high = float(summary_list[summary_list.index("high:") + 1])
    stats_dict = {
        "I": i_lufs,
        "I Threshold": i_thresh,
        "LRA": lra,
        "LRA Threshold": lra_thresh,
        "LRA Low": lra_low,
        "LRA High": lra_high,
    }

    return stats_dict


def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
    """Given a path to a file, returns the start time offset and codec of
    the first audio stream.
    """
    ff = ffmpy.FFprobe(
        inputs={path: None},
        global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
    )
    streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
    seconds_offset = 0.0
    codec = None

    # Get the offset and codec of the first audio stream we find
    # and return its start time, if it has one.
    for stream in streams:
        if stream["codec_type"] == "audio":
            seconds_offset = stream.get("start_time", 0.0)
            codec = stream.get("codec_name")
            break
    return float(seconds_offset), codec


class FFMPEGMixin:
    _loudness = None

    def ffmpeg_loudness(self, quiet: bool = True):
        """Computes loudness of audio file using FFMPEG.

        Parameters
        ----------
        quiet : bool, optional
            Whether to show FFMPEG output during computation,
            by default True

        Returns
        -------
        torch.Tensor
            Loudness of every item in the batch, computed via
            FFMPEG.
        """
        loudness = []

        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
            for i in range(self.batch_size):
                self[i].write(f.name)
                loudness_stats = r128stats(f.name, quiet=quiet)
                loudness.append(loudness_stats["I"])

        self._loudness = torch.from_numpy(np.array(loudness)).float()
        return self.loudness()

    def ffmpeg_resample(self, sample_rate: int, quiet: bool = True):
        """Resamples AudioSignal using FFMPEG. More memory-efficient
        than using julius.resample for long audio files.

        Parameters
        ----------
        sample_rate : int
            Sample rate to resample to.
        quiet : bool, optional
            Whether to show FFMPEG output during computation,
            by default True

        Returns
        -------
        AudioSignal
            Resampled AudioSignal.
        """
        from audiotools import AudioSignal

        if sample_rate == self.sample_rate:
            return self

        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
            self.write(f.name)
            f_out = f.name.replace("wav", "rs.wav")
            command = f"ffmpeg -i {f.name} -ar {sample_rate} {f_out}"
            if quiet:
                command += " -hide_banner -loglevel error"
            subprocess.check_call(shlex.split(command))
            resampled = AudioSignal(f_out)
            Path.unlink(Path(f_out))
        return resampled

    @classmethod
    def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwargs):
        """Loads AudioSignal object after decoding it to a wav file using FFMPEG.
        Useful for loading audio that isn't covered by librosa's loading mechanism. Also
        useful for loading mp3 files, without any offset.

        Parameters
        ----------
        audio_path : str
            Path to load AudioSignal from.
        quiet : bool, optional
            Whether to show FFMPEG output during computation,
            by default True

        Returns
        -------
        AudioSignal
            AudioSignal loaded from file with FFMPEG.
        """
        audio_path = str(audio_path)
        with tempfile.TemporaryDirectory() as d:
            wav_file = str(Path(d) / "extracted.wav")
            padded_wav = str(Path(d) / "padded.wav")

            global_options = "-y"
            if quiet:
                global_options += " -loglevel error"

            ff = ffmpy.FFmpeg(
                inputs={audio_path: None},
                outputs={wav_file: None},
                global_options=global_options,
            )
            ff.run()

            # We pad the file using the start time offset in case it's an audio
            # stream starting at some offset in a video container.
            pad, codec = ffprobe_offset_and_codec(audio_path)

            # For mp3s, don't pad files with discrepancies less than 0.027s -
            # it's likely due to codec latency. The amount of latency introduced
            # by mp3 is 1152, which is 0.0261 44khz. So we set the threshold
            # here slightly above that.
            # Source: https://lame.sourceforge.io/tech-FAQ.txt.
            if codec == "mp3" and pad < 0.027:
                pad = 0.0
            ff = ffmpy.FFmpeg(
                inputs={wav_file: None},
                outputs={padded_wav: f"-af 'adelay={pad*1000}:all=true'"},
                global_options=global_options,
            )
            ff.run()

            signal = cls(padded_wav, **kwargs)

        return signal