Spaces:

undetectable
/

voice-clone

Configuration error

App Files Files Community

renator commited on Jan 24, 2024

Commit

4a367ac

1 Parent(s): d257d3d

fixed environment issues

Browse files

Files changed (5) hide show

Dockerfile +4 -0
constantq.py +1497 -0
filters.py +1661 -0
sequence.py +2059 -0
utils.py +316 -0

Dockerfile CHANGED Viewed

@@ -26,6 +26,10 @@ COPY . /app/
 # Replace the librosa notation.py with notation.py from your project
 COPY notation.py /usr/local/lib/python3.10/site-packages/librosa/core/notation.py
 COPY audio.py /usr/local/lib/python3.10/site-packages/librosa/core/audio.py
 # RUN cd /tmp && mkdir cache1

 # Replace the librosa notation.py with notation.py from your project
 COPY notation.py /usr/local/lib/python3.10/site-packages/librosa/core/notation.py
 COPY audio.py /usr/local/lib/python3.10/site-packages/librosa/core/audio.py
+COPY constantq.py /usr/local/lib/python3.10/site-packages/librosa/core/constantq.py
+COPY filters.py /usr/local/lib/python3.10/site-packages/librosa/filters.py
+COPY sequence.py /usr/local/lib/python3.10/site-packages/librosa/sequence.py
+COPY utils.py /usr/local/lib/python3.10/site-packages/librosa/feature/utils.py
 # RUN cd /tmp && mkdir cache1

constantq.py ADDED Viewed

	@@ -0,0 +1,1497 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Constant-Q transforms"""
+import warnings
+import numpy as np
+from numba import jit
+from . import audio
+from .intervals import interval_frequencies
+from .fft import get_fftlib
+from .convert import cqt_frequencies, note_to_hz
+from .spectrum import stft, istft
+from .pitch import estimate_tuning
+from .._cache import cache
+from .. import filters
+from .. import util
+from ..util.exceptions import ParameterError
+from numpy.typing import DTypeLike
+from typing import Optional, Union, Collection, List
+from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable
+__all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]
+# TODO: ivqt, griffinlim_vqt
+@cache(level=20)
+def cqt(
+    y: np.ndarray,
+    *,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    n_bins: int = 84,
+    bins_per_octave: int = 12,
+    tuning: Optional[float] = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    pad_mode: _PadMode = "constant",
+    res_type: Optional[str] = "soxr_hq",
+    dtype: Optional[DTypeLike] = None,
+) -> np.ndarray:
+    """Compute the constant-Q transform of an audio signal.
+    This implementation is based on the recursive sub-sampling method
+    described by [#]_.
+    .. [#] Schoerkhuber, Christian, and Anssi Klapuri.
+        "Constant-Q transform toolbox for music processing."
+        7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
+    Parameters
+    ----------
+    y : np.ndarray [shape=(..., n)]
+        audio time series. Multi-channel is supported.
+    sr : number > 0 [scalar]
+        sampling rate of ``y``
+    hop_length : int > 0 [scalar]
+        number of samples between successive CQT columns.
+    fmin : float > 0 [scalar]
+        Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
+    n_bins : int > 0 [scalar]
+        Number of frequency bins, starting at ``fmin``
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    tuning : None or float
+        Tuning offset in fractions of a bin.
+        If ``None``, tuning will be automatically estimated from the signal.
+        The minimum frequency of the resulting CQT will be modified to
+        ``fmin * 2**(tuning / bins_per_octave)``.
+    filter_scale : float > 0
+        Filter scale factor. Small values (<1) use shorter windows
+        for improved time resolution.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the CQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, number, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the CQT response by square-root the length of
+        each channel's filter.  This is analogous to ``norm='ortho'`` in FFT.
+        If ``False``, do not scale the CQT. This is analogous to
+        ``norm=None`` in FFT.
+    pad_mode : string
+        Padding mode for centered frame analysis.
+        See also: `librosa.stft` and `numpy.pad`.
+    res_type : string
+        The resampling mode for recursive downsampling.
+    dtype : np.dtype
+        The (complex) data type of the output array.  By default, this is inferred to match
+        the numerical precision of the input signal.
+    Returns
+    -------
+    CQT : np.ndarray [shape=(..., n_bins, t)]
+        Constant-Q value each frequency at each time.
+    See Also
+    --------
+    vqt
+    librosa.resample
+    librosa.util.normalize
+    Notes
+    -----
+    This function caches at level 20.
+    Examples
+    --------
+    Generate and plot a constant-Q power spectrum
+    >>> import matplotlib.pyplot as plt
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> C = np.abs(librosa.cqt(y, sr=sr))
+    >>> fig, ax = plt.subplots()
+    >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
+    ...                                sr=sr, x_axis='time', y_axis='cqt_note', ax=ax)
+    >>> ax.set_title('Constant-Q power spectrum')
+    >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
+    Limit the frequency range
+    >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
+    ...                 n_bins=60))
+    >>> C
+    array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09],
+           [5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08],
+           ...,
+           [4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06],
+           [2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]])
+    Using a higher frequency resolution
+    >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
+    ...                 n_bins=60 * 2, bins_per_octave=12 * 2))
+    >>> C
+    array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09],
+           [4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09],
+           ...,
+           [2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05],
+           [5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]])
+    """
+    # CQT is the special case of VQT with gamma=0
+    return vqt(
+        y=y,
+        sr=sr,
+        hop_length=hop_length,
+        fmin=fmin,
+        n_bins=n_bins,
+        intervals="equal",
+        gamma=0,
+        bins_per_octave=bins_per_octave,
+        tuning=tuning,
+        filter_scale=filter_scale,
+        norm=norm,
+        sparsity=sparsity,
+        window=window,
+        scale=scale,
+        pad_mode=pad_mode,
+        res_type=res_type,
+        dtype=dtype,
+    )
+@cache(level=20)
+def hybrid_cqt(
+    y: np.ndarray,
+    *,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    n_bins: int = 84,
+    bins_per_octave: int = 12,
+    tuning: Optional[float] = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    pad_mode: _PadMode = "constant",
+    res_type: str = "soxr_hq",
+    dtype: Optional[DTypeLike] = None,
+) -> np.ndarray:
+    """Compute the hybrid constant-Q transform of an audio signal.
+    Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
+    the hop_length is longer than half the filter length and the full CQT
+    for lower frequencies.
+    Parameters
+    ----------
+    y : np.ndarray [shape=(..., n)]
+        audio time series. Multi-channel is supported.
+    sr : number > 0 [scalar]
+        sampling rate of ``y``
+    hop_length : int > 0 [scalar]
+        number of samples between successive CQT columns.
+    fmin : float > 0 [scalar]
+        Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
+    n_bins : int > 0 [scalar]
+        Number of frequency bins, starting at ``fmin``
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    tuning : None or float
+        Tuning offset in fractions of a bin.
+        If ``None``, tuning will be automatically estimated from the signal.
+        The minimum frequency of the resulting CQT will be modified to
+        ``fmin * 2**(tuning / bins_per_octave)``.
+    filter_scale : float > 0
+        Filter filter_scale factor. Larger values use longer windows.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the CQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, number, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the CQT response by square-root the length of
+        each channel's filter.  This is analogous to ``norm='ortho'`` in FFT.
+        If ``False``, do not scale the CQT. This is analogous to
+        ``norm=None`` in FFT.
+    pad_mode : string
+        Padding mode for centered frame analysis.
+        See also: `librosa.stft` and `numpy.pad`.
+    res_type : string
+        Resampling mode.  See `librosa.cqt` for details.
+    dtype : np.dtype, optional
+        The complex dtype to use for computing the CQT.
+        By default, this is inferred to match the precision of
+        the input signal.
+    Returns
+    -------
+    CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
+        Constant-Q energy for each frequency at each time.
+    See Also
+    --------
+    cqt
+    pseudo_cqt
+    Notes
+    -----
+    This function caches at level 20.
+    """
+    if fmin is None:
+        # C1 by default
+        fmin = note_to_hz("C1")
+    if tuning is None:
+        tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
+    # Apply tuning correction
+    fmin = fmin * 2.0 ** (tuning / bins_per_octave)
+    # Get all CQT frequencies
+    freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave)
+    # Compute an alpha parameter, just in case we need it
+    alpha = __bpo_to_alpha(bins_per_octave)
+    # Compute the length of each constant-Q basis function
+    lengths, _ = filters.wavelet_lengths(
+        freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha
+    )
+    # Determine which filters to use with Pseudo CQT
+    # These are the ones that fit within 2 hop lengths after padding
+    pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length
+    n_bins_pseudo = int(np.sum(pseudo_filters))
+    n_bins_full = n_bins - n_bins_pseudo
+    cqt_resp = []
+    if n_bins_pseudo > 0:
+        fmin_pseudo = np.min(freqs[pseudo_filters])
+        cqt_resp.append(
+            pseudo_cqt(
+                y,
+                sr=sr,
+                hop_length=hop_length,
+                fmin=fmin_pseudo,
+                n_bins=n_bins_pseudo,
+                bins_per_octave=bins_per_octave,
+                filter_scale=filter_scale,
+                norm=norm,
+                sparsity=sparsity,
+                window=window,
+                scale=scale,
+                pad_mode=pad_mode,
+                dtype=dtype,
+            )
+        )
+    if n_bins_full > 0:
+        cqt_resp.append(
+            np.abs(
+                cqt(
+                    y,
+                    sr=sr,
+                    hop_length=hop_length,
+                    fmin=fmin,
+                    n_bins=n_bins_full,
+                    bins_per_octave=bins_per_octave,
+                    filter_scale=filter_scale,
+                    norm=norm,
+                    sparsity=sparsity,
+                    window=window,
+                    scale=scale,
+                    pad_mode=pad_mode,
+                    res_type=res_type,
+                    dtype=dtype,
+                )
+            )
+        )
+    # Propagate dtype from the last component
+    return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)
+@cache(level=20)
+def pseudo_cqt(
+    y: np.ndarray,
+    *,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    n_bins: int = 84,
+    bins_per_octave: int = 12,
+    tuning: Optional[float] = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    pad_mode: _PadMode = "constant",
+    dtype: Optional[DTypeLike] = None,
+) -> np.ndarray:
+    """Compute the pseudo constant-Q transform of an audio signal.
+    This uses a single fft size that is the smallest power of 2 that is greater
+    than or equal to the max of:
+        1. The longest CQT filter
+        2. 2x the hop_length
+    Parameters
+    ----------
+    y : np.ndarray [shape=(..., n)]
+        audio time series. Multi-channel is supported.
+    sr : number > 0 [scalar]
+        sampling rate of ``y``
+    hop_length : int > 0 [scalar]
+        number of samples between successive CQT columns.
+    fmin : float > 0 [scalar]
+        Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
+    n_bins : int > 0 [scalar]
+        Number of frequency bins, starting at ``fmin``
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    tuning : None or float
+        Tuning offset in fractions of a bin.
+        If ``None``, tuning will be automatically estimated from the signal.
+        The minimum frequency of the resulting CQT will be modified to
+        ``fmin * 2**(tuning / bins_per_octave)``.
+    filter_scale : float > 0
+        Filter filter_scale factor. Larger values use longer windows.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the CQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, number, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the CQT response by square-root the length of
+        each channel's filter.  This is analogous to ``norm='ortho'`` in FFT.
+        If ``False``, do not scale the CQT. This is analogous to
+        ``norm=None`` in FFT.
+    pad_mode : string
+        Padding mode for centered frame analysis.
+        See also: `librosa.stft` and `numpy.pad`.
+    dtype : np.dtype, optional
+        The complex data type for CQT calculations.
+        By default, this is inferred to match the precision of the input signal.
+    Returns
+    -------
+    CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
+        Pseudo Constant-Q energy for each frequency at each time.
+    Notes
+    -----
+    This function caches at level 20.
+    """
+    if fmin is None:
+        # C1 by default
+        fmin = note_to_hz("C1")
+    if tuning is None:
+        tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
+    if dtype is None:
+        dtype = util.dtype_r2c(y.dtype)
+    # Apply tuning correction
+    fmin = fmin * 2.0 ** (tuning / bins_per_octave)
+    freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
+    alpha = __bpo_to_alpha(bins_per_octave)
+    lengths, _ = filters.wavelet_lengths(
+        freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
+    )
+    fft_basis, n_fft, _ = __vqt_filter_fft(
+        sr,
+        freqs,
+        filter_scale,
+        norm,
+        sparsity,
+        hop_length=hop_length,
+        window=window,
+        dtype=dtype,
+        alpha=alpha,
+    )
+    fft_basis = np.abs(fft_basis)
+    # Compute the magnitude-only CQT response
+    C: np.ndarray = __cqt_response(
+        y,
+        n_fft,
+        hop_length,
+        fft_basis,
+        pad_mode,
+        window="hann",
+        dtype=dtype,
+        phase=False,
+    )
+    if scale:
+        C /= np.sqrt(n_fft)
+    else:
+        # reshape lengths to match dimension properly
+        lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2)
+        C *= np.sqrt(lengths / n_fft)
+    return C
+@cache(level=40)
+def icqt(
+    C: np.ndarray,
+    *,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    bins_per_octave: int = 12,
+    tuning: float = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    length: Optional[int] = None,
+    res_type: str = "soxr_hq",
+    dtype: Optional[DTypeLike] = None,
+) -> np.ndarray:
+    """Compute the inverse constant-Q transform.
+    Given a constant-Q transform representation ``C`` of an audio signal ``y``,
+    this function produces an approximation ``y_hat``.
+    Parameters
+    ----------
+    C : np.ndarray, [shape=(..., n_bins, n_frames)]
+        Constant-Q representation as produced by `cqt`
+    sr : number > 0 [scalar]
+        sampling rate of the signal
+    hop_length : int > 0 [scalar]
+        number of samples between successive frames
+    fmin : float > 0 [scalar]
+        Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    tuning : float [scalar]
+        Tuning offset in fractions of a bin.
+        The minimum frequency of the CQT will be modified to
+        ``fmin * 2**(tuning / bins_per_octave)``.
+    filter_scale : float > 0 [scalar]
+        Filter scale factor. Small values (<1) use shorter windows
+        for improved time resolution.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the CQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, number, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the CQT response by square-root the length
+        of each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
+        If ``False``, do not scale the CQT. This is analogous to ``norm=None``
+        in FFT.
+    length : int > 0, optional
+        If provided, the output ``y`` is zero-padded or clipped to exactly
+        ``length`` samples.
+    res_type : string
+        Resampling mode.
+        See `librosa.resample` for supported modes.
+    dtype : numeric type
+        Real numeric type for ``y``.  Default is inferred to match the numerical
+        precision of the input CQT.
+    Returns
+    -------
+    y : np.ndarray, [shape=(..., n_samples), dtype=np.float]
+        Audio time-series reconstructed from the CQT representation.
+    See Also
+    --------
+    cqt
+    librosa.resample
+    Notes
+    -----
+    This function caches at level 40.
+    Examples
+    --------
+    Using default parameters
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> C = librosa.cqt(y=y, sr=sr)
+    >>> y_hat = librosa.icqt(C=C, sr=sr)
+    Or with a different hop length and frequency resolution:
+    >>> hop_length = 256
+    >>> bins_per_octave = 12 * 3
+    >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
+    ...                 bins_per_octave=bins_per_octave)
+    >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
+    ...                 bins_per_octave=bins_per_octave)
+    """
+    if fmin is None:
+        fmin = note_to_hz("C1")
+    # Apply tuning correction
+    fmin = fmin * 2.0 ** (tuning / bins_per_octave)
+    # Get the top octave of frequencies
+    n_bins = C.shape[-2]
+    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
+    # truncate the cqt to max frames if helpful
+    freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
+    alpha = __bpo_to_alpha(bins_per_octave)
+    lengths, f_cutoff = filters.wavelet_lengths(
+        freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
+    )
+    # Trim the CQT to only what's necessary for reconstruction
+    if length is not None:
+        n_frames = int(np.ceil((length + max(lengths)) / hop_length))
+        C = C[..., :n_frames]
+    C_scale = np.sqrt(lengths)
+    # This shape array will be used for broadcasting the basis scale
+    # we'll have to adapt this per octave within the loop
+    y: Optional[np.ndarray] = None
+    # Assume the top octave is at the full rate
+    srs = [sr]
+    hops = [hop_length]
+    for i in range(n_octaves - 1):
+        if hops[0] % 2 == 0:
+            # We can downsample:
+            srs.insert(0, srs[0] * 0.5)
+            hops.insert(0, hops[0] // 2)
+        else:
+            # We're out of downsamplings, carry forward
+            srs.insert(0, srs[0])
+            hops.insert(0, hops[0])
+    for i, (my_sr, my_hop) in enumerate(zip(srs, hops)):
+        # How many filters are in this octave?
+        n_filters = min(bins_per_octave, n_bins - bins_per_octave * i)
+        # Slice out the current octave
+        sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters)
+        fft_basis, n_fft, _ = __vqt_filter_fft(
+            my_sr,
+            freqs[sl],
+            filter_scale,
+            norm,
+            sparsity,
+            window=window,
+            dtype=dtype,
+            alpha=alpha,
+        )
+        # Transpose the basis
+        inv_basis = fft_basis.H.todense()
+        # Compute each filter's frequency-domain power
+        freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0)
+        # Compensate for length normalization in the forward transform
+        freq_power *= n_fft / lengths[sl]
+        # Inverse-project the basis for each octave
+        if scale:
+            # scale=True ==> re-scale by sqrt(lengths)
+            D_oct = np.einsum(
+                "fc,c,c,...ct->...ft",
+                inv_basis,
+                C_scale[sl],
+                freq_power,
+                C[..., sl, :],
+                optimize=True,
+            )
+        else:
+            D_oct = np.einsum(
+                "fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True
+            )
+        y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype)
+        y_oct = audio.resample(
+            y_oct,
+            orig_sr=1,
+            target_sr=sr // my_sr,
+            res_type=res_type,
+            scale=False,
+            fix=False,
+        )
+        if y is None:
+            y = y_oct
+        else:
+            y[..., : y_oct.shape[-1]] += y_oct
+    # make mypy happy
+    assert y is not None
+    if length:
+        y = util.fix_length(y, size=length)
+    return y
+@cache(level=20)
+def vqt(
+    y: np.ndarray,
+    *,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    n_bins: int = 84,
+    intervals: Union[str, Collection[float]] = "equal",
+    gamma: Optional[float] = None,
+    bins_per_octave: int = 12,
+    tuning: Optional[float] = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    pad_mode: _PadMode = "constant",
+    res_type: Optional[str] = "soxr_hq",
+    dtype: Optional[DTypeLike] = None,
+) -> np.ndarray:
+    """Compute the variable-Q transform of an audio signal.
+    This implementation is based on the recursive sub-sampling method
+    described by [#]_.
+    .. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler.
+        "A Matlab toolbox for efficient perfect reconstruction time-frequency
+        transforms with log-frequency resolution."
+        In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio.
+        Audio Engineering Society, 2014.
+    Parameters
+    ----------
+    y : np.ndarray [shape=(..., n)]
+        audio time series. Multi-channel is supported.
+    sr : number > 0 [scalar]
+        sampling rate of ``y``
+    hop_length : int > 0 [scalar]
+        number of samples between successive VQT columns.
+    fmin : float > 0 [scalar]
+        Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
+    n_bins : int > 0 [scalar]
+        Number of frequency bins, starting at ``fmin``
+    intervals : str or array of floats in [1, 2)
+        Either a string specification for an interval set, e.g.,
+        `'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of
+        intervals expressed as numbers between 1 and 2.
+        .. see also:: librosa.interval_frequencies
+    gamma : number > 0 [scalar]
+        Bandwidth offset for determining filter lengths.
+        If ``gamma=0``, produces the constant-Q transform.
+        If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
+        constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
+        by solving for the gamma which gives::
+            B_k = alpha * f_k + gamma = C * ERB(f_k),
+        where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha
+        is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the
+        constant fraction across all filters.
+        Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived
+        from experimental data in [#]_.
+        .. [#] Glasberg, Brian R., and Brian CJ Moore.
+            "Derivation of auditory filter shapes from notched-noise data."
+            Hearing research 47.1-2 (1990): 103-138.
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    tuning : None or float
+        Tuning offset in fractions of a bin.
+        If ``None``, tuning will be automatically estimated from the signal.
+        The minimum frequency of the resulting VQT will be modified to
+        ``fmin * 2**(tuning / bins_per_octave)``.
+    filter_scale : float > 0
+        Filter scale factor. Small values (<1) use shorter windows
+        for improved time resolution.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the VQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, number, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the VQT response by square-root the length of
+        each channel's filter.  This is analogous to ``norm='ortho'`` in FFT.
+        If ``False``, do not scale the VQT. This is analogous to
+        ``norm=None`` in FFT.
+    pad_mode : string
+        Padding mode for centered frame analysis.
+        See also: `librosa.stft` and `numpy.pad`.
+    res_type : string
+        The resampling mode for recursive downsampling.
+    dtype : np.dtype
+        The dtype of the output array.  By default, this is inferred to match the
+        numerical precision of the input signal.
+    Returns
+    -------
+    VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex]
+        Variable-Q value each frequency at each time.
+    See Also
+    --------
+    cqt
+    Notes
+    -----
+    This function caches at level 20.
+    Examples
+    --------
+    Generate and plot a variable-Q power spectrum
+    >>> import matplotlib.pyplot as plt
+    >>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
+    >>> C = np.abs(librosa.cqt(y, sr=sr))
+    >>> V = np.abs(librosa.vqt(y, sr=sr))
+    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
+    >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
+    ...                          sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0])
+    >>> ax[0].set(title='Constant-Q power spectrum', xlabel=None)
+    >>> ax[0].label_outer()
+    >>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max),
+    ...                                sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1])
+    >>> ax[1].set_title('Variable-Q power spectrum')
+    >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
+    """
+    # If intervals are provided as an array, override BPO
+    if not isinstance(intervals, str):
+        bins_per_octave = len(intervals)
+    # How many octaves are we dealing with?
+    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
+    n_filters = min(bins_per_octave, n_bins)
+    if fmin is None:
+        # C1 by default
+        fmin = note_to_hz("C1")
+    if tuning is None:
+        tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
+    if dtype is None:
+        dtype = util.dtype_r2c(y.dtype)
+    # Apply tuning correction
+    fmin = fmin * 2.0 ** (tuning / bins_per_octave)
+    # First thing, get the freqs of the top octave
+    freqs = interval_frequencies(
+        n_bins=n_bins,
+        fmin=fmin,
+        intervals=intervals,
+        bins_per_octave=bins_per_octave,
+        sort=True,
+    )
+    freqs_top = freqs[-bins_per_octave:]
+    fmax_t: float = np.max(freqs_top)
+    alpha = __bpo_to_alpha(bins_per_octave)
+    lengths, filter_cutoff = filters.wavelet_lengths(
+        freqs=freqs,
+        sr=sr,
+        window=window,
+        filter_scale=filter_scale,
+        gamma=gamma,
+        alpha=alpha,
+    )
+    # Determine required resampling quality
+    nyquist = sr / 2.0
+    if filter_cutoff > nyquist:
+        raise ParameterError(
+            f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. "
+            "Try reducing the number of frequency bins."
+        )
+    if res_type is None:
+        warnings.warn(
+            "Support for VQT with res_type=None is deprecated in librosa 0.10\n"
+            "and will be removed in version 1.0.",
+            category=FutureWarning,
+            stacklevel=2,
+        )
+        res_type = "soxr_hq"
+    y, sr, hop_length = __early_downsample(
+        y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
+    )
+    vqt_resp = []
+    # Iterate down the octaves
+    my_y, my_sr, my_hop = y, sr, hop_length
+    for i in range(n_octaves):
+        # Slice out the current octave of filters
+        if i == 0:
+            sl = slice(-n_filters, None)
+        else:
+            sl = slice(-n_filters * (i + 1), -n_filters * i)
+        # This may be incorrect with early downsampling
+        freqs_oct = freqs[sl]
+        fft_basis, n_fft, _ = __vqt_filter_fft(
+            my_sr,
+            freqs_oct,
+            filter_scale,
+            norm,
+            sparsity,
+            window=window,
+            gamma=gamma,
+            dtype=dtype,
+            alpha=alpha,
+        )
+        # Re-scale the filters to compensate for downsampling
+        fft_basis[:] *= np.sqrt(sr / my_sr)
+        # Compute the vqt filter response and append to the stack
+        vqt_resp.append(
+            __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype)
+        )
+        if my_hop % 2 == 0:
+            my_hop //= 2
+            my_sr /= 2.0
+            my_y = audio.resample(
+                my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True
+            )
+    V = __trim_stack(vqt_resp, n_bins, dtype)
+    if scale:
+        # Recompute lengths here because early downsampling may have changed
+        # our sampling rate
+        lengths, _ = filters.wavelet_lengths(
+            freqs=freqs,
+            sr=sr,
+            window=window,
+            filter_scale=filter_scale,
+            gamma=gamma,
+            alpha=alpha,
+        )
+        # reshape lengths to match V shape
+        lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2)
+        V /= np.sqrt(lengths)
+    return V
+@cache(level=10)
+def __vqt_filter_fft(
+    sr,
+    freqs,
+    filter_scale,
+    norm,
+    sparsity,
+    hop_length=None,
+    window="hann",
+    gamma=0.0,
+    dtype=np.complex64,
+    alpha=None,
+):
+    """Generate the frequency domain variable-Q filter basis."""
+    basis, lengths = filters.wavelet(
+        freqs=freqs,
+        sr=sr,
+        filter_scale=filter_scale,
+        norm=norm,
+        pad_fft=True,
+        window=window,
+        gamma=gamma,
+        alpha=alpha,
+    )
+    # Filters are padded up to the nearest integral power of 2
+    n_fft = basis.shape[1]
+    if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))):
+        n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length))))
+    # re-normalize bases with respect to the FFT window length
+    basis *= lengths[:, np.newaxis] / float(n_fft)
+    # FFT and retain only the non-negative frequencies
+    fft = get_fftlib()
+    fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1]
+    # sparsify the basis
+    fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype)
+    return fft_basis, n_fft, lengths
+def __trim_stack(
+    cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike
+) -> np.ndarray:
+    """Helper function to trim and stack a collection of CQT responses"""
+    max_col = min(c_i.shape[-1] for c_i in cqt_resp)
+    # Grab any leading dimensions
+    shape = list(cqt_resp[0].shape)
+    shape[-2] = n_bins
+    shape[-1] = max_col
+    cqt_out = np.empty(shape, dtype=dtype, order="F")
+    # Copy per-octave data into output array
+    end = n_bins
+    for c_i in cqt_resp:
+        # By default, take the whole octave
+        n_oct = c_i.shape[-2]
+        # If the whole octave is more than we can fit,
+        # take the highest bins from c_i
+        if end < n_oct:
+            cqt_out[..., :end, :] = c_i[..., -end:, :max_col]
+        else:
+            cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col]
+        end -= n_oct
+    return cqt_out
+def __cqt_response(
+    y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None
+):
+    """Compute the filter response with a target STFT hop."""
+    # Compute the STFT matrix
+    D = stft(
+        y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype
+    )
+    if not phase:
+        D = np.abs(D)
+    # Reshape D to Dr
+    Dr = D.reshape((-1, D.shape[-2], D.shape[-1]))
+    output_flat = np.empty(
+        (Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype
+    )
+    # iterate over channels
+    #   project fft_basis.dot(Dr[i])
+    for i in range(Dr.shape[0]):
+        output_flat[i] = fft_basis.dot(Dr[i])
+    # reshape Dr to match D's leading dimensions again
+    shape = list(D.shape)
+    shape[-2] = fft_basis.shape[0]
+    return output_flat.reshape(shape)
+def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves):
+    """Compute the number of early downsampling operations"""
+    downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1)
+    num_twos = __num_two_factors(hop_length)
+    downsample_count2 = max(0, num_twos - n_octaves + 1)
+    return min(downsample_count1, downsample_count2)
+def __early_downsample(
+    y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
+):
+    """Perform early downsampling on an audio signal, if it applies."""
+    downsample_count = __early_downsample_count(
+        nyquist, filter_cutoff, hop_length, n_octaves
+    )
+    if downsample_count > 0:
+        downsample_factor = 2 ** (downsample_count)
+        hop_length //= downsample_factor
+        if y.shape[-1] < downsample_factor:
+            raise ParameterError(
+                f"Input signal length={len(y):d} is too short for "
+                f"{n_octaves:d}-octave CQT"
+            )
+        new_sr = sr / float(downsample_factor)
+        y = audio.resample(
+            y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True
+        )
+        # If we're not going to length-scale after CQT, we
+        # need to compensate for the downsampling factor here
+        if not scale:
+            y *= np.sqrt(downsample_factor)
+        sr = new_sr
+    return y, sr, hop_length
+@jit(nopython=True, cache=False)
+def __num_two_factors(x):
+    """Return how many times integer x can be evenly divided by 2.
+    Returns 0 for non-positive integers.
+    """
+    if x <= 0:
+        return 0
+    num_twos = 0
+    while x % 2 == 0:
+        num_twos += 1
+        x //= 2
+    return num_twos
+def griffinlim_cqt(
+    C: np.ndarray,
+    *,
+    n_iter: int = 32,
+    sr: float = 22050,
+    hop_length: int = 512,
+    fmin: Optional[_FloatLike_co] = None,
+    bins_per_octave: int = 12,
+    tuning: float = 0.0,
+    filter_scale: float = 1,
+    norm: Optional[float] = 1,
+    sparsity: float = 0.01,
+    window: _WindowSpec = "hann",
+    scale: bool = True,
+    pad_mode: _PadMode = "constant",
+    res_type: str = "soxr_hq",
+    dtype: Optional[DTypeLike] = None,
+    length: Optional[int] = None,
+    momentum: float = 0.99,
+    init: Optional[str] = "random",
+    random_state: Optional[
+        Union[int, np.random.RandomState, np.random.Generator]
+    ] = None,
+) -> np.ndarray:
+    """Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim
+    algorithm.
+    Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes
+    phase estimates, and then alternates forward- and inverse-CQT operations. [#]_
+    This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_
+    but adapted for use with constant-Q spectrograms.
+    .. [#] D. W. Griffin and J. S. Lim,
+        "Signal estimation from modified short-time Fourier transform,"
+        IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+    .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
+        "A fast Griffin-Lim algorithm,"
+        IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
+        Oct. 2013.
+    Parameters
+    ----------
+    C : np.ndarray [shape=(..., n_bins, n_frames)]
+        The constant-Q magnitude spectrogram
+    n_iter : int > 0
+        The number of iterations to run
+    sr : number > 0
+        Audio sampling rate
+    hop_length : int > 0
+        The hop length of the CQT
+    fmin : number > 0
+        Minimum frequency for the CQT.
+        If not provided, it defaults to `C1`.
+    bins_per_octave : int > 0
+        Number of bins per octave
+    tuning : float
+        Tuning deviation from A440, in fractions of a bin
+    filter_scale : float > 0
+        Filter scale factor. Small values (<1) use shorter windows
+        for improved time resolution.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See `librosa.util.normalize`.
+    sparsity : float in [0, 1)
+        Sparsify the CQT basis by discarding up to ``sparsity``
+        fraction of the energy in each basis.
+        Set ``sparsity=0`` to disable sparsification.
+    window : str, tuple, or function
+        Window specification for the basis filters.
+        See `filters.get_window` for details.
+    scale : bool
+        If ``True``, scale the CQT response by square-root the length
+        of each channel's filter.  This is analogous to ``norm='ortho'``
+        in FFT.
+        If ``False``, do not scale the CQT. This is analogous to ``norm=None``
+        in FFT.
+    pad_mode : string
+        Padding mode for centered frame analysis.
+        See also: `librosa.stft` and `numpy.pad`.
+    res_type : string
+        The resampling mode for recursive downsampling.
+        See ``librosa.resample`` for a list of available options.
+    dtype : numeric type
+        Real numeric type for ``y``.  Default is inferred to match the precision
+        of the input CQT.
+    length : int > 0, optional
+        If provided, the output ``y`` is zero-padded or clipped to exactly
+        ``length`` samples.
+    momentum : float > 0
+        The momentum parameter for fast Griffin-Lim.
+        Setting this to 0 recovers the original Griffin-Lim method.
+        Values near 1 can lead to faster convergence, but above 1 may not converge.
+    init : None or 'random' [default]
+        If 'random' (the default), then phase values are initialized randomly
+        according to ``random_state``.  This is recommended when the input ``C`` is
+        a magnitude spectrogram with no initial phase estimates.
+        If ``None``, then the phase is initialized from ``C``.  This is useful when
+        an initial guess for phase can be provided, or when you want to resume
+        Griffin-Lim from a previous output.
+    random_state : None, int, np.random.RandomState, or np.random.Generator
+        If int, random_state is the seed used by the random number generator
+        for phase initialization.
+        If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself.
+        If ``None``, defaults to the `np.random.default_rng()` object.
+    Returns
+    -------
+    y : np.ndarray [shape=(..., n)]
+        time-domain signal reconstructed from ``C``
+    See Also
+    --------
+    cqt
+    icqt
+    griffinlim
+    filters.get_window
+    resample
+    Examples
+    --------
+    A basis CQT inverse example
+    >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None)
+    >>> # Get the CQT magnitude, 7 octaves at 36 bins per octave
+    >>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36))
+    >>> # Invert using Griffin-Lim
+    >>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36)
+    >>> # And invert without estimating phase
+    >>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36)
+    Wave-plot the results
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
+    >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
+    >>> ax[0].set(title='Original', xlabel=None)
+    >>> ax[0].label_outer()
+    >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
+    >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
+    >>> ax[1].label_outer()
+    >>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2])
+    >>> ax[2].set(title='Magnitude-only icqt reconstruction')
+    """
+    if fmin is None:
+        fmin = note_to_hz("C1")
+    if random_state is None:
+        rng = np.random.default_rng()
+    elif isinstance(random_state, int):
+        rng = np.random.RandomState(seed=random_state)  # type: ignore
+    elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
+        rng = random_state  # type: ignore
+    else:
+        _ensure_not_reachable(random_state)
+        raise ParameterError(f"Unsupported random_state={random_state!r}")
+    if momentum > 1:
+        warnings.warn(
+            f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
+            "Proceed with caution!",
+            stacklevel=2,
+        )
+    elif momentum < 0:
+        raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0")
+    # using complex64 will keep the result to minimal necessary precision
+    angles = np.empty(C.shape, dtype=np.complex64)
+    eps = util.tiny(angles)
+    if init == "random":
+        # randomly initialize the phase
+        angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape))
+    elif init is None:
+        # Initialize an all ones complex matrix
+        angles[:] = 1.0
+    else:
+        raise ParameterError(f"init={init} must either None or 'random'")
+    # And initialize the previous iterate to 0
+    rebuilt: np.ndarray = np.array(0.0)
+    for _ in range(n_iter):
+        # Store the previous iterate
+        tprev = rebuilt
+        # Invert with our current estimate of the phases
+        inverse = icqt(
+            C * angles,
+            sr=sr,
+            hop_length=hop_length,
+            bins_per_octave=bins_per_octave,
+            fmin=fmin,
+            tuning=tuning,
+            filter_scale=filter_scale,
+            window=window,
+            length=length,
+            res_type=res_type,
+            norm=norm,
+            scale=scale,
+            sparsity=sparsity,
+            dtype=dtype,
+        )
+        # Rebuild the spectrogram
+        rebuilt = cqt(
+            inverse,
+            sr=sr,
+            bins_per_octave=bins_per_octave,
+            n_bins=C.shape[-2],
+            hop_length=hop_length,
+            fmin=fmin,
+            tuning=tuning,
+            filter_scale=filter_scale,
+            window=window,
+            norm=norm,
+            scale=scale,
+            sparsity=sparsity,
+            pad_mode=pad_mode,
+            res_type=res_type,
+        )
+        # Update our phase estimates
+        angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
+        angles[:] /= np.abs(angles) + eps
+    # Return the final phase estimates
+    return icqt(
+        C * angles,
+        sr=sr,
+        hop_length=hop_length,
+        bins_per_octave=bins_per_octave,
+        tuning=tuning,
+        filter_scale=filter_scale,
+        fmin=fmin,
+        window=window,
+        length=length,
+        res_type=res_type,
+        norm=norm,
+        scale=scale,
+        sparsity=sparsity,
+        dtype=dtype,
+    )
+def __bpo_to_alpha(bins_per_octave: int) -> float:
+    """Compute the alpha coefficient for a given number of bins per octave
+    Parameters
+    ----------
+    bins_per_octave : int
+    Returns
+    -------
+    alpha : number > 0
+    """
+    r = 2 ** (1 / bins_per_octave)
+    return (r**2 - 1) / (r**2 + 1)

filters.py ADDED Viewed

	@@ -0,0 +1,1661 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Filters
+=======
+Filter bank construction
+------------------------
+.. autosummary::
+    :toctree: generated/
+    mel
+    chroma
+    wavelet
+    semitone_filterbank
+Window functions
+----------------
+.. autosummary::
+    :toctree: generated/
+    window_bandwidth
+    get_window
+Miscellaneous
+-------------
+.. autosummary::
+    :toctree: generated/
+    wavelet_lengths
+    cq_to_chroma
+    mr_frequencies
+    window_sumsquare
+    diagonal_filter
+Deprecated
+----------
+.. autosummary::
+    :toctree: generated/
+    constant_q
+    constant_q_lengths
+"""
+import warnings
+import numpy as np
+import scipy
+import scipy.signal
+import scipy.ndimage
+from numba import jit
+from ._cache import cache
+from . import util
+from .util.exceptions import ParameterError
+from .util.decorators import deprecated
+from .core.convert import note_to_hz, hz_to_midi, midi_to_hz, hz_to_octs
+from .core.convert import fft_frequencies, mel_frequencies
+from numpy.typing import ArrayLike, DTypeLike
+from typing import Any, List, Optional, Tuple, Union
+from typing_extensions import Literal
+from ._typing import _WindowSpec, _FloatLike_co
+__all__ = [
+    "mel",
+    "chroma",
+    "constant_q",
+    "constant_q_lengths",
+    "cq_to_chroma",
+    "window_bandwidth",
+    "get_window",
+    "mr_frequencies",
+    "semitone_filterbank",
+    "window_sumsquare",
+    "diagonal_filter",
+    "wavelet",
+    "wavelet_lengths",
+]
+# Dictionary of window function bandwidths
+WINDOW_BANDWIDTHS = {
+    "bart": 1.3334961334912805,
+    "barthann": 1.4560255965133932,
+    "bartlett": 1.3334961334912805,
+    "bkh": 2.0045975283585014,
+    "black": 1.7269681554262326,
+    "blackharr": 2.0045975283585014,
+    "blackman": 1.7269681554262326,
+    "blackmanharris": 2.0045975283585014,
+    "blk": 1.7269681554262326,
+    "bman": 1.7859588613860062,
+    "bmn": 1.7859588613860062,
+    "bohman": 1.7859588613860062,
+    "box": 1.0,
+    "boxcar": 1.0,
+    "brt": 1.3334961334912805,
+    "brthan": 1.4560255965133932,
+    "bth": 1.4560255965133932,
+    "cosine": 1.2337005350199792,
+    "flat": 2.7762255046484143,
+    "flattop": 2.7762255046484143,
+    "flt": 2.7762255046484143,
+    "halfcosine": 1.2337005350199792,
+    "ham": 1.3629455320350348,
+    "hamm": 1.3629455320350348,
+    "hamming": 1.3629455320350348,
+    "han": 1.50018310546875,
+    "hann": 1.50018310546875,
+    "nut": 1.9763500280946082,
+    "nutl": 1.9763500280946082,
+    "nuttall": 1.9763500280946082,
+    "ones": 1.0,
+    "par": 1.9174603174603191,
+    "parz": 1.9174603174603191,
+    "parzen": 1.9174603174603191,
+    "rect": 1.0,
+    "rectangular": 1.0,
+    "tri": 1.3331706523555851,
+    "triang": 1.3331706523555851,
+    "triangle": 1.3331706523555851,
+}
+@cache(level=10)
+def mel(
+    *,
+    sr: float,
+    n_fft: int,
+    n_mels: int = 128,
+    fmin: float = 0.0,
+    fmax: Optional[float] = None,
+    htk: bool = False,
+    norm: Optional[Union[Literal["slaney"], float]] = "slaney",
+    dtype: DTypeLike = np.float32,
+) -> np.ndarray:
+    """Create a Mel filter-bank.
+    This produces a linear transformation matrix to project
+    FFT bins onto Mel-frequency bins.
+    Parameters
+    ----------
+    sr : number > 0 [scalar]
+        sampling rate of the incoming signal
+    n_fft : int > 0 [scalar]
+        number of FFT components
+    n_mels : int > 0 [scalar]
+        number of Mel bands to generate
+    fmin : float >= 0 [scalar]
+        lowest frequency (in Hz)
+    fmax : float >= 0 [scalar]
+        highest frequency (in Hz).
+        If `None`, use ``fmax = sr / 2.0``
+    htk : bool [scalar]
+        use HTK formula instead of Slaney
+    norm : {None, 'slaney', or number} [scalar]
+        If 'slaney', divide the triangular mel weights by the width of the mel band
+        (area normalization).
+        If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
+        See `librosa.util.normalize` for a full description of supported norm values
+        (including `+-np.inf`).
+        Otherwise, leave all the triangles aiming for a peak value of 1.0
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 32-bit (single-precision) floating point.
+    Returns
+    -------
+    M : np.ndarray [shape=(n_mels, 1 + n_fft/2)]
+        Mel transform matrix
+    See Also
+    --------
+    librosa.util.normalize
+    Notes
+    -----
+    This function caches at level 10.
+    Examples
+    --------
+    >>> melfb = librosa.filters.mel(sr=22050, n_fft=2048)
+    >>> melfb
+    array([[ 0.   ,  0.016, ...,  0.   ,  0.   ],
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ],
+           ...,
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ],
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ]])
+    Clip the maximum frequency to 8KHz
+    >>> librosa.filters.mel(sr=22050, n_fft=2048, fmax=8000)
+    array([[ 0.  ,  0.02, ...,  0.  ,  0.  ],
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ],
+           ...,
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ],
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ]])
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> img = librosa.display.specshow(melfb, x_axis='linear', ax=ax)
+    >>> ax.set(ylabel='Mel filter', title='Mel filter bank')
+    >>> fig.colorbar(img, ax=ax)
+    """
+    if fmax is None:
+        fmax = float(sr) / 2
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+    if isinstance(norm, str):
+        if norm == "slaney":
+            # Slaney-style mel is scaled to be approx constant energy per channel
+            enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
+            weights *= enorm[:, np.newaxis]
+        else:
+            raise ParameterError(f"Unsupported norm={norm}")
+    else:
+        weights = util.normalize(weights, norm=norm, axis=-1)
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn(
+            "Empty filters detected in mel frequency basis. "
+            "Some channels will produce empty responses. "
+            "Try increasing your sampling rate (and fmax) or "
+            "reducing n_mels.",
+            stacklevel=2,
+        )
+    return weights
+@cache(level=10)
+def chroma(
+    *,
+    sr: float,
+    n_fft: int,
+    n_chroma: int = 12,
+    tuning: float = 0.0,
+    ctroct: float = 5.0,
+    octwidth: Union[float, None] = 2,
+    norm: Optional[float] = 2,
+    base_c: bool = True,
+    dtype: DTypeLike = np.float32,
+) -> np.ndarray:
+    """Create a chroma filter bank.
+    This creates a linear transformation matrix to project
+    FFT bins onto chroma bins (i.e. pitch classes).
+    Parameters
+    ----------
+    sr : number > 0 [scalar]
+        audio sampling rate
+    n_fft : int > 0 [scalar]
+        number of FFT bins
+    n_chroma : int > 0 [scalar]
+        number of chroma bins
+    tuning : float
+        Tuning deviation from A440 in fractions of a chroma bin.
+    ctroct : float > 0 [scalar]
+    octwidth : float > 0 or None [scalar]
+        ``ctroct`` and ``octwidth`` specify a dominance window:
+        a Gaussian weighting centered on ``ctroct`` (in octs, A0 = 27.5Hz)
+        and with a gaussian half-width of ``octwidth``.
+        Set ``octwidth`` to `None` to use a flat weighting.
+    norm : float > 0 or np.inf
+        Normalization factor for each filter
+    base_c : bool
+        If True, the filter bank will start at 'C'.
+        If False, the filter bank will start at 'A'.
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 32-bit (single-precision) floating point.
+    Returns
+    -------
+    wts : ndarray [shape=(n_chroma, 1 + n_fft / 2)]
+        Chroma filter matrix
+    See Also
+    --------
+    librosa.util.normalize
+    librosa.feature.chroma_stft
+    Notes
+    -----
+    This function caches at level 10.
+    Examples
+    --------
+    Build a simple chroma filter bank
+    >>> chromafb = librosa.filters.chroma(sr=22050, n_fft=4096)
+    array([[  1.689e-05,   3.024e-04, ...,   4.639e-17,   5.327e-17],
+           [  1.716e-05,   2.652e-04, ...,   2.674e-25,   3.176e-25],
+    ...,
+           [  1.578e-05,   3.619e-04, ...,   8.577e-06,   9.205e-06],
+           [  1.643e-05,   3.355e-04, ...,   1.474e-10,   1.636e-10]])
+    Use quarter-tones instead of semitones
+    >>> librosa.filters.chroma(sr=22050, n_fft=4096, n_chroma=24)
+    array([[  1.194e-05,   2.138e-04, ...,   6.297e-64,   1.115e-63],
+           [  1.206e-05,   2.009e-04, ...,   1.546e-79,   2.929e-79],
+    ...,
+           [  1.162e-05,   2.372e-04, ...,   6.417e-38,   9.923e-38],
+           [  1.180e-05,   2.260e-04, ...,   4.697e-50,   7.772e-50]])
+    Equally weight all octaves
+    >>> librosa.filters.chroma(sr=22050, n_fft=4096, octwidth=None)
+    array([[  3.036e-01,   2.604e-01, ...,   2.445e-16,   2.809e-16],
+           [  3.084e-01,   2.283e-01, ...,   1.409e-24,   1.675e-24],
+    ...,
+           [  2.836e-01,   3.116e-01, ...,   4.520e-05,   4.854e-05],
+           [  2.953e-01,   2.888e-01, ...,   7.768e-10,   8.629e-10]])
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> img = librosa.display.specshow(chromafb, x_axis='linear', ax=ax)
+    >>> ax.set(ylabel='Chroma filter', title='Chroma filter bank')
+    >>> fig.colorbar(img, ax=ax)
+    """
+    wts = np.zeros((n_chroma, n_fft))
+    # Get the FFT bins, not counting the DC component
+    frequencies = np.linspace(0, sr, n_fft, endpoint=False)[1:]
+    frqbins = n_chroma * hz_to_octs(
+        frequencies, tuning=tuning, bins_per_octave=n_chroma
+    )
+    # make up a value for the 0 Hz bin = 1.5 octaves below bin 1
+    # (so chroma is 50% rotated from bin 1, and bin width is broad)
+    frqbins = np.concatenate(([frqbins[0] - 1.5 * n_chroma], frqbins))
+    binwidthbins = np.concatenate((np.maximum(frqbins[1:] - frqbins[:-1], 1.0), [1]))
+    D = np.subtract.outer(frqbins, np.arange(0, n_chroma, dtype="d")).T
+    n_chroma2 = np.round(float(n_chroma) / 2)
+    # Project into range -n_chroma/2 .. n_chroma/2
+    # add on fixed offset of 10*n_chroma to ensure all values passed to
+    # rem are positive
+    D = np.remainder(D + n_chroma2 + 10 * n_chroma, n_chroma) - n_chroma2
+    # Gaussian bumps - 2*D to make them narrower
+    wts = np.exp(-0.5 * (2 * D / np.tile(binwidthbins, (n_chroma, 1))) ** 2)
+    # normalize each column
+    wts = util.normalize(wts, norm=norm, axis=0)
+    # Maybe apply scaling for fft bins
+    if octwidth is not None:
+        wts *= np.tile(
+            np.exp(-0.5 * (((frqbins / n_chroma - ctroct) / octwidth) ** 2)),
+            (n_chroma, 1),
+        )
+    if base_c:
+        wts = np.roll(wts, -3 * (n_chroma // 12), axis=0)
+    # remove aliasing columns, copy to ensure row-contiguity
+    return np.ascontiguousarray(wts[:, : int(1 + n_fft / 2)], dtype=dtype)
+def __float_window(window_spec):
+    """Decorator function for windows with fractional input.
+    This function guarantees that for fractional ``x``, the following hold:
+    1. ``__float_window(window_function)(x)`` has length ``np.ceil(x)``
+    2. all values from ``np.floor(x)`` are set to 0.
+    For integer-valued ``x``, there should be no change in behavior.
+    """
+    def _wrap(n, *args, **kwargs):
+        """The wrapped window"""
+        n_min, n_max = int(np.floor(n)), int(np.ceil(n))
+        window = get_window(window_spec, n_min)
+        if len(window) < n_max:
+            window = np.pad(window, [(0, n_max - len(window))], mode="constant")
+        window[n_min:] = 0.0
+        return window
+    return _wrap
+@deprecated(version="0.9.0", version_removed="1.0")
+def constant_q(
+    *,
+    sr: float,
+    fmin: Optional[_FloatLike_co] = None,
+    n_bins: int = 84,
+    bins_per_octave: int = 12,
+    window: _WindowSpec = "hann",
+    filter_scale: float = 1,
+    pad_fft: bool = True,
+    norm: Optional[float] = 1,
+    dtype: DTypeLike = np.complex64,
+    gamma: float = 0,
+    **kwargs: Any,
+) -> Tuple[np.ndarray, np.ndarray]:
+    r"""Construct a constant-Q basis.
+    This function constructs a filter bank similar to Morlet wavelets,
+    where complex exponentials are windowed to different lengths
+    such that the number of cycles remains fixed for all frequencies.
+    By default, a Hann window (rather than the Gaussian window of Morlet wavelets)
+    is used, but this can be controlled by the ``window`` parameter.
+    Frequencies are spaced geometrically, increasing by a factor of
+    ``(2**(1./bins_per_octave))`` at each successive band.
+    .. warning:: This function is deprecated as of v0.9 and will be removed in 1.0.
+        See `librosa.filters.wavelet`.
+    Parameters
+    ----------
+    sr : number > 0 [scalar]
+        Audio sampling rate
+    fmin : float > 0 [scalar]
+        Minimum frequency bin. Defaults to `C1 ~= 32.70`
+    n_bins : int > 0 [scalar]
+        Number of frequencies.  Defaults to 7 octaves (84 bins).
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    window : string, tuple, number, or function
+        Windowing function to apply to filters.
+    filter_scale : float > 0 [scalar]
+        Scale of filter windows.
+        Small values (<1) use shorter windows for higher temporal resolution.
+    pad_fft : boolean
+        Center-pad all filters up to the nearest integral power of 2.
+        By default, padding is done with zeros, but this can be overridden
+        by setting the ``mode=`` field in *kwargs*.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See librosa.util.normalize
+    gamma : number >= 0
+        Bandwidth offset for variable-Q transforms.
+        ``gamma=0`` produces a constant-Q filterbank.
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 64-bit (single precision) complex floating point.
+    **kwargs : additional keyword arguments
+        Arguments to `np.pad()` when ``pad==True``.
+    Returns
+    -------
+    filters : np.ndarray, ``len(filters) == n_bins``
+        ``filters[i]`` is ``i``\ th time-domain CQT basis filter
+    lengths : np.ndarray, ``len(lengths) == n_bins``
+        The (fractional) length of each filter
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    wavelet
+    constant_q_lengths
+    librosa.cqt
+    librosa.vqt
+    librosa.util.normalize
+    Examples
+    --------
+    Use a shorter window for each filter
+    >>> basis, lengths = librosa.filters.constant_q(sr=22050, filter_scale=0.5)
+    Plot one octave of filters in time and frequency
+    >>> import matplotlib.pyplot as plt
+    >>> basis, lengths = librosa.filters.constant_q(sr=22050)
+    >>> fig, ax = plt.subplots(nrows=2, figsize=(10, 6))
+    >>> notes = librosa.midi_to_note(np.arange(24, 24 + len(basis)))
+    >>> for i, (f, n) in enumerate(zip(basis, notes[:12])):
+    ...     f_scale = librosa.util.normalize(f) / 2
+    ...     ax[0].plot(i + f_scale.real)
+    ...     ax[0].plot(i + f_scale.imag, linestyle=':')
+    >>> ax[0].set(yticks=np.arange(len(notes[:12])), yticklabels=notes[:12],
+    ...           ylabel='CQ filters',
+    ...           title='CQ filters (one octave, time domain)',
+    ...           xlabel='Time (samples at 22050 Hz)')
+    >>> ax[0].legend(['Real', 'Imaginary'])
+    >>> F = np.abs(np.fft.fftn(basis, axes=[-1]))
+    >>> # Keep only the positive frequencies
+    >>> F = F[:, :(1 + F.shape[1] // 2)]
+    >>> librosa.display.specshow(F, x_axis='linear', y_axis='cqt_note', ax=ax[1])
+    >>> ax[1].set(ylabel='CQ filters', title='CQ filter magnitudes (frequency domain)')
+    """
+    if fmin is None:
+        fmin = note_to_hz("C1")
+    # Pass-through parameters to get the filter lengths
+    lengths = constant_q_lengths(
+        sr=sr,
+        fmin=fmin,
+        n_bins=n_bins,
+        bins_per_octave=bins_per_octave,
+        window=window,
+        filter_scale=filter_scale,
+        gamma=gamma,
+    )
+    freqs = fmin * (2.0 ** (np.arange(n_bins, dtype=float) / bins_per_octave))
+    # Build the filters
+    filters = []
+    for ilen, freq in zip(lengths, freqs):
+        # Build the filter: note, length will be ceil(ilen)
+        sig = util.phasor(
+            np.arange(-ilen // 2, ilen // 2, dtype=float) * 2 * np.pi * freq / sr
+        )
+        # Apply the windowing function
+        sig = sig * __float_window(window)(len(sig))
+        # Normalize
+        sig = util.normalize(sig, norm=norm)
+        filters.append(sig)
+    # Pad and stack
+    max_len = max(lengths)
+    if pad_fft:
+        max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
+    else:
+        max_len = int(np.ceil(max_len))
+    filters = np.asarray(
+        [util.pad_center(filt, size=max_len, **kwargs) for filt in filters], dtype=dtype
+    )
+    return filters, np.asarray(lengths)
+@deprecated(version="0.9.0", version_removed="1.0")
+@cache(level=10)
+def constant_q_lengths(
+    *,
+    sr: float,
+    fmin: _FloatLike_co,
+    n_bins: int = 84,
+    bins_per_octave: int = 12,
+    window: _WindowSpec = "hann",
+    filter_scale: float = 1,
+    gamma: float = 0,
+) -> np.ndarray:
+    r"""Return length of each filter in a constant-Q basis.
+    .. warning:: This function is deprecated as of v0.9 and will be removed in 1.0.
+        See `librosa.filters.wavelet_lengths`.
+    Parameters
+    ----------
+    sr : number > 0 [scalar]
+        Audio sampling rate
+    fmin : float > 0 [scalar]
+        Minimum frequency bin.
+    n_bins : int > 0 [scalar]
+        Number of frequencies.  Defaults to 7 octaves (84 bins).
+    bins_per_octave : int > 0 [scalar]
+        Number of bins per octave
+    window : str or callable
+        Window function to use on filters
+    filter_scale : float > 0 [scalar]
+        Resolution of filter windows. Larger values use longer windows.
+    gamma : number >= 0
+        Bandwidth offset for variable-Q transforms.
+        ``gamma=0`` produces a constant-Q filterbank.
+    Returns
+    -------
+    lengths : np.ndarray
+        The length of each filter.
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    wavelet_lengths
+    """
+    if fmin <= 0:
+        raise ParameterError("fmin must be strictly positive")
+    if bins_per_octave <= 0:
+        raise ParameterError("bins_per_octave must be positive")
+    if filter_scale <= 0:
+        raise ParameterError("filter_scale must be positive")
+    if n_bins <= 0 or not isinstance(n_bins, (int, np.integer)):
+        raise ParameterError("n_bins must be a positive integer")
+    # Compute the frequencies
+    freq = fmin * (2.0 ** (np.arange(n_bins, dtype=float) / bins_per_octave))
+    # Q should be capitalized here, so we suppress the name warning
+    # pylint: disable=invalid-name
+    #
+    # Balance filter bandwidths
+    alpha = (2.0 ** (2 / bins_per_octave) - 1) / (2.0 ** (2 / bins_per_octave) + 1)
+    Q = float(filter_scale) / alpha
+    if max(freq * (1 + 0.5 * window_bandwidth(window) / Q)) > sr / 2.0:
+        raise ParameterError(
+            f"Maximum filter frequency={max(freq):.2f} would exceed Nyquist={sr/2}"
+        )
+    # Convert frequencies to filter lengths
+    lengths: np.ndarray = Q * sr / (freq + gamma / alpha)
+    return lengths
+@cache(level=10)
+def wavelet_lengths(
+    *,
+    freqs: ArrayLike,
+    sr: float = 22050,
+    window: _WindowSpec = "hann",
+    filter_scale: float = 1,
+    gamma: Optional[float] = 0,
+    alpha: Optional[Union[float, np.ndarray]] = None,
+) -> Tuple[np.ndarray, float]:
+    """Return length of each filter in a wavelet basis.
+    Parameters
+    ----------
+    freqs : np.ndarray (positive)
+        Center frequencies of the filters (in Hz).
+        Must be in ascending order.
+    sr : number > 0 [scalar]
+        Audio sampling rate
+    window : str or callable
+        Window function to use on filters
+    filter_scale : float > 0 [scalar]
+        Resolution of filter windows. Larger values use longer windows.
+    gamma : number >= 0 [scalar, optional]
+        Bandwidth offset for determining filter lengths, as used in
+        Variable-Q transforms.
+        Bandwidth for the k'th filter is determined by::
+            B[k] = alpha[k] * freqs[k] + gamma
+        ``alpha[k]`` is twice the relative difference between ``freqs[k+1]`` and ``freqs[k-1]``::
+            alpha[k] = (freqs[k+1]-freqs[k-1]) / (freqs[k+1]+freqs[k-1])
+        If ``freqs`` follows a geometric progression (as in CQT and VQT), the vector
+        ``alpha`` is constant and such that::
+            (1 + alpha) * freqs[k-1] = (1 - alpha) * freqs[k+1]
+        Furthermore, if ``gamma=0`` (default), ``alpha`` is such that even-``k`` and
+        odd-``k`` filters are interleaved::
+            freqs[k-1] + B[k-1] = freqs[k+1] - B[k+1]
+        If ``gamma=None`` is specified, then ``gamma`` is computed such
+        that each filter has bandwidth proportional to the equivalent
+        rectangular bandwidth (ERB) at frequency ``freqs[k]``::
+            gamma[k] = 24.7 * alpha[k] / 0.108
+        as derived by [#]_.
+        .. [#] Glasberg, Brian R., and Brian CJ Moore.
+            "Derivation of auditory filter shapes from notched-noise data."
+            Hearing research 47.1-2 (1990): 103-138.
+    alpha : number > 0 [optional]
+        If only one frequency is provided (``len(freqs)==1``), then filter bandwidth
+        cannot be computed.  In that case, the ``alpha`` parameter described above
+        can be explicitly specified here.
+        If two or more frequencies are provided, this parameter is ignored.
+    Returns
+    -------
+    lengths : np.ndarray
+        The length of each filter.
+    f_cutoff : float
+        The lowest frequency at which all filters' main lobes have decayed by
+        at least 3dB.
+        This second output serves in cqt and vqt to ensure that all wavelet
+        bands remain below the Nyquist frequency.
+    Notes
+    -----
+    This function caches at level 10.
+    Raises
+    ------
+    ParameterError
+        - If ``filter_scale`` is not strictly positive
+        - If ``gamma`` is a negative number
+        - If any frequencies are <= 0
+        - If the frequency array is not sorted in ascending order
+    """
+    freqs = np.asarray(freqs)
+    if filter_scale <= 0:
+        raise ParameterError(f"filter_scale={filter_scale} must be positive")
+    if gamma is not None and gamma < 0:
+        raise ParameterError(f"gamma={gamma} must be non-negative")
+    if np.any(freqs <= 0):
+        raise ParameterError("frequencies must be strictly positive")
+    if len(freqs) > 1 and np.any(freqs[:-1] > freqs[1:]):
+        raise ParameterError(
+            f"Frequency array={freqs} must be in strictly ascending order"
+        )
+    # We need at least 2 frequencies to infer alpha
+    if len(freqs) > 1:
+        # Approximate the local octave resolution
+        bpo = np.empty(len(freqs))
+        logf = np.log2(freqs)
+        bpo[0] = 1 / (logf[1] - logf[0])
+        bpo[-1] = 1 / (logf[-1] - logf[-2])
+        bpo[1:-1] = 2 / (logf[2:] - logf[:-2])
+        alpha = (2.0 ** (2 / bpo) - 1) / (2.0 ** (2 / bpo) + 1)
+    if alpha is None:
+        raise ParameterError(
+            "Cannot construct a wavelet basis for a single frequency if alpha is not provided"
+        )
+    gamma_: Union[_FloatLike_co, np.ndarray]
+    if gamma is None:
+        gamma_ = alpha * 24.7 / 0.108
+    else:
+        gamma_ = gamma
+    # Q should be capitalized here, so we suppress the name warning
+    # pylint: disable=invalid-name
+    Q = float(filter_scale) / alpha
+    # How far up does our highest frequency reach?
+    f_cutoff = max(freqs * (1 + 0.5 * window_bandwidth(window) / Q) + 0.5 * gamma_)
+    # Convert frequencies to filter lengths
+    lengths = Q * sr / (freqs + gamma_ / alpha)
+    return lengths, f_cutoff
+@cache(level=10)
+def wavelet(
+    *,
+    freqs: np.ndarray,
+    sr: float = 22050,
+    window: _WindowSpec = "hann",
+    filter_scale: float = 1,
+    pad_fft: bool = True,
+    norm: Optional[float] = 1,
+    dtype: DTypeLike = np.complex64,
+    gamma: float = 0,
+    alpha: Optional[float] = None,
+    **kwargs: Any,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Construct a wavelet basis using windowed complex sinusoids.
+    This function constructs a wavelet filterbank at a specified set of center
+    frequencies.
+    Parameters
+    ----------
+    freqs : np.ndarray (positive)
+        Center frequencies of the filters (in Hz).
+        Must be in ascending order.
+    sr : number > 0 [scalar]
+        Audio sampling rate
+    window : string, tuple, number, or function
+        Windowing function to apply to filters.
+    filter_scale : float > 0 [scalar]
+        Scale of filter windows.
+        Small values (<1) use shorter windows for higher temporal resolution.
+    pad_fft : boolean
+        Center-pad all filters up to the nearest integral power of 2.
+        By default, padding is done with zeros, but this can be overridden
+        by setting the ``mode=`` field in *kwargs*.
+    norm : {inf, -inf, 0, float > 0}
+        Type of norm to use for basis function normalization.
+        See librosa.util.normalize
+    gamma : number >= 0
+        Bandwidth offset for variable-Q transforms.
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 64-bit (single precision) complex floating point.
+    alpha : number > 0 [optional]
+        If only one frequency is provided (``len(freqs)==1``), then filter bandwidth
+        cannot be computed.  In that case, the ``alpha`` parameter described above
+        can be explicitly specified here.
+        If two or more frequencies are provided, this parameter is ignored.
+    **kwargs : additional keyword arguments
+        Arguments to `np.pad()` when ``pad==True``.
+    Returns
+    -------
+    filters : np.ndarray, ``len(filters) == n_bins``
+        each ``filters[i]`` is a (complex) time-domain filter
+    lengths : np.ndarray, ``len(lengths) == n_bins``
+        The (fractional) length of each filter in samples
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    wavelet_lengths
+    librosa.cqt
+    librosa.vqt
+    librosa.util.normalize
+    Examples
+    --------
+    Create a constant-Q basis
+    >>> freqs = librosa.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1'))
+    >>> basis, lengths = librosa.filters.wavelet(freqs=freqs, sr=22050)
+    Plot one octave of filters in time and frequency
+    >>> import matplotlib.pyplot as plt
+    >>> basis, lengths = librosa.filters.wavelet(freqs=freqs, sr=22050)
+    >>> fig, ax = plt.subplots(nrows=2, figsize=(10, 6))
+    >>> notes = librosa.midi_to_note(np.arange(24, 24 + len(basis)))
+    >>> for i, (f, n) in enumerate(zip(basis, notes[:12])):
+    ...     f_scale = librosa.util.normalize(f) / 2
+    ...     ax[0].plot(i + f_scale.real)
+    ...     ax[0].plot(i + f_scale.imag, linestyle=':')
+    >>> ax[0].set(yticks=np.arange(len(notes[:12])), yticklabels=notes[:12],
+    ...           ylabel='CQ filters',
+    ...           title='CQ filters (one octave, time domain)',
+    ...           xlabel='Time (samples at 22050 Hz)')
+    >>> ax[0].legend(['Real', 'Imaginary'])
+    >>> F = np.abs(np.fft.fftn(basis, axes=[-1]))
+    >>> # Keep only the positive frequencies
+    >>> F = F[:, :(1 + F.shape[1] // 2)]
+    >>> librosa.display.specshow(F, x_axis='linear', y_axis='cqt_note', ax=ax[1])
+    >>> ax[1].set(ylabel='CQ filters', title='CQ filter magnitudes (frequency domain)')
+    """
+    # Pass-through parameters to get the filter lengths
+    lengths, _ = wavelet_lengths(
+        freqs=freqs,
+        sr=sr,
+        window=window,
+        filter_scale=filter_scale,
+        gamma=gamma,
+        alpha=alpha,
+    )
+    # Build the filters
+    filters = []
+    for ilen, freq in zip(lengths, freqs):
+        # Build the filter: note, length will be ceil(ilen)
+        sig = util.phasor(
+            np.arange(-ilen // 2, ilen // 2, dtype=float) * 2 * np.pi * freq / sr
+        )
+        # Apply the windowing function
+        sig *= __float_window(window)(len(sig))
+        # Normalize
+        sig = util.normalize(sig, norm=norm)
+        filters.append(sig)
+    # Pad and stack
+    max_len = max(lengths)
+    if pad_fft:
+        max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
+    else:
+        max_len = int(np.ceil(max_len))
+    filters = np.asarray(
+        [util.pad_center(filt, size=max_len, **kwargs) for filt in filters], dtype=dtype
+    )
+    return filters, lengths
+@cache(level=10)
+def cq_to_chroma(
+    n_input: int,
+    *,
+    bins_per_octave: int = 12,
+    n_chroma: int = 12,
+    fmin: Optional[_FloatLike_co] = None,
+    window: Optional[np.ndarray] = None,
+    base_c: bool = True,
+    dtype: DTypeLike = np.float32,
+) -> np.ndarray:
+    """Construct a linear transformation matrix to map Constant-Q bins
+    onto chroma bins (i.e., pitch classes).
+    Parameters
+    ----------
+    n_input : int > 0 [scalar]
+        Number of input components (CQT bins)
+    bins_per_octave : int > 0 [scalar]
+        How many bins per octave in the CQT
+    n_chroma : int > 0 [scalar]
+        Number of output bins (per octave) in the chroma
+    fmin : None or float > 0
+        Center frequency of the first constant-Q channel.
+        Default: 'C1' ~= 32.7 Hz
+    window : None or np.ndarray
+        If provided, the cq_to_chroma filter bank will be
+        convolved with ``window``.
+    base_c : bool
+        If True, the first chroma bin will start at 'C'
+        If False, the first chroma bin will start at 'A'
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 32-bit (single-precision) floating point.
+    Returns
+    -------
+    cq_to_chroma : np.ndarray [shape=(n_chroma, n_input)]
+        Transformation matrix: ``Chroma = np.dot(cq_to_chroma, CQT)``
+    Raises
+    ------
+    ParameterError
+        If ``n_input`` is not an integer multiple of ``n_chroma``
+    Notes
+    -----
+    This function caches at level 10.
+    Examples
+    --------
+    Get a CQT, and wrap bins to chroma
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> CQT = np.abs(librosa.cqt(y, sr=sr))
+    >>> chroma_map = librosa.filters.cq_to_chroma(CQT.shape[0])
+    >>> chromagram = chroma_map.dot(CQT)
+    >>> # Max-normalize each time step
+    >>> chromagram = librosa.util.normalize(chromagram, axis=0)
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=3, sharex=True)
+    >>> imgcq = librosa.display.specshow(librosa.amplitude_to_db(CQT,
+    ...                                                         ref=np.max),
+    ...                                  y_axis='cqt_note', x_axis='time',
+    ...                                  ax=ax[0])
+    >>> ax[0].set(title='CQT Power')
+    >>> ax[0].label_outer()
+    >>> librosa.display.specshow(chromagram, y_axis='chroma', x_axis='time',
+    ...                          ax=ax[1])
+    >>> ax[1].set(title='Chroma (wrapped CQT)')
+    >>> ax[1].label_outer()
+    >>> chroma = librosa.feature.chroma_stft(y=y, sr=sr)
+    >>> imgchroma = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=ax[2])
+    >>> ax[2].set(title='librosa.feature.chroma_stft')
+    """
+    # How many fractional bins are we merging?
+    n_merge = float(bins_per_octave) / n_chroma
+    fmin_: _FloatLike_co
+    if fmin is None:
+        fmin_ = note_to_hz("C1")
+    else:
+        fmin_ = fmin
+    if np.mod(n_merge, 1) != 0:
+        raise ParameterError(
+            "Incompatible CQ merge: "
+            "input bins must be an "
+            "integer multiple of output bins."
+        )
+    # Tile the identity to merge fractional bins
+    cq_to_ch = np.repeat(np.eye(n_chroma), int(n_merge), axis=1)
+    # Roll it left to center on the target bin
+    cq_to_ch = np.roll(cq_to_ch, -int(n_merge // 2), axis=1)
+    # How many octaves are we repeating?
+    n_octaves = np.ceil(float(n_input) / bins_per_octave)
+    # Repeat and trim
+    cq_to_ch = np.tile(cq_to_ch, int(n_octaves))[:, :n_input]
+    # What's the note number of the first bin in the CQT?
+    # midi uses 12 bins per octave here
+    midi_0 = np.mod(hz_to_midi(fmin_), 12)
+    if base_c:
+        # rotate to C
+        roll = midi_0
+    else:
+        # rotate to A
+        roll = midi_0 - 9
+    # Adjust the roll in terms of how many chroma we want out
+    # We need to be careful with rounding here
+    roll = int(np.round(roll * (n_chroma / 12.0)))
+    # Apply the roll
+    cq_to_ch = np.roll(cq_to_ch, roll, axis=0).astype(dtype)
+    if window is not None:
+        cq_to_ch = scipy.signal.convolve(cq_to_ch, np.atleast_2d(window), mode="same")
+    return cq_to_ch
+@cache(level=10)
+def window_bandwidth(window: _WindowSpec, n: int = 1000) -> float:
+    """Get the equivalent noise bandwidth (ENBW) of a window function.
+    The ENBW of a window is defined by [#]_ (equation 11) as the normalized
+    ratio of the sum of squares to the square of sums::
+        enbw = n * sum(window**2) / sum(window)**2
+    .. [#] Harris, F. J.
+        "On the use of windows for harmonic analysis with the discrete Fourier transform."
+        Proceedings of the IEEE, 66(1), 51-83.  1978.
+    Parameters
+    ----------
+    window : callable or string
+        A window function, or the name of a window function.
+        Examples:
+        - scipy.signal.hann
+        - 'boxcar'
+    n : int > 0
+        The number of coefficients to use in estimating the
+        window bandwidth
+    Returns
+    -------
+    bandwidth : float
+        The equivalent noise bandwidth (in FFT bins) of the
+        given window function
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    get_window
+    """
+    if hasattr(window, "__name__"):
+        key = window.__name__
+    else:
+        key = window
+    if key not in WINDOW_BANDWIDTHS:
+        win = get_window(window, n)
+        WINDOW_BANDWIDTHS[key] = (
+            n * np.sum(win**2) / (np.sum(win) ** 2 + util.tiny(win))
+        )
+    return WINDOW_BANDWIDTHS[key]
+@cache(level=10)
+def get_window(
+    window: _WindowSpec,
+    Nx: int,
+    *,
+    fftbins: Optional[bool] = True,
+) -> np.ndarray:
+    """Compute a window function.
+    This is a wrapper for `scipy.signal.get_window` that additionally
+    supports callable or pre-computed windows.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        The window specification:
+        - If string, it's the name of the window function (e.g., `'hann'`)
+        - If tuple, it's the name of the window function and any parameters
+          (e.g., `('kaiser', 4.0)`)
+        - If numeric, it is treated as the beta parameter of the `'kaiser'`
+          window, as in `scipy.signal.get_window`.
+        - If callable, it's a function that accepts one integer argument
+          (the window length)
+        - If list-like, it's a pre-computed window of the correct length `Nx`
+    Nx : int > 0
+        The length of the window
+    fftbins : bool, optional
+        If True (default), create a periodic window for use with FFT
+        If False, create a symmetric window for filter design applications.
+    Returns
+    -------
+    get_window : np.ndarray
+        A window of length `Nx` and type `window`
+    See Also
+    --------
+    scipy.signal.get_window
+    Notes
+    -----
+    This function caches at level 10.
+    Raises
+    ------
+    ParameterError
+        If `window` is supplied as a vector of length != `n_fft`,
+        or is otherwise mis-specified.
+    """
+    if callable(window):
+        return window(Nx)
+    elif isinstance(window, (str, tuple)) or np.isscalar(window):
+        # TODO: if we add custom window functions in librosa, call them here
+        win: np.ndarray = scipy.signal.get_window(window, Nx, fftbins=fftbins)
+        return win
+    elif isinstance(window, (np.ndarray, list)):
+        if len(window) == Nx:
+            return np.asarray(window)
+        raise ParameterError(f"Window size mismatch: {len(window):d} != {Nx:d}")
+    else:
+        raise ParameterError(f"Invalid window specification: {window!r}")
+@cache(level=10)
+def _multirate_fb(
+    center_freqs: Optional[np.ndarray] = None,
+    sample_rates: Optional[np.ndarray] = None,
+    Q: float = 25.0,
+    passband_ripple: float = 1,
+    stopband_attenuation: float = 50,
+    ftype: str = "ellip",
+    flayout: str = "sos",
+) -> Tuple[List[Any], np.ndarray]:
+    r"""Helper function to construct a multirate filterbank.
+     A filter bank consists of multiple band-pass filters which divide the input signal
+     into subbands. In the case of a multirate filter bank, the band-pass filters
+     operate with resampled versions of the input signal, e.g. to keep the length
+     of a filter constant while shifting its center frequency.
+     This implementation uses `scipy.signal.iirdesign` to design the filters.
+    Parameters
+    ----------
+    center_freqs : np.ndarray [shape=(n,), dtype=float]
+        Center frequencies of the filter kernels.
+        Also defines the number of filters in the filterbank.
+    sample_rates : np.ndarray [shape=(n,), dtype=float]
+        Samplerate for each filter (used for multirate filterbank).
+    Q : float
+        Q factor (influences the filter bandwidth).
+    passband_ripple : float
+        The maximum loss in the passband (dB)
+        See `scipy.signal.iirdesign` for details.
+    stopband_attenuation : float
+        The minimum attenuation in the stopband (dB)
+        See `scipy.signal.iirdesign` for details.
+    ftype : str
+        The type of IIR filter to design
+        See `scipy.signal.iirdesign` for details.
+    flayout : string
+        Valid `output` argument for `scipy.signal.iirdesign`.
+        - If `ba`, returns numerators/denominators of the transfer functions,
+          used for filtering with `scipy.signal.filtfilt`.
+          Can be unstable for high-order filters.
+        - If `sos`, returns a series of second-order filters,
+          used for filtering with `scipy.signal.sosfiltfilt`.
+          Minimizes numerical precision errors for high-order filters, but is slower.
+        - If `zpk`, returns zeros, poles, and system gains of the transfer functions.
+    Returns
+    -------
+    filterbank : list [shape=(n,), dtype=float]
+        Each list entry comprises the filter coefficients for a single filter.
+    sample_rates : np.ndarray [shape=(n,), dtype=float]
+        Samplerate for each filter.
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    scipy.signal.iirdesign
+    Raises
+    ------
+    ParameterError
+        If ``center_freqs`` is ``None``.
+        If ``sample_rates`` is ``None``.
+        If ``center_freqs.shape`` does not match ``sample_rates.shape``.
+    """
+    if center_freqs is None:
+        raise ParameterError("center_freqs must be provided.")
+    if sample_rates is None:
+        raise ParameterError("sample_rates must be provided.")
+    if center_freqs.shape != sample_rates.shape:
+        raise ParameterError(
+            "Number of provided center_freqs and sample_rates must be equal."
+        )
+    nyquist = 0.5 * sample_rates
+    filter_bandwidths = center_freqs / float(Q)
+    filterbank = []
+    for cur_center_freq, cur_nyquist, cur_bw in zip(
+        center_freqs, nyquist, filter_bandwidths
+    ):
+        passband_freqs = [
+            cur_center_freq - 0.5 * cur_bw,
+            cur_center_freq + 0.5 * cur_bw,
+        ] / cur_nyquist
+        stopband_freqs = [
+            cur_center_freq - cur_bw,
+            cur_center_freq + cur_bw,
+        ] / cur_nyquist
+        cur_filter = scipy.signal.iirdesign(
+            passband_freqs,
+            stopband_freqs,
+            passband_ripple,
+            stopband_attenuation,
+            analog=False,
+            ftype=ftype,
+            output=flayout,
+        )
+        filterbank.append(cur_filter)
+    return filterbank, sample_rates
+@cache(level=10)
+def mr_frequencies(tuning: float) -> Tuple[np.ndarray, np.ndarray]:
+    r"""Helper function for generating center frequency and sample rate pairs.
+    This function will return center frequency and corresponding sample rates
+    to obtain similar pitch filterbank settings as described in [#]_.
+    Instead of starting with MIDI pitch `A0`, we start with `C0`.
+    .. [#] Müller, Meinard.
+           "Information Retrieval for Music and Motion."
+           Springer Verlag. 2007.
+    Parameters
+    ----------
+    tuning : float [scalar]
+        Tuning deviation from A440, measure as a fraction of the equally
+        tempered semitone (1/12 of an octave).
+    Returns
+    -------
+    center_freqs : np.ndarray [shape=(n,), dtype=float]
+        Center frequencies of the filter kernels.
+        Also defines the number of filters in the filterbank.
+    sample_rates : np.ndarray [shape=(n,), dtype=float]
+        Sample rate for each filter, used for multirate filterbank.
+    Notes
+    -----
+    This function caches at level 10.
+    See Also
+    --------
+    librosa.filters.semitone_filterbank
+    """
+    center_freqs = midi_to_hz(np.arange(24 + tuning, 109 + tuning))
+    sample_rates = np.asarray(
+        len(np.arange(0, 36))
+        * [
+            882.0,
+        ]
+        + len(np.arange(36, 70))
+        * [
+            4410.0,
+        ]
+        + len(np.arange(70, 85))
+        * [
+            22050.0,
+        ]
+    )
+    return center_freqs, sample_rates
+def semitone_filterbank(
+    *,
+    center_freqs: Optional[np.ndarray] = None,
+    tuning: float = 0.0,
+    sample_rates: Optional[np.ndarray] = None,
+    flayout: str = "ba",
+    **kwargs: Any,
+) -> Tuple[List[Any], np.ndarray]:
+    r"""Construct a multi-rate bank of infinite-impulse response (IIR)
+    band-pass filters at user-defined center frequencies and sample rates.
+    By default, these center frequencies are set equal to the 88 fundamental
+    frequencies of the grand piano keyboard, according to a pitch tuning standard
+    of A440, that is, note A above middle C set to 440 Hz. The center frequencies
+    are tuned to the twelve-tone equal temperament, which means that they grow
+    exponentially at a rate of 2**(1/12), that is, twelve notes per octave.
+    The A440 tuning can be changed by the user while keeping twelve-tone equal
+    temperament. While A440 is currently the international standard in the music
+    industry (ISO 16), some orchestras tune to A441-A445, whereas baroque musicians
+    tune to A415.
+    See [#]_ for details.
+    .. [#] Müller, Meinard.
+           "Information Retrieval for Music and Motion."
+           Springer Verlag. 2007.
+    Parameters
+    ----------
+    center_freqs : np.ndarray [shape=(n,), dtype=float]
+        Center frequencies of the filter kernels.
+        Also defines the number of filters in the filterbank.
+    tuning : float [scalar]
+        Tuning deviation from A440 as a fraction of a semitone (1/12 of an octave
+        in equal temperament).
+    sample_rates : np.ndarray [shape=(n,), dtype=float]
+        Sample rates of each filter in the multirate filterbank.
+    flayout : string
+        - If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`.
+          Can be unstable for high-order filters.
+        - If `sos`, a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`.
+          Minimizes numerical precision errors for high-order filters, but is slower.
+    **kwargs : additional keyword arguments
+        Additional arguments to the private function `_multirate_fb()`.
+    Returns
+    -------
+    filterbank : list [shape=(n,), dtype=float]
+        Each list entry contains the filter coefficients for a single filter.
+    fb_sample_rates : np.ndarray [shape=(n,), dtype=float]
+        Sample rate for each filter.
+    See Also
+    --------
+    librosa.cqt
+    librosa.iirt
+    librosa.filters.mr_frequencies
+    scipy.signal.iirdesign
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> import scipy.signal
+    >>> semitone_filterbank, sample_rates = librosa.filters.semitone_filterbank(
+    ...     center_freqs=librosa.midi_to_hz(np.arange(60, 72)),
+    ...     sample_rates=np.repeat(4410.0, 12),
+    ...     flayout='sos'
+    ...     )
+    >>> magnitudes = []
+    >>> for cur_sr, cur_filter in zip(sample_rates, semitone_filterbank):
+    ...     w, h = scipy.signal.sosfreqz(cur_filter,fs=cur_sr, worN=1025)
+    ...     magnitudes.append(20 * np.log10(np.abs(h)))
+    >>> fig, ax = plt.subplots(figsize=(12,6))
+    >>> img = librosa.display.specshow(
+    ...     np.array(magnitudes),
+    ...     x_axis="hz",
+    ...     sr=4410,
+    ...     y_coords=librosa.midi_to_hz(np.arange(60, 72)),
+    ...     vmin=-60,
+    ...     vmax=3,
+    ...     ax=ax
+    ...     )
+    >>> fig.colorbar(img, ax=ax, format="%+2.f dB", label="Magnitude (dB)")
+    >>> ax.set(
+    ...     xlim=[200, 600],
+    ...     yticks=librosa.midi_to_hz(np.arange(60, 72)),
+    ...     title='Magnitude Responses of the Pitch Filterbank',
+    ...     xlabel='Frequency (Hz)',
+    ...     ylabel='Semitone filter center frequency (Hz)'
+    ... )
+    """
+    if (center_freqs is None) and (sample_rates is None):
+        center_freqs, sample_rates = mr_frequencies(tuning)
+    filterbank, fb_sample_rates = _multirate_fb(
+        center_freqs=center_freqs, sample_rates=sample_rates, flayout=flayout, **kwargs
+    )
+    return filterbank, fb_sample_rates
+@jit(nopython=True, cache=False)
+def __window_ss_fill(x, win_sq, n_frames, hop_length):  # pragma: no cover
+    """Helper function for window sum-square calculation."""
+    n = len(x)
+    n_fft = len(win_sq)
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+def window_sumsquare(
+    *,
+    window: _WindowSpec,
+    n_frames: int,
+    hop_length: int = 512,
+    win_length: Optional[int] = None,
+    n_fft: int = 2048,
+    dtype: DTypeLike = np.float32,
+    norm: Optional[float] = None,
+) -> np.ndarray:
+    """Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing observations
+    in short-time Fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches ``n_fft``.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    norm : {np.inf, -np.inf, 0, float > 0, None}
+        Normalization mode used in window construction.
+        Note that this does not affect the squaring operation.
+    Returns
+    -------
+    wss : np.ndarray, shape=``(n_fft + hop_length * (n_frames - 1))``
+        The sum-squared envelope of the window function
+    Examples
+    --------
+    For a fixed frame length (2048), compare modulation effects for a Hann window
+    at different hop lengths:
+    >>> n_frames = 50
+    >>> wss_256 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=256)
+    >>> wss_512 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=512)
+    >>> wss_1024 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=1024)
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=3, sharey=True)
+    >>> ax[0].plot(wss_256)
+    >>> ax[0].set(title='hop_length=256')
+    >>> ax[1].plot(wss_512)
+    >>> ax[1].set(title='hop_length=512')
+    >>> ax[2].plot(wss_1024)
+    >>> ax[2].set(title='hop_length=1024')
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length)
+    win_sq = util.normalize(win_sq, norm=norm) ** 2
+    win_sq = util.pad_center(win_sq, size=n_fft)
+    # Fill the envelope
+    __window_ss_fill(x, win_sq, n_frames, hop_length)
+    return x
+@cache(level=10)
+def diagonal_filter(
+    window: _WindowSpec,
+    n: int,
+    *,
+    slope: float = 1.0,
+    angle: Optional[float] = None,
+    zero_mean: bool = False,
+) -> np.ndarray:
+    """Build a two-dimensional diagonal filter.
+    This is primarily used for smoothing recurrence or self-similarity matrices.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        The window function to use for the filter.
+        See `get_window` for details.
+        Note that the window used here should be non-negative.
+    n : int > 0
+        the length of the filter
+    slope : float
+        The slope of the diagonal filter to produce
+    angle : float or None
+        If given, the slope parameter is ignored,
+        and angle directly sets the orientation of the filter (in radians).
+        Otherwise, angle is inferred as `arctan(slope)`.
+    zero_mean : bool
+        If True, a zero-mean filter is used.
+        Otherwise, a non-negative averaging filter is used.
+        This should be enabled if you want to enhance paths and suppress
+        blocks.
+    Returns
+    -------
+    kernel : np.ndarray, shape=[(m, m)]
+        The 2-dimensional filter kernel
+    Notes
+    -----
+    This function caches at level 10.
+    """
+    if angle is None:
+        angle = np.arctan(slope)
+    win = np.diag(get_window(window, n, fftbins=False))
+    if not np.isclose(angle, np.pi / 4):
+        win = scipy.ndimage.rotate(
+            win, 45 - angle * 180 / np.pi, order=5, prefilter=False
+        )
+    np.clip(win, 0, None, out=win)
+    win /= win.sum()
+    if zero_mean:
+        win -= win.mean()
+    return win

sequence.py ADDED Viewed

	@@ -0,0 +1,2059 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+Sequential modeling
+===================
+Sequence alignment
+------------------
+.. autosummary::
+    :toctree: generated/
+    dtw
+    rqa
+Viterbi decoding
+----------------
+.. autosummary::
+    :toctree: generated/
+    viterbi
+    viterbi_discriminative
+    viterbi_binary
+Transition matrices
+-------------------
+.. autosummary::
+    :toctree: generated/
+    transition_uniform
+    transition_loop
+    transition_cycle
+    transition_local
+"""
+from __future__ import annotations
+import numpy as np
+from scipy.spatial.distance import cdist
+from numba import jit
+from .util import pad_center, fill_off_diagonal, is_positive_int, tiny, expand_to
+from .util.exceptions import ParameterError
+from .filters import get_window
+from typing import Any, Iterable, List, Optional, Tuple, Union, overload
+from typing_extensions import Literal
+from ._typing import _WindowSpec, _IntLike_co
+__all__ = [
+    "dtw",
+    "dtw_backtracking",
+    "rqa",
+    "viterbi",
+    "viterbi_discriminative",
+    "viterbi_binary",
+    "transition_uniform",
+    "transition_loop",
+    "transition_cycle",
+    "transition_local",
+]
+@overload
+def dtw(
+    X: np.ndarray,
+    Y: np.ndarray,
+    *,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[False],
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[False] = ...,
+) -> np.ndarray:
+    ...
+@overload
+def dtw(
+    *,
+    C: np.ndarray,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[False],
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[False] = ...,
+) -> np.ndarray:
+    ...
+@overload
+def dtw(
+    X: np.ndarray,
+    Y: np.ndarray,
+    *,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[False],
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def dtw(
+    *,
+    C: np.ndarray,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[False],
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def dtw(
+    X: np.ndarray,
+    Y: np.ndarray,
+    *,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[True] = ...,
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[False] = ...,
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def dtw(
+    *,
+    C: np.ndarray,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[True] = ...,
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[False] = ...,
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def dtw(
+    X: np.ndarray,
+    Y: np.ndarray,
+    *,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[True] = ...,
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ...
+@overload
+def dtw(
+    *,
+    C: np.ndarray,
+    metric: str = ...,
+    step_sizes_sigma: Optional[np.ndarray] = ...,
+    weights_add: Optional[np.ndarray] = ...,
+    weights_mul: Optional[np.ndarray] = ...,
+    subseq: bool = ...,
+    backtrack: Literal[True] = ...,
+    global_constraints: bool = ...,
+    band_rad: float = ...,
+    return_steps: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ...
+def dtw(
+    X: Optional[np.ndarray] = None,
+    Y: Optional[np.ndarray] = None,
+    *,
+    C: Optional[np.ndarray] = None,
+    metric: str = "euclidean",
+    step_sizes_sigma: Optional[np.ndarray] = None,
+    weights_add: Optional[np.ndarray] = None,
+    weights_mul: Optional[np.ndarray] = None,
+    subseq: bool = False,
+    backtrack: bool = True,
+    global_constraints: bool = False,
+    band_rad: float = 0.25,
+    return_steps: bool = False,
+) -> Union[
+    np.ndarray, Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, np.ndarray]
+]:
+    """Dynamic time warping (DTW).
+    This function performs a DTW and path backtracking on two sequences.
+    We follow the nomenclature and algorithmic approach as described in [#]_.
+    .. [#] Meinard Mueller
+           Fundamentals of Music Processing — Audio, Analysis, Algorithms, Applications
+           Springer Verlag, ISBN: 978-3-319-21944-8, 2015.
+    Parameters
+    ----------
+    X : np.ndarray [shape=(..., K, N)]
+        audio feature matrix (e.g., chroma features)
+        If ``X`` has more than two dimensions (e.g., for multi-channel inputs), all leading
+        dimensions are used when computing distance to ``Y``.
+    Y : np.ndarray [shape=(..., K, M)]
+        audio feature matrix (e.g., chroma features)
+    C : np.ndarray [shape=(N, M)]
+        Precomputed distance matrix. If supplied, X and Y must not be supplied and
+        ``metric`` will be ignored.
+    metric : str
+        Identifier for the cost-function as documented
+        in `scipy.spatial.distance.cdist()`
+    step_sizes_sigma : np.ndarray [shape=[n, 2]]
+        Specifies allowed step sizes as used by the dtw.
+    weights_add : np.ndarray [shape=[n, ]]
+        Additive weights to penalize certain step sizes.
+    weights_mul : np.ndarray [shape=[n, ]]
+        Multiplicative weights to penalize certain step sizes.
+    subseq : bool
+        Enable subsequence DTW, e.g., for retrieval tasks.
+    backtrack : bool
+        Enable backtracking in accumulated cost matrix.
+    global_constraints : bool
+        Applies global constraints to the cost matrix ``C`` (Sakoe-Chiba band).
+    band_rad : float
+        The Sakoe-Chiba band radius (1/2 of the width) will be
+        ``int(radius*min(C.shape))``.
+    return_steps : bool
+        If true, the function returns ``steps``, the step matrix, containing
+        the indices of the used steps from the cost accumulation step.
+    Returns
+    -------
+    D : np.ndarray [shape=(N, M)]
+        accumulated cost matrix.
+        D[N, M] is the total alignment cost.
+        When doing subsequence DTW, D[N,:] indicates a matching function.
+    wp : np.ndarray [shape=(N, 2)]
+        Warping path with index pairs.
+        Each row of the array contains an index pair (n, m).
+        Only returned when ``backtrack`` is True.
+    steps : np.ndarray [shape=(N, M)]
+        Step matrix, containing the indices of the used steps from the cost
+        accumulation step.
+        Only returned when ``return_steps`` is True.
+    Raises
+    ------
+    ParameterError
+        If you are doing diagonal matching and Y is shorter than X or if an
+        incompatible combination of X, Y, and C are supplied.
+        If your input dimensions are incompatible.
+        If the cost matrix has NaN values.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> y, sr = librosa.load(librosa.ex('brahms'), offset=10, duration=15)
+    >>> X = librosa.feature.chroma_cens(y=y, sr=sr)
+    >>> noise = np.random.rand(X.shape[0], 200)
+    >>> Y = np.concatenate((noise, noise, X, noise), axis=1)
+    >>> D, wp = librosa.sequence.dtw(X, Y, subseq=True)
+    >>> fig, ax = plt.subplots(nrows=2, sharex=True)
+    >>> img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
+    ...                                ax=ax[0])
+    >>> ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
+    >>> ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
+    >>> ax[0].legend()
+    >>> fig.colorbar(img, ax=ax[0])
+    >>> ax[1].plot(D[-1, :] / wp.shape[0])
+    >>> ax[1].set(xlim=[0, Y.shape[1]], ylim=[0, 2],
+    ...           title='Matching cost function')
+    """
+    # Default Parameters
+    default_steps = np.array([[1, 1], [0, 1], [1, 0]], dtype=np.uint32)
+    default_weights_add = np.zeros(3, dtype=np.float64)
+    default_weights_mul = np.ones(3, dtype=np.float64)
+    if step_sizes_sigma is None:
+        # Use the default steps
+        step_sizes_sigma = default_steps
+        # Use default weights if none are provided
+        if weights_add is None:
+            weights_add = default_weights_add
+        if weights_mul is None:
+            weights_mul = default_weights_mul
+    else:
+        # If we have custom steps but no weights, construct them here
+        if weights_add is None:
+            weights_add = np.zeros(len(step_sizes_sigma), dtype=np.float64)
+        if weights_mul is None:
+            weights_mul = np.ones(len(step_sizes_sigma), dtype=np.float64)
+        # Make the default step weights infinite so that they are never
+        # preferred over custom steps
+        default_weights_add.fill(np.inf)
+        default_weights_mul.fill(np.inf)
+        # Append custom steps and weights to our defaults
+        step_sizes_sigma = np.concatenate((default_steps, step_sizes_sigma))
+        weights_add = np.concatenate((default_weights_add, weights_add))
+        weights_mul = np.concatenate((default_weights_mul, weights_mul))
+    # These asserts are bad, but mypy cannot trace the code paths properly
+    assert step_sizes_sigma is not None
+    assert weights_add is not None
+    assert weights_mul is not None
+    if np.any(step_sizes_sigma < 0):
+        raise ParameterError("step_sizes_sigma cannot contain negative values")
+    if len(step_sizes_sigma) != len(weights_add):
+        raise ParameterError("len(weights_add) must be equal to len(step_sizes_sigma)")
+    if len(step_sizes_sigma) != len(weights_mul):
+        raise ParameterError("len(weights_mul) must be equal to len(step_sizes_sigma)")
+    if C is None and (X is None or Y is None):
+        raise ParameterError("If C is not supplied, both X and Y must be supplied")
+    if C is not None and (X is not None or Y is not None):
+        raise ParameterError("If C is supplied, both X and Y must not be supplied")
+    c_is_transposed = False
+    # calculate pair-wise distances, unless already supplied.
+    # C_local will keep track of whether the distance matrix was supplied
+    # by the user (False) or constructed locally (True)
+    C_local = False
+    if C is None:
+        C_local = True
+        # mypy can't figure out that this case does not happen
+        assert X is not None and Y is not None
+        # take care of dimensions
+        X = np.atleast_2d(X)
+        Y = np.atleast_2d(Y)
+        # Perform some shape-squashing here
+        # Put the time axes around front
+        # Suppress types because mypy doesn't know these are ndarrays
+        X = np.swapaxes(X, -1, 0)  # type: ignore
+        Y = np.swapaxes(Y, -1, 0)  # type: ignore
+        # Flatten the remaining dimensions
+        # Use F-ordering to preserve columns
+        X = X.reshape((X.shape[0], -1), order="F")
+        Y = Y.reshape((Y.shape[0], -1), order="F")
+        try:
+            C = cdist(X, Y, metric=metric)
+        except ValueError as exc:
+            raise ParameterError(
+                "scipy.spatial.distance.cdist returned an error.\n"
+                "Please provide your input in the form X.shape=(K, N) "
+                "and Y.shape=(K, M).\n 1-dimensional sequences should "
+                "be reshaped to X.shape=(1, N) and Y.shape=(1, M)."
+            ) from exc
+        # for subsequence matching:
+        # if N > M, Y can be a subsequence of X
+        if subseq and (X.shape[0] > Y.shape[0]):
+            C = C.T
+            c_is_transposed = True
+    C = np.atleast_2d(C)
+    # if diagonal matching, Y has to be longer than X
+    # (X simply cannot be contained in Y)
+    if np.array_equal(step_sizes_sigma, np.array([[1, 1]])) and (
+        C.shape[0] > C.shape[1]
+    ):
+        raise ParameterError(
+            "For diagonal matching: Y.shape[-1] >= X.shape[-11] "
+            "(C.shape[1] >= C.shape[0])"
+        )
+    max_0 = step_sizes_sigma[:, 0].max()
+    max_1 = step_sizes_sigma[:, 1].max()
+    # check C here for nans before building global constraints
+    if np.any(np.isnan(C)):
+        raise ParameterError("DTW cost matrix C has NaN values. ")
+    if global_constraints:
+        # Apply global constraints to the cost matrix
+        if not C_local:
+            # If C was provided as input, make a copy here
+            C = np.copy(C)
+        fill_off_diagonal(C, radius=band_rad, value=np.inf)
+    # initialize whole matrix with infinity values
+    D = np.ones(C.shape + np.array([max_0, max_1])) * np.inf
+    # set starting point to C[0, 0]
+    D[max_0, max_1] = C[0, 0]
+    if subseq:
+        D[max_0, max_1:] = C[0, :]
+    # initialize step matrix with -1
+    # will be filled in calc_accu_cost() with indices from step_sizes_sigma
+    steps = np.zeros(D.shape, dtype=np.int32)
+    # these steps correspond to left- (first row) and up-(first column) moves
+    steps[0, :] = 1
+    steps[:, 0] = 2
+    # calculate accumulated cost matrix
+    D: np.ndarray
+    steps: np.ndarray
+    D, steps = __dtw_calc_accu_cost(
+        C, D, steps, step_sizes_sigma, weights_mul, weights_add, max_0, max_1
+    )
+    # delete infinity rows and columns
+    D = D[max_0:, max_1:]
+    steps = steps[max_0:, max_1:]
+    return_values: List[np.ndarray]
+    if backtrack:
+        wp: np.ndarray
+        if subseq:
+            if np.all(np.isinf(D[-1])):
+                raise ParameterError(
+                    "No valid sub-sequence warping path could "
+                    "be constructed with the given step sizes."
+                )
+            start = np.argmin(D[-1, :])
+            _wp = __dtw_backtracking(steps, step_sizes_sigma, subseq, start)
+        else:
+            # perform warping path backtracking
+            if np.isinf(D[-1, -1]):
+                raise ParameterError(
+                    "No valid sub-sequence warping path could "
+                    "be constructed with the given step sizes."
+                )
+            _wp = __dtw_backtracking(steps, step_sizes_sigma, subseq)
+            if _wp[-1] != (0, 0):
+                raise ParameterError(
+                    "Unable to compute a full DTW warping path. "
+                    "You may want to try again with subseq=True."
+                )
+        wp = np.asarray(_wp, dtype=int)
+        # since we transposed in the beginning, we have to adjust the index pairs back
+        if subseq and (
+            (X is not None and Y is not None and X.shape[0] > Y.shape[0])
+            or c_is_transposed
+            or C.shape[0] > C.shape[1]
+        ):
+            wp = np.fliplr(wp)
+        return_values = [D, wp]
+    else:
+        return_values = [D]
+    if return_steps:
+        return_values.append(steps)
+    if len(return_values) > 1:
+        # Suppressing type check here because mypy can't
+        # infer the exact length of the tuple
+        return tuple(return_values)  # type: ignore
+    else:
+        return return_values[0]
+@jit(nopython=True, cache=False)  # type: ignore
+def __dtw_calc_accu_cost(
+    C: np.ndarray,
+    D: np.ndarray,
+    steps: np.ndarray,
+    step_sizes_sigma: np.ndarray,
+    weights_mul: np.ndarray,
+    weights_add: np.ndarray,
+    max_0: int,
+    max_1: int,
+) -> Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+    """Calculate the accumulated cost matrix D.
+    Use dynamic programming to calculate the accumulated costs.
+    Parameters
+    ----------
+    C : np.ndarray [shape=(N, M)]
+        pre-computed cost matrix
+    D : np.ndarray [shape=(N, M)]
+        accumulated cost matrix
+    steps : np.ndarray [shape=(N, M)]
+        Step matrix, containing the indices of the used steps from the cost
+        accumulation step.
+    step_sizes_sigma : np.ndarray [shape=[n, 2]]
+        Specifies allowed step sizes as used by the dtw.
+    weights_add : np.ndarray [shape=[n, ]]
+        Additive weights to penalize certain step sizes.
+    weights_mul : np.ndarray [shape=[n, ]]
+        Multiplicative weights to penalize certain step sizes.
+    max_0 : int
+        maximum number of steps in step_sizes_sigma in dim 0.
+    max_1 : int
+        maximum number of steps in step_sizes_sigma in dim 1.
+    Returns
+    -------
+    D : np.ndarray [shape=(N, M)]
+        accumulated cost matrix.
+        D[N, M] is the total alignment cost.
+        When doing subsequence DTW, D[N,:] indicates a matching function.
+    steps : np.ndarray [shape=(N, M)]
+        Step matrix, containing the indices of the used steps from the cost
+        accumulation step.
+    See Also
+    --------
+    dtw
+    """
+    for cur_n in range(max_0, D.shape[0]):
+        for cur_m in range(max_1, D.shape[1]):
+            # accumulate costs
+            for cur_step_idx, cur_w_add, cur_w_mul in zip(
+                range(step_sizes_sigma.shape[0]), weights_add, weights_mul
+            ):
+                cur_D = D[
+                    cur_n - step_sizes_sigma[cur_step_idx, 0],
+                    cur_m - step_sizes_sigma[cur_step_idx, 1],
+                ]
+                cur_C = cur_w_mul * C[cur_n - max_0, cur_m - max_1]
+                cur_C += cur_w_add
+                cur_cost = cur_D + cur_C
+                # check if cur_cost is smaller than the one stored in D
+                if cur_cost < D[cur_n, cur_m]:
+                    D[cur_n, cur_m] = cur_cost
+                    # save step-index
+                    steps[cur_n, cur_m] = cur_step_idx
+    return D, steps
+@jit(nopython=True, cache=False)  # type: ignore
+def __dtw_backtracking(
+    steps: np.ndarray,
+    step_sizes_sigma: np.ndarray,
+    subseq: bool,
+    start: Optional[int] = None,
+) -> List[Tuple[int, int]]:  # pragma: no cover
+    """Backtrack optimal warping path.
+    Uses the saved step sizes from the cost accumulation
+    step to backtrack the index pairs for an optimal
+    warping path.
+    Parameters
+    ----------
+    steps : np.ndarray [shape=(N, M)]
+        Step matrix, containing the indices of the used steps from the cost
+        accumulation step.
+    step_sizes_sigma : np.ndarray [shape=[n, 2]]
+        Specifies allowed step sizes as used by the dtw.
+    subseq : bool
+        Enable subsequence DTW, e.g., for retrieval tasks.
+    start : int
+        Start column index for backtraing (only allowed for ``subseq=True``)
+    Returns
+    -------
+    wp : list [shape=(N,)]
+        Warping path with index pairs.
+        Each list entry contains an index pair
+        (n, m) as a tuple
+    See Also
+    --------
+    dtw
+    """
+    if start is None:
+        cur_idx = (steps.shape[0] - 1, steps.shape[1] - 1)
+    else:
+        cur_idx = (steps.shape[0] - 1, start)
+    wp = []
+    # Set starting point D(N, M) and append it to the path
+    wp.append((cur_idx[0], cur_idx[1]))
+    # Loop backwards.
+    # Stop criteria:
+    # Setting it to (0, 0) does not work for the subsequence dtw,
+    # so we only ask to reach the first row of the matrix.
+    while (subseq and cur_idx[0] > 0) or (not subseq and cur_idx != (0, 0)):
+        cur_step_idx = steps[(cur_idx[0], cur_idx[1])]
+        # save tuple with minimal acc. cost in path
+        cur_idx = (
+            cur_idx[0] - step_sizes_sigma[cur_step_idx][0],
+            cur_idx[1] - step_sizes_sigma[cur_step_idx][1],
+        )
+        # If we run off the side of the cost matrix, break here
+        if min(cur_idx) < 0:
+            break
+        # append to warping path
+        wp.append((cur_idx[0], cur_idx[1]))
+    return wp
+def dtw_backtracking(
+    steps: np.ndarray,
+    *,
+    step_sizes_sigma: Optional[np.ndarray] = None,
+    subseq: bool = False,
+    start: Optional[Union[int, np.integer[Any]]] = None,
+) -> np.ndarray:
+    """Backtrack a warping path.
+    Uses the saved step sizes from the cost accumulation
+    step to backtrack the index pairs for a warping path.
+    Parameters
+    ----------
+    steps : np.ndarray [shape=(N, M)]
+        Step matrix, containing the indices of the used steps from the cost
+        accumulation step.
+    step_sizes_sigma : np.ndarray [shape=[n, 2]]
+        Specifies allowed step sizes as used by the dtw.
+    subseq : bool
+        Enable subsequence DTW, e.g., for retrieval tasks.
+    start : int
+        Start column index for backtraing (only allowed for ``subseq=True``)
+    Returns
+    -------
+    wp : list [shape=(N,)]
+        Warping path with index pairs.
+        Each list entry contains an index pair
+        (n, m) as a tuple
+    See Also
+    --------
+    dtw
+    """
+    if subseq is False and start is not None:
+        raise ParameterError(
+            f"start is only allowed to be set if subseq is True (start={start}, subseq={subseq})"
+        )
+    # Default Parameters
+    default_steps = np.array([[1, 1], [0, 1], [1, 0]], dtype=np.uint32)
+    if step_sizes_sigma is None:
+        # Use the default steps
+        step_sizes_sigma = default_steps
+    else:
+        # Append custom steps and weights to our defaults
+        step_sizes_sigma = np.concatenate((default_steps, step_sizes_sigma))
+    wp = __dtw_backtracking(steps, step_sizes_sigma, subseq, start)
+    return np.asarray(wp, dtype=int)
+@overload
+def rqa(
+    sim: np.ndarray,
+    *,
+    gap_onset: float = ...,
+    gap_extend: float = ...,
+    knight_moves: bool = ...,
+    backtrack: Literal[False],
+) -> np.ndarray:
+    ...
+@overload
+def rqa(
+    sim: np.ndarray,
+    *,
+    gap_onset: float = ...,
+    gap_extend: float = ...,
+    knight_moves: bool = ...,
+    backtrack: Literal[True] = ...,
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def rqa(
+    sim: np.ndarray,
+    *,
+    gap_onset: float = ...,
+    gap_extend: float = ...,
+    knight_moves: bool = ...,
+    backtrack: bool = ...,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    ...
+def rqa(
+    sim: np.ndarray,
+    *,
+    gap_onset: float = 1,
+    gap_extend: float = 1,
+    knight_moves: bool = True,
+    backtrack: bool = True,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Recurrence quantification analysis (RQA)
+    This function implements different forms of RQA as described by
+    Serra, Serra, and Andrzejak (SSA). [#]_  These methods take as input
+    a self- or cross-similarity matrix ``sim``, and calculate the value
+    of path alignments by dynamic programming.
+    Note that unlike dynamic time warping (`dtw`), alignment paths here are
+    maximized, not minimized, so the input should measure similarity rather
+    than distance.
+    The simplest RQA method, denoted as `L` (SSA equation 3) and equivalent
+    to the method described by Eckman, Kamphorst, and Ruelle [#]_, accumulates
+    the length of diagonal paths with positive values in the input:
+        - ``score[i, j] = score[i-1, j-1] + 1``  if ``sim[i, j] > 0``
+        - ``score[i, j] = 0`` otherwise.
+    The second method, denoted as `S` (SSA equation 4), is similar to the first,
+    but allows for "knight moves" (as in the chess piece) in addition to strict
+    diagonal moves:
+        - ``score[i, j] = max(score[i-1, j-1], score[i-2, j-1], score[i-1, j-2]) + 1``  if ``sim[i, j] >
+          0``
+        - ``score[i, j] = 0`` otherwise.
+    The third method, denoted as `Q` (SSA equations 5 and 6) extends this by
+    allowing gaps in the alignment that incur some cost, rather than a hard
+    reset to 0 whenever ``sim[i, j] == 0``.
+    Gaps are penalized by two additional parameters, ``gap_onset`` and ``gap_extend``,
+    which are subtracted from the value of the alignment path every time a gap
+    is introduced or extended (respectively).
+    Note that setting ``gap_onset`` and ``gap_extend`` to `np.inf` recovers the second
+    method, and disabling knight moves recovers the first.
+    .. [#] Serrà, Joan, Xavier Serra, and Ralph G. Andrzejak.
+        "Cross recurrence quantification for cover song identification."
+        New Journal of Physics 11, no. 9 (2009): 093017.
+    .. [#] Eckmann, J. P., S. Oliffson Kamphorst, and D. Ruelle.
+        "Recurrence plots of dynamical systems."
+        World Scientific Series on Nonlinear Science Series A 16 (1995): 441-446.
+    Parameters
+    ----------
+    sim : np.ndarray [shape=(N, M), non-negative]
+        The similarity matrix to use as input.
+        This can either be a recurrence matrix (self-similarity)
+        or a cross-similarity matrix between two sequences.
+    gap_onset : float > 0
+        Penalty for introducing a gap to an alignment sequence
+    gap_extend : float > 0
+        Penalty for extending a gap in an alignment sequence
+    knight_moves : bool
+        If ``True`` (default), allow for "knight moves" in the alignment,
+        e.g., ``(n, m) => (n + 1, m + 2)`` or ``(n + 2, m + 1)``.
+        If ``False``, only allow for diagonal moves ``(n, m) => (n + 1, m + 1)``.
+    backtrack : bool
+        If ``True``, return the alignment path.
+        If ``False``, only return the score matrix.
+    Returns
+    -------
+    score : np.ndarray [shape=(N, M)]
+        The alignment score matrix.  ``score[n, m]`` is the cumulative value of
+        the best alignment sequence ending in frames ``n`` and ``m``.
+    path : np.ndarray [shape=(k, 2)] (optional)
+        If ``backtrack=True``, ``path`` contains a list of pairs of aligned frames
+        in the best alignment sequence.
+        ``path[i] = [n, m]`` indicates that row ``n`` aligns to column ``m``.
+    See Also
+    --------
+    librosa.segment.recurrence_matrix
+    librosa.segment.cross_similarity
+    dtw
+    Examples
+    --------
+    Simple diagonal path enhancement (L-mode)
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
+    >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
+    >>> # Use time-delay embedding to reduce noise
+    >>> chroma_stack = librosa.feature.stack_memory(chroma, n_steps=10, delay=3)
+    >>> # Build recurrence, suppress self-loops within 1 second
+    >>> rec = librosa.segment.recurrence_matrix(chroma_stack, width=43,
+    ...                                         mode='affinity',
+    ...                                         metric='cosine')
+    >>> # using infinite cost for gaps enforces strict path continuation
+    >>> L_score, L_path = librosa.sequence.rqa(rec,
+    ...                                        gap_onset=np.inf,
+    ...                                        gap_extend=np.inf,
+    ...                                        knight_moves=False)
+    >>> fig, ax = plt.subplots(ncols=2)
+    >>> librosa.display.specshow(rec, x_axis='frames', y_axis='frames', ax=ax[0])
+    >>> ax[0].set(title='Recurrence matrix')
+    >>> librosa.display.specshow(L_score, x_axis='frames', y_axis='frames', ax=ax[1])
+    >>> ax[1].set(title='Alignment score matrix')
+    >>> ax[1].plot(L_path[:, 1], L_path[:, 0], label='Optimal path', color='c')
+    >>> ax[1].legend()
+    >>> ax[1].label_outer()
+    Full alignment using gaps and knight moves
+    >>> # New gaps cost 5, extending old gaps cost 10 for each step
+    >>> score, path = librosa.sequence.rqa(rec, gap_onset=5, gap_extend=10)
+    >>> fig, ax = plt.subplots(ncols=2, sharex=True, sharey=True)
+    >>> librosa.display.specshow(rec, x_axis='frames', y_axis='frames', ax=ax[0])
+    >>> ax[0].set(title='Recurrence matrix')
+    >>> librosa.display.specshow(score, x_axis='frames', y_axis='frames', ax=ax[1])
+    >>> ax[1].set(title='Alignment score matrix')
+    >>> ax[1].plot(path[:, 1], path[:, 0], label='Optimal path', color='c')
+    >>> ax[1].legend()
+    >>> ax[1].label_outer()
+    """
+    if gap_onset < 0:
+        raise ParameterError("gap_onset={} must be strictly positive")
+    if gap_extend < 0:
+        raise ParameterError("gap_extend={} must be strictly positive")
+    score: np.ndarray
+    pointers: np.ndarray
+    score, pointers = __rqa_dp(sim, gap_onset, gap_extend, knight_moves)
+    if backtrack:
+        path = __rqa_backtrack(score, pointers)
+        return score, path
+    return score
+@jit(nopython=True, cache=False)  # type: ignore
+def __rqa_dp(
+    sim: np.ndarray, gap_onset: float, gap_extend: float, knight: bool
+) -> Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+    """RQA dynamic programming implementation"""
+    # The output array
+    score = np.zeros(sim.shape, dtype=sim.dtype)
+    # The backtracking array
+    backtrack = np.zeros(sim.shape, dtype=np.int8)
+    # These are place-holder arrays to limit the points being considered
+    # at each step of the DP
+    #
+    # If knight moves are enabled, values are indexed according to
+    # [(-1,-1), (-1, -2), (-2, -1)]
+    #
+    # If knight moves are disabled, then only the first entry is used.
+    #
+    # Using dummy vectors here makes the code a bit cleaner down below.
+    sim_values = np.zeros(3)
+    score_values = np.zeros(3)
+    vec = np.zeros(3)
+    if knight:
+        # Initial limit is for the base case: diagonal + one knight
+        init_limit = 2
+        # Otherwise, we have 3 positions
+        limit = 3
+    else:
+        init_limit = 1
+        limit = 1
+    # backtracking rubric:
+    #   0 ==> diagonal move
+    #   1 ==> knight move up
+    #   2 ==> knight move left
+    #  -1 ==> reset without inclusion
+    #  -2 ==> reset with inclusion (ie positive value at init)
+    # Initialize the first row and column with the data
+    score[0, :] = sim[0, :]
+    score[:, 0] = sim[:, 0]
+    # backtracking initialization: the first row and column are all resets
+    # if there's a positive link here, it's an inclusive reset
+    for i in range(sim.shape[0]):
+        if sim[i, 0]:
+            backtrack[i, 0] = -2
+        else:
+            backtrack[i, 0] = -1
+    for j in range(sim.shape[1]):
+        if sim[0, j]:
+            backtrack[0, j] = -2
+        else:
+            backtrack[0, j] = -1
+    # Initialize the 1-1 case using only the diagonal
+    if sim[1, 1] > 0:
+        score[1, 1] = score[0, 0] + sim[1, 1]
+        backtrack[1, 1] = 0
+    else:
+        link = sim[0, 0] > 0
+        score[1, 1] = max(0, score[0, 0] - (link) * gap_onset - (~link) * gap_extend)
+        if score[1, 1] > 0:
+            backtrack[1, 1] = 0
+        else:
+            backtrack[1, 1] = -1
+    # Initialize the second row with diagonal and left-knight moves
+    i = 1
+    for j in range(2, sim.shape[1]):
+        score_values[:-1] = (score[i - 1, j - 1], score[i - 1, j - 2])
+        sim_values[:-1] = (sim[i - 1, j - 1], sim[i - 1, j - 2])
+        t_values = sim_values > 0
+        if sim[i, j] > 0:
+            backtrack[i, j] = np.argmax(score_values[:init_limit])
+            score[i, j] = score_values[backtrack[i, j]] + sim[i, j]  # or + 1 for binary
+        else:
+            vec[:init_limit] = (
+                score_values[:init_limit]
+                - t_values[:init_limit] * gap_onset
+                - (~t_values[:init_limit]) * gap_extend
+            )
+            backtrack[i, j] = np.argmax(vec[:init_limit])
+            score[i, j] = max(0, vec[backtrack[i, j]])
+            # Is it a reset?
+            if score[i, j] == 0:
+                backtrack[i, j] = -1
+    # Initialize the second column with diagonal and up-knight moves
+    j = 1
+    for i in range(2, sim.shape[0]):
+        score_values[:-1] = (score[i - 1, j - 1], score[i - 2, j - 1])
+        sim_values[:-1] = (sim[i - 1, j - 1], sim[i - 2, j - 1])
+        t_values = sim_values > 0
+        if sim[i, j] > 0:
+            backtrack[i, j] = np.argmax(score_values[:init_limit])
+            score[i, j] = score_values[backtrack[i, j]] + sim[i, j]  # or + 1 for binary
+        else:
+            vec[:init_limit] = (
+                score_values[:init_limit]
+                - t_values[:init_limit] * gap_onset
+                - (~t_values[:init_limit]) * gap_extend
+            )
+            backtrack[i, j] = np.argmax(vec[:init_limit])
+            score[i, j] = max(0, vec[backtrack[i, j]])
+            # Is it a reset?
+            if score[i, j] == 0:
+                backtrack[i, j] = -1
+    # Now fill in the rest of the table
+    for i in range(2, sim.shape[0]):
+        for j in range(2, sim.shape[1]):
+            score_values[:] = (
+                score[i - 1, j - 1],
+                score[i - 1, j - 2],
+                score[i - 2, j - 1],
+            )
+            sim_values[:] = (sim[i - 1, j - 1], sim[i - 1, j - 2], sim[i - 2, j - 1])
+            t_values = sim_values > 0
+            if sim[i, j] > 0:
+                # if knight is true, it's max of (-1,-1), (-1, -2), (-2, -1)
+                # otherwise, it's just the diagonal move (-1, -1)
+                # for backtracking purposes, if the max is 0 then it's the start of a new sequence
+                # if the max is non-zero, then we extend the existing sequence
+                backtrack[i, j] = np.argmax(score_values[:limit])
+                score[i, j] = (
+                    score_values[backtrack[i, j]] + sim[i, j]
+                )  # or + 1 for binary
+            else:
+                # if the max of our options is negative, then it's a hard reset
+                # otherwise, it's a skip move
+                vec[:limit] = (
+                    score_values[:limit]
+                    - t_values[:limit] * gap_onset
+                    - (~t_values[:limit]) * gap_extend
+                )
+                backtrack[i, j] = np.argmax(vec[:limit])
+                score[i, j] = max(0, vec[backtrack[i, j]])
+                # Is it a reset?
+                if score[i, j] == 0:
+                    backtrack[i, j] = -1
+    return score, backtrack
+def __rqa_backtrack(score, pointers):
+    """RQA path backtracking
+    Given the score matrix and backtracking index array,
+    reconstruct the optimal path.
+    """
+    # backtracking rubric:
+    #   0 ==> diagonal move
+    #   1 ==> knight move up
+    #   2 ==> knight move left
+    #  -1 ==> reset (sim = 0)
+    #  -2 ==> start of sequence (sim > 0)
+    # This array maps the backtracking values to the
+    # relative index offsets
+    offsets = [(-1, -1), (-1, -2), (-2, -1)]
+    # Find the maximum to end the path
+    idx = list(np.unravel_index(np.argmax(score), score.shape))
+    # Construct the path
+    path: List = []
+    while True:
+        bt_index = pointers[tuple(idx)]
+        # A -1 indicates a non-inclusive reset
+        # this can only happen when sim[idx] == 0,
+        # and a reset with zero score should not be included
+        # in the path.  In this case, we're done.
+        if bt_index == -1:
+            break
+        # Other bt_index values are okay for inclusion
+        path.insert(0, idx)
+        # -2 indicates beginning of sequence,
+        # so we can't backtrack any further
+        if bt_index == -2:
+            break
+        # Otherwise, prepend this index and continue
+        idx = [idx[_] + offsets[bt_index][_] for _ in range(len(idx))]
+    # If there's no alignment path at all, eg an empty cross-similarity
+    # matrix, return a properly shaped and typed array
+    if not path:
+        return np.empty((0, 2), dtype=np.uint)
+    return np.asarray(path, dtype=np.uint)
+@jit(nopython=True, cache=False)  # type: ignore
+def _viterbi(
+    log_prob: np.ndarray, log_trans: np.ndarray, log_p_init: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+    """Core Viterbi algorithm.
+    This is intended for internal use only.
+    Parameters
+    ----------
+    log_prob : np.ndarray [shape=(T, m)]
+        ``log_prob[t, s]`` is the conditional log-likelihood
+        ``log P[X = X(t) | State(t) = s]``
+    log_trans : np.ndarray [shape=(m, m)]
+        The log transition matrix
+        ``log_trans[i, j] = log P[State(t+1) = j | State(t) = i]``
+    log_p_init : np.ndarray [shape=(m,)]
+        log of the initial state distribution
+    Returns
+    -------
+    None
+        All computations are performed in-place on ``state, value, ptr``.
+    """
+    n_steps, n_states = log_prob.shape
+    state = np.zeros(n_steps, dtype=np.uint16)
+    value = np.zeros((n_steps, n_states), dtype=np.float64)
+    ptr = np.zeros((n_steps, n_states), dtype=np.uint16)
+    # factor in initial state distribution
+    value[0] = log_prob[0] + log_p_init
+    for t in range(1, n_steps):
+        # Want V[t, j] <- p[t, j] * max_k V[t-1, k] * A[k, j]
+        #    assume at time t-1 we were in state k
+        #    transition k -> j
+        # Broadcast over rows:
+        #    Tout[k, j] = V[t-1, k] * A[k, j]
+        #    then take the max over columns
+        # We'll do this in log-space for stability
+        trans_out = value[t - 1] + log_trans.T
+        # Unroll the max/argmax loop to enable numba support
+        for j in range(n_states):
+            ptr[t, j] = np.argmax(trans_out[j])
+            # value[t, j] = log_prob[t, j] + np.max(trans_out[j])
+            value[t, j] = log_prob[t, j] + trans_out[j, ptr[t][j]]
+    # Now roll backward
+    # Get the last state
+    state[-1] = np.argmax(value[-1])
+    for t in range(n_steps - 2, -1, -1):
+        state[t] = ptr[t + 1, state[t + 1]]
+    logp = value[-1:, state[-1]]
+    return state, logp
+@overload
+def viterbi(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def viterbi(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[False] = ...,
+) -> np.ndarray:
+    ...
+def viterbi(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_init: Optional[np.ndarray] = None,
+    return_logp: bool = False,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Viterbi decoding from observation likelihoods.
+    Given a sequence of observation likelihoods ``prob[s, t]``,
+    indicating the conditional likelihood of seeing the observation
+    at time ``t`` from state ``s``, and a transition matrix
+    ``transition[i, j]`` which encodes the conditional probability of
+    moving from state ``i`` to state ``j``, the Viterbi algorithm [#]_ computes
+    the most likely sequence of states from the observations.
+    .. [#] Viterbi, Andrew. "Error bounds for convolutional codes and an
+        asymptotically optimum decoding algorithm."
+        IEEE transactions on Information Theory 13.2 (1967): 260-269.
+    Parameters
+    ----------
+    prob : np.ndarray [shape=(..., n_states, n_steps), non-negative]
+        ``prob[..., s, t]`` is the probability of observation at time ``t``
+        being generated by state ``s``.
+    transition : np.ndarray [shape=(n_states, n_states), non-negative]
+        ``transition[i, j]`` is the probability of a transition from i->j.
+        Each row must sum to 1.
+    p_init : np.ndarray [shape=(n_states,)]
+        Optional: initial state distribution.
+        If not provided, a uniform distribution is assumed.
+    return_logp : bool
+        If ``True``, return the log-likelihood of the state sequence.
+    Returns
+    -------
+    Either ``states`` or ``(states, logp)``:
+    states : np.ndarray [shape=(..., n_steps,)]
+        The most likely state sequence.
+        If ``prob`` contains multiple channels of input, then each channel is
+        decoded independently.
+    logp : scalar [float] or np.ndarray
+        If ``return_logp=True``, the log probability of ``states`` given
+        the observations.
+    See Also
+    --------
+    viterbi_discriminative : Viterbi decoding from state likelihoods
+    Examples
+    --------
+    Example from https://en.wikipedia.org/wiki/Viterbi_algorithm#Example
+    In this example, we have two states ``healthy`` and ``fever``, with
+    initial probabilities 60% and 40%.
+    We have three observation possibilities: ``normal``, ``cold``, and
+    ``dizzy``, whose probabilities given each state are:
+    ``healthy => {normal: 50%, cold: 40%, dizzy: 10%}`` and
+    ``fever => {normal: 10%, cold: 30%, dizzy: 60%}``
+    Finally, we have transition probabilities:
+    ``healthy => healthy (70%)`` and
+    ``fever => fever (60%)``.
+    Over three days, we observe the sequence ``[normal, cold, dizzy]``,
+    and wish to know the maximum likelihood assignment of states for the
+    corresponding days, which we compute with the Viterbi algorithm below.
+    >>> p_init = np.array([0.6, 0.4])
+    >>> p_emit = np.array([[0.5, 0.4, 0.1],
+    ...                    [0.1, 0.3, 0.6]])
+    >>> p_trans = np.array([[0.7, 0.3], [0.4, 0.6]])
+    >>> path, logp = librosa.sequence.viterbi(p_emit, p_trans, p_init=p_init,
+    ...                                       return_logp=True)
+    >>> print(logp, path)
+    -4.19173690823075 [0 0 1]
+    """
+    n_states, n_steps = prob.shape[-2:]
+    if transition.shape != (n_states, n_states):
+        raise ParameterError(
+            f"transition.shape={transition.shape}, must be "
+            f"(n_states, n_states)={n_states, n_states}"
+        )
+    if np.any(transition < 0) or not np.allclose(transition.sum(axis=1), 1):
+        raise ParameterError(
+            "Invalid transition matrix: must be non-negative "
+            "and sum to 1 on each row."
+        )
+    if np.any(prob < 0) or np.any(prob > 1):
+        raise ParameterError("Invalid probability values: must be between 0 and 1.")
+    # Compute log-likelihoods while avoiding log-underflow
+    epsilon = tiny(prob)
+    if p_init is None:
+        p_init = np.empty(n_states)
+        p_init.fill(1.0 / n_states)
+    elif (
+        np.any(p_init < 0)
+        or not np.allclose(p_init.sum(), 1)
+        or p_init.shape != (n_states,)
+    ):
+        raise ParameterError(f"Invalid initial state distribution: p_init={p_init}")
+    log_trans = np.log(transition + epsilon)
+    log_prob = np.log(prob + epsilon)
+    log_p_init = np.log(p_init + epsilon)
+    def _helper(lp):
+        # Transpose input
+        _state, logp = _viterbi(lp.T, log_trans, log_p_init)
+        # Transpose outputs for return
+        return _state.T, logp
+    states: np.ndarray
+    logp: np.ndarray
+    if log_prob.ndim == 2:
+        states, logp = _helper(log_prob)
+    else:
+        # Vectorize the helper
+        __viterbi = np.vectorize(
+            _helper, otypes=[np.uint16, np.float64], signature="(s,t)->(t),(1)"
+        )
+        states, logp = __viterbi(log_prob)
+        # Flatten out the trailing dimension introduced by vectorization
+        logp = logp[..., 0]
+    if return_logp:
+        return states, logp
+    return states
+@overload
+def viterbi_discriminative(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[False] = ...,
+) -> np.ndarray:
+    ...
+@overload
+def viterbi_discriminative(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def viterbi_discriminative(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: bool,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    ...
+def viterbi_discriminative(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = None,
+    p_init: Optional[np.ndarray] = None,
+    return_logp: bool = False,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Viterbi decoding from discriminative state predictions.
+    Given a sequence of conditional state predictions ``prob[s, t]``,
+    indicating the conditional likelihood of state ``s`` given the
+    observation at time ``t``, and a transition matrix ``transition[i, j]``
+    which encodes the conditional probability of moving from state ``i``
+    to state ``j``, the Viterbi algorithm computes the most likely sequence
+    of states from the observations.
+    This implementation uses the standard Viterbi decoding algorithm
+    for observation likelihood sequences, under the assumption that
+    ``P[Obs(t) | State(t) = s]`` is proportional to
+    ``P[State(t) = s | Obs(t)] / P[State(t) = s]``, where the denominator
+    is the marginal probability of state ``s`` occurring as given by ``p_state``.
+    Note that because the denominator ``P[State(t) = s]`` is not explicitly
+    calculated, the resulting probabilities (or log-probabilities) are not
+    normalized.  If using the `return_logp=True` option (see below),
+    be aware that the "probabilities" may not sum to (and may exceed) 1.
+    Parameters
+    ----------
+    prob : np.ndarray [shape=(..., n_states, n_steps), non-negative]
+        ``prob[s, t]`` is the probability of state ``s`` conditional on
+        the observation at time ``t``.
+        Must be non-negative and sum to 1 along each column.
+    transition : np.ndarray [shape=(n_states, n_states), non-negative]
+        ``transition[i, j]`` is the probability of a transition from i->j.
+        Each row must sum to 1.
+    p_state : np.ndarray [shape=(n_states,)]
+        Optional: marginal probability distribution over states,
+        must be non-negative and sum to 1.
+        If not provided, a uniform distribution is assumed.
+    p_init : np.ndarray [shape=(n_states,)]
+        Optional: initial state distribution.
+        If not provided, it is assumed to be uniform.
+    return_logp : bool
+        If ``True``, return the log-likelihood of the state sequence.
+    Returns
+    -------
+    Either ``states`` or ``(states, logp)``:
+    states : np.ndarray [shape=(..., n_steps,)]
+        The most likely state sequence.
+        If ``prob`` contains multiple input channels,
+        then each channel is decoded independently.
+    logp : scalar [float] or np.ndarray
+        If ``return_logp=True``, the (unnormalized) log probability
+        of ``states`` given the observations.
+    See Also
+    --------
+    viterbi :
+        Viterbi decoding from observation likelihoods
+    viterbi_binary :
+        Viterbi decoding for multi-label, conditional state likelihoods
+    Examples
+    --------
+    This example constructs a simple, template-based discriminative chord estimator,
+    using CENS chroma as input features.
+    .. note:: this chord model is not accurate enough to use in practice. It is only
+            intended to demonstrate how to use discriminative Viterbi decoding.
+    >>> # Create templates for major, minor, and no-chord qualities
+    >>> maj_template = np.array([1,0,0, 0,1,0, 0,1,0, 0,0,0])
+    >>> min_template = np.array([1,0,0, 1,0,0, 0,1,0, 0,0,0])
+    >>> N_template   = np.array([1,1,1, 1,1,1, 1,1,1, 1,1,1.]) / 4.
+    >>> # Generate the weighting matrix that maps chroma to labels
+    >>> weights = np.zeros((25, 12), dtype=float)
+    >>> labels = ['C:maj', 'C#:maj', 'D:maj', 'D#:maj', 'E:maj', 'F:maj',
+    ...           'F#:maj', 'G:maj', 'G#:maj', 'A:maj', 'A#:maj', 'B:maj',
+    ...           'C:min', 'C#:min', 'D:min', 'D#:min', 'E:min', 'F:min',
+    ...           'F#:min', 'G:min', 'G#:min', 'A:min', 'A#:min', 'B:min',
+    ...           'N']
+    >>> for c in range(12):
+    ...     weights[c, :] = np.roll(maj_template, c) # c:maj
+    ...     weights[c + 12, :] = np.roll(min_template, c)  # c:min
+    >>> weights[-1] = N_template  # the last row is the no-chord class
+    >>> # Make a self-loop transition matrix over 25 states
+    >>> trans = librosa.sequence.transition_loop(25, 0.9)
+    >>> # Load in audio and make features
+    >>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=15)
+    >>> # Suppress percussive elements
+    >>> y = librosa.effects.harmonic(y, margin=4)
+    >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
+    >>> # Map chroma (observations) to class (state) likelihoods
+    >>> probs = np.exp(weights.dot(chroma))  # P[class | chroma] ~= exp(template' chroma)
+    >>> probs /= probs.sum(axis=0, keepdims=True)  # probabilities must sum to 1 in each column
+    >>> # Compute independent frame-wise estimates
+    >>> chords_ind = np.argmax(probs, axis=0)
+    >>> # And viterbi estimates
+    >>> chords_vit = librosa.sequence.viterbi_discriminative(probs, trans)
+    >>> # Plot the features and prediction map
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=2)
+    >>> librosa.display.specshow(chroma, x_axis='time', y_axis='chroma', ax=ax[0])
+    >>> librosa.display.specshow(weights, x_axis='chroma', ax=ax[1])
+    >>> ax[1].set(yticks=np.arange(25) + 0.5, yticklabels=labels, ylabel='Chord')
+    >>> # And plot the results
+    >>> fig, ax = plt.subplots()
+    >>> librosa.display.specshow(probs, x_axis='time', cmap='gray', ax=ax)
+    >>> times = librosa.times_like(chords_vit)
+    >>> ax.scatter(times, chords_ind + 0.25, color='lime', alpha=0.5, marker='+',
+    ...            s=15, label='Independent')
+    >>> ax.scatter(times, chords_vit - 0.25, color='deeppink', alpha=0.5, marker='o',
+    ...            s=15, label='Viterbi')
+    >>> ax.set(yticks=np.unique(chords_vit),
+    ...        yticklabels=[labels[i] for i in np.unique(chords_vit)])
+    >>> ax.legend()
+    """
+    n_states, n_steps = prob.shape[-2:]
+    if transition.shape != (n_states, n_states):
+        raise ParameterError(
+            f"transition.shape={transition.shape}, must be "
+            f"(n_states, n_states)={n_states, n_states}"
+        )
+    if np.any(transition < 0) or not np.allclose(transition.sum(axis=1), 1):
+        raise ParameterError(
+            "Invalid transition matrix: must be non-negative "
+            "and sum to 1 on each row."
+        )
+    if np.any(prob < 0) or not np.allclose(prob.sum(axis=-2), 1):
+        raise ParameterError(
+            "Invalid probability values: each column must "
+            "sum to 1 and be non-negative"
+        )
+    # Compute log-likelihoods while avoiding log-underflow
+    epsilon = tiny(prob)
+    # Compute marginal log probabilities while avoiding underflow
+    if p_state is None:
+        p_state = np.empty(n_states)
+        p_state.fill(1.0 / n_states)
+    elif p_state.shape != (n_states,):
+        raise ParameterError(
+            "Marginal distribution p_state must have shape (n_states,). "
+            f"Got p_state.shape={p_state.shape}"
+        )
+    elif np.any(p_state < 0) or not np.allclose(p_state.sum(axis=-1), 1):
+        raise ParameterError(f"Invalid marginal state distribution: p_state={p_state}")
+    if p_init is None:
+        p_init = np.empty(n_states)
+        p_init.fill(1.0 / n_states)
+    elif (
+        np.any(p_init < 0)
+        or not np.allclose(p_init.sum(), 1)
+        or p_init.shape != (n_states,)
+    ):
+        raise ParameterError(f"Invalid initial state distribution: p_init={p_init}")
+    # By Bayes' rule, P[X | Y] * P[Y] = P[Y | X] * P[X]
+    # P[X] is constant for the sake of maximum likelihood inference
+    # and P[Y] is given by the marginal distribution p_state.
+    #
+    # So we have P[X | y] \propto P[Y | x] / P[Y]
+    # if X = observation and Y = states, this can be done in log space as
+    # log P[X | y] \propto \log P[Y | x] - \log P[Y]
+    log_p_init = np.log(p_init + epsilon)
+    log_trans = np.log(transition + epsilon)
+    log_marginal = np.log(p_state + epsilon)
+    # reshape to broadcast against prob
+    log_marginal = expand_to(log_marginal, ndim=prob.ndim, axes=-2)
+    log_prob = np.log(prob + epsilon) - log_marginal
+    def _helper(lp):
+        # Transpose input
+        _state, logp = _viterbi(lp.T, log_trans, log_p_init)
+        # Transpose outputs for return
+        return _state.T, logp
+    states: np.ndarray
+    logp: np.ndarray
+    if log_prob.ndim == 2:
+        states, logp = _helper(log_prob)
+    else:
+        # Vectorize the helper
+        __viterbi = np.vectorize(
+            _helper, otypes=[np.uint16, np.float64], signature="(s,t)->(t),(1)"
+        )
+        states, logp = __viterbi(log_prob)
+    # Flatten out the trailing dimension
+    logp = logp[..., 0]
+    if return_logp:
+        return states, logp
+    return states
+@overload
+def viterbi_binary(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[False] = ...,
+) -> np.ndarray:
+    ...
+@overload
+def viterbi_binary(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray]:
+    ...
+@overload
+def viterbi_binary(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = ...,
+    p_init: Optional[np.ndarray] = ...,
+    return_logp: bool = ...,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    ...
+def viterbi_binary(
+    prob: np.ndarray,
+    transition: np.ndarray,
+    *,
+    p_state: Optional[np.ndarray] = None,
+    p_init: Optional[np.ndarray] = None,
+    return_logp: bool = False,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Viterbi decoding from binary (multi-label), discriminative state predictions.
+    Given a sequence of conditional state predictions ``prob[s, t]``,
+    indicating the conditional likelihood of state ``s`` being active
+    conditional on observation at time ``t``, and a 2*2 transition matrix
+    ``transition`` which encodes the conditional probability of moving from
+    state ``s`` to state ``~s`` (not-``s``), the Viterbi algorithm computes the
+    most likely sequence of states from the observations.
+    This function differs from `viterbi_discriminative` in that it does not assume the
+    states to be mutually exclusive.  `viterbi_binary` is implemented by
+    transforming the multi-label decoding problem to a collection
+    of binary Viterbi problems (one for each *state* or label).
+    The output is a binary matrix ``states[s, t]`` indicating whether each
+    state ``s`` is active at time ``t``.
+    Like `viterbi_discriminative`, the probabilities of the optimal state sequences
+    are not normalized here.  If using the `return_logp=True` option (see below),
+    be aware that the "probabilities" may not sum to (and may exceed) 1.
+    Parameters
+    ----------
+    prob : np.ndarray [shape=(..., n_steps,) or (..., n_states, n_steps)], non-negative
+        ``prob[s, t]`` is the probability of state ``s`` being active
+        conditional on the observation at time ``t``.
+        Must be non-negative and less than 1.
+        If ``prob`` is 1-dimensional, it is expanded to shape ``(1, n_steps)``.
+        If ``prob`` contains multiple input channels, then each channel is decoded independently.
+    transition : np.ndarray [shape=(2, 2) or (n_states, 2, 2)], non-negative
+        If 2-dimensional, the same transition matrix is applied to each sub-problem.
+        ``transition[0, i]`` is the probability of the state going from inactive to ``i``,
+        ``transition[1, i]`` is the probability of the state going from active to ``i``.
+        Each row must sum to 1.
+        If 3-dimensional, ``transition[s]`` is interpreted as the 2x2 transition matrix
+        for state label ``s``.
+    p_state : np.ndarray [shape=(n_states,)]
+        Optional: marginal probability for each state (between [0,1]).
+        If not provided, a uniform distribution (0.5 for each state)
+        is assumed.
+    p_init : np.ndarray [shape=(n_states,)]
+        Optional: initial state distribution.
+        If not provided, it is assumed to be uniform.
+    return_logp : bool
+        If ``True``, return the (unnormalized) log-likelihood of the state sequences.
+    Returns
+    -------
+    Either ``states`` or ``(states, logp)``:
+    states : np.ndarray [shape=(..., n_states, n_steps)]
+        The most likely state sequence.
+    logp : np.ndarray [shape=(..., n_states,)]
+        If ``return_logp=True``, the (unnormalized) log probability of each
+        state activation sequence ``states``
+    See Also
+    --------
+    viterbi :
+        Viterbi decoding from observation likelihoods
+    viterbi_discriminative :
+        Viterbi decoding for discriminative (mutually exclusive) state predictions
+    Examples
+    --------
+    In this example, we have a sequence of binary state likelihoods that we want to de-noise
+    under the assumption that state changes are relatively uncommon.  Positive predictions
+    should only be retained if they persist for multiple steps, and any transient predictions
+    should be considered as errors.  This use case arises frequently in problems such as
+    instrument recognition, where state activations tend to be stable over time, but subject
+    to abrupt changes (e.g., when an instrument joins the mix).
+    We assume that the 0 state has a self-transition probability of 90%, and the 1 state
+    has a self-transition probability of 70%.  We assume the marginal and initial
+    probability of either state is 50%.
+    >>> trans = np.array([[0.9, 0.1], [0.3, 0.7]])
+    >>> prob = np.array([0.1, 0.7, 0.4, 0.3, 0.8, 0.9, 0.8, 0.2, 0.6, 0.3])
+    >>> librosa.sequence.viterbi_binary(prob, trans, p_state=0.5, p_init=0.5)
+    array([[0, 0, 0, 0, 1, 1, 1, 0, 0, 0]])
+    """
+    prob = np.atleast_2d(prob)
+    n_states, n_steps = prob.shape[-2:]
+    if transition.shape == (2, 2):
+        transition = np.tile(transition, (n_states, 1, 1))
+    elif transition.shape != (n_states, 2, 2):
+        raise ParameterError(
+            f"transition.shape={transition.shape}, must be (2, 2) or "
+            f"(n_states, 2, 2)={n_states}"
+        )
+    if np.any(transition < 0) or not np.allclose(transition.sum(axis=-1), 1):
+        raise ParameterError(
+            "Invalid transition matrix: must be non-negative "
+            "and sum to 1 on each row."
+        )
+    if np.any(prob < 0) or np.any(prob > 1):
+        raise ParameterError("Invalid probability values: prob must be between [0, 1]")
+    if p_state is None:
+        p_state = np.empty(n_states)
+        p_state.fill(0.5)
+    else:
+        p_state = np.atleast_1d(p_state)
+    assert p_state is not None
+    if p_state.shape != (n_states,) or np.any(p_state < 0) or np.any(p_state > 1):
+        raise ParameterError(f"Invalid marginal state distributions: p_state={p_state}")
+    if p_init is None:
+        p_init = np.empty(n_states)
+        p_init.fill(0.5)
+    else:
+        p_init = np.atleast_1d(p_init)
+    assert p_init is not None
+    if p_init.shape != (n_states,) or np.any(p_init < 0) or np.any(p_init > 1):
+        raise ParameterError(f"Invalid initial state distributions: p_init={p_init}")
+    shape_prefix = list(prob.shape[:-2])
+    states = np.empty(shape_prefix + [n_states, n_steps], dtype=np.uint16)
+    logp = np.empty(shape_prefix + [n_states])
+    prob_binary = np.empty(shape_prefix + [2, n_steps])
+    p_state_binary = np.empty(2)
+    p_init_binary = np.empty(2)
+    for state in range(n_states):
+        prob_binary[..., 0, :] = 1 - prob[..., state, :]
+        prob_binary[..., 1, :] = prob[..., state, :]
+        p_state_binary[0] = 1 - p_state[state]
+        p_state_binary[1] = p_state[state]
+        p_init_binary[0] = 1 - p_init[state]
+        p_init_binary[1] = p_init[state]
+        states[..., state, :], logp[..., state] = viterbi_discriminative(
+            prob_binary,
+            transition[state],
+            p_state=p_state_binary,
+            p_init=p_init_binary,
+            return_logp=True,
+        )
+    if return_logp:
+        return states, logp
+    return states
+def transition_uniform(n_states: int) -> np.ndarray:
+    """Construct a uniform transition matrix over ``n_states``.
+    Parameters
+    ----------
+    n_states : int > 0
+        The number of states
+    Returns
+    -------
+    transition : np.ndarray [shape=(n_states, n_states)]
+        ``transition[i, j] = 1./n_states``
+    Examples
+    --------
+    >>> librosa.sequence.transition_uniform(3)
+    array([[0.333, 0.333, 0.333],
+           [0.333, 0.333, 0.333],
+           [0.333, 0.333, 0.333]])
+    """
+    if not is_positive_int(n_states):
+        raise ParameterError(f"n_states={n_states} must be a positive integer")
+    transition = np.empty((n_states, n_states), dtype=np.float64)
+    transition.fill(1.0 / n_states)
+    return transition
+def transition_loop(n_states: int, prob: Union[float, Iterable[float]]) -> np.ndarray:
+    """Construct a self-loop transition matrix over ``n_states``.
+    The transition matrix will have the following properties:
+        - ``transition[i, i] = p`` for all ``i``
+        - ``transition[i, j] = (1 - p) / (n_states - 1)`` for all ``j != i``
+    This type of transition matrix is appropriate when states tend to be
+    locally stable, and there is no additional structure between different
+    states.  This is primarily useful for de-noising frame-wise predictions.
+    Parameters
+    ----------
+    n_states : int > 1
+        The number of states
+    prob : float in [0, 1] or iterable, length=n_states
+        If a scalar, this is the probability of a self-transition.
+        If a vector of length ``n_states``, ``p[i]`` is the probability of self-transition in state ``i``
+    Returns
+    -------
+    transition : np.ndarray [shape=(n_states, n_states)]
+        The transition matrix
+    Examples
+    --------
+    >>> librosa.sequence.transition_loop(3, 0.5)
+    array([[0.5 , 0.25, 0.25],
+           [0.25, 0.5 , 0.25],
+           [0.25, 0.25, 0.5 ]])
+    >>> librosa.sequence.transition_loop(3, [0.8, 0.5, 0.25])
+    array([[0.8  , 0.1  , 0.1  ],
+           [0.25 , 0.5  , 0.25 ],
+           [0.375, 0.375, 0.25 ]])
+    """
+    if not (is_positive_int(n_states) and (n_states > 1)):
+        raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
+    transition = np.empty((n_states, n_states), dtype=np.float64)
+    # if it's a float, make it a vector
+    prob = np.asarray(prob, dtype=np.float64)
+    if prob.ndim == 0:
+        prob = np.tile(prob, n_states)
+    if prob.shape != (n_states,):
+        raise ParameterError(
+            f"prob={prob} must have length equal to n_states={n_states}"
+        )
+    if np.any(prob < 0) or np.any(prob > 1):
+        raise ParameterError(f"prob={prob} must have values in the range [0, 1]")
+    for i, prob_i in enumerate(prob):
+        transition[i] = (1.0 - prob_i) / (n_states - 1)
+        transition[i, i] = prob_i
+    return transition
+def transition_cycle(n_states: int, prob: Union[float, Iterable[float]]) -> np.ndarray:
+    """Construct a cyclic transition matrix over ``n_states``.
+    The transition matrix will have the following properties:
+        - ``transition[i, i] = p``
+        - ``transition[i, i + 1] = (1 - p)``
+    This type of transition matrix is appropriate for state spaces
+    with cyclical structure, such as metrical position within a bar.
+    For example, a song in 4/4 time has state transitions of the form
+        1->{1, 2}, 2->{2, 3}, 3->{3, 4}, 4->{4, 1}.
+    Parameters
+    ----------
+    n_states : int > 1
+        The number of states
+    prob : float in [0, 1] or iterable, length=n_states
+        If a scalar, this is the probability of a self-transition.
+        If a vector of length ``n_states``, ``p[i]`` is the probability of
+        self-transition in state ``i``
+    Returns
+    -------
+    transition : np.ndarray [shape=(n_states, n_states)]
+        The transition matrix
+    Examples
+    --------
+    >>> librosa.sequence.transition_cycle(4, 0.9)
+    array([[0.9, 0.1, 0. , 0. ],
+           [0. , 0.9, 0.1, 0. ],
+           [0. , 0. , 0.9, 0.1],
+           [0.1, 0. , 0. , 0.9]])
+    """
+    if not (is_positive_int(n_states) and n_states > 1):
+        raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
+    transition = np.zeros((n_states, n_states), dtype=np.float64)
+    # if it's a float, make it a vector
+    prob = np.asarray(prob, dtype=np.float64)
+    if prob.ndim == 0:
+        prob = np.tile(prob, n_states)
+    if prob.shape != (n_states,):
+        raise ParameterError(
+            f"prob={prob} must have length equal to n_states={n_states}"
+        )
+    if np.any(prob < 0) or np.any(prob > 1):
+        raise ParameterError(f"prob={prob} must have values in the range [0, 1]")
+    for i, prob_i in enumerate(prob):
+        transition[i, np.mod(i + 1, n_states)] = 1.0 - prob_i
+        transition[i, i] = prob_i
+    return transition
+def transition_local(
+    n_states: int,
+    width: Union[int, Iterable[int]],
+    *,
+    window: _WindowSpec = "triangle",
+    wrap: bool = False,
+) -> np.ndarray:
+    """Construct a localized transition matrix.
+    The transition matrix will have the following properties:
+        - ``transition[i, j] = 0`` if ``|i - j| > width``
+        - ``transition[i, i]`` is maximal
+        - ``transition[i, i - width//2 : i + width//2]`` has shape ``window``
+    This type of transition matrix is appropriate for state spaces
+    that discretely approximate continuous variables, such as in fundamental
+    frequency estimation.
+    Parameters
+    ----------
+    n_states : int > 1
+        The number of states
+    width : int >= 1 or iterable
+        The maximum number of states to treat as "local".
+        If iterable, it should have length equal to ``n_states``,
+        and specify the width independently for each state.
+    window : str, callable, or window specification
+        The window function to determine the shape of the "local" distribution.
+        Any window specification supported by `filters.get_window` will work here.
+        .. note:: Certain windows (e.g., 'hann') are identically 0 at the boundaries,
+            so and effectively have ``width-2`` non-zero values.  You may have to expand
+            ``width`` to get the desired behavior.
+    wrap : bool
+        If ``True``, then state locality ``|i - j|`` is computed modulo ``n_states``.
+        If ``False`` (default), then locality is absolute.
+    See Also
+    --------
+    librosa.filters.get_window
+    Returns
+    -------
+    transition : np.ndarray [shape=(n_states, n_states)]
+        The transition matrix
+    Examples
+    --------
+    Triangular distributions with and without wrapping
+    >>> librosa.sequence.transition_local(5, 3, window='triangle', wrap=False)
+    array([[0.667, 0.333, 0.   , 0.   , 0.   ],
+           [0.25 , 0.5  , 0.25 , 0.   , 0.   ],
+           [0.   , 0.25 , 0.5  , 0.25 , 0.   ],
+           [0.   , 0.   , 0.25 , 0.5  , 0.25 ],
+           [0.   , 0.   , 0.   , 0.333, 0.667]])
+    >>> librosa.sequence.transition_local(5, 3, window='triangle', wrap=True)
+    array([[0.5 , 0.25, 0.  , 0.  , 0.25],
+           [0.25, 0.5 , 0.25, 0.  , 0.  ],
+           [0.  , 0.25, 0.5 , 0.25, 0.  ],
+           [0.  , 0.  , 0.25, 0.5 , 0.25],
+           [0.25, 0.  , 0.  , 0.25, 0.5 ]])
+    Uniform local distributions with variable widths and no wrapping
+    >>> librosa.sequence.transition_local(5, [1, 2, 3, 3, 1], window='ones', wrap=False)
+    array([[1.   , 0.   , 0.   , 0.   , 0.   ],
+           [0.5  , 0.5  , 0.   , 0.   , 0.   ],
+           [0.   , 0.333, 0.333, 0.333, 0.   ],
+           [0.   , 0.   , 0.333, 0.333, 0.333],
+           [0.   , 0.   , 0.   , 0.   , 1.   ]])
+    """
+    if not (is_positive_int(n_states) and n_states > 1):
+        raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
+    width = np.asarray(width, dtype=int)
+    if width.ndim == 0:
+        width = np.tile(width, n_states)
+    if width.shape != (n_states,):
+        raise ParameterError(
+            f"width={width} must have length equal to n_states={n_states}"
+        )
+    if np.any(width < 1):
+        raise ParameterError(f"width={width} must be at least 1")
+    transition = np.zeros((n_states, n_states), dtype=np.float64)
+    # Fill in the widths.  This is inefficient, but simple
+    for i, width_i in enumerate(width):
+        trans_row = pad_center(
+            get_window(window, width_i, fftbins=False), size=n_states
+        )
+        trans_row = np.roll(trans_row, n_states // 2 + i + 1)
+        if not wrap:
+            # Knock out the off-diagonal-band elements
+            trans_row[min(n_states, i + width_i // 2 + 1) :] = 0
+            trans_row[: max(0, i - width_i // 2)] = 0
+        transition[i] = trans_row
+    # Row-normalize
+    transition /= transition.sum(axis=1, keepdims=True)
+    return transition

utils.py ADDED Viewed

	@@ -0,0 +1,316 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Feature manipulation utilities"""
+import numpy as np
+import scipy.signal
+from numba import jit
+from .._cache import cache
+from ..util.exceptions import ParameterError
+from typing import Any
+__all__ = ["delta", "stack_memory"]
+@cache(level=40)
+def delta(
+    data: np.ndarray,
+    *,
+    width: int = 9,
+    order: int = 1,
+    axis: int = -1,
+    mode: str = "interp",
+    **kwargs: Any,
+) -> np.ndarray:
+    r"""Compute delta features: local estimate of the derivative
+    of the input data along the selected axis.
+    Delta features are computed Savitsky-Golay filtering.
+    Parameters
+    ----------
+    data : np.ndarray
+        the input data matrix (eg, spectrogram)
+    width : int, positive, odd [scalar]
+        Number of frames over which to compute the delta features.
+        Cannot exceed the length of ``data`` along the specified axis.
+        If ``mode='interp'``, then ``width`` must be at least ``data.shape[axis]``.
+    order : int > 0 [scalar]
+        the order of the difference operator.
+        1 for first derivative, 2 for second, etc.
+    axis : int [scalar]
+        the axis along which to compute deltas.
+        Default is -1 (columns).
+    mode : str, {'interp', 'nearest', 'mirror', 'constant', 'wrap'}
+        Padding mode for estimating differences at the boundaries.
+    **kwargs : additional keyword arguments
+        See `scipy.signal.savgol_filter`
+    Returns
+    -------
+    delta_data : np.ndarray [shape=(..., t)]
+        delta matrix of ``data`` at specified order
+    Notes
+    -----
+    This function caches at level 40.
+    See Also
+    --------
+    scipy.signal.savgol_filter
+    Examples
+    --------
+    Compute MFCC deltas, delta-deltas
+    >>> y, sr = librosa.load(librosa.ex('libri1'), duration=5)
+    >>> mfcc = librosa.feature.mfcc(y=y, sr=sr)
+    >>> mfcc_delta = librosa.feature.delta(mfcc)
+    >>> mfcc_delta
+    array([[-5.713e+02, -5.697e+02, ..., -1.522e+02, -1.224e+02],
+           [ 1.104e+01,  1.330e+01, ...,  2.089e+02,  1.698e+02],
+           ...,
+           [ 2.829e+00,  1.933e+00, ..., -3.149e+00,  2.294e-01],
+           [ 2.890e+00,  2.187e+00, ...,  6.959e+00, -1.039e+00]],
+          dtype=float32)
+    >>> mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
+    >>> mfcc_delta2
+    array([[-1.195, -1.195, ..., -4.328, -4.328],
+           [-1.566, -1.566, ..., -9.949, -9.949],
+           ...,
+           [ 0.707,  0.707, ...,  2.287,  2.287],
+           [ 0.655,  0.655, ..., -1.719, -1.719]], dtype=float32)
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
+    >>> img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='time')
+    >>> ax[0].set(title='MFCC')
+    >>> ax[0].label_outer()
+    >>> img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='time')
+    >>> ax[1].set(title=r'MFCC-$\Delta$')
+    >>> ax[1].label_outer()
+    >>> img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='time')
+    >>> ax[2].set(title=r'MFCC-$\Delta^2$')
+    >>> fig.colorbar(img1, ax=[ax[0]])
+    >>> fig.colorbar(img2, ax=[ax[1]])
+    >>> fig.colorbar(img3, ax=[ax[2]])
+    """
+    data = np.atleast_1d(data)
+    if mode == "interp" and width > data.shape[axis]:
+        raise ParameterError(
+            f"when mode='interp', width={width} "
+            f"cannot exceed data.shape[axis]={data.shape[axis]}"
+        )
+    if width < 3 or np.mod(width, 2) != 1:
+        raise ParameterError("width must be an odd integer >= 3")
+    if order <= 0 or not isinstance(order, (int, np.integer)):
+        raise ParameterError("order must be a positive integer")
+    kwargs.pop("deriv", None)
+    kwargs.setdefault("polyorder", order)
+    result: np.ndarray = scipy.signal.savgol_filter(
+        data, width, deriv=order, axis=axis, mode=mode, **kwargs
+    )
+    return result
+@cache(level=40)
+def stack_memory(
+    data: np.ndarray, *, n_steps: int = 2, delay: int = 1, **kwargs: Any
+) -> np.ndarray:
+    """Short-term history embedding: vertically concatenate a data
+    vector or matrix with delayed copies of itself.
+    Each column ``data[:, i]`` is mapped to::
+        data[..., i] ->  [data[..., i],
+                          data[..., i - delay],
+                          ...
+                          data[..., i - (n_steps-1)*delay]]
+    For columns ``i < (n_steps - 1) * delay``, the data will be padded.
+    By default, the data is padded with zeros, but this behavior can be
+    overridden by supplying additional keyword arguments which are passed
+    to `np.pad()`.
+    Parameters
+    ----------
+    data : np.ndarray [shape=(..., d, t)]
+        Input data matrix.  If ``data`` is a vector (``data.ndim == 1``),
+        it will be interpreted as a row matrix and reshaped to ``(1, t)``.
+    n_steps : int > 0 [scalar]
+        embedding dimension, the number of steps back in time to stack
+    delay : int != 0 [scalar]
+        the number of columns to step.
+        Positive values embed from the past (previous columns).
+        Negative values embed from the future (subsequent columns).
+    **kwargs : additional keyword arguments
+        Additional arguments to pass to `numpy.pad`
+    Returns
+    -------
+    data_history : np.ndarray [shape=(..., m * d, t)]
+        data augmented with lagged copies of itself,
+        where ``m == n_steps - 1``.
+    Notes
+    -----
+    This function caches at level 40.
+    Examples
+    --------
+    Keep two steps (current and previous)
+    >>> data = np.arange(-3, 3)
+    >>> librosa.feature.stack_memory(data)
+    array([[-3, -2, -1,  0,  1,  2],
+           [ 0, -3, -2, -1,  0,  1]])
+    Or three steps
+    >>> librosa.feature.stack_memory(data, n_steps=3)
+    array([[-3, -2, -1,  0,  1,  2],
+           [ 0, -3, -2, -1,  0,  1],
+           [ 0,  0, -3, -2, -1,  0]])
+    Use reflection padding instead of zero-padding
+    >>> librosa.feature.stack_memory(data, n_steps=3, mode='reflect')
+    array([[-3, -2, -1,  0,  1,  2],
+           [-2, -3, -2, -1,  0,  1],
+           [-1, -2, -3, -2, -1,  0]])
+    Or pad with edge-values, and delay by 2
+    >>> librosa.feature.stack_memory(data, n_steps=3, delay=2, mode='edge')
+    array([[-3, -2, -1,  0,  1,  2],
+           [-3, -3, -3, -2, -1,  0],
+           [-3, -3, -3, -3, -3, -2]])
+    Stack time-lagged beat-synchronous chroma edge padding
+    >>> y, sr = librosa.load(librosa.ex('sweetwaltz'), duration=10)
+    >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
+    >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
+    >>> beats = librosa.util.fix_frames(beats, x_min=0)
+    >>> chroma_sync = librosa.util.sync(chroma, beats)
+    >>> chroma_lag = librosa.feature.stack_memory(chroma_sync, n_steps=3,
+    ...                                           mode='edge')
+    Plot the result
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512)
+    >>> librosa.display.specshow(chroma_lag, y_axis='chroma', x_axis='time',
+    ...                          x_coords=beat_times, ax=ax)
+    >>> ax.text(1.0, 1/6, "Lag=0", transform=ax.transAxes, rotation=-90, ha="left", va="center")
+    >>> ax.text(1.0, 3/6, "Lag=1", transform=ax.transAxes, rotation=-90, ha="left", va="center")
+    >>> ax.text(1.0, 5/6, "Lag=2", transform=ax.transAxes, rotation=-90, ha="left", va="center")
+    >>> ax.set(title='Time-lagged chroma', ylabel="")
+    """
+    if n_steps < 1:
+        raise ParameterError("n_steps must be a positive integer")
+    if delay == 0:
+        raise ParameterError("delay must be a non-zero integer")
+    data = np.atleast_2d(data)
+    t = data.shape[-1]
+    if t < 1:
+        raise ParameterError(
+            "Cannot stack memory when input data has "
+            f"no columns. Given data.shape={data.shape}"
+        )
+    kwargs.setdefault("mode", "constant")
+    if kwargs["mode"] == "constant":
+        kwargs.setdefault("constant_values", [0])
+    padding = [(0, 0) for _ in range(data.ndim)]
+    # Pad the end with zeros, which will roll to the front below
+    if delay > 0:
+        padding[-1] = (int((n_steps - 1) * delay), 0)
+    else:
+        padding[-1] = (0, int((n_steps - 1) * -delay))
+    data = np.pad(data, padding, **kwargs)
+    # Construct the shape of the target array
+    shape = list(data.shape)
+    shape[-2] = shape[-2] * n_steps
+    shape[-1] = t
+    shape = tuple(shape)
+    # Construct the output array to match layout and dtype of input
+    history = np.empty_like(data, shape=shape)
+    # Populate the output array
+    __stack(history, data, n_steps, delay)
+    return history
+@jit(nopython=True, cache=False)
+def __stack(history, data, n_steps, delay):
+    """Memory-stacking helper function.
+    Parameters
+    ----------
+    history : output array (2-dimensional)
+    data : pre-padded input array (2-dimensional)
+    n_steps : int > 0, the number of steps to stack
+    delay : int != 0, the amount of delay between steps
+    Returns
+    -------
+    None
+        Output is stored directly in the history array
+    """
+    # Dimension of each copy of the data
+    d = data.shape[-2]
+    # Total number of time-steps to output
+    t = history.shape[-1]
+    if delay > 0:
+        for step in range(n_steps):
+            q = n_steps - 1 - step
+            # nth block is original shifted left by n*delay steps
+            history[..., step * d : (step + 1) * d, :] = data[
+                ..., q * delay : q * delay + t
+            ]
+    else:
+        # Handle the last block separately to avoid -t:0 empty slices
+        history[..., -d:, :] = data[..., -t:]
+        for step in range(n_steps - 1):
+            # nth block is original shifted right by n*delay steps
+            q = n_steps - 1 - step
+            history[..., step * d : (step + 1) * d, :] = data[
+                ..., -t + q * delay : q * delay
+            ]