diff --git "a/spectrum.py" "b/spectrum.py" new file mode 100644--- /dev/null +++ "b/spectrum.py" @@ -0,0 +1,2851 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Utilities for spectral processing""" +import warnings + +import numpy as np +import scipy +import scipy.ndimage +import scipy.signal +import scipy.interpolate + +from numba import jit + +from . import convert +from .fft import get_fftlib +from .audio import resample +from .._cache import cache +from .. import util +from ..util.exceptions import ParameterError +from ..filters import get_window, semitone_filterbank +from ..filters import window_sumsquare +from numpy.typing import DTypeLike +from typing import Any, Callable, Optional, Tuple, List, Union, overload +from typing_extensions import Literal +from .._typing import _WindowSpec, _PadMode, _PadModeSTFT + +__all__ = [ + "stft", + "istft", + "magphase", + "iirt", + "reassigned_spectrogram", + "phase_vocoder", + "perceptual_weighting", + "power_to_db", + "db_to_power", + "amplitude_to_db", + "db_to_amplitude", + "fmt", + "pcen", + "griffinlim", +] + + +@cache(level=20) +def stft( + y: np.ndarray, + *, + n_fft: int = 2048, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + dtype: Optional[DTypeLike] = None, + pad_mode: _PadModeSTFT = "constant", + out: Optional[np.ndarray] = None, +) -> np.ndarray: + """Short-time Fourier transform (STFT). + + The STFT represents a signal in the time-frequency domain by + computing discrete Fourier transforms (DFT) over short overlapping + windows. + + This function returns a complex-valued matrix D such that + + - ``np.abs(D[..., f, t])`` is the magnitude of frequency bin ``f`` + at frame ``t``, and + + - ``np.angle(D[..., f, t])`` is the phase of frequency bin ``f`` + at frame ``t``. + + The integers ``t`` and ``f`` can be converted to physical units by means + of the utility functions `frames_to_samples` and `fft_frequencies`. + + Parameters + ---------- + y : np.ndarray [shape=(..., n)], real-valued + input signal. Multi-channel is supported. + + n_fft : int > 0 [scalar] + length of the windowed signal after padding with zeros. + The number of rows in the STFT matrix ``D`` is ``(1 + n_fft/2)``. + The default value, ``n_fft=2048`` samples, corresponds to a physical + duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the + default sample rate in librosa. This value is well adapted for music + signals. However, in speech processing, the recommended value is 512, + corresponding to 23 milliseconds at a sample rate of 22050 Hz. + In any case, we recommend setting ``n_fft`` to a power of two for + optimizing the speed of the fast Fourier transform (FFT) algorithm. + + hop_length : int > 0 [scalar] + number of audio samples between adjacent STFT columns. + + Smaller values increase the number of columns in ``D`` without + affecting the frequency resolution of the STFT. + + If unspecified, defaults to ``win_length // 4`` (see below). + + win_length : int <= n_fft [scalar] + Each frame of audio is windowed by ``window`` of length ``win_length`` + and then padded with zeros to match ``n_fft``. + + Smaller values improve the temporal resolution of the STFT (i.e. the + ability to discriminate impulses that are closely spaced in time) + at the expense of frequency resolution (i.e. the ability to discriminate + pure tones that are closely spaced in frequency). This effect is known + as the time-frequency localization trade-off and needs to be adjusted + according to the properties of the input signal ``y``. + + If unspecified, defaults to ``win_length = n_fft``. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + Either: + + - a window specification (string, tuple, or number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a vector or array of length ``n_fft`` + + Defaults to a raised cosine window (`'hann'`), which is adequate for + most applications in audio signal processing. + + .. see also:: `filters.get_window` + + center : boolean + If ``True``, the signal ``y`` is padded so that frame + ``D[:, t]`` is centered at ``y[t * hop_length]``. + + If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``. + + Defaults to ``True``, which simplifies the alignment of ``D`` onto a + time grid by means of `librosa.frames_to_samples`. + Note, however, that ``center`` must be set to `False` when analyzing + signals with `librosa.stream`. + + .. see also:: `librosa.stream` + + dtype : np.dtype, optional + Complex numeric type for ``D``. Default is inferred to match the + precision of the input signal. + + pad_mode : string or function + If ``center=True``, this argument is passed to `np.pad` for padding + the edges of the signal ``y``. By default (``pad_mode="constant"``), + ``y`` is padded on both sides with zeros. + + .. note:: Not all padding modes supported by `numpy.pad` are supported here. + `wrap`, `mean`, `maximum`, `median`, and `minimum` are not supported. + + Other modes that depend at most on input values at the edges of the + signal (e.g., `constant`, `edge`, `linear_ramp`) are supported. + + If ``center=False``, this argument is ignored. + + .. see also:: `numpy.pad` + + out : np.ndarray or None + A pre-allocated, complex-valued array to store the STFT results. + This must be of compatible shape and dtype for the given input parameters. + + If `out` is larger than necessary for the provided input signal, then only + a prefix slice of `out` will be used. + + If not provided, a new array is allocated and returned. + + Returns + ------- + D : np.ndarray [shape=(..., 1 + n_fft/2, n_frames), dtype=dtype] + Complex-valued matrix of short-term Fourier transform + coefficients. + + If a pre-allocated `out` array is provided, then `D` will be + a reference to `out`. + + If `out` is larger than necessary, then `D` will be a sliced + view: `D = out[..., :n_frames]`. + + See Also + -------- + istft : Inverse STFT + reassigned_spectrogram : Time-frequency reassigned spectrogram + + Notes + ----- + This function caches at level 20. + + Examples + -------- + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> S = np.abs(librosa.stft(y)) + >>> S + array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05], + [3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05], + ..., + [7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03], + [7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]], + dtype=float32) + + Use left-aligned frames, instead of centered frames + + >>> S_left = librosa.stft(y, center=False) + + Use a shorter hop length + + >>> D_short = librosa.stft(y, hop_length=64) + + Display a spectrogram + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots() + >>> img = librosa.display.specshow(librosa.amplitude_to_db(S, + ... ref=np.max), + ... y_axis='log', x_axis='time', ax=ax) + >>> ax.set_title('Power spectrogram') + >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") + """ + + # By default, use the entire frame + if win_length is None: + win_length = n_fft + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = int(win_length // 4) + elif not util.is_positive_int(hop_length): + raise ParameterError(f"hop_length={hop_length} must be a positive integer") + + # Check audio is valid + util.valid_audio(y, mono=False) + + fft_window = get_window(window, win_length, fftbins=True) + + # Pad the window out to n_fft size + fft_window = util.pad_center(fft_window, size=n_fft) + + # Reshape so that the window can be broadcast + fft_window = util.expand_to(fft_window, ndim=1 + y.ndim, axes=-2) + + # Pad the time series so that frames are centered + if center: + if pad_mode in ("wrap", "maximum", "mean", "median", "minimum"): + # Note: padding with a user-provided function "works", but + # use at your own risk. + # Since we don't pass-through kwargs here, any arguments + # to a user-provided pad function should be encapsulated + # by using functools.partial: + # + # >>> my_pad_func = functools.partial(pad_func, foo=x, bar=y) + # >>> librosa.stft(..., pad_mode=my_pad_func) + + raise ParameterError( + f"pad_mode='{pad_mode}' is not supported by librosa.stft" + ) + + if n_fft > y.shape[-1]: + warnings.warn( + f"n_fft={n_fft} is too large for input signal of length={y.shape[-1]}" + ) + + # Set up the padding array to be empty, and we'll fix the target dimension later + padding = [(0, 0) for _ in range(y.ndim)] + + # How many frames depend on left padding? + start_k = int(np.ceil(n_fft // 2 / hop_length)) + + # What's the first frame that depends on extra right-padding? + tail_k = (y.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1 + + if tail_k <= start_k: + # If tail and head overlap, then just copy-pad the signal and carry on + start = 0 + extra = 0 + padding[-1] = (n_fft // 2, n_fft // 2) + y = np.pad(y, padding, mode=pad_mode) + else: + # If tail and head do not overlap, then we can implement padding on each part separately + # and avoid a full copy-pad + + # "Middle" of the signal starts here, and does not depend on head padding + start = start_k * hop_length - n_fft // 2 + padding[-1] = (n_fft // 2, 0) + + # +1 here is to ensure enough samples to fill the window + # fixes bug #1567 + y_pre = np.pad( + y[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1], + padding, + mode=pad_mode, + ) + y_frames_pre = util.frame(y_pre, frame_length=n_fft, hop_length=hop_length) + # Trim this down to the exact number of frames we should have + y_frames_pre = y_frames_pre[..., :start_k] + + # How many extra frames do we have from the head? + extra = y_frames_pre.shape[-1] + + # Determine if we have any frames that will fit inside the tail pad + if tail_k * hop_length - n_fft // 2 + n_fft <= y.shape[-1] + n_fft // 2: + padding[-1] = (0, n_fft // 2) + y_post = np.pad( + y[..., (tail_k) * hop_length - n_fft // 2 :], padding, mode=pad_mode + ) + y_frames_post = util.frame( + y_post, frame_length=n_fft, hop_length=hop_length + ) + # How many extra frames do we have from the tail? + extra += y_frames_post.shape[-1] + else: + # In this event, the first frame that touches tail padding would run off + # the end of the padded array + # We'll circumvent this by allocating an empty frame buffer for the tail + # this keeps the subsequent logic simple + post_shape = list(y_frames_pre.shape) + post_shape[-1] = 0 + y_frames_post = np.empty_like(y_frames_pre, shape=post_shape) + else: + if n_fft > y.shape[-1]: + raise ParameterError( + f"n_fft={n_fft} is too large for uncentered analysis of input signal of length={y.shape[-1]}" + ) + + # "Middle" of the signal starts at sample 0 + start = 0 + # We have no extra frames + extra = 0 + + fft = get_fftlib() + + if dtype is None: + dtype = util.dtype_r2c(y.dtype) + + # Window the time series. + y_frames = util.frame(y[..., start:], frame_length=n_fft, hop_length=hop_length) + + # Pre-allocate the STFT matrix + shape = list(y_frames.shape) + + # This is our frequency dimension + shape[-2] = 1 + n_fft // 2 + + # If there's padding, there will be extra head and tail frames + shape[-1] += extra + + if out is None: + stft_matrix = np.zeros(shape, dtype=dtype, order="F") + elif not (np.allclose(out.shape[:-1], shape[:-1]) and out.shape[-1] >= shape[-1]): + raise ParameterError( + f"Shape mismatch for provided output array out.shape={out.shape} and target shape={shape}" + ) + elif not np.iscomplexobj(out): + raise ParameterError(f"output with dtype={out.dtype} is not of complex type") + else: + if np.allclose(shape, out.shape): + stft_matrix = out + else: + stft_matrix = out[..., : shape[-1]] + + # Fill in the warm-up + if center and extra > 0: + off_start = y_frames_pre.shape[-1] + stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2) + + off_end = y_frames_post.shape[-1] + if off_end > 0: + stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2) + else: + off_start = 0 + + n_columns = int( + util.MAX_MEM_BLOCK // (np.prod(y_frames.shape[:-1]) * y_frames.itemsize) + ) + n_columns = max(n_columns, 1) + + for bl_s in range(0, y_frames.shape[-1], n_columns): + bl_t = min(bl_s + n_columns, y_frames.shape[-1]) + + stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft( + fft_window * y_frames[..., bl_s:bl_t], axis=-2 + ) + return stft_matrix + + +@cache(level=30) +def istft( + stft_matrix: np.ndarray, + *, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + n_fft: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + dtype: Optional[DTypeLike] = None, + length: Optional[int] = None, + out: Optional[np.ndarray] = None, +) -> np.ndarray: + """ + Inverse short-time Fourier transform (ISTFT). + + Converts a complex-valued spectrogram ``stft_matrix`` to time-series ``y`` + by minimizing the mean squared error between ``stft_matrix`` and STFT of + ``y`` as described in [#]_ up to Section 2 (reconstruction from MSTFT). + + In general, window function, hop length and other parameters should be same + as in stft, which mostly leads to perfect reconstruction of a signal from + unmodified ``stft_matrix``. + + .. [#] D. W. Griffin and J. S. Lim, + "Signal estimation from modified short-time Fourier transform," + IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. + + Parameters + ---------- + stft_matrix : np.ndarray [shape=(..., 1 + n_fft//2, t)] + STFT matrix from ``stft`` + + hop_length : int > 0 [scalar] + Number of frames between STFT columns. + If unspecified, defaults to ``win_length // 4``. + + win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1) + When reconstructing the time series, each frame is windowed + and each sample is normalized by the sum of squared window + according to the ``window`` function (see below). + + If unspecified, defaults to ``n_fft``. + + n_fft : int > 0 or None + The number of samples per frame in the input spectrogram. + By default, this will be inferred from the shape of ``stft_matrix``. + However, if an odd frame length was used, you can specify the correct + length by setting ``n_fft``. + + window : string, tuple, number, function, np.ndarray [shape=(n_fft,)] + - a window specification (string, tuple, or number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a user-specified window vector of length ``n_fft`` + + .. see also:: `filters.get_window` + + center : boolean + - If ``True``, ``D`` is assumed to have centered frames. + - If ``False``, ``D`` is assumed to have left-aligned frames. + + dtype : numeric type + Real numeric type for ``y``. Default is to match the numerical + precision of the input spectrogram. + + length : int > 0, optional + If provided, the output ``y`` is zero-padded or clipped to exactly + ``length`` samples. + + out : np.ndarray or None + A pre-allocated, complex-valued array to store the reconstructed signal + ``y``. This must be of the correct shape for the given input parameters. + + If not provided, a new array is allocated and returned. + + Returns + ------- + y : np.ndarray [shape=(..., n)] + time domain signal reconstructed from ``stft_matrix``. + If ``stft_matrix`` contains more than two axes + (e.g., from a stereo input signal), then ``y`` will match shape on the leading dimensions. + + See Also + -------- + stft : Short-time Fourier Transform + + Notes + ----- + This function caches at level 30. + + Examples + -------- + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> D = librosa.stft(y) + >>> y_hat = librosa.istft(D) + >>> y_hat + array([-1.407e-03, -4.461e-04, ..., 5.131e-06, -1.417e-05], + dtype=float32) + + Exactly preserving length of the input signal requires explicit padding. + Otherwise, a partial frame at the end of ``y`` will not be represented. + + >>> n = len(y) + >>> n_fft = 2048 + >>> y_pad = librosa.util.fix_length(y, size=n + n_fft // 2) + >>> D = librosa.stft(y_pad, n_fft=n_fft) + >>> y_out = librosa.istft(D, length=n) + >>> np.max(np.abs(y - y_out)) + 8.940697e-08 + """ + + if n_fft is None: + n_fft = 2 * (stft_matrix.shape[-2] - 1) + + # By default, use the entire frame + if win_length is None: + win_length = n_fft + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = int(win_length // 4) + + ifft_window = get_window(window, win_length, fftbins=True) + + # Pad out to match n_fft, and add broadcasting axes + ifft_window = util.pad_center(ifft_window, size=n_fft) + ifft_window = util.expand_to(ifft_window, ndim=stft_matrix.ndim, axes=-2) + + # For efficiency, trim STFT frames according to signal length if available + if length: + if center: + padded_length = length + 2 * (n_fft // 2) + else: + padded_length = length + n_frames = min(stft_matrix.shape[-1], int(np.ceil(padded_length / hop_length))) + else: + n_frames = stft_matrix.shape[-1] + + if dtype is None: + dtype = util.dtype_c2r(stft_matrix.dtype) + + shape = list(stft_matrix.shape[:-2]) + expected_signal_len = n_fft + hop_length * (n_frames - 1) + + if length: + expected_signal_len = length + elif center: + expected_signal_len -= 2 * (n_fft // 2) + + shape.append(expected_signal_len) + + if out is None: + y = np.zeros(shape, dtype=dtype) + elif not np.allclose(out.shape, shape): + raise ParameterError( + f"Shape mismatch for provided output array out.shape={out.shape} != {shape}" + ) + else: + y = out + # Since we'll be doing overlap-add here, this needs to be initialized to zero. + y.fill(0.0) + + fft = get_fftlib() + + if center: + # First frame that does not depend on padding + # k * hop_length - n_fft//2 >= 0 + # k * hop_length >= n_fft // 2 + # k >= (n_fft//2 / hop_length) + + start_frame = int(np.ceil((n_fft // 2) / hop_length)) + + # Do overlap-add on the head block + ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2) + + shape[-1] = n_fft + hop_length * (start_frame - 1) + head_buffer = np.zeros(shape, dtype=dtype) + + __overlap_add(head_buffer, ytmp, hop_length) + + # If y is smaller than the head buffer, take everything + if y.shape[-1] < shape[-1] - n_fft // 2: + y[..., :] = head_buffer[..., n_fft // 2 : y.shape[-1] + n_fft // 2] + else: + # Trim off the first n_fft//2 samples from the head and copy into target buffer + y[..., : shape[-1] - n_fft // 2] = head_buffer[..., n_fft // 2 :] + + # This offset compensates for any differences between frame alignment + # and padding truncation + offset = start_frame * hop_length - n_fft // 2 + + else: + start_frame = 0 + offset = 0 + + n_columns = int( + util.MAX_MEM_BLOCK // (np.prod(stft_matrix.shape[:-1]) * stft_matrix.itemsize) + ) + n_columns = max(n_columns, 1) + + frame = 0 + for bl_s in range(start_frame, n_frames, n_columns): + bl_t = min(bl_s + n_columns, n_frames) + + # invert the block and apply the window function + ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2) + + # Overlap-add the istft block starting at the i'th frame + __overlap_add(y[..., frame * hop_length + offset :], ytmp, hop_length) + + frame += bl_t - bl_s + + # Normalize by sum of squared window + ifft_window_sum = window_sumsquare( + window=window, + n_frames=n_frames, + win_length=win_length, + n_fft=n_fft, + hop_length=hop_length, + dtype=dtype, + ) + + if center: + start = n_fft // 2 + else: + start = 0 + + ifft_window_sum = util.fix_length(ifft_window_sum[..., start:], size=y.shape[-1]) + + approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum) + + y[..., approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices] + + return y + + +@jit(nopython=True, cache=False) +def __overlap_add(y, ytmp, hop_length): + # numba-accelerated overlap add for inverse stft + # y is the pre-allocated output buffer + # ytmp is the windowed inverse-stft frames + # hop_length is the hop-length of the STFT analysis + + n_fft = ytmp.shape[-2] + N = n_fft + for frame in range(ytmp.shape[-1]): + sample = frame * hop_length + if N > y.shape[-1] - sample: + N = y.shape[-1] - sample + + y[..., sample : (sample + N)] += ytmp[..., :N, frame] + + +def __reassign_frequencies( + y: np.ndarray, + sr: float = 22050, + S: Optional[np.ndarray] = None, + n_fft: int = 2048, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + dtype: Optional[DTypeLike] = None, + pad_mode: _PadModeSTFT = "constant", +) -> Tuple[np.ndarray, np.ndarray]: + """Instantaneous frequencies based on a spectrogram representation. + + The reassignment vector is calculated using equation 5.20 in Flandrin, + Auger, & Chassande-Mottin 2002:: + + omega_reassigned = omega - np.imag(S_dh/S_h) + + where ``S_h`` is the complex STFT calculated using the original window, and + ``S_dh`` is the complex STFT calculated using the derivative of the original + window. + + See `reassigned_spectrogram` for references. + + It is recommended to use ``pad_mode="wrap"`` or else ``center=False``, rather + than the defaults. Frequency reassignment assumes that the energy in each + FFT bin is associated with exactly one signal component. Reflection padding + at the edges of the signal may invalidate the reassigned estimates in the + boundary frames. + + Parameters + ---------- + y : np.ndarray [shape=(..., n,)], real-valued + audio time series. Multi-channel is supported. + + sr : number > 0 [scalar] + sampling rate of ``y`` + + S : np.ndarray [shape=(..., d, t)] or None + (optional) complex STFT calculated using the other arguments provided + to `__reassign_frequencies` + + n_fft : int > 0 [scalar] + FFT window size. Defaults to 2048. + + hop_length : int > 0 [scalar] + hop length, number samples between subsequent frames. + If not supplied, defaults to ``win_length // 4``. + + win_length : int > 0, <= n_fft + Window length. Defaults to ``n_fft``. + See ``stft`` for details. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + - a window specification (string, tuple, number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a user-specified window vector of length ``n_fft`` + + See `stft` for details. + + .. see also:: `filters.get_window` + + center : boolean + - If ``True``, the signal ``y`` is padded so that frame + ``S[:, t]`` is centered at ``y[t * hop_length]``. + - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``. + + dtype : numeric type + Complex numeric type for ``S``. Default is inferred to match + the numerical precision of the input signal. + + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, STFT uses zero padding. + + Returns + ------- + freqs : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real] + Instantaneous frequencies: + ``freqs[f, t]`` is the frequency for bin ``f``, frame ``t``. + S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex] + Short-time Fourier transform + + Warns + ----- + RuntimeWarning + Frequencies with zero support will produce a divide-by-zero warning and + will be returned as `np.nan`. + + See Also + -------- + stft : Short-time Fourier Transform + reassigned_spectrogram : Time-frequency reassigned spectrogram + + Examples + -------- + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> frequencies, S = librosa.core.spectrum.__reassign_frequencies(y, sr=sr) + >>> frequencies + array([[0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00], + [3.628e+00, 4.698e+00, ..., 1.239e+01, 1.072e+01], + ..., + [1.101e+04, 1.102e+04, ..., 1.105e+04, 1.102e+04], + [1.102e+04, 1.102e+04, ..., 1.102e+04, 1.102e+04]]) + + """ + + # retrieve window samples if needed so that the window derivative can be + # calculated + if win_length is None: + win_length = n_fft + + window = get_window(window, win_length, fftbins=True) + window = util.pad_center(window, size=n_fft) + + if S is None: + if dtype is None: + dtype = util.dtype_r2c(y.dtype) + + S_h = stft( + y=y, + n_fft=n_fft, + hop_length=hop_length, + window=window, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + else: + if dtype is None: + dtype = S.dtype + + S_h = S + + # cyclic gradient to correctly handle edges of a periodic window + window_derivative = util.cyclic_gradient(window) + + S_dh = stft( + y=y, + n_fft=n_fft, + hop_length=hop_length, + window=window_derivative, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + # equation 5.20 of Flandrin, Auger, & Chassande-Mottin 2002 + # the sign of the correction is reversed in some papers - see Plante, + # Meyer, & Ainsworth 1998 pp. 283-284 + correction = -np.imag(S_dh / S_h) + + freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft) + freqs = util.expand_to(freqs, ndim=correction.ndim, axes=-2) + correction * ( + 0.5 * sr / np.pi + ) + + return freqs, S_h + + +def __reassign_times( + y: np.ndarray, + sr: float = 22050, + S: Optional[np.ndarray] = None, + n_fft: int = 2048, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + dtype: Optional[DTypeLike] = None, + pad_mode: _PadModeSTFT = "constant", +) -> Tuple[np.ndarray, np.ndarray]: + """Time reassignments based on a spectrogram representation. + + The reassignment vector is calculated using equation 5.23 in Flandrin, + Auger, & Chassande-Mottin 2002:: + + t_reassigned = t + np.real(S_th/S_h) + + where ``S_h`` is the complex STFT calculated using the original window, and + ``S_th`` is the complex STFT calculated using the original window multiplied + by the time offset from the window center. + + See `reassigned_spectrogram` for references. + + It is recommended to use ``pad_mode="constant"`` (zero padding) or else + ``center=False``, rather than the defaults. Time reassignment assumes that + the energy in each FFT bin is associated with exactly one impulse event. + Reflection padding at the edges of the signal may invalidate the reassigned + estimates in the boundary frames. + + Parameters + ---------- + y : np.ndarray [shape=(..., n,)], real-valued + audio time series. Multi-channel is supported. + + sr : number > 0 [scalar] + sampling rate of ``y`` + + S : np.ndarray [shape=(..., d, t)] or None + (optional) complex STFT calculated using the other arguments provided + to `__reassign_times` + + n_fft : int > 0 [scalar] + FFT window size. Defaults to 2048. + + hop_length : int > 0 [scalar] + hop length, number samples between subsequent frames. + If not supplied, defaults to ``win_length // 4``. + + win_length : int > 0, <= n_fft + Window length. Defaults to ``n_fft``. + See `stft` for details. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + - a window specification (string, tuple, number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a user-specified window vector of length ``n_fft`` + + See `stft` for details. + + .. see also:: `filters.get_window` + + center : boolean + - If ``True``, the signal ``y`` is padded so that frame + ``S[:, t]`` is centered at ``y[t * hop_length]``. + - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``. + + dtype : numeric type + Complex numeric type for ``S``. Default is inferred to match + the precision of the input signal. + + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, STFT uses zero padding. + + Returns + ------- + times : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real] + Reassigned times: + ``times[f, t]`` is the time for bin ``f``, frame ``t``. + S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex] + Short-time Fourier transform + + Warns + ----- + RuntimeWarning + Time estimates with zero support will produce a divide-by-zero warning + and will be returned as `np.nan`. + + See Also + -------- + stft : Short-time Fourier Transform + reassigned_spectrogram : Time-frequency reassigned spectrogram + + Examples + -------- + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> times, S = librosa.core.spectrum.__reassign_times(y, sr=sr) + >>> times + array([[ 2.268e-05, 1.144e-02, ..., 5.332e+00, 5.333e+00], + [ 2.268e-05, 1.451e-02, ..., 5.334e+00, 5.333e+00], + ..., + [ 2.268e-05, -6.177e-04, ..., 5.368e+00, 5.327e+00], + [ 2.268e-05, 1.420e-03, ..., 5.307e+00, 5.328e+00]]) + + """ + + # retrieve window samples if needed so that the time-weighted window can be + # calculated + if win_length is None: + win_length = n_fft + + window = get_window(window, win_length, fftbins=True) + window = util.pad_center(window, size=n_fft) + + # retrieve hop length if needed so that the frame times can be calculated + if hop_length is None: + hop_length = int(win_length // 4) + + if S is None: + if dtype is None: + dtype = util.dtype_r2c(y.dtype) + S_h = stft( + y=y, + n_fft=n_fft, + hop_length=hop_length, + window=window, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + else: + if dtype is None: + dtype = S.dtype + S_h = S + + # calculate window weighted by time + half_width = n_fft // 2 + + window_times: np.ndarray + if n_fft % 2: + window_times = np.arange(-half_width, half_width + 1) + + else: + window_times = np.arange(0.5 - half_width, half_width) + + window_time_weighted = window * window_times + + S_th = stft( + y=y, + n_fft=n_fft, + hop_length=hop_length, + window=window_time_weighted, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + # equation 5.23 of Flandrin, Auger, & Chassande-Mottin 2002 + # the sign of the correction is reversed in some papers - see Plante, + # Meyer, & Ainsworth 1998 pp. 283-284 + correction = np.real(S_th / S_h) + + if center: + pad_length = None + + else: + pad_length = n_fft + + times = convert.frames_to_time( + np.arange(S_h.shape[-1]), sr=sr, hop_length=hop_length, n_fft=pad_length + ) + + times = util.expand_to(times, ndim=correction.ndim, axes=-1) + correction / sr + + return times, S_h + + +def reassigned_spectrogram( + y: np.ndarray, + *, + sr: float = 22050, + S: Optional[np.ndarray] = None, + n_fft: int = 2048, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + reassign_frequencies: bool = True, + reassign_times: bool = True, + ref_power: Union[float, Callable] = 1e-6, + fill_nan: bool = False, + clip: bool = True, + dtype: Optional[DTypeLike] = None, + pad_mode: _PadModeSTFT = "constant", +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + r"""Time-frequency reassigned spectrogram. + + The reassignment vectors are calculated using equations 5.20 and 5.23 in + [#]_:: + + t_reassigned = t + np.real(S_th/S_h) + omega_reassigned = omega - np.imag(S_dh/S_h) + + where ``S_h`` is the complex STFT calculated using the original window, + ``S_dh`` is the complex STFT calculated using the derivative of the original + window, and ``S_th`` is the complex STFT calculated using the original window + multiplied by the time offset from the window center. See [#]_ for + additional algorithms, and [#]_ and [#]_ for history and discussion of the + method. + + .. [#] Flandrin, P., Auger, F., & Chassande-Mottin, E. (2002). + Time-Frequency reassignment: From principles to algorithms. In + Applications in Time-Frequency Signal Processing (Vol. 10, pp. + 179-204). CRC Press. + + .. [#] Fulop, S. A., & Fitz, K. (2006). Algorithms for computing the + time-corrected instantaneous frequency (reassigned) spectrogram, with + applications. The Journal of the Acoustical Society of America, 119(1), + 360. doi:10.1121/1.2133000 + + .. [#] Auger, F., Flandrin, P., Lin, Y.-T., McLaughlin, S., Meignen, S., + Oberlin, T., & Wu, H.-T. (2013). Time-Frequency Reassignment and + Synchrosqueezing: An Overview. IEEE Signal Processing Magazine, 30(6), + 32-41. doi:10.1109/MSP.2013.2265316 + + .. [#] Hainsworth, S., Macleod, M. (2003). Time-frequency reassignment: a + review and analysis. Tech. Rep. CUED/FINFENG/TR.459, Cambridge + University Engineering Department + + Parameters + ---------- + y : np.ndarray [shape=(..., n)], real-valued + audio time series. Multi-channel is supported. + + sr : number > 0 [scalar] + sampling rate of ``y`` + + S : np.ndarray [shape=(..., d, t)] or None + (optional) complex STFT calculated using the other arguments provided + to ``reassigned_spectrogram`` + + n_fft : int > 0 [scalar] + FFT window size. Defaults to 2048. + + hop_length : int > 0 [scalar] + hop length, number samples between subsequent frames. + If not supplied, defaults to ``win_length // 4``. + + win_length : int > 0, <= n_fft + Window length. Defaults to ``n_fft``. + See `stft` for details. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + - a window specification (string, tuple, number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a user-specified window vector of length ``n_fft`` + + See `stft` for details. + + .. see also:: `filters.get_window` + + center : boolean + - If ``True`` (default), the signal ``y`` is padded so that frame + ``S[:, t]`` is centered at ``y[t * hop_length]``. See `Notes` for + recommended usage in this function. + - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``. + + reassign_frequencies : boolean + - If ``True`` (default), the returned frequencies will be instantaneous + frequency estimates. + - If ``False``, the returned frequencies will be a read-only view of the + STFT bin frequencies for all frames. + + reassign_times : boolean + - If ``True`` (default), the returned times will be corrected + (reassigned) time estimates for each bin. + - If ``False``, the returned times will be a read-only view of the STFT + frame times for all bins. + + ref_power : float >= 0 or callable + Minimum power threshold for estimating time-frequency reassignments. + Any bin with ``np.abs(S[f, t])**2 < ref_power`` will be returned as + `np.nan` in both frequency and time, unless ``fill_nan`` is ``True``. If 0 + is provided, then only bins with zero power will be returned as + `np.nan` (unless ``fill_nan=True``). + + fill_nan : boolean + - If ``False`` (default), the frequency and time reassignments for bins + below the power threshold provided in ``ref_power`` will be returned as + `np.nan`. + - If ``True``, the frequency and time reassignments for these bins will + be returned as the bin center frequencies and frame times. + + clip : boolean + - If ``True`` (default), estimated frequencies outside the range + `[0, 0.5 * sr]` or times outside the range `[0, len(y) / sr]` will be + clipped to those ranges. + - If ``False``, estimated frequencies and times beyond the bounds of the + spectrogram may be returned. + + dtype : numeric type + Complex numeric type for STFT calculation. Default is inferred to match + the precision of the input signal. + + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, STFT uses zero padding. + + Returns + ------- + freqs, times, mags : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real] + Instantaneous frequencies: + ``freqs[..., f, t]`` is the frequency for bin ``f``, frame ``t``. + If ``reassign_frequencies=False``, this will instead be a read-only array + of the same shape containing the bin center frequencies for all frames. + + Reassigned times: + ``times[..., f, t]`` is the time for bin ``f``, frame ``t``. + If ``reassign_times=False``, this will instead be a read-only array of + the same shape containing the frame times for all bins. + + Magnitudes from short-time Fourier transform: + ``mags[..., f, t]`` is the magnitude for bin ``f``, frame ``t``. + + Warns + ----- + RuntimeWarning + Frequency or time estimates with zero support will produce a + divide-by-zero warning, and will be returned as `np.nan` unless + ``fill_nan=True``. + + See Also + -------- + stft : Short-time Fourier Transform + + Notes + ----- + It is recommended to use ``center=False`` with this function rather than the + librosa default ``True``. Unlike ``stft``, reassigned times are not aligned to + the left or center of each frame, so padding the signal does not affect the + meaning of the reassigned times. However, reassignment assumes that the + energy in each FFT bin is associated with exactly one signal component and + impulse event. + + If ``reassign_times`` is ``False``, the frame times that are returned will be + aligned to the left or center of the frame, depending on the value of + ``center``. In this case, if ``center`` is ``True``, then ``pad_mode="wrap"`` is + recommended for valid estimation of the instantaneous frequencies in the + boundary frames. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> amin = 1e-10 + >>> n_fft = 64 + >>> sr = 4000 + >>> y = 1e-3 * librosa.clicks(times=[0.3], sr=sr, click_duration=1.0, + ... click_freq=1200.0, length=8000) +\ + ... 1e-3 * librosa.clicks(times=[1.5], sr=sr, click_duration=0.5, + ... click_freq=400.0, length=8000) +\ + ... 1e-3 * librosa.chirp(fmin=200, fmax=1600, sr=sr, duration=2.0) +\ + ... 1e-6 * np.random.randn(2*sr) + >>> freqs, times, mags = librosa.reassigned_spectrogram(y=y, sr=sr, + ... n_fft=n_fft) + >>> mags_db = librosa.amplitude_to_db(mags, ref=np.max) + + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> img = librosa.display.specshow(mags_db, x_axis="s", y_axis="linear", sr=sr, + ... hop_length=n_fft//4, ax=ax[0]) + >>> ax[0].set(title="Spectrogram", xlabel=None) + >>> ax[0].label_outer() + >>> ax[1].scatter(times, freqs, c=mags_db, cmap="magma", alpha=0.1, s=5) + >>> ax[1].set_title("Reassigned spectrogram") + >>> fig.colorbar(img, ax=ax, format="%+2.f dB") + """ + + if not callable(ref_power) and ref_power < 0: + raise ParameterError("ref_power must be non-negative or callable.") + + if not reassign_frequencies and not reassign_times: + raise ParameterError("reassign_frequencies or reassign_times must be True.") + + if win_length is None: + win_length = n_fft + + if hop_length is None: + hop_length = int(win_length // 4) + + # frequency and time reassignment if requested + if reassign_frequencies: + freqs, S = __reassign_frequencies( + y=y, + sr=sr, + S=S, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + if reassign_times: + times, S = __reassign_times( + y=y, + sr=sr, + S=S, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + dtype=dtype, + pad_mode=pad_mode, + ) + + assert S is not None + + mags: np.ndarray = np.abs(S) + + # clean up reassignment issues: divide-by-zero, bins with near-zero power, + # and estimates outside the spectrogram bounds + + # retrieve bin frequencies and frame times to replace missing estimates + if fill_nan or not reassign_frequencies or not reassign_times: + if center: + pad_length = None + + else: + pad_length = n_fft + + bin_freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft) + + frame_times = convert.frames_to_time( + frames=np.arange(S.shape[-1]), + sr=sr, + hop_length=hop_length, + n_fft=pad_length, + ) + + # find bins below the power threshold + # reassigned bins with zero power will already be NaN + if callable(ref_power): + ref_p = ref_power(mags**2) + else: + ref_p = ref_power + mags_low = np.less(mags, ref_p**0.5, where=~np.isnan(mags)) + + # for reassigned estimates, optionally set thresholded bins to NaN, return + # bin frequencies and frame times in place of NaN generated by + # divide-by-zero and power threshold, and clip to spectrogram bounds + if reassign_frequencies: + if ref_p > 0: + freqs[mags_low] = np.nan + + if fill_nan: + freqs = np.where(np.isnan(freqs), bin_freqs[:, np.newaxis], freqs) + + if clip: + np.clip(freqs, 0, sr / 2.0, out=freqs) + + # or if reassignment was not requested, return bin frequencies and frame + # times for every cell is the spectrogram + else: + freqs = np.broadcast_to(bin_freqs[:, np.newaxis], S.shape) + + if reassign_times: + if ref_p > 0: + times[mags_low] = np.nan + + if fill_nan: + times = np.where(np.isnan(times), frame_times[np.newaxis, :], times) + + if clip: + np.clip(times, 0, y.shape[-1] / float(sr), out=times) + + else: + times = np.broadcast_to(frame_times[np.newaxis, :], S.shape) + + return freqs, times, mags + + +def magphase(D: np.ndarray, *, power: float = 1) -> Tuple[np.ndarray, np.ndarray]: + """Separate a complex-valued spectrogram D into its magnitude (S) + and phase (P) components, so that ``D = S * P``. + + Parameters + ---------- + D : np.ndarray [shape=(..., d, t), dtype=complex] + complex-valued spectrogram + power : float > 0 + Exponent for the magnitude spectrogram, + e.g., 1 for energy, 2 for power, etc. + + Returns + ------- + D_mag : np.ndarray [shape=(..., d, t), dtype=real] + magnitude of ``D``, raised to ``power`` + D_phase : np.ndarray [shape=(..., d, t), dtype=complex] + ``exp(1.j * phi)`` where ``phi`` is the phase of ``D`` + + Examples + -------- + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> D = librosa.stft(y) + >>> magnitude, phase = librosa.magphase(D) + >>> magnitude + array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05], + [3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05], + ..., + [7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03], + [7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]], + dtype=float32) + >>> phase + array([[ 1. +0.000e+00j, 1. +0.000e+00j, ..., + -1. -8.742e-08j, -1. -8.742e-08j], + [-1. -8.742e-08j, -0.775-6.317e-01j, ..., + -0.885-4.648e-01j, 0.472-8.815e-01j], + ..., + [ 1. -4.342e-12j, 0.028-9.996e-01j, ..., + -0.222-9.751e-01j, -0.75 -6.610e-01j], + [-1. -8.742e-08j, -1. -8.742e-08j, ..., + 1. +0.000e+00j, 1. +0.000e+00j]], dtype=complex64) + + Or get the phase angle (in radians) + + >>> np.angle(phase) + array([[ 0.000e+00, 0.000e+00, ..., -3.142e+00, -3.142e+00], + [-3.142e+00, -2.458e+00, ..., -2.658e+00, -1.079e+00], + ..., + [-4.342e-12, -1.543e+00, ..., -1.794e+00, -2.419e+00], + [-3.142e+00, -3.142e+00, ..., 0.000e+00, 0.000e+00]], + dtype=float32) + + """ + + mag = np.abs(D) + + # Prevent NaNs and return magnitude 0, phase 1+0j for zero + zeros_to_ones = mag == 0 + mag_nonzero = mag + zeros_to_ones + # Compute real and imaginary separately, because complex division can + # produce NaNs when denormalized numbers are involved (< ~2e-39 for + # complex64, ~5e-309 for complex128) + phase = np.empty_like(D, dtype=util.dtype_r2c(D.dtype)) + phase.real = D.real / mag_nonzero + zeros_to_ones + phase.imag = D.imag / mag_nonzero + + mag **= power + + return mag, phase + + +def phase_vocoder( + D: np.ndarray, + *, + rate: float, + hop_length: Optional[int] = None, + n_fft: Optional[int] = None, +) -> np.ndarray: + """Phase vocoder. Given an STFT matrix D, speed up by a factor of ``rate`` + + Based on the implementation provided by [#]_. + + This is a simplified implementation, intended primarily for + reference and pedagogical purposes. It makes no attempt to + handle transients, and is likely to produce many audible + artifacts. For a higher quality implementation, we recommend + the RubberBand library [#]_ and its Python wrapper `pyrubberband`. + + .. [#] Ellis, D. P. W. "A phase vocoder in Matlab." + Columbia University, 2002. + http://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/ + + .. [#] https://breakfastquay.com/rubberband/ + + Examples + -------- + >>> # Play at double speed + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> D = librosa.stft(y, n_fft=2048, hop_length=512) + >>> D_fast = librosa.phase_vocoder(D, rate=2.0, hop_length=512) + >>> y_fast = librosa.istft(D_fast, hop_length=512) + + >>> # Or play at 1/3 speed + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> D = librosa.stft(y, n_fft=2048, hop_length=512) + >>> D_slow = librosa.phase_vocoder(D, rate=1./3, hop_length=512) + >>> y_slow = librosa.istft(D_slow, hop_length=512) + + Parameters + ---------- + D : np.ndarray [shape=(..., d, t), dtype=complex] + STFT matrix + + rate : float > 0 [scalar] + Speed-up factor: ``rate > 1`` is faster, ``rate < 1`` is slower. + + hop_length : int > 0 [scalar] or None + The number of samples between successive columns of ``D``. + + If None, defaults to ``n_fft//4 = (D.shape[0]-1)//2`` + + n_fft : int > 0 or None + The number of samples per frame in D. + By default (None), this will be inferred from the shape of D. + However, if D was constructed using an odd-length window, the correct + frame length can be specified here. + + Returns + ------- + D_stretched : np.ndarray [shape=(..., d, t / rate), dtype=complex] + time-stretched STFT + + See Also + -------- + pyrubberband + """ + + if n_fft is None: + n_fft = 2 * (D.shape[-2] - 1) + + if hop_length is None: + hop_length = int(n_fft // 4) + + time_steps = np.arange(0, D.shape[-1], rate, dtype=np.float64) + + # Create an empty output array + shape = list(D.shape) + shape[-1] = len(time_steps) + d_stretch = np.zeros_like(D, shape=shape) + + # Expected phase advance in each bin + phi_advance = np.linspace(0, np.pi * hop_length, D.shape[-2]) + + # Phase accumulator; initialize to the first sample + phase_acc = np.angle(D[..., 0]) + + # Pad 0 columns to simplify boundary logic + padding = [(0, 0) for _ in D.shape] + padding[-1] = (0, 2) + D = np.pad(D, padding, mode="constant") + + for t, step in enumerate(time_steps): + columns = D[..., int(step) : int(step + 2)] + + # Weighting for linear magnitude interpolation + alpha = np.mod(step, 1.0) + mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(columns[..., 1]) + + # Store to output array + d_stretch[..., t] = util.phasor(phase_acc, mag=mag) + + # Compute phase advance + dphase = np.angle(columns[..., 1]) - np.angle(columns[..., 0]) - phi_advance + + # Wrap to -pi:pi range + dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi)) + + # Accumulate phase + phase_acc += phi_advance + dphase + + return d_stretch + + +@cache(level=20) +def iirt( + y: np.ndarray, + *, + sr: float = 22050, + win_length: int = 2048, + hop_length: Optional[int] = None, + center: bool = True, + tuning: float = 0.0, + pad_mode: _PadMode = "constant", + flayout: str = "sos", + res_type: str = "soxr_hq", + **kwargs: Any, +) -> np.ndarray: + r"""Time-frequency representation using IIR filters + + This function will return a time-frequency representation + using a multirate filter bank consisting of IIR filters. [#]_ + + First, ``y`` is resampled as needed according to the provided ``sample_rates``. + + Then, a filterbank with with ``n`` band-pass filters is designed. + + The resampled input signals are processed by the filterbank as a whole. + (`scipy.signal.filtfilt` resp. `sosfiltfilt` is used to make the phase linear.) + The output of the filterbank is cut into frames. + For each band, the short-time mean-square power (STMSP) is calculated by + summing ``win_length`` subsequent filtered time samples. + + When called with the default set of parameters, it will generate the TF-representation + (pitch filterbank): + + * 85 filters with MIDI pitches [24, 108] as ``center_freqs``. + * each filter having a bandwidth of one semitone. + + .. [#] Müller, Meinard. + "Information Retrieval for Music and Motion." + Springer Verlag. 2007. + + Parameters + ---------- + y : np.ndarray [shape=(..., n)] + audio time series. Multi-channel is supported. + sr : number > 0 [scalar] + sampling rate of ``y`` + win_length : int > 0, <= n_fft + Window length. + hop_length : int > 0 [scalar] + Hop length, number samples between subsequent frames. + If not supplied, defaults to ``win_length // 4``. + center : boolean + - If ``True``, the signal ``y`` is padded so that frame + ``D[..., :, t]`` is centered at ``y[t * hop_length]``. + - If ``False``, then `D[..., :, t]`` begins at ``y[t * hop_length]`` + tuning : float [scalar] + Tuning deviation from A440 in fractions of a bin. + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, this function uses zero padding. + flayout : string + - If `sos` (default), a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`. + Minimizes numerical precision errors for high-order filters, but is slower. + - If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`. + Can be unstable for high-order filters. + res_type : string + The resampling mode. See `librosa.resample` for details. + **kwargs : additional keyword arguments + Additional arguments for `librosa.filters.semitone_filterbank` + (e.g., could be used to provide another set of ``center_freqs`` and ``sample_rates``). + + Returns + ------- + bands_power : np.ndarray [shape=(..., n, t), dtype=dtype] + Short-time mean-square power for the input signal. + + Raises + ------ + ParameterError + If ``flayout`` is not None, `ba`, or `sos`. + + See Also + -------- + librosa.filters.semitone_filterbank + librosa.filters.mr_frequencies + librosa.cqt + scipy.signal.filtfilt + scipy.signal.sosfiltfilt + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3) + >>> D = np.abs(librosa.iirt(y)) + >>> C = np.abs(librosa.cqt(y=y, sr=sr)) + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), + ... y_axis='cqt_hz', x_axis='time', ax=ax[0]) + >>> ax[0].set(title='Constant-Q transform') + >>> ax[0].label_outer() + >>> img = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), + ... y_axis='cqt_hz', x_axis='time', ax=ax[1]) + >>> ax[1].set_title('Semitone spectrogram (iirt)') + >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") + """ + + if flayout not in ("ba", "sos"): + raise ParameterError(f"Unsupported flayout={flayout}") + + # check audio input + util.valid_audio(y, mono=False) + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = win_length // 4 + + # Pad the time series so that frames are centered + if center: + padding = [(0, 0) for _ in y.shape] + padding[-1] = (win_length // 2, win_length // 2) + y = np.pad(y, padding, mode=pad_mode) + + # get the semitone filterbank + filterbank_ct, sample_rates = semitone_filterbank( + tuning=tuning, flayout=flayout, **kwargs + ) + + # create three downsampled versions of the audio signal + y_resampled = [] + + y_srs = np.unique(sample_rates) + + for cur_sr in y_srs: + y_resampled.append(resample(y, orig_sr=sr, target_sr=cur_sr, res_type=res_type)) + + # Compute the number of frames that will fit. The end may get truncated. + n_frames = int(1 + (y.shape[-1] - win_length) // hop_length) + + # Pre-allocate the output array + shape = list(y.shape) + # Time dimension reduces to n_frames + shape[-1] = n_frames + # Insert a new axis at position -2 for filter response + shape.insert(-1, len(filterbank_ct)) + + bands_power = np.empty_like(y, shape=shape) + + slices: List[Union[int, slice]] = [slice(None) for _ in bands_power.shape] + for i, (cur_sr, cur_filter) in enumerate(zip(sample_rates, filterbank_ct)): + slices[-2] = i + + # filter the signal + cur_sr_idx = np.flatnonzero(y_srs == cur_sr)[0] + + if flayout == "ba": + cur_filter_output = scipy.signal.filtfilt( + cur_filter[0], cur_filter[1], y_resampled[cur_sr_idx], axis=-1 + ) + elif flayout == "sos": + cur_filter_output = scipy.signal.sosfiltfilt( + cur_filter, y_resampled[cur_sr_idx], axis=-1 + ) + + factor = sr / cur_sr + hop_length_STMSP = hop_length / factor + win_length_STMSP_round = int(round(win_length / factor)) + + # hop_length_STMSP is used here as a floating-point number. + # The discretization happens at the end to avoid accumulated rounding errors. + start_idx = np.arange( + 0, cur_filter_output.shape[-1] - win_length_STMSP_round, hop_length_STMSP + ) + if len(start_idx) < n_frames: + min_length = ( + int(np.ceil(n_frames * hop_length_STMSP)) + win_length_STMSP_round + ) + cur_filter_output = util.fix_length(cur_filter_output, size=min_length) + start_idx = np.arange( + 0, + cur_filter_output.shape[-1] - win_length_STMSP_round, + hop_length_STMSP, + ) + start_idx = np.round(start_idx).astype(int)[:n_frames] + + idx = np.add.outer(start_idx, np.arange(win_length_STMSP_round)) + + bands_power[tuple(slices)] = factor * np.sum( + cur_filter_output[..., idx] ** 2, axis=-1 + ) + + return bands_power + + +@cache(level=30) +def power_to_db( + S: np.ndarray, + *, + ref: Union[float, Callable] = 1.0, + amin: float = 1e-10, + top_db: Optional[float] = 80.0, +) -> np.ndarray: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units + + This computes the scaling ``10 * log10(S / ref)`` in a numerically + stable way. + + Parameters + ---------- + S : np.ndarray + input power + + ref : scalar or callable + If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:: + + 10 * log10(S / ref) + + Zeros in the output correspond to positions where ``S == ref``. + + If callable, the reference value is computed as ``ref(S)``. + + amin : float > 0 [scalar] + minimum threshold for ``abs(S)`` and ``ref`` + + top_db : float >= 0 [scalar] + threshold the output at ``top_db`` below the peak: + ``max(10 * log10(S/ref)) - top_db`` + + Returns + ------- + S_db : np.ndarray + ``S_db ~= 10 * log10(S) - 10 * log10(ref)`` + + See Also + -------- + perceptual_weighting + db_to_power + amplitude_to_db + db_to_amplitude + + Notes + ----- + This function caches at level 30. + + Examples + -------- + Get a power spectrogram from a waveform ``y`` + + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> S = np.abs(librosa.stft(y)) + >>> librosa.power_to_db(S**2) + array([[-41.809, -41.809, ..., -41.809, -41.809], + [-41.809, -41.809, ..., -41.809, -41.809], + ..., + [-41.809, -41.809, ..., -41.809, -41.809], + [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32) + + Compute dB relative to peak power + + >>> librosa.power_to_db(S**2, ref=np.max) + array([[-80., -80., ..., -80., -80.], + [-80., -80., ..., -80., -80.], + ..., + [-80., -80., ..., -80., -80.], + [-80., -80., ..., -80., -80.]], dtype=float32) + + Or compare to median power + + >>> librosa.power_to_db(S**2, ref=np.median) + array([[16.578, 16.578, ..., 16.578, 16.578], + [16.578, 16.578, ..., 16.578, 16.578], + ..., + [16.578, 16.578, ..., 16.578, 16.578], + [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32) + + And plot the results + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time', + ... ax=ax[0]) + >>> ax[0].set(title='Power spectrogram') + >>> ax[0].label_outer() + >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max), + ... sr=sr, y_axis='log', x_axis='time', ax=ax[1]) + >>> ax[1].set(title='Log-Power spectrogram') + >>> fig.colorbar(imgpow, ax=ax[0]) + >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB") + """ + + S = np.asarray(S) + + if amin <= 0: + raise ParameterError("amin must be strictly positive") + + if np.issubdtype(S.dtype, np.complexfloating): + warnings.warn( + "power_to_db was called on complex input so phase " + "information will be discarded. To suppress this warning, " + "call power_to_db(np.abs(D)**2) instead.", + stacklevel=2, + ) + magnitude = np.abs(S) + else: + magnitude = S + + if callable(ref): + # User supplied a function to calculate reference power + ref_value = ref(magnitude) + else: + ref_value = np.abs(ref) + + log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude)) + log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) + + if top_db is not None: + if top_db < 0: + raise ParameterError("top_db must be non-negative") + log_spec = np.maximum(log_spec, log_spec.max() - top_db) + + return log_spec + + +@cache(level=30) +def db_to_power(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray: + """Convert a dB-scale spectrogram to a power spectrogram. + + This effectively inverts ``power_to_db``:: + + db_to_power(S_db) ~= ref * 10.0**(S_db / 10) + + Parameters + ---------- + S_db : np.ndarray + dB-scaled spectrogram + ref : number > 0 + Reference power: output will be scaled by this value + + Returns + ------- + S : np.ndarray + Power spectrogram + + Notes + ----- + This function caches at level 30. + """ + return ref * np.power(10.0, 0.1 * S_db) + + +@cache(level=30) +def amplitude_to_db( + S: np.ndarray, + *, + ref: Union[float, Callable] = 1.0, + amin: float = 1e-5, + top_db: Optional[float] = 80.0, +) -> np.ndarray: + """Convert an amplitude spectrogram to dB-scaled spectrogram. + + This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``, + but is provided for convenience. + + Parameters + ---------- + S : np.ndarray + input amplitude + + ref : scalar or callable + If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``: + ``20 * log10(S / ref)``. + Zeros in the output correspond to positions where ``S == ref``. + + If callable, the reference value is computed as ``ref(S)``. + + amin : float > 0 [scalar] + minimum threshold for ``S`` and ``ref`` + + top_db : float >= 0 [scalar] + threshold the output at ``top_db`` below the peak: + ``max(20 * log10(S/ref)) - top_db`` + + Returns + ------- + S_db : np.ndarray + ``S`` measured in dB + + See Also + -------- + power_to_db, db_to_amplitude + + Notes + ----- + This function caches at level 30. + """ + + S = np.asarray(S) + + if np.issubdtype(S.dtype, np.complexfloating): + warnings.warn( + "amplitude_to_db was called on complex input so phase " + "information will be discarded. To suppress this warning, " + "call amplitude_to_db(np.abs(S)) instead.", + stacklevel=2, + ) + + magnitude = np.abs(S) + + if callable(ref): + # User supplied a function to calculate reference power + ref_value = ref(magnitude) + else: + ref_value = np.abs(ref) + + power = np.square(magnitude, out=magnitude) + + return power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db) + + +@cache(level=30) +def db_to_amplitude(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray: + """Convert a dB-scaled spectrogram to an amplitude spectrogram. + + This effectively inverts `amplitude_to_db`:: + + db_to_amplitude(S_db) ~= 10.0**(0.5 * S_db/10 + log10(ref)) + + Parameters + ---------- + S_db : np.ndarray + dB-scaled spectrogram + ref : number > 0 + Optional reference power. + + Returns + ------- + S : np.ndarray + Linear magnitude spectrogram + + Notes + ----- + This function caches at level 30. + """ + return db_to_power(S_db, ref=ref**2) ** 0.5 + + +@cache(level=30) +def perceptual_weighting( + S: np.ndarray, frequencies: np.ndarray, *, kind: str = "A", **kwargs: Any +) -> np.ndarray: + """Perceptual weighting of a power spectrogram:: + + S_p[..., f, :] = frequency_weighting(f, 'A') + 10*log(S[..., f, :] / ref) + + Parameters + ---------- + S : np.ndarray [shape=(..., d, t)] + Power spectrogram + frequencies : np.ndarray [shape=(d,)] + Center frequency for each row of` `S`` + kind : str + The frequency weighting curve to use. + e.g. `'A'`, `'B'`, `'C'`, `'D'`, `None or 'Z'` + **kwargs : additional keyword arguments + Additional keyword arguments to `power_to_db`. + + Returns + ------- + S_p : np.ndarray [shape=(..., d, t)] + perceptually weighted version of ``S`` + + See Also + -------- + power_to_db + + Notes + ----- + This function caches at level 30. + + Examples + -------- + Re-weight a CQT power spectrum, using peak power as reference + + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1'))) + >>> freqs = librosa.cqt_frequencies(C.shape[0], + ... fmin=librosa.note_to_hz('A1')) + >>> perceptual_CQT = librosa.perceptual_weighting(C**2, + ... freqs, + ... ref=np.max) + >>> perceptual_CQT + array([[ -96.528, -97.101, ..., -108.561, -108.561], + [ -95.88 , -96.479, ..., -107.551, -107.551], + ..., + [ -65.142, -53.256, ..., -80.098, -80.098], + [ -71.542, -53.197, ..., -80.311, -80.311]]) + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, + ... ref=np.max), + ... fmin=librosa.note_to_hz('A1'), + ... y_axis='cqt_hz', x_axis='time', + ... ax=ax[0]) + >>> ax[0].set(title='Log CQT power') + >>> ax[0].label_outer() + >>> imgp = librosa.display.specshow(perceptual_CQT, y_axis='cqt_hz', + ... fmin=librosa.note_to_hz('A1'), + ... x_axis='time', ax=ax[1]) + >>> ax[1].set(title='Perceptually weighted log CQT') + >>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB") + >>> fig.colorbar(imgp, ax=ax[1], format="%+2.0f dB") + """ + + offset = convert.frequency_weighting(frequencies, kind=kind).reshape((-1, 1)) + + result: np.ndarray = offset + power_to_db(S, **kwargs) + return result + + +@cache(level=30) +def fmt( + y: np.ndarray, + *, + t_min: float = 0.5, + n_fmt: Optional[int] = None, + kind: str = "cubic", + beta: float = 0.5, + over_sample: float = 1, + axis: int = -1, +) -> np.ndarray: + """The fast Mellin transform (FMT) + + The Mellin of a signal `y` is performed by interpolating `y` on an exponential time + axis, applying a polynomial window, and then taking the discrete Fourier transform. + + When the Mellin parameter (beta) is 1/2, it is also known as the scale transform. [#]_ + The scale transform can be useful for audio analysis because its magnitude is invariant + to scaling of the domain (e.g., time stretching or compression). This is analogous + to the magnitude of the Fourier transform being invariant to shifts in the input domain. + + .. [#] De Sena, Antonio, and Davide Rocchesso. + "A fast Mellin and scale transform." + EURASIP Journal on Applied Signal Processing 2007.1 (2007): 75-75. + + .. [#] Cohen, L. + "The scale representation." + IEEE Transactions on Signal Processing 41, no. 12 (1993): 3275-3292. + + Parameters + ---------- + y : np.ndarray, real-valued + The input signal(s). Can be multidimensional. + The target axis must contain at least 3 samples. + + t_min : float > 0 + The minimum time spacing (in samples). + This value should generally be less than 1 to preserve as much information as + possible. + + n_fmt : int > 2 or None + The number of scale transform bins to use. + If None, then ``n_bins = over_sample * ceil(n * log((n-1)/t_min))`` is taken, + where ``n = y.shape[axis]`` + + kind : str + The type of interpolation to use when re-sampling the input. + See `scipy.interpolate.interp1d` for possible values. + + Note that the default is to use high-precision (cubic) interpolation. + This can be slow in practice; if speed is preferred over accuracy, + then consider using ``kind='linear'``. + + beta : float + The Mellin parameter. ``beta=0.5`` provides the scale transform. + + over_sample : float >= 1 + Over-sampling factor for exponential resampling. + + axis : int + The axis along which to transform ``y`` + + Returns + ------- + x_scale : np.ndarray [dtype=complex] + The scale transform of ``y`` along the ``axis`` dimension. + + Raises + ------ + ParameterError + if ``n_fmt < 2`` or ``t_min <= 0`` + or if ``y`` is not finite + or if ``y.shape[axis] < 3``. + + Notes + ----- + This function caches at level 30. + + Examples + -------- + >>> # Generate a signal and time-stretch it (with energy normalization) + >>> scale = 1.25 + >>> freq = 3.0 + >>> x1 = np.linspace(0, 1, num=1024, endpoint=False) + >>> x2 = np.linspace(0, 1, num=int(scale * len(x1)), endpoint=False) + >>> y1 = np.sin(2 * np.pi * freq * x1) + >>> y2 = np.sin(2 * np.pi * freq * x2) / np.sqrt(scale) + >>> # Verify that the two signals have the same energy + >>> np.sum(np.abs(y1)**2), np.sum(np.abs(y2)**2) + (255.99999999999997, 255.99999999999969) + >>> scale1 = librosa.fmt(y1, n_fmt=512) + >>> scale2 = librosa.fmt(y2, n_fmt=512) + + >>> # And plot the results + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots(nrows=2) + >>> ax[0].plot(y1, label='Original') + >>> ax[0].plot(y2, linestyle='--', label='Stretched') + >>> ax[0].set(xlabel='time (samples)', title='Input signals') + >>> ax[0].legend() + >>> ax[1].semilogy(np.abs(scale1), label='Original') + >>> ax[1].semilogy(np.abs(scale2), linestyle='--', label='Stretched') + >>> ax[1].set(xlabel='scale coefficients', title='Scale transform magnitude') + >>> ax[1].legend() + + >>> # Plot the scale transform of an onset strength autocorrelation + >>> y, sr = librosa.load(librosa.ex('choice')) + >>> odf = librosa.onset.onset_strength(y=y, sr=sr) + >>> # Auto-correlate with up to 10 seconds lag + >>> odf_ac = librosa.autocorrelate(odf, max_size=10 * sr // 512) + >>> # Normalize + >>> odf_ac = librosa.util.normalize(odf_ac, norm=np.inf) + >>> # Compute the scale transform + >>> odf_ac_scale = librosa.fmt(librosa.util.normalize(odf_ac), n_fmt=512) + >>> # Plot the results + >>> fig, ax = plt.subplots(nrows=3) + >>> ax[0].plot(odf, label='Onset strength') + >>> ax[0].set(xlabel='Time (frames)', title='Onset strength') + >>> ax[1].plot(odf_ac, label='Onset autocorrelation') + >>> ax[1].set(xlabel='Lag (frames)', title='Onset autocorrelation') + >>> ax[2].semilogy(np.abs(odf_ac_scale), label='Scale transform magnitude') + >>> ax[2].set(xlabel='scale coefficients') + """ + + n = y.shape[axis] + + if n < 3: + raise ParameterError(f"y.shape[{axis}]=={n} < 3") + + if t_min <= 0: + raise ParameterError(f"t_min={t_min} must be a positive number") + + if n_fmt is None: + if over_sample < 1: + raise ParameterError(f"over_sample={over_sample} must be >= 1") + + # The base is the maximum ratio between adjacent samples + # Since the sample spacing is increasing, this is simply the + # ratio between the positions of the last two samples: (n-1)/(n-2) + log_base = np.log(n - 1) - np.log(n - 2) + + n_fmt = int(np.ceil(over_sample * (np.log(n - 1) - np.log(t_min)) / log_base)) + + elif n_fmt < 3: + raise ParameterError(f"n_fmt=={n_fmt} < 3") + else: + log_base = (np.log(n_fmt - 1) - np.log(n_fmt - 2)) / over_sample + + if not np.all(np.isfinite(y)): + raise ParameterError("y must be finite everywhere") + + base = np.exp(log_base) + # original grid: signal covers [0, 1). This range is arbitrary, but convenient. + # The final sample is positioned at (n-1)/n, so we omit the endpoint + x = np.linspace(0, 1, num=n, endpoint=False) + + # build the interpolator + f_interp = scipy.interpolate.interp1d(x, y, kind=kind, axis=axis) + + # build the new sampling grid + # exponentially spaced between t_min/n and 1 (exclusive) + # we'll go one past where we need, and drop the last sample + # When over-sampling, the last input sample contributions n_over samples. + # To keep the spacing consistent, we over-sample by n_over, and then + # trim the final samples. + n_over = int(np.ceil(over_sample)) + x_exp = np.logspace( + (np.log(t_min) - np.log(n)) / log_base, + 0, + num=n_fmt + n_over, + endpoint=False, + base=base, + )[:-n_over] + + # Clean up any rounding errors at the boundaries of the interpolation + # The interpolator gets angry if we try to extrapolate, so clipping is necessary here. + if x_exp[0] < t_min or x_exp[-1] > float(n - 1.0) / n: + x_exp = np.clip(x_exp, float(t_min) / n, x[-1]) + + # Make sure that all sample points are unique + # This should never happen! + if len(np.unique(x_exp)) != len(x_exp): + raise ParameterError("Redundant sample positions in Mellin transform") + + # Resample the signal + y_res = f_interp(x_exp) + + # Broadcast the window correctly + shape = [1] * y_res.ndim + shape[axis] = -1 + + # Apply the window and fft + # Normalization is absorbed into the window here for expedience + fft = get_fftlib() + result: np.ndarray = fft.rfft( + y_res * ((x_exp**beta).reshape(shape) * np.sqrt(n) / n_fmt), axis=axis + ) + return result + + +@overload +def pcen( + S: np.ndarray, + *, + sr: float = ..., + hop_length: int = ..., + gain: float = ..., + bias: float = ..., + power: float = ..., + time_constant: float = ..., + eps: float = ..., + b: Optional[float] = ..., + max_size: int = ..., + ref: Optional[np.ndarray] = ..., + axis: int = ..., + max_axis: Optional[int] = ..., + zi: Optional[np.ndarray] = ..., + return_zf: Literal[False] = ..., +) -> np.ndarray: + ... + + +@overload +def pcen( + S: np.ndarray, + *, + sr: float = ..., + hop_length: int = ..., + gain: float = ..., + bias: float = ..., + power: float = ..., + time_constant: float = ..., + eps: float = ..., + b: Optional[float] = ..., + max_size: int = ..., + ref: Optional[np.ndarray] = ..., + axis: int = ..., + max_axis: Optional[int] = ..., + zi: Optional[np.ndarray] = ..., + return_zf: Literal[True], +) -> Tuple[np.ndarray, np.ndarray]: + ... + + +@overload +def pcen( + S: np.ndarray, + *, + sr: float = ..., + hop_length: int = ..., + gain: float = ..., + bias: float = ..., + power: float = ..., + time_constant: float = ..., + eps: float = ..., + b: Optional[float] = ..., + max_size: int = ..., + ref: Optional[np.ndarray] = ..., + axis: int = ..., + max_axis: Optional[int] = ..., + zi: Optional[np.ndarray] = ..., + return_zf: bool = ..., +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + ... + + +@cache(level=30) +def pcen( + S: np.ndarray, + *, + sr: float = 22050, + hop_length: int = 512, + gain: float = 0.98, + bias: float = 2, + power: float = 0.5, + time_constant: float = 0.400, + eps: float = 1e-6, + b: Optional[float] = None, + max_size: int = 1, + ref: Optional[np.ndarray] = None, + axis: int = -1, + max_axis: Optional[int] = None, + zi: Optional[np.ndarray] = None, + return_zf: bool = False, +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + """Per-channel energy normalization (PCEN) + + This function normalizes a time-frequency representation ``S`` by + performing automatic gain control, followed by nonlinear compression [#]_ :: + + P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power + + IMPORTANT: the default values of eps, gain, bias, and power match the + original publication, in which ``S`` is a 40-band mel-frequency + spectrogram with 25 ms windowing, 10 ms frame shift, and raw audio values + in the interval [-2**31; 2**31-1[. If you use these default values, we + recommend to make sure that the raw audio is properly scaled to this + interval, and not to [-1, 1[ as is most often the case. + + The matrix ``M`` is the result of applying a low-pass, temporal IIR filter + to ``S``:: + + M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t] + + If ``b`` is not provided, it is calculated as:: + + b = (sqrt(1 + 4* T**2) - 1) / (2 * T**2) + + where ``T = time_constant * sr / hop_length``. [#]_ + + This normalization is designed to suppress background noise and + emphasize foreground signals, and can be used as an alternative to + decibel scaling (`amplitude_to_db`). + + This implementation also supports smoothing across frequency bins + by specifying ``max_size > 1``. If this option is used, the filtered + spectrogram ``M`` is computed as:: + + M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t] + + where ``R`` has been max-filtered along the frequency axis, similar to + the SuperFlux algorithm implemented in `onset.onset_strength`:: + + R[f, t] = max(S[f - max_size//2: f + max_size//2, t]) + + This can be used to perform automatic gain control on signals that cross + or span multiple frequency bans, which may be desirable for spectrograms + with high frequency resolution. + + .. [#] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A. + (2017, March). Trainable frontend for robust and far-field keyword spotting. + In Acoustics, Speech and Signal Processing (ICASSP), 2017 + IEEE International Conference on (pp. 5670-5674). IEEE. + + .. [#] Lostanlen, V., Salamon, J., McFee, B., Cartwright, M., Farnsworth, A., + Kelling, S., and Bello, J. P. Per-Channel Energy Normalization: Why and How. + IEEE Signal Processing Letters, 26(1), 39-43. + + Parameters + ---------- + S : np.ndarray (non-negative) + The input (magnitude) spectrogram + + sr : number > 0 [scalar] + The audio sampling rate + + hop_length : int > 0 [scalar] + The hop length of ``S``, expressed in samples + + gain : number >= 0 [scalar] + The gain factor. Typical values should be slightly less than 1. + + bias : number >= 0 [scalar] + The bias point of the nonlinear compression (default: 2) + + power : number >= 0 [scalar] + The compression exponent. Typical values should be between 0 and 0.5. + Smaller values of ``power`` result in stronger compression. + At the limit ``power=0``, polynomial compression becomes logarithmic. + + time_constant : number > 0 [scalar] + The time constant for IIR filtering, measured in seconds. + + eps : number > 0 [scalar] + A small constant used to ensure numerical stability of the filter. + + b : number in [0, 1] [scalar] + The filter coefficient for the low-pass filter. + If not provided, it will be inferred from ``time_constant``. + + max_size : int > 0 [scalar] + The width of the max filter applied to the frequency axis. + If left as `1`, no filtering is performed. + + ref : None or np.ndarray (shape=S.shape) + An optional pre-computed reference spectrum (``R`` in the above). + If not provided it will be computed from ``S``. + + axis : int [scalar] + The (time) axis of the input spectrogram. + + max_axis : None or int [scalar] + The frequency axis of the input spectrogram. + If `None`, and ``S`` is two-dimensional, it will be inferred + as the opposite from ``axis``. + If ``S`` is not two-dimensional, and ``max_size > 1``, an error + will be raised. + + zi : np.ndarray + The initial filter delay values. + + This may be the ``zf`` (final delay values) of a previous call to ``pcen``, or + computed by `scipy.signal.lfilter_zi`. + + return_zf : bool + If ``True``, return the final filter delay values along with the PCEN output ``P``. + This is primarily useful in streaming contexts, where the final state of one + block of processing should be used to initialize the next block. + + If ``False`` (default) only the PCEN values ``P`` are returned. + + Returns + ------- + P : np.ndarray, non-negative [shape=(n, m)] + The per-channel energy normalized version of ``S``. + zf : np.ndarray (optional) + The final filter delay values. Only returned if ``return_zf=True``. + + See Also + -------- + amplitude_to_db + librosa.onset.onset_strength + + Examples + -------- + Compare PCEN to log amplitude (dB) scaling on Mel spectra + + >>> import matplotlib.pyplot as plt + >>> y, sr = librosa.load(librosa.ex('robin')) + + >>> # We recommend scaling y to the range [-2**31, 2**31[ before applying + >>> # PCEN's default parameters. Furthermore, we use power=1 to get a + >>> # magnitude spectrum instead of a power spectrum. + >>> S = librosa.feature.melspectrogram(y=y, sr=sr, power=1) + >>> log_S = librosa.amplitude_to_db(S, ref=np.max) + >>> pcen_S = librosa.pcen(S * (2**31)) + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> img = librosa.display.specshow(log_S, x_axis='time', y_axis='mel', ax=ax[0]) + >>> ax[0].set(title='log amplitude (dB)', xlabel=None) + >>> ax[0].label_outer() + >>> imgpcen = librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[1]) + >>> ax[1].set(title='Per-channel energy normalization') + >>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB") + >>> fig.colorbar(imgpcen, ax=ax[1]) + + Compare PCEN with and without max-filtering + + >>> pcen_max = librosa.pcen(S * (2**31), max_size=3) + >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) + >>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[0]) + >>> ax[0].set(title='Per-channel energy normalization (no max-filter)') + >>> ax[0].label_outer() + >>> img = librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel', ax=ax[1]) + >>> ax[1].set(title='Per-channel energy normalization (max_size=3)') + >>> fig.colorbar(img, ax=ax) + """ + + if power < 0: + raise ParameterError(f"power={power} must be nonnegative") + + if gain < 0: + raise ParameterError(f"gain={gain} must be non-negative") + + if bias < 0: + raise ParameterError(f"bias={bias} must be non-negative") + + if eps <= 0: + raise ParameterError(f"eps={eps} must be strictly positive") + + if time_constant <= 0: + raise ParameterError(f"time_constant={time_constant} must be strictly positive") + + if not util.is_positive_int(max_size): + raise ParameterError(f"max_size={max_size} must be a positive integer") + + if b is None: + t_frames = time_constant * sr / float(hop_length) + # By default, this solves the equation for b: + # b**2 + (1 - b) / t_frames - 2 = 0 + # which approximates the full-width half-max of the + # squared frequency response of the IIR low-pass filter + + b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2) + + if not 0 <= b <= 1: + raise ParameterError(f"b={b} must be between 0 and 1") + + if np.issubdtype(S.dtype, np.complexfloating): + warnings.warn( + "pcen was called on complex input so phase " + "information will be discarded. To suppress this warning, " + "call pcen(np.abs(D)) instead.", + stacklevel=2, + ) + S = np.abs(S) + + if ref is None: + if max_size == 1: + ref = S + elif S.ndim == 1: + raise ParameterError( + "Max-filtering cannot be applied to 1-dimensional input" + ) + else: + if max_axis is None: + if S.ndim != 2: + raise ParameterError( + f"Max-filtering a {S.ndim:d}-dimensional spectrogram " + "requires you to specify max_axis" + ) + # if axis = 0, max_axis=1 + # if axis = +- 1, max_axis = 0 + max_axis = np.mod(1 - axis, 2) + + ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis) + + if zi is None: + # Make sure zi matches dimension to input + shape = tuple([1] * ref.ndim) + zi = np.empty(shape) + zi[:] = scipy.signal.lfilter_zi([b], [1, b - 1])[:] + + # Temporal integration + S_smooth: np.ndarray + zf: np.ndarray + S_smooth, zf = scipy.signal.lfilter([b], [1, b - 1], ref, zi=zi, axis=axis) + + # Adaptive gain control + # Working in log-space gives us some stability, and a slight speedup + smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps))) + + # Dynamic range compression + S_out: np.ndarray + if power == 0: + S_out = np.log1p(S * smooth) + elif bias == 0: + S_out = np.exp(power * (np.log(S) + np.log(smooth))) + else: + S_out = (bias**power) * np.expm1(power * np.log1p(S * smooth / bias)) + + if return_zf: + return S_out, zf + else: + return S_out + + +def griffinlim( + S: np.ndarray, + *, + n_iter: int = 32, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + n_fft: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + dtype: Optional[DTypeLike] = None, + length: Optional[int] = None, + pad_mode: _PadModeSTFT = "constant", + momentum: float = 0.99, + init: Optional[str] = "random", + random_state: Optional[ + Union[int, np.random.RandomState, np.random.Generator] + ] = None, +) -> np.ndarray: + """Approximate magnitude spectrogram inversion using the "fast" Griffin-Lim algorithm. + + Given a short-time Fourier transform magnitude matrix (``S``), the algorithm randomly + initializes phase estimates, and then alternates forward- and inverse-STFT + operations. [#]_ + + Note that this assumes reconstruction of a real-valued time-domain signal, and + that ``S`` contains only the non-negative frequencies (as computed by + `stft`). + + The "fast" GL method [#]_ uses a momentum parameter to accelerate convergence. + + .. [#] D. W. Griffin and J. S. Lim, + "Signal estimation from modified short-time Fourier transform," + IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. + + .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L. + "A fast Griffin-Lim algorithm," + IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4), + Oct. 2013. + + Parameters + ---------- + S : np.ndarray [shape=(..., n_fft // 2 + 1, t), non-negative] + An array of short-time Fourier transform magnitudes as produced by + `stft`. + + n_iter : int > 0 + The number of iterations to run + + hop_length : None or int > 0 + The hop length of the STFT. If not provided, it will default to ``n_fft // 4`` + + win_length : None or int > 0 + The window length of the STFT. By default, it will equal ``n_fft`` + + n_fft : None or int > 0 + The number of samples per frame. + By default, this will be inferred from the shape of ``S`` as an even number. + However, if an odd frame length was used, you can explicitly set ``n_fft``. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + A window specification as supported by `stft` or `istft` + + center : boolean + If ``True``, the STFT is assumed to use centered frames. + If ``False``, the STFT is assumed to use left-aligned frames. + + dtype : np.dtype + Real numeric type for the time-domain signal. Default is inferred + to match the precision of the input spectrogram. + + length : None or int > 0 + If provided, the output ``y`` is zero-padded or clipped to exactly ``length`` + samples. + + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, STFT uses zero padding. + + momentum : number >= 0 + The momentum parameter for fast Griffin-Lim. + Setting this to 0 recovers the original Griffin-Lim method [1]_. + Values near 1 can lead to faster convergence, but above 1 may not converge. + + init : None or 'random' [default] + If 'random' (the default), then phase values are initialized randomly + according to ``random_state``. This is recommended when the input ``S`` is + a magnitude spectrogram with no initial phase estimates. + + If `None`, then the phase is initialized from ``S``. This is useful when + an initial guess for phase can be provided, or when you want to resume + Griffin-Lim from a previous output. + + random_state : None, int, np.random.RandomState, or np.random.Generator + If int, random_state is the seed used by the random number generator + for phase initialization. + + If `np.random.RandomState` or `np.random.Generator` instance, the random number + generator itself. + + If `None`, defaults to the `np.random.default_rng()` object. + + Returns + ------- + y : np.ndarray [shape=(..., n)] + time-domain signal reconstructed from ``S`` + + See Also + -------- + stft + istft + magphase + filters.get_window + + Examples + -------- + A basic STFT inverse example + + >>> y, sr = librosa.load(librosa.ex('trumpet')) + >>> # Get the magnitude spectrogram + >>> S = np.abs(librosa.stft(y)) + >>> # Invert using Griffin-Lim + >>> y_inv = librosa.griffinlim(S) + >>> # Invert without estimating phase + >>> y_istft = librosa.istft(S) + + Wave-plot the results + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True) + >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0]) + >>> ax[0].set(title='Original', xlabel=None) + >>> ax[0].label_outer() + >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1]) + >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None) + >>> ax[1].label_outer() + >>> librosa.display.waveshow(y_istft, sr=sr, color='r', ax=ax[2]) + >>> ax[2].set_title('Magnitude-only istft reconstruction') + """ + + if random_state is None: + rng = np.random.default_rng() + elif isinstance(random_state, int): + rng = np.random.RandomState(seed=random_state) # type: ignore + elif isinstance(random_state, (np.random.RandomState, np.random.Generator)): + rng = random_state # type: ignore + else: + raise ParameterError(f"Unsupported random_state={random_state!r}") + + if momentum > 1: + warnings.warn( + f"Griffin-Lim with momentum={momentum} > 1 can be unstable. " + "Proceed with caution!", + stacklevel=2, + ) + elif momentum < 0: + raise ParameterError(f"griffinlim() called with momentum={momentum} < 0") + + # Infer n_fft from the spectrogram shape + if n_fft is None: + n_fft = 2 * (S.shape[-2] - 1) + + # Infer the dtype from S + angles = np.empty(S.shape, dtype=util.dtype_r2c(S.dtype)) + eps = util.tiny(angles) + + if init == "random": + # randomly initialize the phase + angles[:] = util.phasor((2 * np.pi * rng.random(size=S.shape))) + elif init is None: + # Initialize an all ones complex matrix + angles[:] = 1.0 + else: + raise ParameterError(f"init={init} must either None or 'random'") + + # Place-holders for temporary data and reconstructed buffer + rebuilt = None + tprev = None + inverse = None + + # Absorb magnitudes into angles + angles *= S + for _ in range(n_iter): + # Invert with our current estimate of the phases + inverse = istft( + angles, + hop_length=hop_length, + win_length=win_length, + n_fft=n_fft, + window=window, + center=center, + dtype=dtype, + length=length, + out=inverse, + ) + + # Rebuild the spectrogram + rebuilt = stft( + inverse, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + out=rebuilt, + ) + + # Update our phase estimates + angles[:] = rebuilt + if tprev is not None: + angles -= (momentum / (1 + momentum)) * tprev + angles /= np.abs(angles) + eps + angles *= S + # Store + rebuilt, tprev = tprev, rebuilt + + # Return the final phase estimates + return istft( + angles, + hop_length=hop_length, + win_length=win_length, + n_fft=n_fft, + window=window, + center=center, + dtype=dtype, + length=length, + out=inverse, + ) + + +def _spectrogram( + *, + y: Optional[np.ndarray] = None, + S: Optional[np.ndarray] = None, + n_fft: Optional[int] = 2048, + hop_length: Optional[int] = 512, + power: float = 1, + win_length: Optional[int] = None, + window: _WindowSpec = "hann", + center: bool = True, + pad_mode: _PadModeSTFT = "constant", +) -> Tuple[np.ndarray, int]: + """Helper function to retrieve a magnitude spectrogram. + + This is primarily used in feature extraction functions that can operate on + either audio time-series or spectrogram input. + + Parameters + ---------- + y : None or np.ndarray + If provided, an audio time series + + S : None or np.ndarray + Spectrogram input, optional + + n_fft : int > 0 + STFT window size + + hop_length : int > 0 + STFT hop length + + power : float > 0 + Exponent for the magnitude spectrogram, + e.g., 1 for energy, 2 for power, etc. + + win_length : int <= n_fft [scalar] + Each frame of audio is windowed by ``window``. + The window will be of length ``win_length`` and then padded + with zeros to match ``n_fft``. + + If unspecified, defaults to ``win_length = n_fft``. + + window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] + - a window specification (string, tuple, or number); + see `scipy.signal.get_window` + - a window function, such as `scipy.signal.windows.hann` + - a vector or array of length ``n_fft`` + + .. see also:: `filters.get_window` + + center : boolean + - If ``True``, the signal ``y`` is padded so that frame + ``t`` is centered at ``y[t * hop_length]``. + - If ``False``, then frame ``t`` begins at ``y[t * hop_length]`` + + pad_mode : string + If ``center=True``, the padding mode to use at the edges of the signal. + By default, STFT uses zero padding. + + Returns + ------- + S_out : np.ndarray [dtype=np.float] + - If ``S`` is provided as input, then ``S_out == S`` + - Else, ``S_out = |stft(y, ...)|**power`` + n_fft : int > 0 + - If ``S`` is provided, then ``n_fft`` is inferred from ``S`` + - Else, copied from input + """ + + if S is not None: + # Infer n_fft from spectrogram shape, but only if it mismatches + if n_fft is None or n_fft // 2 + 1 != S.shape[-2]: + n_fft = 2 * (S.shape[-2] - 1) + else: + # Otherwise, compute a magnitude spectrogram from input + if n_fft is None: + raise ParameterError(f"Unable to compute spectrogram with n_fft={n_fft}") + if y is None: + raise ParameterError( + "Input signal must be provided to compute a spectrogram" + ) + S = ( + np.abs( + stft( + y, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + center=center, + window=window, + pad_mode=pad_mode, + ) + ) + ** power + ) + + return S, n_fft