renator commited on
Commit
4a367ac
·
1 Parent(s): d257d3d

fixed environment issues

Browse files
Files changed (5) hide show
  1. Dockerfile +4 -0
  2. constantq.py +1497 -0
  3. filters.py +1661 -0
  4. sequence.py +2059 -0
  5. utils.py +316 -0
Dockerfile CHANGED
@@ -26,6 +26,10 @@ COPY . /app/
26
  # Replace the librosa notation.py with notation.py from your project
27
  COPY notation.py /usr/local/lib/python3.10/site-packages/librosa/core/notation.py
28
  COPY audio.py /usr/local/lib/python3.10/site-packages/librosa/core/audio.py
 
 
 
 
29
 
30
  # RUN cd /tmp && mkdir cache1
31
 
 
26
  # Replace the librosa notation.py with notation.py from your project
27
  COPY notation.py /usr/local/lib/python3.10/site-packages/librosa/core/notation.py
28
  COPY audio.py /usr/local/lib/python3.10/site-packages/librosa/core/audio.py
29
+ COPY constantq.py /usr/local/lib/python3.10/site-packages/librosa/core/constantq.py
30
+ COPY filters.py /usr/local/lib/python3.10/site-packages/librosa/filters.py
31
+ COPY sequence.py /usr/local/lib/python3.10/site-packages/librosa/sequence.py
32
+ COPY utils.py /usr/local/lib/python3.10/site-packages/librosa/feature/utils.py
33
 
34
  # RUN cd /tmp && mkdir cache1
35
 
constantq.py ADDED
@@ -0,0 +1,1497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ #!/usr/bin/env python
5
+ # -*- coding: utf-8 -*-
6
+ """Constant-Q transforms"""
7
+ import warnings
8
+ import numpy as np
9
+ from numba import jit
10
+
11
+ from . import audio
12
+ from .intervals import interval_frequencies
13
+ from .fft import get_fftlib
14
+ from .convert import cqt_frequencies, note_to_hz
15
+ from .spectrum import stft, istft
16
+ from .pitch import estimate_tuning
17
+ from .._cache import cache
18
+ from .. import filters
19
+ from .. import util
20
+ from ..util.exceptions import ParameterError
21
+ from numpy.typing import DTypeLike
22
+ from typing import Optional, Union, Collection, List
23
+ from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable
24
+
25
+ __all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]
26
+
27
+ # TODO: ivqt, griffinlim_vqt
28
+
29
+
30
+ @cache(level=20)
31
+ def cqt(
32
+ y: np.ndarray,
33
+ *,
34
+ sr: float = 22050,
35
+ hop_length: int = 512,
36
+ fmin: Optional[_FloatLike_co] = None,
37
+ n_bins: int = 84,
38
+ bins_per_octave: int = 12,
39
+ tuning: Optional[float] = 0.0,
40
+ filter_scale: float = 1,
41
+ norm: Optional[float] = 1,
42
+ sparsity: float = 0.01,
43
+ window: _WindowSpec = "hann",
44
+ scale: bool = True,
45
+ pad_mode: _PadMode = "constant",
46
+ res_type: Optional[str] = "soxr_hq",
47
+ dtype: Optional[DTypeLike] = None,
48
+ ) -> np.ndarray:
49
+ """Compute the constant-Q transform of an audio signal.
50
+
51
+ This implementation is based on the recursive sub-sampling method
52
+ described by [#]_.
53
+
54
+ .. [#] Schoerkhuber, Christian, and Anssi Klapuri.
55
+ "Constant-Q transform toolbox for music processing."
56
+ 7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
57
+
58
+ Parameters
59
+ ----------
60
+ y : np.ndarray [shape=(..., n)]
61
+ audio time series. Multi-channel is supported.
62
+
63
+ sr : number > 0 [scalar]
64
+ sampling rate of ``y``
65
+
66
+ hop_length : int > 0 [scalar]
67
+ number of samples between successive CQT columns.
68
+
69
+ fmin : float > 0 [scalar]
70
+ Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
71
+
72
+ n_bins : int > 0 [scalar]
73
+ Number of frequency bins, starting at ``fmin``
74
+
75
+ bins_per_octave : int > 0 [scalar]
76
+ Number of bins per octave
77
+
78
+ tuning : None or float
79
+ Tuning offset in fractions of a bin.
80
+
81
+ If ``None``, tuning will be automatically estimated from the signal.
82
+
83
+ The minimum frequency of the resulting CQT will be modified to
84
+ ``fmin * 2**(tuning / bins_per_octave)``.
85
+
86
+ filter_scale : float > 0
87
+ Filter scale factor. Small values (<1) use shorter windows
88
+ for improved time resolution.
89
+
90
+ norm : {inf, -inf, 0, float > 0}
91
+ Type of norm to use for basis function normalization.
92
+ See `librosa.util.normalize`.
93
+
94
+ sparsity : float in [0, 1)
95
+ Sparsify the CQT basis by discarding up to ``sparsity``
96
+ fraction of the energy in each basis.
97
+
98
+ Set ``sparsity=0`` to disable sparsification.
99
+
100
+ window : str, tuple, number, or function
101
+ Window specification for the basis filters.
102
+ See `filters.get_window` for details.
103
+
104
+ scale : bool
105
+ If ``True``, scale the CQT response by square-root the length of
106
+ each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
107
+
108
+ If ``False``, do not scale the CQT. This is analogous to
109
+ ``norm=None`` in FFT.
110
+
111
+ pad_mode : string
112
+ Padding mode for centered frame analysis.
113
+
114
+ See also: `librosa.stft` and `numpy.pad`.
115
+
116
+ res_type : string
117
+ The resampling mode for recursive downsampling.
118
+
119
+ dtype : np.dtype
120
+ The (complex) data type of the output array. By default, this is inferred to match
121
+ the numerical precision of the input signal.
122
+
123
+ Returns
124
+ -------
125
+ CQT : np.ndarray [shape=(..., n_bins, t)]
126
+ Constant-Q value each frequency at each time.
127
+
128
+ See Also
129
+ --------
130
+ vqt
131
+ librosa.resample
132
+ librosa.util.normalize
133
+
134
+ Notes
135
+ -----
136
+ This function caches at level 20.
137
+
138
+ Examples
139
+ --------
140
+ Generate and plot a constant-Q power spectrum
141
+
142
+ >>> import matplotlib.pyplot as plt
143
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
144
+ >>> C = np.abs(librosa.cqt(y, sr=sr))
145
+ >>> fig, ax = plt.subplots()
146
+ >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
147
+ ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax)
148
+ >>> ax.set_title('Constant-Q power spectrum')
149
+ >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
150
+
151
+ Limit the frequency range
152
+
153
+ >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
154
+ ... n_bins=60))
155
+ >>> C
156
+ array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09],
157
+ [5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08],
158
+ ...,
159
+ [4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06],
160
+ [2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]])
161
+
162
+ Using a higher frequency resolution
163
+
164
+ >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
165
+ ... n_bins=60 * 2, bins_per_octave=12 * 2))
166
+ >>> C
167
+ array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09],
168
+ [4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09],
169
+ ...,
170
+ [2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05],
171
+ [5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]])
172
+ """
173
+
174
+ # CQT is the special case of VQT with gamma=0
175
+ return vqt(
176
+ y=y,
177
+ sr=sr,
178
+ hop_length=hop_length,
179
+ fmin=fmin,
180
+ n_bins=n_bins,
181
+ intervals="equal",
182
+ gamma=0,
183
+ bins_per_octave=bins_per_octave,
184
+ tuning=tuning,
185
+ filter_scale=filter_scale,
186
+ norm=norm,
187
+ sparsity=sparsity,
188
+ window=window,
189
+ scale=scale,
190
+ pad_mode=pad_mode,
191
+ res_type=res_type,
192
+ dtype=dtype,
193
+ )
194
+
195
+
196
+ @cache(level=20)
197
+ def hybrid_cqt(
198
+ y: np.ndarray,
199
+ *,
200
+ sr: float = 22050,
201
+ hop_length: int = 512,
202
+ fmin: Optional[_FloatLike_co] = None,
203
+ n_bins: int = 84,
204
+ bins_per_octave: int = 12,
205
+ tuning: Optional[float] = 0.0,
206
+ filter_scale: float = 1,
207
+ norm: Optional[float] = 1,
208
+ sparsity: float = 0.01,
209
+ window: _WindowSpec = "hann",
210
+ scale: bool = True,
211
+ pad_mode: _PadMode = "constant",
212
+ res_type: str = "soxr_hq",
213
+ dtype: Optional[DTypeLike] = None,
214
+ ) -> np.ndarray:
215
+ """Compute the hybrid constant-Q transform of an audio signal.
216
+
217
+ Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
218
+ the hop_length is longer than half the filter length and the full CQT
219
+ for lower frequencies.
220
+
221
+ Parameters
222
+ ----------
223
+ y : np.ndarray [shape=(..., n)]
224
+ audio time series. Multi-channel is supported.
225
+
226
+ sr : number > 0 [scalar]
227
+ sampling rate of ``y``
228
+
229
+ hop_length : int > 0 [scalar]
230
+ number of samples between successive CQT columns.
231
+
232
+ fmin : float > 0 [scalar]
233
+ Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
234
+
235
+ n_bins : int > 0 [scalar]
236
+ Number of frequency bins, starting at ``fmin``
237
+
238
+ bins_per_octave : int > 0 [scalar]
239
+ Number of bins per octave
240
+
241
+ tuning : None or float
242
+ Tuning offset in fractions of a bin.
243
+
244
+ If ``None``, tuning will be automatically estimated from the signal.
245
+
246
+ The minimum frequency of the resulting CQT will be modified to
247
+ ``fmin * 2**(tuning / bins_per_octave)``.
248
+
249
+ filter_scale : float > 0
250
+ Filter filter_scale factor. Larger values use longer windows.
251
+
252
+ norm : {inf, -inf, 0, float > 0}
253
+ Type of norm to use for basis function normalization.
254
+ See `librosa.util.normalize`.
255
+
256
+ sparsity : float in [0, 1)
257
+ Sparsify the CQT basis by discarding up to ``sparsity``
258
+ fraction of the energy in each basis.
259
+
260
+ Set ``sparsity=0`` to disable sparsification.
261
+
262
+ window : str, tuple, number, or function
263
+ Window specification for the basis filters.
264
+ See `filters.get_window` for details.
265
+
266
+ scale : bool
267
+ If ``True``, scale the CQT response by square-root the length of
268
+ each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
269
+
270
+ If ``False``, do not scale the CQT. This is analogous to
271
+ ``norm=None`` in FFT.
272
+
273
+ pad_mode : string
274
+ Padding mode for centered frame analysis.
275
+
276
+ See also: `librosa.stft` and `numpy.pad`.
277
+
278
+ res_type : string
279
+ Resampling mode. See `librosa.cqt` for details.
280
+
281
+ dtype : np.dtype, optional
282
+ The complex dtype to use for computing the CQT.
283
+ By default, this is inferred to match the precision of
284
+ the input signal.
285
+
286
+ Returns
287
+ -------
288
+ CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
289
+ Constant-Q energy for each frequency at each time.
290
+
291
+ See Also
292
+ --------
293
+ cqt
294
+ pseudo_cqt
295
+
296
+ Notes
297
+ -----
298
+ This function caches at level 20.
299
+
300
+ """
301
+
302
+ if fmin is None:
303
+ # C1 by default
304
+ fmin = note_to_hz("C1")
305
+
306
+ if tuning is None:
307
+ tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
308
+
309
+ # Apply tuning correction
310
+ fmin = fmin * 2.0 ** (tuning / bins_per_octave)
311
+
312
+ # Get all CQT frequencies
313
+ freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave)
314
+
315
+ # Compute an alpha parameter, just in case we need it
316
+ alpha = __bpo_to_alpha(bins_per_octave)
317
+
318
+ # Compute the length of each constant-Q basis function
319
+ lengths, _ = filters.wavelet_lengths(
320
+ freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha
321
+ )
322
+
323
+ # Determine which filters to use with Pseudo CQT
324
+ # These are the ones that fit within 2 hop lengths after padding
325
+ pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length
326
+
327
+ n_bins_pseudo = int(np.sum(pseudo_filters))
328
+
329
+ n_bins_full = n_bins - n_bins_pseudo
330
+ cqt_resp = []
331
+
332
+ if n_bins_pseudo > 0:
333
+ fmin_pseudo = np.min(freqs[pseudo_filters])
334
+
335
+ cqt_resp.append(
336
+ pseudo_cqt(
337
+ y,
338
+ sr=sr,
339
+ hop_length=hop_length,
340
+ fmin=fmin_pseudo,
341
+ n_bins=n_bins_pseudo,
342
+ bins_per_octave=bins_per_octave,
343
+ filter_scale=filter_scale,
344
+ norm=norm,
345
+ sparsity=sparsity,
346
+ window=window,
347
+ scale=scale,
348
+ pad_mode=pad_mode,
349
+ dtype=dtype,
350
+ )
351
+ )
352
+
353
+ if n_bins_full > 0:
354
+ cqt_resp.append(
355
+ np.abs(
356
+ cqt(
357
+ y,
358
+ sr=sr,
359
+ hop_length=hop_length,
360
+ fmin=fmin,
361
+ n_bins=n_bins_full,
362
+ bins_per_octave=bins_per_octave,
363
+ filter_scale=filter_scale,
364
+ norm=norm,
365
+ sparsity=sparsity,
366
+ window=window,
367
+ scale=scale,
368
+ pad_mode=pad_mode,
369
+ res_type=res_type,
370
+ dtype=dtype,
371
+ )
372
+ )
373
+ )
374
+
375
+ # Propagate dtype from the last component
376
+ return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)
377
+
378
+
379
+ @cache(level=20)
380
+ def pseudo_cqt(
381
+ y: np.ndarray,
382
+ *,
383
+ sr: float = 22050,
384
+ hop_length: int = 512,
385
+ fmin: Optional[_FloatLike_co] = None,
386
+ n_bins: int = 84,
387
+ bins_per_octave: int = 12,
388
+ tuning: Optional[float] = 0.0,
389
+ filter_scale: float = 1,
390
+ norm: Optional[float] = 1,
391
+ sparsity: float = 0.01,
392
+ window: _WindowSpec = "hann",
393
+ scale: bool = True,
394
+ pad_mode: _PadMode = "constant",
395
+ dtype: Optional[DTypeLike] = None,
396
+ ) -> np.ndarray:
397
+ """Compute the pseudo constant-Q transform of an audio signal.
398
+
399
+ This uses a single fft size that is the smallest power of 2 that is greater
400
+ than or equal to the max of:
401
+
402
+ 1. The longest CQT filter
403
+ 2. 2x the hop_length
404
+
405
+ Parameters
406
+ ----------
407
+ y : np.ndarray [shape=(..., n)]
408
+ audio time series. Multi-channel is supported.
409
+
410
+ sr : number > 0 [scalar]
411
+ sampling rate of ``y``
412
+
413
+ hop_length : int > 0 [scalar]
414
+ number of samples between successive CQT columns.
415
+
416
+ fmin : float > 0 [scalar]
417
+ Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
418
+
419
+ n_bins : int > 0 [scalar]
420
+ Number of frequency bins, starting at ``fmin``
421
+
422
+ bins_per_octave : int > 0 [scalar]
423
+ Number of bins per octave
424
+
425
+ tuning : None or float
426
+ Tuning offset in fractions of a bin.
427
+
428
+ If ``None``, tuning will be automatically estimated from the signal.
429
+
430
+ The minimum frequency of the resulting CQT will be modified to
431
+ ``fmin * 2**(tuning / bins_per_octave)``.
432
+
433
+ filter_scale : float > 0
434
+ Filter filter_scale factor. Larger values use longer windows.
435
+
436
+ norm : {inf, -inf, 0, float > 0}
437
+ Type of norm to use for basis function normalization.
438
+ See `librosa.util.normalize`.
439
+
440
+ sparsity : float in [0, 1)
441
+ Sparsify the CQT basis by discarding up to ``sparsity``
442
+ fraction of the energy in each basis.
443
+
444
+ Set ``sparsity=0`` to disable sparsification.
445
+
446
+ window : str, tuple, number, or function
447
+ Window specification for the basis filters.
448
+ See `filters.get_window` for details.
449
+
450
+ scale : bool
451
+ If ``True``, scale the CQT response by square-root the length of
452
+ each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
453
+
454
+ If ``False``, do not scale the CQT. This is analogous to
455
+ ``norm=None`` in FFT.
456
+
457
+ pad_mode : string
458
+ Padding mode for centered frame analysis.
459
+
460
+ See also: `librosa.stft` and `numpy.pad`.
461
+
462
+ dtype : np.dtype, optional
463
+ The complex data type for CQT calculations.
464
+ By default, this is inferred to match the precision of the input signal.
465
+
466
+ Returns
467
+ -------
468
+ CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
469
+ Pseudo Constant-Q energy for each frequency at each time.
470
+
471
+ Notes
472
+ -----
473
+ This function caches at level 20.
474
+
475
+ """
476
+
477
+ if fmin is None:
478
+ # C1 by default
479
+ fmin = note_to_hz("C1")
480
+
481
+ if tuning is None:
482
+ tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
483
+
484
+ if dtype is None:
485
+ dtype = util.dtype_r2c(y.dtype)
486
+
487
+ # Apply tuning correction
488
+ fmin = fmin * 2.0 ** (tuning / bins_per_octave)
489
+
490
+ freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
491
+
492
+ alpha = __bpo_to_alpha(bins_per_octave)
493
+
494
+ lengths, _ = filters.wavelet_lengths(
495
+ freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
496
+ )
497
+
498
+ fft_basis, n_fft, _ = __vqt_filter_fft(
499
+ sr,
500
+ freqs,
501
+ filter_scale,
502
+ norm,
503
+ sparsity,
504
+ hop_length=hop_length,
505
+ window=window,
506
+ dtype=dtype,
507
+ alpha=alpha,
508
+ )
509
+
510
+ fft_basis = np.abs(fft_basis)
511
+
512
+ # Compute the magnitude-only CQT response
513
+ C: np.ndarray = __cqt_response(
514
+ y,
515
+ n_fft,
516
+ hop_length,
517
+ fft_basis,
518
+ pad_mode,
519
+ window="hann",
520
+ dtype=dtype,
521
+ phase=False,
522
+ )
523
+
524
+ if scale:
525
+ C /= np.sqrt(n_fft)
526
+ else:
527
+ # reshape lengths to match dimension properly
528
+ lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2)
529
+
530
+ C *= np.sqrt(lengths / n_fft)
531
+
532
+ return C
533
+
534
+
535
+ @cache(level=40)
536
+ def icqt(
537
+ C: np.ndarray,
538
+ *,
539
+ sr: float = 22050,
540
+ hop_length: int = 512,
541
+ fmin: Optional[_FloatLike_co] = None,
542
+ bins_per_octave: int = 12,
543
+ tuning: float = 0.0,
544
+ filter_scale: float = 1,
545
+ norm: Optional[float] = 1,
546
+ sparsity: float = 0.01,
547
+ window: _WindowSpec = "hann",
548
+ scale: bool = True,
549
+ length: Optional[int] = None,
550
+ res_type: str = "soxr_hq",
551
+ dtype: Optional[DTypeLike] = None,
552
+ ) -> np.ndarray:
553
+ """Compute the inverse constant-Q transform.
554
+
555
+ Given a constant-Q transform representation ``C`` of an audio signal ``y``,
556
+ this function produces an approximation ``y_hat``.
557
+
558
+ Parameters
559
+ ----------
560
+ C : np.ndarray, [shape=(..., n_bins, n_frames)]
561
+ Constant-Q representation as produced by `cqt`
562
+
563
+ sr : number > 0 [scalar]
564
+ sampling rate of the signal
565
+
566
+ hop_length : int > 0 [scalar]
567
+ number of samples between successive frames
568
+
569
+ fmin : float > 0 [scalar]
570
+ Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
571
+
572
+ bins_per_octave : int > 0 [scalar]
573
+ Number of bins per octave
574
+
575
+ tuning : float [scalar]
576
+ Tuning offset in fractions of a bin.
577
+
578
+ The minimum frequency of the CQT will be modified to
579
+ ``fmin * 2**(tuning / bins_per_octave)``.
580
+
581
+ filter_scale : float > 0 [scalar]
582
+ Filter scale factor. Small values (<1) use shorter windows
583
+ for improved time resolution.
584
+
585
+ norm : {inf, -inf, 0, float > 0}
586
+ Type of norm to use for basis function normalization.
587
+ See `librosa.util.normalize`.
588
+
589
+ sparsity : float in [0, 1)
590
+ Sparsify the CQT basis by discarding up to ``sparsity``
591
+ fraction of the energy in each basis.
592
+
593
+ Set ``sparsity=0`` to disable sparsification.
594
+
595
+ window : str, tuple, number, or function
596
+ Window specification for the basis filters.
597
+ See `filters.get_window` for details.
598
+
599
+ scale : bool
600
+ If ``True``, scale the CQT response by square-root the length
601
+ of each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
602
+
603
+ If ``False``, do not scale the CQT. This is analogous to ``norm=None``
604
+ in FFT.
605
+
606
+ length : int > 0, optional
607
+ If provided, the output ``y`` is zero-padded or clipped to exactly
608
+ ``length`` samples.
609
+
610
+ res_type : string
611
+ Resampling mode.
612
+ See `librosa.resample` for supported modes.
613
+
614
+ dtype : numeric type
615
+ Real numeric type for ``y``. Default is inferred to match the numerical
616
+ precision of the input CQT.
617
+
618
+ Returns
619
+ -------
620
+ y : np.ndarray, [shape=(..., n_samples), dtype=np.float]
621
+ Audio time-series reconstructed from the CQT representation.
622
+
623
+ See Also
624
+ --------
625
+ cqt
626
+ librosa.resample
627
+
628
+ Notes
629
+ -----
630
+ This function caches at level 40.
631
+
632
+ Examples
633
+ --------
634
+ Using default parameters
635
+
636
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
637
+ >>> C = librosa.cqt(y=y, sr=sr)
638
+ >>> y_hat = librosa.icqt(C=C, sr=sr)
639
+
640
+ Or with a different hop length and frequency resolution:
641
+
642
+ >>> hop_length = 256
643
+ >>> bins_per_octave = 12 * 3
644
+ >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
645
+ ... bins_per_octave=bins_per_octave)
646
+ >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
647
+ ... bins_per_octave=bins_per_octave)
648
+ """
649
+ if fmin is None:
650
+ fmin = note_to_hz("C1")
651
+
652
+ # Apply tuning correction
653
+ fmin = fmin * 2.0 ** (tuning / bins_per_octave)
654
+
655
+ # Get the top octave of frequencies
656
+ n_bins = C.shape[-2]
657
+
658
+ n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
659
+
660
+ # truncate the cqt to max frames if helpful
661
+ freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
662
+ alpha = __bpo_to_alpha(bins_per_octave)
663
+
664
+ lengths, f_cutoff = filters.wavelet_lengths(
665
+ freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
666
+ )
667
+
668
+ # Trim the CQT to only what's necessary for reconstruction
669
+ if length is not None:
670
+ n_frames = int(np.ceil((length + max(lengths)) / hop_length))
671
+ C = C[..., :n_frames]
672
+
673
+ C_scale = np.sqrt(lengths)
674
+
675
+ # This shape array will be used for broadcasting the basis scale
676
+ # we'll have to adapt this per octave within the loop
677
+ y: Optional[np.ndarray] = None
678
+
679
+ # Assume the top octave is at the full rate
680
+ srs = [sr]
681
+ hops = [hop_length]
682
+
683
+ for i in range(n_octaves - 1):
684
+ if hops[0] % 2 == 0:
685
+ # We can downsample:
686
+ srs.insert(0, srs[0] * 0.5)
687
+ hops.insert(0, hops[0] // 2)
688
+ else:
689
+ # We're out of downsamplings, carry forward
690
+ srs.insert(0, srs[0])
691
+ hops.insert(0, hops[0])
692
+
693
+ for i, (my_sr, my_hop) in enumerate(zip(srs, hops)):
694
+ # How many filters are in this octave?
695
+ n_filters = min(bins_per_octave, n_bins - bins_per_octave * i)
696
+
697
+ # Slice out the current octave
698
+ sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters)
699
+
700
+ fft_basis, n_fft, _ = __vqt_filter_fft(
701
+ my_sr,
702
+ freqs[sl],
703
+ filter_scale,
704
+ norm,
705
+ sparsity,
706
+ window=window,
707
+ dtype=dtype,
708
+ alpha=alpha,
709
+ )
710
+
711
+ # Transpose the basis
712
+ inv_basis = fft_basis.H.todense()
713
+
714
+ # Compute each filter's frequency-domain power
715
+ freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0)
716
+
717
+ # Compensate for length normalization in the forward transform
718
+ freq_power *= n_fft / lengths[sl]
719
+
720
+ # Inverse-project the basis for each octave
721
+ if scale:
722
+ # scale=True ==> re-scale by sqrt(lengths)
723
+ D_oct = np.einsum(
724
+ "fc,c,c,...ct->...ft",
725
+ inv_basis,
726
+ C_scale[sl],
727
+ freq_power,
728
+ C[..., sl, :],
729
+ optimize=True,
730
+ )
731
+ else:
732
+ D_oct = np.einsum(
733
+ "fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True
734
+ )
735
+
736
+ y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype)
737
+
738
+ y_oct = audio.resample(
739
+ y_oct,
740
+ orig_sr=1,
741
+ target_sr=sr // my_sr,
742
+ res_type=res_type,
743
+ scale=False,
744
+ fix=False,
745
+ )
746
+
747
+ if y is None:
748
+ y = y_oct
749
+ else:
750
+ y[..., : y_oct.shape[-1]] += y_oct
751
+ # make mypy happy
752
+ assert y is not None
753
+
754
+ if length:
755
+ y = util.fix_length(y, size=length)
756
+
757
+ return y
758
+
759
+
760
+ @cache(level=20)
761
+ def vqt(
762
+ y: np.ndarray,
763
+ *,
764
+ sr: float = 22050,
765
+ hop_length: int = 512,
766
+ fmin: Optional[_FloatLike_co] = None,
767
+ n_bins: int = 84,
768
+ intervals: Union[str, Collection[float]] = "equal",
769
+ gamma: Optional[float] = None,
770
+ bins_per_octave: int = 12,
771
+ tuning: Optional[float] = 0.0,
772
+ filter_scale: float = 1,
773
+ norm: Optional[float] = 1,
774
+ sparsity: float = 0.01,
775
+ window: _WindowSpec = "hann",
776
+ scale: bool = True,
777
+ pad_mode: _PadMode = "constant",
778
+ res_type: Optional[str] = "soxr_hq",
779
+ dtype: Optional[DTypeLike] = None,
780
+ ) -> np.ndarray:
781
+ """Compute the variable-Q transform of an audio signal.
782
+
783
+ This implementation is based on the recursive sub-sampling method
784
+ described by [#]_.
785
+
786
+ .. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler.
787
+ "A Matlab toolbox for efficient perfect reconstruction time-frequency
788
+ transforms with log-frequency resolution."
789
+ In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio.
790
+ Audio Engineering Society, 2014.
791
+
792
+ Parameters
793
+ ----------
794
+ y : np.ndarray [shape=(..., n)]
795
+ audio time series. Multi-channel is supported.
796
+
797
+ sr : number > 0 [scalar]
798
+ sampling rate of ``y``
799
+
800
+ hop_length : int > 0 [scalar]
801
+ number of samples between successive VQT columns.
802
+
803
+ fmin : float > 0 [scalar]
804
+ Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
805
+
806
+ n_bins : int > 0 [scalar]
807
+ Number of frequency bins, starting at ``fmin``
808
+
809
+ intervals : str or array of floats in [1, 2)
810
+ Either a string specification for an interval set, e.g.,
811
+ `'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of
812
+ intervals expressed as numbers between 1 and 2.
813
+ .. see also:: librosa.interval_frequencies
814
+
815
+ gamma : number > 0 [scalar]
816
+ Bandwidth offset for determining filter lengths.
817
+
818
+ If ``gamma=0``, produces the constant-Q transform.
819
+
820
+ If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
821
+ constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
822
+ by solving for the gamma which gives::
823
+
824
+ B_k = alpha * f_k + gamma = C * ERB(f_k),
825
+
826
+ where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha
827
+ is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the
828
+ constant fraction across all filters.
829
+
830
+ Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived
831
+ from experimental data in [#]_.
832
+
833
+ .. [#] Glasberg, Brian R., and Brian CJ Moore.
834
+ "Derivation of auditory filter shapes from notched-noise data."
835
+ Hearing research 47.1-2 (1990): 103-138.
836
+
837
+ bins_per_octave : int > 0 [scalar]
838
+ Number of bins per octave
839
+
840
+ tuning : None or float
841
+ Tuning offset in fractions of a bin.
842
+
843
+ If ``None``, tuning will be automatically estimated from the signal.
844
+
845
+ The minimum frequency of the resulting VQT will be modified to
846
+ ``fmin * 2**(tuning / bins_per_octave)``.
847
+
848
+ filter_scale : float > 0
849
+ Filter scale factor. Small values (<1) use shorter windows
850
+ for improved time resolution.
851
+
852
+ norm : {inf, -inf, 0, float > 0}
853
+ Type of norm to use for basis function normalization.
854
+ See `librosa.util.normalize`.
855
+
856
+ sparsity : float in [0, 1)
857
+ Sparsify the VQT basis by discarding up to ``sparsity``
858
+ fraction of the energy in each basis.
859
+
860
+ Set ``sparsity=0`` to disable sparsification.
861
+
862
+ window : str, tuple, number, or function
863
+ Window specification for the basis filters.
864
+ See `filters.get_window` for details.
865
+
866
+ scale : bool
867
+ If ``True``, scale the VQT response by square-root the length of
868
+ each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
869
+
870
+ If ``False``, do not scale the VQT. This is analogous to
871
+ ``norm=None`` in FFT.
872
+
873
+ pad_mode : string
874
+ Padding mode for centered frame analysis.
875
+
876
+ See also: `librosa.stft` and `numpy.pad`.
877
+
878
+ res_type : string
879
+ The resampling mode for recursive downsampling.
880
+
881
+ dtype : np.dtype
882
+ The dtype of the output array. By default, this is inferred to match the
883
+ numerical precision of the input signal.
884
+
885
+ Returns
886
+ -------
887
+ VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex]
888
+ Variable-Q value each frequency at each time.
889
+
890
+ See Also
891
+ --------
892
+ cqt
893
+
894
+ Notes
895
+ -----
896
+ This function caches at level 20.
897
+
898
+ Examples
899
+ --------
900
+ Generate and plot a variable-Q power spectrum
901
+
902
+ >>> import matplotlib.pyplot as plt
903
+ >>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
904
+ >>> C = np.abs(librosa.cqt(y, sr=sr))
905
+ >>> V = np.abs(librosa.vqt(y, sr=sr))
906
+ >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
907
+ >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
908
+ ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0])
909
+ >>> ax[0].set(title='Constant-Q power spectrum', xlabel=None)
910
+ >>> ax[0].label_outer()
911
+ >>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max),
912
+ ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1])
913
+ >>> ax[1].set_title('Variable-Q power spectrum')
914
+ >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
915
+ """
916
+
917
+ # If intervals are provided as an array, override BPO
918
+ if not isinstance(intervals, str):
919
+ bins_per_octave = len(intervals)
920
+
921
+ # How many octaves are we dealing with?
922
+ n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
923
+ n_filters = min(bins_per_octave, n_bins)
924
+
925
+ if fmin is None:
926
+ # C1 by default
927
+ fmin = note_to_hz("C1")
928
+
929
+ if tuning is None:
930
+ tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
931
+
932
+ if dtype is None:
933
+ dtype = util.dtype_r2c(y.dtype)
934
+
935
+ # Apply tuning correction
936
+ fmin = fmin * 2.0 ** (tuning / bins_per_octave)
937
+
938
+ # First thing, get the freqs of the top octave
939
+ freqs = interval_frequencies(
940
+ n_bins=n_bins,
941
+ fmin=fmin,
942
+ intervals=intervals,
943
+ bins_per_octave=bins_per_octave,
944
+ sort=True,
945
+ )
946
+
947
+ freqs_top = freqs[-bins_per_octave:]
948
+
949
+ fmax_t: float = np.max(freqs_top)
950
+ alpha = __bpo_to_alpha(bins_per_octave)
951
+
952
+ lengths, filter_cutoff = filters.wavelet_lengths(
953
+ freqs=freqs,
954
+ sr=sr,
955
+ window=window,
956
+ filter_scale=filter_scale,
957
+ gamma=gamma,
958
+ alpha=alpha,
959
+ )
960
+
961
+ # Determine required resampling quality
962
+ nyquist = sr / 2.0
963
+
964
+ if filter_cutoff > nyquist:
965
+ raise ParameterError(
966
+ f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. "
967
+ "Try reducing the number of frequency bins."
968
+ )
969
+
970
+ if res_type is None:
971
+ warnings.warn(
972
+ "Support for VQT with res_type=None is deprecated in librosa 0.10\n"
973
+ "and will be removed in version 1.0.",
974
+ category=FutureWarning,
975
+ stacklevel=2,
976
+ )
977
+ res_type = "soxr_hq"
978
+
979
+ y, sr, hop_length = __early_downsample(
980
+ y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
981
+ )
982
+
983
+ vqt_resp = []
984
+
985
+ # Iterate down the octaves
986
+ my_y, my_sr, my_hop = y, sr, hop_length
987
+
988
+ for i in range(n_octaves):
989
+ # Slice out the current octave of filters
990
+ if i == 0:
991
+ sl = slice(-n_filters, None)
992
+ else:
993
+ sl = slice(-n_filters * (i + 1), -n_filters * i)
994
+
995
+ # This may be incorrect with early downsampling
996
+ freqs_oct = freqs[sl]
997
+
998
+ fft_basis, n_fft, _ = __vqt_filter_fft(
999
+ my_sr,
1000
+ freqs_oct,
1001
+ filter_scale,
1002
+ norm,
1003
+ sparsity,
1004
+ window=window,
1005
+ gamma=gamma,
1006
+ dtype=dtype,
1007
+ alpha=alpha,
1008
+ )
1009
+
1010
+ # Re-scale the filters to compensate for downsampling
1011
+ fft_basis[:] *= np.sqrt(sr / my_sr)
1012
+
1013
+ # Compute the vqt filter response and append to the stack
1014
+ vqt_resp.append(
1015
+ __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype)
1016
+ )
1017
+
1018
+ if my_hop % 2 == 0:
1019
+ my_hop //= 2
1020
+ my_sr /= 2.0
1021
+ my_y = audio.resample(
1022
+ my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True
1023
+ )
1024
+
1025
+ V = __trim_stack(vqt_resp, n_bins, dtype)
1026
+
1027
+ if scale:
1028
+ # Recompute lengths here because early downsampling may have changed
1029
+ # our sampling rate
1030
+ lengths, _ = filters.wavelet_lengths(
1031
+ freqs=freqs,
1032
+ sr=sr,
1033
+ window=window,
1034
+ filter_scale=filter_scale,
1035
+ gamma=gamma,
1036
+ alpha=alpha,
1037
+ )
1038
+
1039
+ # reshape lengths to match V shape
1040
+ lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2)
1041
+ V /= np.sqrt(lengths)
1042
+
1043
+ return V
1044
+
1045
+
1046
+ @cache(level=10)
1047
+ def __vqt_filter_fft(
1048
+ sr,
1049
+ freqs,
1050
+ filter_scale,
1051
+ norm,
1052
+ sparsity,
1053
+ hop_length=None,
1054
+ window="hann",
1055
+ gamma=0.0,
1056
+ dtype=np.complex64,
1057
+ alpha=None,
1058
+ ):
1059
+ """Generate the frequency domain variable-Q filter basis."""
1060
+
1061
+ basis, lengths = filters.wavelet(
1062
+ freqs=freqs,
1063
+ sr=sr,
1064
+ filter_scale=filter_scale,
1065
+ norm=norm,
1066
+ pad_fft=True,
1067
+ window=window,
1068
+ gamma=gamma,
1069
+ alpha=alpha,
1070
+ )
1071
+
1072
+ # Filters are padded up to the nearest integral power of 2
1073
+ n_fft = basis.shape[1]
1074
+
1075
+ if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))):
1076
+ n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length))))
1077
+
1078
+ # re-normalize bases with respect to the FFT window length
1079
+ basis *= lengths[:, np.newaxis] / float(n_fft)
1080
+
1081
+ # FFT and retain only the non-negative frequencies
1082
+ fft = get_fftlib()
1083
+ fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1]
1084
+
1085
+ # sparsify the basis
1086
+ fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype)
1087
+
1088
+ return fft_basis, n_fft, lengths
1089
+
1090
+
1091
+ def __trim_stack(
1092
+ cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike
1093
+ ) -> np.ndarray:
1094
+ """Helper function to trim and stack a collection of CQT responses"""
1095
+
1096
+ max_col = min(c_i.shape[-1] for c_i in cqt_resp)
1097
+ # Grab any leading dimensions
1098
+ shape = list(cqt_resp[0].shape)
1099
+ shape[-2] = n_bins
1100
+ shape[-1] = max_col
1101
+ cqt_out = np.empty(shape, dtype=dtype, order="F")
1102
+
1103
+ # Copy per-octave data into output array
1104
+ end = n_bins
1105
+ for c_i in cqt_resp:
1106
+ # By default, take the whole octave
1107
+ n_oct = c_i.shape[-2]
1108
+ # If the whole octave is more than we can fit,
1109
+ # take the highest bins from c_i
1110
+ if end < n_oct:
1111
+ cqt_out[..., :end, :] = c_i[..., -end:, :max_col]
1112
+ else:
1113
+ cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col]
1114
+
1115
+ end -= n_oct
1116
+
1117
+ return cqt_out
1118
+
1119
+
1120
+ def __cqt_response(
1121
+ y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None
1122
+ ):
1123
+ """Compute the filter response with a target STFT hop."""
1124
+
1125
+ # Compute the STFT matrix
1126
+ D = stft(
1127
+ y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype
1128
+ )
1129
+
1130
+ if not phase:
1131
+ D = np.abs(D)
1132
+
1133
+ # Reshape D to Dr
1134
+ Dr = D.reshape((-1, D.shape[-2], D.shape[-1]))
1135
+ output_flat = np.empty(
1136
+ (Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype
1137
+ )
1138
+
1139
+ # iterate over channels
1140
+ # project fft_basis.dot(Dr[i])
1141
+ for i in range(Dr.shape[0]):
1142
+ output_flat[i] = fft_basis.dot(Dr[i])
1143
+
1144
+ # reshape Dr to match D's leading dimensions again
1145
+ shape = list(D.shape)
1146
+ shape[-2] = fft_basis.shape[0]
1147
+ return output_flat.reshape(shape)
1148
+
1149
+
1150
+ def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves):
1151
+ """Compute the number of early downsampling operations"""
1152
+
1153
+ downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1)
1154
+
1155
+ num_twos = __num_two_factors(hop_length)
1156
+ downsample_count2 = max(0, num_twos - n_octaves + 1)
1157
+
1158
+ return min(downsample_count1, downsample_count2)
1159
+
1160
+
1161
+ def __early_downsample(
1162
+ y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
1163
+ ):
1164
+ """Perform early downsampling on an audio signal, if it applies."""
1165
+
1166
+ downsample_count = __early_downsample_count(
1167
+ nyquist, filter_cutoff, hop_length, n_octaves
1168
+ )
1169
+
1170
+ if downsample_count > 0:
1171
+ downsample_factor = 2 ** (downsample_count)
1172
+
1173
+ hop_length //= downsample_factor
1174
+
1175
+ if y.shape[-1] < downsample_factor:
1176
+ raise ParameterError(
1177
+ f"Input signal length={len(y):d} is too short for "
1178
+ f"{n_octaves:d}-octave CQT"
1179
+ )
1180
+
1181
+ new_sr = sr / float(downsample_factor)
1182
+ y = audio.resample(
1183
+ y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True
1184
+ )
1185
+
1186
+ # If we're not going to length-scale after CQT, we
1187
+ # need to compensate for the downsampling factor here
1188
+ if not scale:
1189
+ y *= np.sqrt(downsample_factor)
1190
+
1191
+ sr = new_sr
1192
+
1193
+ return y, sr, hop_length
1194
+
1195
+
1196
+ @jit(nopython=True, cache=False)
1197
+ def __num_two_factors(x):
1198
+ """Return how many times integer x can be evenly divided by 2.
1199
+
1200
+ Returns 0 for non-positive integers.
1201
+ """
1202
+ if x <= 0:
1203
+ return 0
1204
+ num_twos = 0
1205
+ while x % 2 == 0:
1206
+ num_twos += 1
1207
+ x //= 2
1208
+
1209
+ return num_twos
1210
+
1211
+
1212
+ def griffinlim_cqt(
1213
+ C: np.ndarray,
1214
+ *,
1215
+ n_iter: int = 32,
1216
+ sr: float = 22050,
1217
+ hop_length: int = 512,
1218
+ fmin: Optional[_FloatLike_co] = None,
1219
+ bins_per_octave: int = 12,
1220
+ tuning: float = 0.0,
1221
+ filter_scale: float = 1,
1222
+ norm: Optional[float] = 1,
1223
+ sparsity: float = 0.01,
1224
+ window: _WindowSpec = "hann",
1225
+ scale: bool = True,
1226
+ pad_mode: _PadMode = "constant",
1227
+ res_type: str = "soxr_hq",
1228
+ dtype: Optional[DTypeLike] = None,
1229
+ length: Optional[int] = None,
1230
+ momentum: float = 0.99,
1231
+ init: Optional[str] = "random",
1232
+ random_state: Optional[
1233
+ Union[int, np.random.RandomState, np.random.Generator]
1234
+ ] = None,
1235
+ ) -> np.ndarray:
1236
+ """Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim
1237
+ algorithm.
1238
+
1239
+ Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes
1240
+ phase estimates, and then alternates forward- and inverse-CQT operations. [#]_
1241
+
1242
+ This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_
1243
+ but adapted for use with constant-Q spectrograms.
1244
+
1245
+ .. [#] D. W. Griffin and J. S. Lim,
1246
+ "Signal estimation from modified short-time Fourier transform,"
1247
+ IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
1248
+
1249
+ .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
1250
+ "A fast Griffin-Lim algorithm,"
1251
+ IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
1252
+ Oct. 2013.
1253
+
1254
+ Parameters
1255
+ ----------
1256
+ C : np.ndarray [shape=(..., n_bins, n_frames)]
1257
+ The constant-Q magnitude spectrogram
1258
+
1259
+ n_iter : int > 0
1260
+ The number of iterations to run
1261
+
1262
+ sr : number > 0
1263
+ Audio sampling rate
1264
+
1265
+ hop_length : int > 0
1266
+ The hop length of the CQT
1267
+
1268
+ fmin : number > 0
1269
+ Minimum frequency for the CQT.
1270
+
1271
+ If not provided, it defaults to `C1`.
1272
+
1273
+ bins_per_octave : int > 0
1274
+ Number of bins per octave
1275
+
1276
+ tuning : float
1277
+ Tuning deviation from A440, in fractions of a bin
1278
+
1279
+ filter_scale : float > 0
1280
+ Filter scale factor. Small values (<1) use shorter windows
1281
+ for improved time resolution.
1282
+
1283
+ norm : {inf, -inf, 0, float > 0}
1284
+ Type of norm to use for basis function normalization.
1285
+ See `librosa.util.normalize`.
1286
+
1287
+ sparsity : float in [0, 1)
1288
+ Sparsify the CQT basis by discarding up to ``sparsity``
1289
+ fraction of the energy in each basis.
1290
+
1291
+ Set ``sparsity=0`` to disable sparsification.
1292
+
1293
+ window : str, tuple, or function
1294
+ Window specification for the basis filters.
1295
+ See `filters.get_window` for details.
1296
+
1297
+ scale : bool
1298
+ If ``True``, scale the CQT response by square-root the length
1299
+ of each channel's filter. This is analogous to ``norm='ortho'``
1300
+ in FFT.
1301
+
1302
+ If ``False``, do not scale the CQT. This is analogous to ``norm=None``
1303
+ in FFT.
1304
+
1305
+ pad_mode : string
1306
+ Padding mode for centered frame analysis.
1307
+
1308
+ See also: `librosa.stft` and `numpy.pad`.
1309
+
1310
+ res_type : string
1311
+ The resampling mode for recursive downsampling.
1312
+
1313
+ See ``librosa.resample`` for a list of available options.
1314
+
1315
+ dtype : numeric type
1316
+ Real numeric type for ``y``. Default is inferred to match the precision
1317
+ of the input CQT.
1318
+
1319
+ length : int > 0, optional
1320
+ If provided, the output ``y`` is zero-padded or clipped to exactly
1321
+ ``length`` samples.
1322
+
1323
+ momentum : float > 0
1324
+ The momentum parameter for fast Griffin-Lim.
1325
+ Setting this to 0 recovers the original Griffin-Lim method.
1326
+ Values near 1 can lead to faster convergence, but above 1 may not converge.
1327
+
1328
+ init : None or 'random' [default]
1329
+ If 'random' (the default), then phase values are initialized randomly
1330
+ according to ``random_state``. This is recommended when the input ``C`` is
1331
+ a magnitude spectrogram with no initial phase estimates.
1332
+
1333
+ If ``None``, then the phase is initialized from ``C``. This is useful when
1334
+ an initial guess for phase can be provided, or when you want to resume
1335
+ Griffin-Lim from a previous output.
1336
+
1337
+ random_state : None, int, np.random.RandomState, or np.random.Generator
1338
+ If int, random_state is the seed used by the random number generator
1339
+ for phase initialization.
1340
+
1341
+ If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself.
1342
+
1343
+ If ``None``, defaults to the `np.random.default_rng()` object.
1344
+
1345
+ Returns
1346
+ -------
1347
+ y : np.ndarray [shape=(..., n)]
1348
+ time-domain signal reconstructed from ``C``
1349
+
1350
+ See Also
1351
+ --------
1352
+ cqt
1353
+ icqt
1354
+ griffinlim
1355
+ filters.get_window
1356
+ resample
1357
+
1358
+ Examples
1359
+ --------
1360
+ A basis CQT inverse example
1361
+
1362
+ >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None)
1363
+ >>> # Get the CQT magnitude, 7 octaves at 36 bins per octave
1364
+ >>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36))
1365
+ >>> # Invert using Griffin-Lim
1366
+ >>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36)
1367
+ >>> # And invert without estimating phase
1368
+ >>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36)
1369
+
1370
+ Wave-plot the results
1371
+
1372
+ >>> import matplotlib.pyplot as plt
1373
+ >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
1374
+ >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
1375
+ >>> ax[0].set(title='Original', xlabel=None)
1376
+ >>> ax[0].label_outer()
1377
+ >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
1378
+ >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
1379
+ >>> ax[1].label_outer()
1380
+ >>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2])
1381
+ >>> ax[2].set(title='Magnitude-only icqt reconstruction')
1382
+ """
1383
+ if fmin is None:
1384
+ fmin = note_to_hz("C1")
1385
+
1386
+ if random_state is None:
1387
+ rng = np.random.default_rng()
1388
+ elif isinstance(random_state, int):
1389
+ rng = np.random.RandomState(seed=random_state) # type: ignore
1390
+ elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
1391
+ rng = random_state # type: ignore
1392
+ else:
1393
+ _ensure_not_reachable(random_state)
1394
+ raise ParameterError(f"Unsupported random_state={random_state!r}")
1395
+
1396
+ if momentum > 1:
1397
+ warnings.warn(
1398
+ f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
1399
+ "Proceed with caution!",
1400
+ stacklevel=2,
1401
+ )
1402
+ elif momentum < 0:
1403
+ raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0")
1404
+
1405
+ # using complex64 will keep the result to minimal necessary precision
1406
+ angles = np.empty(C.shape, dtype=np.complex64)
1407
+ eps = util.tiny(angles)
1408
+
1409
+ if init == "random":
1410
+ # randomly initialize the phase
1411
+ angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape))
1412
+ elif init is None:
1413
+ # Initialize an all ones complex matrix
1414
+ angles[:] = 1.0
1415
+ else:
1416
+ raise ParameterError(f"init={init} must either None or 'random'")
1417
+
1418
+ # And initialize the previous iterate to 0
1419
+ rebuilt: np.ndarray = np.array(0.0)
1420
+
1421
+ for _ in range(n_iter):
1422
+ # Store the previous iterate
1423
+ tprev = rebuilt
1424
+
1425
+ # Invert with our current estimate of the phases
1426
+ inverse = icqt(
1427
+ C * angles,
1428
+ sr=sr,
1429
+ hop_length=hop_length,
1430
+ bins_per_octave=bins_per_octave,
1431
+ fmin=fmin,
1432
+ tuning=tuning,
1433
+ filter_scale=filter_scale,
1434
+ window=window,
1435
+ length=length,
1436
+ res_type=res_type,
1437
+ norm=norm,
1438
+ scale=scale,
1439
+ sparsity=sparsity,
1440
+ dtype=dtype,
1441
+ )
1442
+
1443
+ # Rebuild the spectrogram
1444
+ rebuilt = cqt(
1445
+ inverse,
1446
+ sr=sr,
1447
+ bins_per_octave=bins_per_octave,
1448
+ n_bins=C.shape[-2],
1449
+ hop_length=hop_length,
1450
+ fmin=fmin,
1451
+ tuning=tuning,
1452
+ filter_scale=filter_scale,
1453
+ window=window,
1454
+ norm=norm,
1455
+ scale=scale,
1456
+ sparsity=sparsity,
1457
+ pad_mode=pad_mode,
1458
+ res_type=res_type,
1459
+ )
1460
+
1461
+ # Update our phase estimates
1462
+ angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
1463
+ angles[:] /= np.abs(angles) + eps
1464
+
1465
+ # Return the final phase estimates
1466
+ return icqt(
1467
+ C * angles,
1468
+ sr=sr,
1469
+ hop_length=hop_length,
1470
+ bins_per_octave=bins_per_octave,
1471
+ tuning=tuning,
1472
+ filter_scale=filter_scale,
1473
+ fmin=fmin,
1474
+ window=window,
1475
+ length=length,
1476
+ res_type=res_type,
1477
+ norm=norm,
1478
+ scale=scale,
1479
+ sparsity=sparsity,
1480
+ dtype=dtype,
1481
+ )
1482
+
1483
+
1484
+ def __bpo_to_alpha(bins_per_octave: int) -> float:
1485
+ """Compute the alpha coefficient for a given number of bins per octave
1486
+
1487
+ Parameters
1488
+ ----------
1489
+ bins_per_octave : int
1490
+
1491
+ Returns
1492
+ -------
1493
+ alpha : number > 0
1494
+ """
1495
+
1496
+ r = 2 ** (1 / bins_per_octave)
1497
+ return (r**2 - 1) / (r**2 + 1)
filters.py ADDED
@@ -0,0 +1,1661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Filters
5
+ =======
6
+
7
+ Filter bank construction
8
+ ------------------------
9
+ .. autosummary::
10
+ :toctree: generated/
11
+
12
+ mel
13
+ chroma
14
+ wavelet
15
+ semitone_filterbank
16
+
17
+ Window functions
18
+ ----------------
19
+ .. autosummary::
20
+ :toctree: generated/
21
+
22
+ window_bandwidth
23
+ get_window
24
+
25
+ Miscellaneous
26
+ -------------
27
+ .. autosummary::
28
+ :toctree: generated/
29
+
30
+ wavelet_lengths
31
+ cq_to_chroma
32
+ mr_frequencies
33
+ window_sumsquare
34
+ diagonal_filter
35
+
36
+ Deprecated
37
+ ----------
38
+ .. autosummary::
39
+ :toctree: generated/
40
+
41
+ constant_q
42
+ constant_q_lengths
43
+
44
+ """
45
+ import warnings
46
+
47
+ import numpy as np
48
+ import scipy
49
+ import scipy.signal
50
+ import scipy.ndimage
51
+
52
+ from numba import jit
53
+
54
+ from ._cache import cache
55
+ from . import util
56
+ from .util.exceptions import ParameterError
57
+ from .util.decorators import deprecated
58
+
59
+ from .core.convert import note_to_hz, hz_to_midi, midi_to_hz, hz_to_octs
60
+ from .core.convert import fft_frequencies, mel_frequencies
61
+ from numpy.typing import ArrayLike, DTypeLike
62
+ from typing import Any, List, Optional, Tuple, Union
63
+ from typing_extensions import Literal
64
+ from ._typing import _WindowSpec, _FloatLike_co
65
+
66
+ __all__ = [
67
+ "mel",
68
+ "chroma",
69
+ "constant_q",
70
+ "constant_q_lengths",
71
+ "cq_to_chroma",
72
+ "window_bandwidth",
73
+ "get_window",
74
+ "mr_frequencies",
75
+ "semitone_filterbank",
76
+ "window_sumsquare",
77
+ "diagonal_filter",
78
+ "wavelet",
79
+ "wavelet_lengths",
80
+ ]
81
+
82
+ # Dictionary of window function bandwidths
83
+
84
+ WINDOW_BANDWIDTHS = {
85
+ "bart": 1.3334961334912805,
86
+ "barthann": 1.4560255965133932,
87
+ "bartlett": 1.3334961334912805,
88
+ "bkh": 2.0045975283585014,
89
+ "black": 1.7269681554262326,
90
+ "blackharr": 2.0045975283585014,
91
+ "blackman": 1.7269681554262326,
92
+ "blackmanharris": 2.0045975283585014,
93
+ "blk": 1.7269681554262326,
94
+ "bman": 1.7859588613860062,
95
+ "bmn": 1.7859588613860062,
96
+ "bohman": 1.7859588613860062,
97
+ "box": 1.0,
98
+ "boxcar": 1.0,
99
+ "brt": 1.3334961334912805,
100
+ "brthan": 1.4560255965133932,
101
+ "bth": 1.4560255965133932,
102
+ "cosine": 1.2337005350199792,
103
+ "flat": 2.7762255046484143,
104
+ "flattop": 2.7762255046484143,
105
+ "flt": 2.7762255046484143,
106
+ "halfcosine": 1.2337005350199792,
107
+ "ham": 1.3629455320350348,
108
+ "hamm": 1.3629455320350348,
109
+ "hamming": 1.3629455320350348,
110
+ "han": 1.50018310546875,
111
+ "hann": 1.50018310546875,
112
+ "nut": 1.9763500280946082,
113
+ "nutl": 1.9763500280946082,
114
+ "nuttall": 1.9763500280946082,
115
+ "ones": 1.0,
116
+ "par": 1.9174603174603191,
117
+ "parz": 1.9174603174603191,
118
+ "parzen": 1.9174603174603191,
119
+ "rect": 1.0,
120
+ "rectangular": 1.0,
121
+ "tri": 1.3331706523555851,
122
+ "triang": 1.3331706523555851,
123
+ "triangle": 1.3331706523555851,
124
+ }
125
+
126
+
127
+ @cache(level=10)
128
+ def mel(
129
+ *,
130
+ sr: float,
131
+ n_fft: int,
132
+ n_mels: int = 128,
133
+ fmin: float = 0.0,
134
+ fmax: Optional[float] = None,
135
+ htk: bool = False,
136
+ norm: Optional[Union[Literal["slaney"], float]] = "slaney",
137
+ dtype: DTypeLike = np.float32,
138
+ ) -> np.ndarray:
139
+ """Create a Mel filter-bank.
140
+
141
+ This produces a linear transformation matrix to project
142
+ FFT bins onto Mel-frequency bins.
143
+
144
+ Parameters
145
+ ----------
146
+ sr : number > 0 [scalar]
147
+ sampling rate of the incoming signal
148
+
149
+ n_fft : int > 0 [scalar]
150
+ number of FFT components
151
+
152
+ n_mels : int > 0 [scalar]
153
+ number of Mel bands to generate
154
+
155
+ fmin : float >= 0 [scalar]
156
+ lowest frequency (in Hz)
157
+
158
+ fmax : float >= 0 [scalar]
159
+ highest frequency (in Hz).
160
+ If `None`, use ``fmax = sr / 2.0``
161
+
162
+ htk : bool [scalar]
163
+ use HTK formula instead of Slaney
164
+
165
+ norm : {None, 'slaney', or number} [scalar]
166
+ If 'slaney', divide the triangular mel weights by the width of the mel band
167
+ (area normalization).
168
+
169
+ If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
170
+ See `librosa.util.normalize` for a full description of supported norm values
171
+ (including `+-np.inf`).
172
+
173
+ Otherwise, leave all the triangles aiming for a peak value of 1.0
174
+
175
+ dtype : np.dtype
176
+ The data type of the output basis.
177
+ By default, uses 32-bit (single-precision) floating point.
178
+
179
+ Returns
180
+ -------
181
+ M : np.ndarray [shape=(n_mels, 1 + n_fft/2)]
182
+ Mel transform matrix
183
+
184
+ See Also
185
+ --------
186
+ librosa.util.normalize
187
+
188
+ Notes
189
+ -----
190
+ This function caches at level 10.
191
+
192
+ Examples
193
+ --------
194
+ >>> melfb = librosa.filters.mel(sr=22050, n_fft=2048)
195
+ >>> melfb
196
+ array([[ 0. , 0.016, ..., 0. , 0. ],
197
+ [ 0. , 0. , ..., 0. , 0. ],
198
+ ...,
199
+ [ 0. , 0. , ..., 0. , 0. ],
200
+ [ 0. , 0. , ..., 0. , 0. ]])
201
+
202
+ Clip the maximum frequency to 8KHz
203
+
204
+ >>> librosa.filters.mel(sr=22050, n_fft=2048, fmax=8000)
205
+ array([[ 0. , 0.02, ..., 0. , 0. ],
206
+ [ 0. , 0. , ..., 0. , 0. ],
207
+ ...,
208
+ [ 0. , 0. , ..., 0. , 0. ],
209
+ [ 0. , 0. , ..., 0. , 0. ]])
210
+
211
+ >>> import matplotlib.pyplot as plt
212
+ >>> fig, ax = plt.subplots()
213
+ >>> img = librosa.display.specshow(melfb, x_axis='linear', ax=ax)
214
+ >>> ax.set(ylabel='Mel filter', title='Mel filter bank')
215
+ >>> fig.colorbar(img, ax=ax)
216
+ """
217
+
218
+ if fmax is None:
219
+ fmax = float(sr) / 2
220
+
221
+ # Initialize the weights
222
+ n_mels = int(n_mels)
223
+ weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
224
+
225
+ # Center freqs of each FFT bin
226
+ fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
227
+
228
+ # 'Center freqs' of mel bands - uniformly spaced between limits
229
+ mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
230
+
231
+ fdiff = np.diff(mel_f)
232
+ ramps = np.subtract.outer(mel_f, fftfreqs)
233
+
234
+ for i in range(n_mels):
235
+ # lower and upper slopes for all bins
236
+ lower = -ramps[i] / fdiff[i]
237
+ upper = ramps[i + 2] / fdiff[i + 1]
238
+
239
+ # .. then intersect them with each other and zero
240
+ weights[i] = np.maximum(0, np.minimum(lower, upper))
241
+
242
+ if isinstance(norm, str):
243
+ if norm == "slaney":
244
+ # Slaney-style mel is scaled to be approx constant energy per channel
245
+ enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
246
+ weights *= enorm[:, np.newaxis]
247
+ else:
248
+ raise ParameterError(f"Unsupported norm={norm}")
249
+ else:
250
+ weights = util.normalize(weights, norm=norm, axis=-1)
251
+
252
+ # Only check weights if f_mel[0] is positive
253
+ if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
254
+ # This means we have an empty channel somewhere
255
+ warnings.warn(
256
+ "Empty filters detected in mel frequency basis. "
257
+ "Some channels will produce empty responses. "
258
+ "Try increasing your sampling rate (and fmax) or "
259
+ "reducing n_mels.",
260
+ stacklevel=2,
261
+ )
262
+
263
+ return weights
264
+
265
+
266
+ @cache(level=10)
267
+ def chroma(
268
+ *,
269
+ sr: float,
270
+ n_fft: int,
271
+ n_chroma: int = 12,
272
+ tuning: float = 0.0,
273
+ ctroct: float = 5.0,
274
+ octwidth: Union[float, None] = 2,
275
+ norm: Optional[float] = 2,
276
+ base_c: bool = True,
277
+ dtype: DTypeLike = np.float32,
278
+ ) -> np.ndarray:
279
+ """Create a chroma filter bank.
280
+
281
+ This creates a linear transformation matrix to project
282
+ FFT bins onto chroma bins (i.e. pitch classes).
283
+
284
+ Parameters
285
+ ----------
286
+ sr : number > 0 [scalar]
287
+ audio sampling rate
288
+
289
+ n_fft : int > 0 [scalar]
290
+ number of FFT bins
291
+
292
+ n_chroma : int > 0 [scalar]
293
+ number of chroma bins
294
+
295
+ tuning : float
296
+ Tuning deviation from A440 in fractions of a chroma bin.
297
+
298
+ ctroct : float > 0 [scalar]
299
+
300
+ octwidth : float > 0 or None [scalar]
301
+ ``ctroct`` and ``octwidth`` specify a dominance window:
302
+ a Gaussian weighting centered on ``ctroct`` (in octs, A0 = 27.5Hz)
303
+ and with a gaussian half-width of ``octwidth``.
304
+
305
+ Set ``octwidth`` to `None` to use a flat weighting.
306
+
307
+ norm : float > 0 or np.inf
308
+ Normalization factor for each filter
309
+
310
+ base_c : bool
311
+ If True, the filter bank will start at 'C'.
312
+ If False, the filter bank will start at 'A'.
313
+
314
+ dtype : np.dtype
315
+ The data type of the output basis.
316
+ By default, uses 32-bit (single-precision) floating point.
317
+
318
+ Returns
319
+ -------
320
+ wts : ndarray [shape=(n_chroma, 1 + n_fft / 2)]
321
+ Chroma filter matrix
322
+
323
+ See Also
324
+ --------
325
+ librosa.util.normalize
326
+ librosa.feature.chroma_stft
327
+
328
+ Notes
329
+ -----
330
+ This function caches at level 10.
331
+
332
+ Examples
333
+ --------
334
+ Build a simple chroma filter bank
335
+
336
+ >>> chromafb = librosa.filters.chroma(sr=22050, n_fft=4096)
337
+ array([[ 1.689e-05, 3.024e-04, ..., 4.639e-17, 5.327e-17],
338
+ [ 1.716e-05, 2.652e-04, ..., 2.674e-25, 3.176e-25],
339
+ ...,
340
+ [ 1.578e-05, 3.619e-04, ..., 8.577e-06, 9.205e-06],
341
+ [ 1.643e-05, 3.355e-04, ..., 1.474e-10, 1.636e-10]])
342
+
343
+ Use quarter-tones instead of semitones
344
+
345
+ >>> librosa.filters.chroma(sr=22050, n_fft=4096, n_chroma=24)
346
+ array([[ 1.194e-05, 2.138e-04, ..., 6.297e-64, 1.115e-63],
347
+ [ 1.206e-05, 2.009e-04, ..., 1.546e-79, 2.929e-79],
348
+ ...,
349
+ [ 1.162e-05, 2.372e-04, ..., 6.417e-38, 9.923e-38],
350
+ [ 1.180e-05, 2.260e-04, ..., 4.697e-50, 7.772e-50]])
351
+
352
+ Equally weight all octaves
353
+
354
+ >>> librosa.filters.chroma(sr=22050, n_fft=4096, octwidth=None)
355
+ array([[ 3.036e-01, 2.604e-01, ..., 2.445e-16, 2.809e-16],
356
+ [ 3.084e-01, 2.283e-01, ..., 1.409e-24, 1.675e-24],
357
+ ...,
358
+ [ 2.836e-01, 3.116e-01, ..., 4.520e-05, 4.854e-05],
359
+ [ 2.953e-01, 2.888e-01, ..., 7.768e-10, 8.629e-10]])
360
+
361
+ >>> import matplotlib.pyplot as plt
362
+ >>> fig, ax = plt.subplots()
363
+ >>> img = librosa.display.specshow(chromafb, x_axis='linear', ax=ax)
364
+ >>> ax.set(ylabel='Chroma filter', title='Chroma filter bank')
365
+ >>> fig.colorbar(img, ax=ax)
366
+ """
367
+
368
+ wts = np.zeros((n_chroma, n_fft))
369
+
370
+ # Get the FFT bins, not counting the DC component
371
+ frequencies = np.linspace(0, sr, n_fft, endpoint=False)[1:]
372
+
373
+ frqbins = n_chroma * hz_to_octs(
374
+ frequencies, tuning=tuning, bins_per_octave=n_chroma
375
+ )
376
+
377
+ # make up a value for the 0 Hz bin = 1.5 octaves below bin 1
378
+ # (so chroma is 50% rotated from bin 1, and bin width is broad)
379
+ frqbins = np.concatenate(([frqbins[0] - 1.5 * n_chroma], frqbins))
380
+
381
+ binwidthbins = np.concatenate((np.maximum(frqbins[1:] - frqbins[:-1], 1.0), [1]))
382
+
383
+ D = np.subtract.outer(frqbins, np.arange(0, n_chroma, dtype="d")).T
384
+
385
+ n_chroma2 = np.round(float(n_chroma) / 2)
386
+
387
+ # Project into range -n_chroma/2 .. n_chroma/2
388
+ # add on fixed offset of 10*n_chroma to ensure all values passed to
389
+ # rem are positive
390
+ D = np.remainder(D + n_chroma2 + 10 * n_chroma, n_chroma) - n_chroma2
391
+
392
+ # Gaussian bumps - 2*D to make them narrower
393
+ wts = np.exp(-0.5 * (2 * D / np.tile(binwidthbins, (n_chroma, 1))) ** 2)
394
+
395
+ # normalize each column
396
+ wts = util.normalize(wts, norm=norm, axis=0)
397
+
398
+ # Maybe apply scaling for fft bins
399
+ if octwidth is not None:
400
+ wts *= np.tile(
401
+ np.exp(-0.5 * (((frqbins / n_chroma - ctroct) / octwidth) ** 2)),
402
+ (n_chroma, 1),
403
+ )
404
+
405
+ if base_c:
406
+ wts = np.roll(wts, -3 * (n_chroma // 12), axis=0)
407
+
408
+ # remove aliasing columns, copy to ensure row-contiguity
409
+ return np.ascontiguousarray(wts[:, : int(1 + n_fft / 2)], dtype=dtype)
410
+
411
+
412
+ def __float_window(window_spec):
413
+ """Decorator function for windows with fractional input.
414
+
415
+ This function guarantees that for fractional ``x``, the following hold:
416
+
417
+ 1. ``__float_window(window_function)(x)`` has length ``np.ceil(x)``
418
+ 2. all values from ``np.floor(x)`` are set to 0.
419
+
420
+ For integer-valued ``x``, there should be no change in behavior.
421
+ """
422
+
423
+ def _wrap(n, *args, **kwargs):
424
+ """The wrapped window"""
425
+ n_min, n_max = int(np.floor(n)), int(np.ceil(n))
426
+
427
+ window = get_window(window_spec, n_min)
428
+
429
+ if len(window) < n_max:
430
+ window = np.pad(window, [(0, n_max - len(window))], mode="constant")
431
+
432
+ window[n_min:] = 0.0
433
+
434
+ return window
435
+
436
+ return _wrap
437
+
438
+
439
+ @deprecated(version="0.9.0", version_removed="1.0")
440
+ def constant_q(
441
+ *,
442
+ sr: float,
443
+ fmin: Optional[_FloatLike_co] = None,
444
+ n_bins: int = 84,
445
+ bins_per_octave: int = 12,
446
+ window: _WindowSpec = "hann",
447
+ filter_scale: float = 1,
448
+ pad_fft: bool = True,
449
+ norm: Optional[float] = 1,
450
+ dtype: DTypeLike = np.complex64,
451
+ gamma: float = 0,
452
+ **kwargs: Any,
453
+ ) -> Tuple[np.ndarray, np.ndarray]:
454
+ r"""Construct a constant-Q basis.
455
+
456
+ This function constructs a filter bank similar to Morlet wavelets,
457
+ where complex exponentials are windowed to different lengths
458
+ such that the number of cycles remains fixed for all frequencies.
459
+
460
+ By default, a Hann window (rather than the Gaussian window of Morlet wavelets)
461
+ is used, but this can be controlled by the ``window`` parameter.
462
+
463
+ Frequencies are spaced geometrically, increasing by a factor of
464
+ ``(2**(1./bins_per_octave))`` at each successive band.
465
+
466
+ .. warning:: This function is deprecated as of v0.9 and will be removed in 1.0.
467
+ See `librosa.filters.wavelet`.
468
+
469
+ Parameters
470
+ ----------
471
+ sr : number > 0 [scalar]
472
+ Audio sampling rate
473
+
474
+ fmin : float > 0 [scalar]
475
+ Minimum frequency bin. Defaults to `C1 ~= 32.70`
476
+
477
+ n_bins : int > 0 [scalar]
478
+ Number of frequencies. Defaults to 7 octaves (84 bins).
479
+
480
+ bins_per_octave : int > 0 [scalar]
481
+ Number of bins per octave
482
+
483
+ window : string, tuple, number, or function
484
+ Windowing function to apply to filters.
485
+
486
+ filter_scale : float > 0 [scalar]
487
+ Scale of filter windows.
488
+ Small values (<1) use shorter windows for higher temporal resolution.
489
+
490
+ pad_fft : boolean
491
+ Center-pad all filters up to the nearest integral power of 2.
492
+
493
+ By default, padding is done with zeros, but this can be overridden
494
+ by setting the ``mode=`` field in *kwargs*.
495
+
496
+ norm : {inf, -inf, 0, float > 0}
497
+ Type of norm to use for basis function normalization.
498
+ See librosa.util.normalize
499
+
500
+ gamma : number >= 0
501
+ Bandwidth offset for variable-Q transforms.
502
+ ``gamma=0`` produces a constant-Q filterbank.
503
+
504
+ dtype : np.dtype
505
+ The data type of the output basis.
506
+ By default, uses 64-bit (single precision) complex floating point.
507
+
508
+ **kwargs : additional keyword arguments
509
+ Arguments to `np.pad()` when ``pad==True``.
510
+
511
+ Returns
512
+ -------
513
+ filters : np.ndarray, ``len(filters) == n_bins``
514
+ ``filters[i]`` is ``i``\ th time-domain CQT basis filter
515
+ lengths : np.ndarray, ``len(lengths) == n_bins``
516
+ The (fractional) length of each filter
517
+
518
+ Notes
519
+ -----
520
+ This function caches at level 10.
521
+
522
+ See Also
523
+ --------
524
+ wavelet
525
+ constant_q_lengths
526
+ librosa.cqt
527
+ librosa.vqt
528
+ librosa.util.normalize
529
+
530
+ Examples
531
+ --------
532
+ Use a shorter window for each filter
533
+
534
+ >>> basis, lengths = librosa.filters.constant_q(sr=22050, filter_scale=0.5)
535
+
536
+ Plot one octave of filters in time and frequency
537
+
538
+ >>> import matplotlib.pyplot as plt
539
+ >>> basis, lengths = librosa.filters.constant_q(sr=22050)
540
+ >>> fig, ax = plt.subplots(nrows=2, figsize=(10, 6))
541
+ >>> notes = librosa.midi_to_note(np.arange(24, 24 + len(basis)))
542
+ >>> for i, (f, n) in enumerate(zip(basis, notes[:12])):
543
+ ... f_scale = librosa.util.normalize(f) / 2
544
+ ... ax[0].plot(i + f_scale.real)
545
+ ... ax[0].plot(i + f_scale.imag, linestyle=':')
546
+ >>> ax[0].set(yticks=np.arange(len(notes[:12])), yticklabels=notes[:12],
547
+ ... ylabel='CQ filters',
548
+ ... title='CQ filters (one octave, time domain)',
549
+ ... xlabel='Time (samples at 22050 Hz)')
550
+ >>> ax[0].legend(['Real', 'Imaginary'])
551
+ >>> F = np.abs(np.fft.fftn(basis, axes=[-1]))
552
+ >>> # Keep only the positive frequencies
553
+ >>> F = F[:, :(1 + F.shape[1] // 2)]
554
+ >>> librosa.display.specshow(F, x_axis='linear', y_axis='cqt_note', ax=ax[1])
555
+ >>> ax[1].set(ylabel='CQ filters', title='CQ filter magnitudes (frequency domain)')
556
+ """
557
+
558
+ if fmin is None:
559
+ fmin = note_to_hz("C1")
560
+
561
+ # Pass-through parameters to get the filter lengths
562
+ lengths = constant_q_lengths(
563
+ sr=sr,
564
+ fmin=fmin,
565
+ n_bins=n_bins,
566
+ bins_per_octave=bins_per_octave,
567
+ window=window,
568
+ filter_scale=filter_scale,
569
+ gamma=gamma,
570
+ )
571
+
572
+ freqs = fmin * (2.0 ** (np.arange(n_bins, dtype=float) / bins_per_octave))
573
+
574
+ # Build the filters
575
+ filters = []
576
+ for ilen, freq in zip(lengths, freqs):
577
+ # Build the filter: note, length will be ceil(ilen)
578
+ sig = util.phasor(
579
+ np.arange(-ilen // 2, ilen // 2, dtype=float) * 2 * np.pi * freq / sr
580
+ )
581
+
582
+ # Apply the windowing function
583
+ sig = sig * __float_window(window)(len(sig))
584
+
585
+ # Normalize
586
+ sig = util.normalize(sig, norm=norm)
587
+
588
+ filters.append(sig)
589
+
590
+ # Pad and stack
591
+ max_len = max(lengths)
592
+ if pad_fft:
593
+ max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
594
+ else:
595
+ max_len = int(np.ceil(max_len))
596
+
597
+ filters = np.asarray(
598
+ [util.pad_center(filt, size=max_len, **kwargs) for filt in filters], dtype=dtype
599
+ )
600
+
601
+ return filters, np.asarray(lengths)
602
+
603
+
604
+ @deprecated(version="0.9.0", version_removed="1.0")
605
+ @cache(level=10)
606
+ def constant_q_lengths(
607
+ *,
608
+ sr: float,
609
+ fmin: _FloatLike_co,
610
+ n_bins: int = 84,
611
+ bins_per_octave: int = 12,
612
+ window: _WindowSpec = "hann",
613
+ filter_scale: float = 1,
614
+ gamma: float = 0,
615
+ ) -> np.ndarray:
616
+ r"""Return length of each filter in a constant-Q basis.
617
+
618
+ .. warning:: This function is deprecated as of v0.9 and will be removed in 1.0.
619
+ See `librosa.filters.wavelet_lengths`.
620
+
621
+ Parameters
622
+ ----------
623
+ sr : number > 0 [scalar]
624
+ Audio sampling rate
625
+ fmin : float > 0 [scalar]
626
+ Minimum frequency bin.
627
+ n_bins : int > 0 [scalar]
628
+ Number of frequencies. Defaults to 7 octaves (84 bins).
629
+ bins_per_octave : int > 0 [scalar]
630
+ Number of bins per octave
631
+ window : str or callable
632
+ Window function to use on filters
633
+ filter_scale : float > 0 [scalar]
634
+ Resolution of filter windows. Larger values use longer windows.
635
+ gamma : number >= 0
636
+ Bandwidth offset for variable-Q transforms.
637
+ ``gamma=0`` produces a constant-Q filterbank.
638
+
639
+ Returns
640
+ -------
641
+ lengths : np.ndarray
642
+ The length of each filter.
643
+
644
+ Notes
645
+ -----
646
+ This function caches at level 10.
647
+
648
+ See Also
649
+ --------
650
+ wavelet_lengths
651
+ """
652
+
653
+ if fmin <= 0:
654
+ raise ParameterError("fmin must be strictly positive")
655
+
656
+ if bins_per_octave <= 0:
657
+ raise ParameterError("bins_per_octave must be positive")
658
+
659
+ if filter_scale <= 0:
660
+ raise ParameterError("filter_scale must be positive")
661
+
662
+ if n_bins <= 0 or not isinstance(n_bins, (int, np.integer)):
663
+ raise ParameterError("n_bins must be a positive integer")
664
+
665
+ # Compute the frequencies
666
+ freq = fmin * (2.0 ** (np.arange(n_bins, dtype=float) / bins_per_octave))
667
+
668
+ # Q should be capitalized here, so we suppress the name warning
669
+ # pylint: disable=invalid-name
670
+ #
671
+ # Balance filter bandwidths
672
+ alpha = (2.0 ** (2 / bins_per_octave) - 1) / (2.0 ** (2 / bins_per_octave) + 1)
673
+ Q = float(filter_scale) / alpha
674
+
675
+ if max(freq * (1 + 0.5 * window_bandwidth(window) / Q)) > sr / 2.0:
676
+ raise ParameterError(
677
+ f"Maximum filter frequency={max(freq):.2f} would exceed Nyquist={sr/2}"
678
+ )
679
+
680
+ # Convert frequencies to filter lengths
681
+ lengths: np.ndarray = Q * sr / (freq + gamma / alpha)
682
+
683
+ return lengths
684
+
685
+
686
+ @cache(level=10)
687
+ def wavelet_lengths(
688
+ *,
689
+ freqs: ArrayLike,
690
+ sr: float = 22050,
691
+ window: _WindowSpec = "hann",
692
+ filter_scale: float = 1,
693
+ gamma: Optional[float] = 0,
694
+ alpha: Optional[Union[float, np.ndarray]] = None,
695
+ ) -> Tuple[np.ndarray, float]:
696
+ """Return length of each filter in a wavelet basis.
697
+
698
+ Parameters
699
+ ----------
700
+ freqs : np.ndarray (positive)
701
+ Center frequencies of the filters (in Hz).
702
+ Must be in ascending order.
703
+
704
+ sr : number > 0 [scalar]
705
+ Audio sampling rate
706
+
707
+ window : str or callable
708
+ Window function to use on filters
709
+
710
+ filter_scale : float > 0 [scalar]
711
+ Resolution of filter windows. Larger values use longer windows.
712
+
713
+ gamma : number >= 0 [scalar, optional]
714
+ Bandwidth offset for determining filter lengths, as used in
715
+ Variable-Q transforms.
716
+
717
+ Bandwidth for the k'th filter is determined by::
718
+
719
+ B[k] = alpha[k] * freqs[k] + gamma
720
+
721
+ ``alpha[k]`` is twice the relative difference between ``freqs[k+1]`` and ``freqs[k-1]``::
722
+
723
+ alpha[k] = (freqs[k+1]-freqs[k-1]) / (freqs[k+1]+freqs[k-1])
724
+
725
+ If ``freqs`` follows a geometric progression (as in CQT and VQT), the vector
726
+ ``alpha`` is constant and such that::
727
+
728
+ (1 + alpha) * freqs[k-1] = (1 - alpha) * freqs[k+1]
729
+
730
+ Furthermore, if ``gamma=0`` (default), ``alpha`` is such that even-``k`` and
731
+ odd-``k`` filters are interleaved::
732
+
733
+ freqs[k-1] + B[k-1] = freqs[k+1] - B[k+1]
734
+
735
+ If ``gamma=None`` is specified, then ``gamma`` is computed such
736
+ that each filter has bandwidth proportional to the equivalent
737
+ rectangular bandwidth (ERB) at frequency ``freqs[k]``::
738
+
739
+ gamma[k] = 24.7 * alpha[k] / 0.108
740
+
741
+ as derived by [#]_.
742
+
743
+ .. [#] Glasberg, Brian R., and Brian CJ Moore.
744
+ "Derivation of auditory filter shapes from notched-noise data."
745
+ Hearing research 47.1-2 (1990): 103-138.
746
+
747
+ alpha : number > 0 [optional]
748
+ If only one frequency is provided (``len(freqs)==1``), then filter bandwidth
749
+ cannot be computed. In that case, the ``alpha`` parameter described above
750
+ can be explicitly specified here.
751
+
752
+ If two or more frequencies are provided, this parameter is ignored.
753
+
754
+ Returns
755
+ -------
756
+ lengths : np.ndarray
757
+ The length of each filter.
758
+ f_cutoff : float
759
+ The lowest frequency at which all filters' main lobes have decayed by
760
+ at least 3dB.
761
+
762
+ This second output serves in cqt and vqt to ensure that all wavelet
763
+ bands remain below the Nyquist frequency.
764
+
765
+ Notes
766
+ -----
767
+ This function caches at level 10.
768
+
769
+ Raises
770
+ ------
771
+ ParameterError
772
+ - If ``filter_scale`` is not strictly positive
773
+
774
+ - If ``gamma`` is a negative number
775
+
776
+ - If any frequencies are <= 0
777
+
778
+ - If the frequency array is not sorted in ascending order
779
+ """
780
+ freqs = np.asarray(freqs)
781
+ if filter_scale <= 0:
782
+ raise ParameterError(f"filter_scale={filter_scale} must be positive")
783
+
784
+ if gamma is not None and gamma < 0:
785
+ raise ParameterError(f"gamma={gamma} must be non-negative")
786
+
787
+ if np.any(freqs <= 0):
788
+ raise ParameterError("frequencies must be strictly positive")
789
+
790
+ if len(freqs) > 1 and np.any(freqs[:-1] > freqs[1:]):
791
+ raise ParameterError(
792
+ f"Frequency array={freqs} must be in strictly ascending order"
793
+ )
794
+
795
+ # We need at least 2 frequencies to infer alpha
796
+ if len(freqs) > 1:
797
+ # Approximate the local octave resolution
798
+ bpo = np.empty(len(freqs))
799
+ logf = np.log2(freqs)
800
+ bpo[0] = 1 / (logf[1] - logf[0])
801
+ bpo[-1] = 1 / (logf[-1] - logf[-2])
802
+ bpo[1:-1] = 2 / (logf[2:] - logf[:-2])
803
+
804
+ alpha = (2.0 ** (2 / bpo) - 1) / (2.0 ** (2 / bpo) + 1)
805
+ if alpha is None:
806
+ raise ParameterError(
807
+ "Cannot construct a wavelet basis for a single frequency if alpha is not provided"
808
+ )
809
+
810
+ gamma_: Union[_FloatLike_co, np.ndarray]
811
+ if gamma is None:
812
+ gamma_ = alpha * 24.7 / 0.108
813
+ else:
814
+ gamma_ = gamma
815
+ # Q should be capitalized here, so we suppress the name warning
816
+ # pylint: disable=invalid-name
817
+ Q = float(filter_scale) / alpha
818
+
819
+ # How far up does our highest frequency reach?
820
+ f_cutoff = max(freqs * (1 + 0.5 * window_bandwidth(window) / Q) + 0.5 * gamma_)
821
+
822
+ # Convert frequencies to filter lengths
823
+ lengths = Q * sr / (freqs + gamma_ / alpha)
824
+
825
+ return lengths, f_cutoff
826
+
827
+
828
+ @cache(level=10)
829
+ def wavelet(
830
+ *,
831
+ freqs: np.ndarray,
832
+ sr: float = 22050,
833
+ window: _WindowSpec = "hann",
834
+ filter_scale: float = 1,
835
+ pad_fft: bool = True,
836
+ norm: Optional[float] = 1,
837
+ dtype: DTypeLike = np.complex64,
838
+ gamma: float = 0,
839
+ alpha: Optional[float] = None,
840
+ **kwargs: Any,
841
+ ) -> Tuple[np.ndarray, np.ndarray]:
842
+ """Construct a wavelet basis using windowed complex sinusoids.
843
+
844
+ This function constructs a wavelet filterbank at a specified set of center
845
+ frequencies.
846
+
847
+ Parameters
848
+ ----------
849
+ freqs : np.ndarray (positive)
850
+ Center frequencies of the filters (in Hz).
851
+ Must be in ascending order.
852
+
853
+ sr : number > 0 [scalar]
854
+ Audio sampling rate
855
+
856
+ window : string, tuple, number, or function
857
+ Windowing function to apply to filters.
858
+
859
+ filter_scale : float > 0 [scalar]
860
+ Scale of filter windows.
861
+ Small values (<1) use shorter windows for higher temporal resolution.
862
+
863
+ pad_fft : boolean
864
+ Center-pad all filters up to the nearest integral power of 2.
865
+
866
+ By default, padding is done with zeros, but this can be overridden
867
+ by setting the ``mode=`` field in *kwargs*.
868
+
869
+ norm : {inf, -inf, 0, float > 0}
870
+ Type of norm to use for basis function normalization.
871
+ See librosa.util.normalize
872
+
873
+ gamma : number >= 0
874
+ Bandwidth offset for variable-Q transforms.
875
+
876
+ dtype : np.dtype
877
+ The data type of the output basis.
878
+ By default, uses 64-bit (single precision) complex floating point.
879
+
880
+ alpha : number > 0 [optional]
881
+ If only one frequency is provided (``len(freqs)==1``), then filter bandwidth
882
+ cannot be computed. In that case, the ``alpha`` parameter described above
883
+ can be explicitly specified here.
884
+
885
+ If two or more frequencies are provided, this parameter is ignored.
886
+
887
+ **kwargs : additional keyword arguments
888
+ Arguments to `np.pad()` when ``pad==True``.
889
+
890
+ Returns
891
+ -------
892
+ filters : np.ndarray, ``len(filters) == n_bins``
893
+ each ``filters[i]`` is a (complex) time-domain filter
894
+ lengths : np.ndarray, ``len(lengths) == n_bins``
895
+ The (fractional) length of each filter in samples
896
+
897
+ Notes
898
+ -----
899
+ This function caches at level 10.
900
+
901
+ See Also
902
+ --------
903
+ wavelet_lengths
904
+ librosa.cqt
905
+ librosa.vqt
906
+ librosa.util.normalize
907
+
908
+ Examples
909
+ --------
910
+ Create a constant-Q basis
911
+
912
+ >>> freqs = librosa.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1'))
913
+ >>> basis, lengths = librosa.filters.wavelet(freqs=freqs, sr=22050)
914
+
915
+ Plot one octave of filters in time and frequency
916
+
917
+ >>> import matplotlib.pyplot as plt
918
+ >>> basis, lengths = librosa.filters.wavelet(freqs=freqs, sr=22050)
919
+ >>> fig, ax = plt.subplots(nrows=2, figsize=(10, 6))
920
+ >>> notes = librosa.midi_to_note(np.arange(24, 24 + len(basis)))
921
+ >>> for i, (f, n) in enumerate(zip(basis, notes[:12])):
922
+ ... f_scale = librosa.util.normalize(f) / 2
923
+ ... ax[0].plot(i + f_scale.real)
924
+ ... ax[0].plot(i + f_scale.imag, linestyle=':')
925
+ >>> ax[0].set(yticks=np.arange(len(notes[:12])), yticklabels=notes[:12],
926
+ ... ylabel='CQ filters',
927
+ ... title='CQ filters (one octave, time domain)',
928
+ ... xlabel='Time (samples at 22050 Hz)')
929
+ >>> ax[0].legend(['Real', 'Imaginary'])
930
+ >>> F = np.abs(np.fft.fftn(basis, axes=[-1]))
931
+ >>> # Keep only the positive frequencies
932
+ >>> F = F[:, :(1 + F.shape[1] // 2)]
933
+ >>> librosa.display.specshow(F, x_axis='linear', y_axis='cqt_note', ax=ax[1])
934
+ >>> ax[1].set(ylabel='CQ filters', title='CQ filter magnitudes (frequency domain)')
935
+ """
936
+
937
+ # Pass-through parameters to get the filter lengths
938
+ lengths, _ = wavelet_lengths(
939
+ freqs=freqs,
940
+ sr=sr,
941
+ window=window,
942
+ filter_scale=filter_scale,
943
+ gamma=gamma,
944
+ alpha=alpha,
945
+ )
946
+
947
+ # Build the filters
948
+ filters = []
949
+ for ilen, freq in zip(lengths, freqs):
950
+ # Build the filter: note, length will be ceil(ilen)
951
+ sig = util.phasor(
952
+ np.arange(-ilen // 2, ilen // 2, dtype=float) * 2 * np.pi * freq / sr
953
+ )
954
+
955
+ # Apply the windowing function
956
+ sig *= __float_window(window)(len(sig))
957
+
958
+ # Normalize
959
+ sig = util.normalize(sig, norm=norm)
960
+
961
+ filters.append(sig)
962
+
963
+ # Pad and stack
964
+ max_len = max(lengths)
965
+ if pad_fft:
966
+ max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
967
+ else:
968
+ max_len = int(np.ceil(max_len))
969
+
970
+ filters = np.asarray(
971
+ [util.pad_center(filt, size=max_len, **kwargs) for filt in filters], dtype=dtype
972
+ )
973
+
974
+ return filters, lengths
975
+
976
+
977
+ @cache(level=10)
978
+ def cq_to_chroma(
979
+ n_input: int,
980
+ *,
981
+ bins_per_octave: int = 12,
982
+ n_chroma: int = 12,
983
+ fmin: Optional[_FloatLike_co] = None,
984
+ window: Optional[np.ndarray] = None,
985
+ base_c: bool = True,
986
+ dtype: DTypeLike = np.float32,
987
+ ) -> np.ndarray:
988
+ """Construct a linear transformation matrix to map Constant-Q bins
989
+ onto chroma bins (i.e., pitch classes).
990
+
991
+ Parameters
992
+ ----------
993
+ n_input : int > 0 [scalar]
994
+ Number of input components (CQT bins)
995
+ bins_per_octave : int > 0 [scalar]
996
+ How many bins per octave in the CQT
997
+ n_chroma : int > 0 [scalar]
998
+ Number of output bins (per octave) in the chroma
999
+ fmin : None or float > 0
1000
+ Center frequency of the first constant-Q channel.
1001
+ Default: 'C1' ~= 32.7 Hz
1002
+ window : None or np.ndarray
1003
+ If provided, the cq_to_chroma filter bank will be
1004
+ convolved with ``window``.
1005
+ base_c : bool
1006
+ If True, the first chroma bin will start at 'C'
1007
+ If False, the first chroma bin will start at 'A'
1008
+ dtype : np.dtype
1009
+ The data type of the output basis.
1010
+ By default, uses 32-bit (single-precision) floating point.
1011
+
1012
+ Returns
1013
+ -------
1014
+ cq_to_chroma : np.ndarray [shape=(n_chroma, n_input)]
1015
+ Transformation matrix: ``Chroma = np.dot(cq_to_chroma, CQT)``
1016
+
1017
+ Raises
1018
+ ------
1019
+ ParameterError
1020
+ If ``n_input`` is not an integer multiple of ``n_chroma``
1021
+
1022
+ Notes
1023
+ -----
1024
+ This function caches at level 10.
1025
+
1026
+ Examples
1027
+ --------
1028
+ Get a CQT, and wrap bins to chroma
1029
+
1030
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
1031
+ >>> CQT = np.abs(librosa.cqt(y, sr=sr))
1032
+ >>> chroma_map = librosa.filters.cq_to_chroma(CQT.shape[0])
1033
+ >>> chromagram = chroma_map.dot(CQT)
1034
+ >>> # Max-normalize each time step
1035
+ >>> chromagram = librosa.util.normalize(chromagram, axis=0)
1036
+
1037
+ >>> import matplotlib.pyplot as plt
1038
+ >>> fig, ax = plt.subplots(nrows=3, sharex=True)
1039
+ >>> imgcq = librosa.display.specshow(librosa.amplitude_to_db(CQT,
1040
+ ... ref=np.max),
1041
+ ... y_axis='cqt_note', x_axis='time',
1042
+ ... ax=ax[0])
1043
+ >>> ax[0].set(title='CQT Power')
1044
+ >>> ax[0].label_outer()
1045
+ >>> librosa.display.specshow(chromagram, y_axis='chroma', x_axis='time',
1046
+ ... ax=ax[1])
1047
+ >>> ax[1].set(title='Chroma (wrapped CQT)')
1048
+ >>> ax[1].label_outer()
1049
+ >>> chroma = librosa.feature.chroma_stft(y=y, sr=sr)
1050
+ >>> imgchroma = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=ax[2])
1051
+ >>> ax[2].set(title='librosa.feature.chroma_stft')
1052
+ """
1053
+
1054
+ # How many fractional bins are we merging?
1055
+ n_merge = float(bins_per_octave) / n_chroma
1056
+
1057
+ fmin_: _FloatLike_co
1058
+ if fmin is None:
1059
+ fmin_ = note_to_hz("C1")
1060
+ else:
1061
+ fmin_ = fmin
1062
+
1063
+ if np.mod(n_merge, 1) != 0:
1064
+ raise ParameterError(
1065
+ "Incompatible CQ merge: "
1066
+ "input bins must be an "
1067
+ "integer multiple of output bins."
1068
+ )
1069
+
1070
+ # Tile the identity to merge fractional bins
1071
+ cq_to_ch = np.repeat(np.eye(n_chroma), int(n_merge), axis=1)
1072
+
1073
+ # Roll it left to center on the target bin
1074
+ cq_to_ch = np.roll(cq_to_ch, -int(n_merge // 2), axis=1)
1075
+
1076
+ # How many octaves are we repeating?
1077
+ n_octaves = np.ceil(float(n_input) / bins_per_octave)
1078
+
1079
+ # Repeat and trim
1080
+ cq_to_ch = np.tile(cq_to_ch, int(n_octaves))[:, :n_input]
1081
+
1082
+ # What's the note number of the first bin in the CQT?
1083
+ # midi uses 12 bins per octave here
1084
+ midi_0 = np.mod(hz_to_midi(fmin_), 12)
1085
+
1086
+ if base_c:
1087
+ # rotate to C
1088
+ roll = midi_0
1089
+ else:
1090
+ # rotate to A
1091
+ roll = midi_0 - 9
1092
+
1093
+ # Adjust the roll in terms of how many chroma we want out
1094
+ # We need to be careful with rounding here
1095
+ roll = int(np.round(roll * (n_chroma / 12.0)))
1096
+
1097
+ # Apply the roll
1098
+ cq_to_ch = np.roll(cq_to_ch, roll, axis=0).astype(dtype)
1099
+
1100
+ if window is not None:
1101
+ cq_to_ch = scipy.signal.convolve(cq_to_ch, np.atleast_2d(window), mode="same")
1102
+
1103
+ return cq_to_ch
1104
+
1105
+
1106
+ @cache(level=10)
1107
+ def window_bandwidth(window: _WindowSpec, n: int = 1000) -> float:
1108
+ """Get the equivalent noise bandwidth (ENBW) of a window function.
1109
+
1110
+ The ENBW of a window is defined by [#]_ (equation 11) as the normalized
1111
+ ratio of the sum of squares to the square of sums::
1112
+
1113
+ enbw = n * sum(window**2) / sum(window)**2
1114
+
1115
+ .. [#] Harris, F. J.
1116
+ "On the use of windows for harmonic analysis with the discrete Fourier transform."
1117
+ Proceedings of the IEEE, 66(1), 51-83. 1978.
1118
+
1119
+ Parameters
1120
+ ----------
1121
+ window : callable or string
1122
+ A window function, or the name of a window function.
1123
+ Examples:
1124
+ - scipy.signal.hann
1125
+ - 'boxcar'
1126
+ n : int > 0
1127
+ The number of coefficients to use in estimating the
1128
+ window bandwidth
1129
+
1130
+ Returns
1131
+ -------
1132
+ bandwidth : float
1133
+ The equivalent noise bandwidth (in FFT bins) of the
1134
+ given window function
1135
+
1136
+ Notes
1137
+ -----
1138
+ This function caches at level 10.
1139
+
1140
+ See Also
1141
+ --------
1142
+ get_window
1143
+ """
1144
+
1145
+ if hasattr(window, "__name__"):
1146
+ key = window.__name__
1147
+ else:
1148
+ key = window
1149
+
1150
+ if key not in WINDOW_BANDWIDTHS:
1151
+ win = get_window(window, n)
1152
+ WINDOW_BANDWIDTHS[key] = (
1153
+ n * np.sum(win**2) / (np.sum(win) ** 2 + util.tiny(win))
1154
+ )
1155
+
1156
+ return WINDOW_BANDWIDTHS[key]
1157
+
1158
+
1159
+ @cache(level=10)
1160
+ def get_window(
1161
+ window: _WindowSpec,
1162
+ Nx: int,
1163
+ *,
1164
+ fftbins: Optional[bool] = True,
1165
+ ) -> np.ndarray:
1166
+ """Compute a window function.
1167
+
1168
+ This is a wrapper for `scipy.signal.get_window` that additionally
1169
+ supports callable or pre-computed windows.
1170
+
1171
+ Parameters
1172
+ ----------
1173
+ window : string, tuple, number, callable, or list-like
1174
+ The window specification:
1175
+
1176
+ - If string, it's the name of the window function (e.g., `'hann'`)
1177
+ - If tuple, it's the name of the window function and any parameters
1178
+ (e.g., `('kaiser', 4.0)`)
1179
+ - If numeric, it is treated as the beta parameter of the `'kaiser'`
1180
+ window, as in `scipy.signal.get_window`.
1181
+ - If callable, it's a function that accepts one integer argument
1182
+ (the window length)
1183
+ - If list-like, it's a pre-computed window of the correct length `Nx`
1184
+
1185
+ Nx : int > 0
1186
+ The length of the window
1187
+
1188
+ fftbins : bool, optional
1189
+ If True (default), create a periodic window for use with FFT
1190
+ If False, create a symmetric window for filter design applications.
1191
+
1192
+ Returns
1193
+ -------
1194
+ get_window : np.ndarray
1195
+ A window of length `Nx` and type `window`
1196
+
1197
+ See Also
1198
+ --------
1199
+ scipy.signal.get_window
1200
+
1201
+ Notes
1202
+ -----
1203
+ This function caches at level 10.
1204
+
1205
+ Raises
1206
+ ------
1207
+ ParameterError
1208
+ If `window` is supplied as a vector of length != `n_fft`,
1209
+ or is otherwise mis-specified.
1210
+ """
1211
+ if callable(window):
1212
+ return window(Nx)
1213
+
1214
+ elif isinstance(window, (str, tuple)) or np.isscalar(window):
1215
+ # TODO: if we add custom window functions in librosa, call them here
1216
+
1217
+ win: np.ndarray = scipy.signal.get_window(window, Nx, fftbins=fftbins)
1218
+ return win
1219
+
1220
+ elif isinstance(window, (np.ndarray, list)):
1221
+ if len(window) == Nx:
1222
+ return np.asarray(window)
1223
+
1224
+ raise ParameterError(f"Window size mismatch: {len(window):d} != {Nx:d}")
1225
+ else:
1226
+ raise ParameterError(f"Invalid window specification: {window!r}")
1227
+
1228
+
1229
+ @cache(level=10)
1230
+ def _multirate_fb(
1231
+ center_freqs: Optional[np.ndarray] = None,
1232
+ sample_rates: Optional[np.ndarray] = None,
1233
+ Q: float = 25.0,
1234
+ passband_ripple: float = 1,
1235
+ stopband_attenuation: float = 50,
1236
+ ftype: str = "ellip",
1237
+ flayout: str = "sos",
1238
+ ) -> Tuple[List[Any], np.ndarray]:
1239
+ r"""Helper function to construct a multirate filterbank.
1240
+
1241
+ A filter bank consists of multiple band-pass filters which divide the input signal
1242
+ into subbands. In the case of a multirate filter bank, the band-pass filters
1243
+ operate with resampled versions of the input signal, e.g. to keep the length
1244
+ of a filter constant while shifting its center frequency.
1245
+
1246
+ This implementation uses `scipy.signal.iirdesign` to design the filters.
1247
+
1248
+ Parameters
1249
+ ----------
1250
+ center_freqs : np.ndarray [shape=(n,), dtype=float]
1251
+ Center frequencies of the filter kernels.
1252
+ Also defines the number of filters in the filterbank.
1253
+
1254
+ sample_rates : np.ndarray [shape=(n,), dtype=float]
1255
+ Samplerate for each filter (used for multirate filterbank).
1256
+
1257
+ Q : float
1258
+ Q factor (influences the filter bandwidth).
1259
+
1260
+ passband_ripple : float
1261
+ The maximum loss in the passband (dB)
1262
+ See `scipy.signal.iirdesign` for details.
1263
+
1264
+ stopband_attenuation : float
1265
+ The minimum attenuation in the stopband (dB)
1266
+ See `scipy.signal.iirdesign` for details.
1267
+
1268
+ ftype : str
1269
+ The type of IIR filter to design
1270
+ See `scipy.signal.iirdesign` for details.
1271
+
1272
+ flayout : string
1273
+ Valid `output` argument for `scipy.signal.iirdesign`.
1274
+
1275
+ - If `ba`, returns numerators/denominators of the transfer functions,
1276
+ used for filtering with `scipy.signal.filtfilt`.
1277
+ Can be unstable for high-order filters.
1278
+
1279
+ - If `sos`, returns a series of second-order filters,
1280
+ used for filtering with `scipy.signal.sosfiltfilt`.
1281
+ Minimizes numerical precision errors for high-order filters, but is slower.
1282
+
1283
+ - If `zpk`, returns zeros, poles, and system gains of the transfer functions.
1284
+
1285
+ Returns
1286
+ -------
1287
+ filterbank : list [shape=(n,), dtype=float]
1288
+ Each list entry comprises the filter coefficients for a single filter.
1289
+ sample_rates : np.ndarray [shape=(n,), dtype=float]
1290
+ Samplerate for each filter.
1291
+
1292
+ Notes
1293
+ -----
1294
+ This function caches at level 10.
1295
+
1296
+ See Also
1297
+ --------
1298
+ scipy.signal.iirdesign
1299
+
1300
+ Raises
1301
+ ------
1302
+ ParameterError
1303
+ If ``center_freqs`` is ``None``.
1304
+ If ``sample_rates`` is ``None``.
1305
+ If ``center_freqs.shape`` does not match ``sample_rates.shape``.
1306
+ """
1307
+
1308
+ if center_freqs is None:
1309
+ raise ParameterError("center_freqs must be provided.")
1310
+
1311
+ if sample_rates is None:
1312
+ raise ParameterError("sample_rates must be provided.")
1313
+
1314
+ if center_freqs.shape != sample_rates.shape:
1315
+ raise ParameterError(
1316
+ "Number of provided center_freqs and sample_rates must be equal."
1317
+ )
1318
+
1319
+ nyquist = 0.5 * sample_rates
1320
+ filter_bandwidths = center_freqs / float(Q)
1321
+
1322
+ filterbank = []
1323
+
1324
+ for cur_center_freq, cur_nyquist, cur_bw in zip(
1325
+ center_freqs, nyquist, filter_bandwidths
1326
+ ):
1327
+ passband_freqs = [
1328
+ cur_center_freq - 0.5 * cur_bw,
1329
+ cur_center_freq + 0.5 * cur_bw,
1330
+ ] / cur_nyquist
1331
+ stopband_freqs = [
1332
+ cur_center_freq - cur_bw,
1333
+ cur_center_freq + cur_bw,
1334
+ ] / cur_nyquist
1335
+
1336
+ cur_filter = scipy.signal.iirdesign(
1337
+ passband_freqs,
1338
+ stopband_freqs,
1339
+ passband_ripple,
1340
+ stopband_attenuation,
1341
+ analog=False,
1342
+ ftype=ftype,
1343
+ output=flayout,
1344
+ )
1345
+
1346
+ filterbank.append(cur_filter)
1347
+
1348
+ return filterbank, sample_rates
1349
+
1350
+
1351
+ @cache(level=10)
1352
+ def mr_frequencies(tuning: float) -> Tuple[np.ndarray, np.ndarray]:
1353
+ r"""Helper function for generating center frequency and sample rate pairs.
1354
+
1355
+ This function will return center frequency and corresponding sample rates
1356
+ to obtain similar pitch filterbank settings as described in [#]_.
1357
+ Instead of starting with MIDI pitch `A0`, we start with `C0`.
1358
+
1359
+ .. [#] Müller, Meinard.
1360
+ "Information Retrieval for Music and Motion."
1361
+ Springer Verlag. 2007.
1362
+
1363
+ Parameters
1364
+ ----------
1365
+ tuning : float [scalar]
1366
+ Tuning deviation from A440, measure as a fraction of the equally
1367
+ tempered semitone (1/12 of an octave).
1368
+
1369
+ Returns
1370
+ -------
1371
+ center_freqs : np.ndarray [shape=(n,), dtype=float]
1372
+ Center frequencies of the filter kernels.
1373
+ Also defines the number of filters in the filterbank.
1374
+ sample_rates : np.ndarray [shape=(n,), dtype=float]
1375
+ Sample rate for each filter, used for multirate filterbank.
1376
+
1377
+ Notes
1378
+ -----
1379
+ This function caches at level 10.
1380
+
1381
+ See Also
1382
+ --------
1383
+ librosa.filters.semitone_filterbank
1384
+ """
1385
+
1386
+ center_freqs = midi_to_hz(np.arange(24 + tuning, 109 + tuning))
1387
+
1388
+ sample_rates = np.asarray(
1389
+ len(np.arange(0, 36))
1390
+ * [
1391
+ 882.0,
1392
+ ]
1393
+ + len(np.arange(36, 70))
1394
+ * [
1395
+ 4410.0,
1396
+ ]
1397
+ + len(np.arange(70, 85))
1398
+ * [
1399
+ 22050.0,
1400
+ ]
1401
+ )
1402
+
1403
+ return center_freqs, sample_rates
1404
+
1405
+
1406
+ def semitone_filterbank(
1407
+ *,
1408
+ center_freqs: Optional[np.ndarray] = None,
1409
+ tuning: float = 0.0,
1410
+ sample_rates: Optional[np.ndarray] = None,
1411
+ flayout: str = "ba",
1412
+ **kwargs: Any,
1413
+ ) -> Tuple[List[Any], np.ndarray]:
1414
+ r"""Construct a multi-rate bank of infinite-impulse response (IIR)
1415
+ band-pass filters at user-defined center frequencies and sample rates.
1416
+
1417
+ By default, these center frequencies are set equal to the 88 fundamental
1418
+ frequencies of the grand piano keyboard, according to a pitch tuning standard
1419
+ of A440, that is, note A above middle C set to 440 Hz. The center frequencies
1420
+ are tuned to the twelve-tone equal temperament, which means that they grow
1421
+ exponentially at a rate of 2**(1/12), that is, twelve notes per octave.
1422
+
1423
+ The A440 tuning can be changed by the user while keeping twelve-tone equal
1424
+ temperament. While A440 is currently the international standard in the music
1425
+ industry (ISO 16), some orchestras tune to A441-A445, whereas baroque musicians
1426
+ tune to A415.
1427
+
1428
+ See [#]_ for details.
1429
+
1430
+ .. [#] Müller, Meinard.
1431
+ "Information Retrieval for Music and Motion."
1432
+ Springer Verlag. 2007.
1433
+
1434
+ Parameters
1435
+ ----------
1436
+ center_freqs : np.ndarray [shape=(n,), dtype=float]
1437
+ Center frequencies of the filter kernels.
1438
+ Also defines the number of filters in the filterbank.
1439
+ tuning : float [scalar]
1440
+ Tuning deviation from A440 as a fraction of a semitone (1/12 of an octave
1441
+ in equal temperament).
1442
+ sample_rates : np.ndarray [shape=(n,), dtype=float]
1443
+ Sample rates of each filter in the multirate filterbank.
1444
+ flayout : string
1445
+ - If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`.
1446
+ Can be unstable for high-order filters.
1447
+ - If `sos`, a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`.
1448
+ Minimizes numerical precision errors for high-order filters, but is slower.
1449
+ **kwargs : additional keyword arguments
1450
+ Additional arguments to the private function `_multirate_fb()`.
1451
+
1452
+ Returns
1453
+ -------
1454
+ filterbank : list [shape=(n,), dtype=float]
1455
+ Each list entry contains the filter coefficients for a single filter.
1456
+ fb_sample_rates : np.ndarray [shape=(n,), dtype=float]
1457
+ Sample rate for each filter.
1458
+
1459
+ See Also
1460
+ --------
1461
+ librosa.cqt
1462
+ librosa.iirt
1463
+ librosa.filters.mr_frequencies
1464
+ scipy.signal.iirdesign
1465
+
1466
+ Examples
1467
+ --------
1468
+ >>> import matplotlib.pyplot as plt
1469
+ >>> import numpy as np
1470
+ >>> import scipy.signal
1471
+ >>> semitone_filterbank, sample_rates = librosa.filters.semitone_filterbank(
1472
+ ... center_freqs=librosa.midi_to_hz(np.arange(60, 72)),
1473
+ ... sample_rates=np.repeat(4410.0, 12),
1474
+ ... flayout='sos'
1475
+ ... )
1476
+ >>> magnitudes = []
1477
+ >>> for cur_sr, cur_filter in zip(sample_rates, semitone_filterbank):
1478
+ ... w, h = scipy.signal.sosfreqz(cur_filter,fs=cur_sr, worN=1025)
1479
+ ... magnitudes.append(20 * np.log10(np.abs(h)))
1480
+ >>> fig, ax = plt.subplots(figsize=(12,6))
1481
+ >>> img = librosa.display.specshow(
1482
+ ... np.array(magnitudes),
1483
+ ... x_axis="hz",
1484
+ ... sr=4410,
1485
+ ... y_coords=librosa.midi_to_hz(np.arange(60, 72)),
1486
+ ... vmin=-60,
1487
+ ... vmax=3,
1488
+ ... ax=ax
1489
+ ... )
1490
+ >>> fig.colorbar(img, ax=ax, format="%+2.f dB", label="Magnitude (dB)")
1491
+ >>> ax.set(
1492
+ ... xlim=[200, 600],
1493
+ ... yticks=librosa.midi_to_hz(np.arange(60, 72)),
1494
+ ... title='Magnitude Responses of the Pitch Filterbank',
1495
+ ... xlabel='Frequency (Hz)',
1496
+ ... ylabel='Semitone filter center frequency (Hz)'
1497
+ ... )
1498
+ """
1499
+
1500
+ if (center_freqs is None) and (sample_rates is None):
1501
+ center_freqs, sample_rates = mr_frequencies(tuning)
1502
+
1503
+ filterbank, fb_sample_rates = _multirate_fb(
1504
+ center_freqs=center_freqs, sample_rates=sample_rates, flayout=flayout, **kwargs
1505
+ )
1506
+
1507
+ return filterbank, fb_sample_rates
1508
+
1509
+
1510
+ @jit(nopython=True, cache=False)
1511
+ def __window_ss_fill(x, win_sq, n_frames, hop_length): # pragma: no cover
1512
+ """Helper function for window sum-square calculation."""
1513
+
1514
+ n = len(x)
1515
+ n_fft = len(win_sq)
1516
+ for i in range(n_frames):
1517
+ sample = i * hop_length
1518
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
1519
+
1520
+
1521
+ def window_sumsquare(
1522
+ *,
1523
+ window: _WindowSpec,
1524
+ n_frames: int,
1525
+ hop_length: int = 512,
1526
+ win_length: Optional[int] = None,
1527
+ n_fft: int = 2048,
1528
+ dtype: DTypeLike = np.float32,
1529
+ norm: Optional[float] = None,
1530
+ ) -> np.ndarray:
1531
+ """Compute the sum-square envelope of a window function at a given hop length.
1532
+
1533
+ This is used to estimate modulation effects induced by windowing observations
1534
+ in short-time Fourier transforms.
1535
+
1536
+ Parameters
1537
+ ----------
1538
+ window : string, tuple, number, callable, or list-like
1539
+ Window specification, as in `get_window`
1540
+ n_frames : int > 0
1541
+ The number of analysis frames
1542
+ hop_length : int > 0
1543
+ The number of samples to advance between frames
1544
+ win_length : [optional]
1545
+ The length of the window function. By default, this matches ``n_fft``.
1546
+ n_fft : int > 0
1547
+ The length of each analysis frame.
1548
+ dtype : np.dtype
1549
+ The data type of the output
1550
+ norm : {np.inf, -np.inf, 0, float > 0, None}
1551
+ Normalization mode used in window construction.
1552
+ Note that this does not affect the squaring operation.
1553
+
1554
+ Returns
1555
+ -------
1556
+ wss : np.ndarray, shape=``(n_fft + hop_length * (n_frames - 1))``
1557
+ The sum-squared envelope of the window function
1558
+
1559
+ Examples
1560
+ --------
1561
+ For a fixed frame length (2048), compare modulation effects for a Hann window
1562
+ at different hop lengths:
1563
+
1564
+ >>> n_frames = 50
1565
+ >>> wss_256 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=256)
1566
+ >>> wss_512 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=512)
1567
+ >>> wss_1024 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=1024)
1568
+
1569
+ >>> import matplotlib.pyplot as plt
1570
+ >>> fig, ax = plt.subplots(nrows=3, sharey=True)
1571
+ >>> ax[0].plot(wss_256)
1572
+ >>> ax[0].set(title='hop_length=256')
1573
+ >>> ax[1].plot(wss_512)
1574
+ >>> ax[1].set(title='hop_length=512')
1575
+ >>> ax[2].plot(wss_1024)
1576
+ >>> ax[2].set(title='hop_length=1024')
1577
+ """
1578
+ if win_length is None:
1579
+ win_length = n_fft
1580
+
1581
+ n = n_fft + hop_length * (n_frames - 1)
1582
+ x = np.zeros(n, dtype=dtype)
1583
+
1584
+ # Compute the squared window at the desired length
1585
+ win_sq = get_window(window, win_length)
1586
+ win_sq = util.normalize(win_sq, norm=norm) ** 2
1587
+ win_sq = util.pad_center(win_sq, size=n_fft)
1588
+
1589
+ # Fill the envelope
1590
+ __window_ss_fill(x, win_sq, n_frames, hop_length)
1591
+
1592
+ return x
1593
+
1594
+
1595
+ @cache(level=10)
1596
+ def diagonal_filter(
1597
+ window: _WindowSpec,
1598
+ n: int,
1599
+ *,
1600
+ slope: float = 1.0,
1601
+ angle: Optional[float] = None,
1602
+ zero_mean: bool = False,
1603
+ ) -> np.ndarray:
1604
+ """Build a two-dimensional diagonal filter.
1605
+
1606
+ This is primarily used for smoothing recurrence or self-similarity matrices.
1607
+
1608
+ Parameters
1609
+ ----------
1610
+ window : string, tuple, number, callable, or list-like
1611
+ The window function to use for the filter.
1612
+
1613
+ See `get_window` for details.
1614
+
1615
+ Note that the window used here should be non-negative.
1616
+
1617
+ n : int > 0
1618
+ the length of the filter
1619
+
1620
+ slope : float
1621
+ The slope of the diagonal filter to produce
1622
+
1623
+ angle : float or None
1624
+ If given, the slope parameter is ignored,
1625
+ and angle directly sets the orientation of the filter (in radians).
1626
+ Otherwise, angle is inferred as `arctan(slope)`.
1627
+
1628
+ zero_mean : bool
1629
+ If True, a zero-mean filter is used.
1630
+ Otherwise, a non-negative averaging filter is used.
1631
+
1632
+ This should be enabled if you want to enhance paths and suppress
1633
+ blocks.
1634
+
1635
+ Returns
1636
+ -------
1637
+ kernel : np.ndarray, shape=[(m, m)]
1638
+ The 2-dimensional filter kernel
1639
+
1640
+ Notes
1641
+ -----
1642
+ This function caches at level 10.
1643
+ """
1644
+
1645
+ if angle is None:
1646
+ angle = np.arctan(slope)
1647
+
1648
+ win = np.diag(get_window(window, n, fftbins=False))
1649
+
1650
+ if not np.isclose(angle, np.pi / 4):
1651
+ win = scipy.ndimage.rotate(
1652
+ win, 45 - angle * 180 / np.pi, order=5, prefilter=False
1653
+ )
1654
+
1655
+ np.clip(win, 0, None, out=win)
1656
+ win /= win.sum()
1657
+
1658
+ if zero_mean:
1659
+ win -= win.mean()
1660
+
1661
+ return win
sequence.py ADDED
@@ -0,0 +1,2059 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+ """
4
+ Sequential modeling
5
+ ===================
6
+
7
+ Sequence alignment
8
+ ------------------
9
+ .. autosummary::
10
+ :toctree: generated/
11
+
12
+ dtw
13
+ rqa
14
+
15
+ Viterbi decoding
16
+ ----------------
17
+ .. autosummary::
18
+ :toctree: generated/
19
+
20
+ viterbi
21
+ viterbi_discriminative
22
+ viterbi_binary
23
+
24
+ Transition matrices
25
+ -------------------
26
+ .. autosummary::
27
+ :toctree: generated/
28
+
29
+ transition_uniform
30
+ transition_loop
31
+ transition_cycle
32
+ transition_local
33
+ """
34
+ from __future__ import annotations
35
+
36
+ import numpy as np
37
+ from scipy.spatial.distance import cdist
38
+ from numba import jit
39
+ from .util import pad_center, fill_off_diagonal, is_positive_int, tiny, expand_to
40
+ from .util.exceptions import ParameterError
41
+ from .filters import get_window
42
+ from typing import Any, Iterable, List, Optional, Tuple, Union, overload
43
+ from typing_extensions import Literal
44
+ from ._typing import _WindowSpec, _IntLike_co
45
+
46
+ __all__ = [
47
+ "dtw",
48
+ "dtw_backtracking",
49
+ "rqa",
50
+ "viterbi",
51
+ "viterbi_discriminative",
52
+ "viterbi_binary",
53
+ "transition_uniform",
54
+ "transition_loop",
55
+ "transition_cycle",
56
+ "transition_local",
57
+ ]
58
+
59
+
60
+ @overload
61
+ def dtw(
62
+ X: np.ndarray,
63
+ Y: np.ndarray,
64
+ *,
65
+ metric: str = ...,
66
+ step_sizes_sigma: Optional[np.ndarray] = ...,
67
+ weights_add: Optional[np.ndarray] = ...,
68
+ weights_mul: Optional[np.ndarray] = ...,
69
+ subseq: bool = ...,
70
+ backtrack: Literal[False],
71
+ global_constraints: bool = ...,
72
+ band_rad: float = ...,
73
+ return_steps: Literal[False] = ...,
74
+ ) -> np.ndarray:
75
+ ...
76
+
77
+
78
+ @overload
79
+ def dtw(
80
+ *,
81
+ C: np.ndarray,
82
+ metric: str = ...,
83
+ step_sizes_sigma: Optional[np.ndarray] = ...,
84
+ weights_add: Optional[np.ndarray] = ...,
85
+ weights_mul: Optional[np.ndarray] = ...,
86
+ subseq: bool = ...,
87
+ backtrack: Literal[False],
88
+ global_constraints: bool = ...,
89
+ band_rad: float = ...,
90
+ return_steps: Literal[False] = ...,
91
+ ) -> np.ndarray:
92
+ ...
93
+
94
+
95
+ @overload
96
+ def dtw(
97
+ X: np.ndarray,
98
+ Y: np.ndarray,
99
+ *,
100
+ metric: str = ...,
101
+ step_sizes_sigma: Optional[np.ndarray] = ...,
102
+ weights_add: Optional[np.ndarray] = ...,
103
+ weights_mul: Optional[np.ndarray] = ...,
104
+ subseq: bool = ...,
105
+ backtrack: Literal[False],
106
+ global_constraints: bool = ...,
107
+ band_rad: float = ...,
108
+ return_steps: Literal[True],
109
+ ) -> Tuple[np.ndarray, np.ndarray]:
110
+ ...
111
+
112
+
113
+ @overload
114
+ def dtw(
115
+ *,
116
+ C: np.ndarray,
117
+ metric: str = ...,
118
+ step_sizes_sigma: Optional[np.ndarray] = ...,
119
+ weights_add: Optional[np.ndarray] = ...,
120
+ weights_mul: Optional[np.ndarray] = ...,
121
+ subseq: bool = ...,
122
+ backtrack: Literal[False],
123
+ global_constraints: bool = ...,
124
+ band_rad: float = ...,
125
+ return_steps: Literal[True],
126
+ ) -> Tuple[np.ndarray, np.ndarray]:
127
+ ...
128
+
129
+
130
+ @overload
131
+ def dtw(
132
+ X: np.ndarray,
133
+ Y: np.ndarray,
134
+ *,
135
+ metric: str = ...,
136
+ step_sizes_sigma: Optional[np.ndarray] = ...,
137
+ weights_add: Optional[np.ndarray] = ...,
138
+ weights_mul: Optional[np.ndarray] = ...,
139
+ subseq: bool = ...,
140
+ backtrack: Literal[True] = ...,
141
+ global_constraints: bool = ...,
142
+ band_rad: float = ...,
143
+ return_steps: Literal[False] = ...,
144
+ ) -> Tuple[np.ndarray, np.ndarray]:
145
+ ...
146
+
147
+
148
+ @overload
149
+ def dtw(
150
+ *,
151
+ C: np.ndarray,
152
+ metric: str = ...,
153
+ step_sizes_sigma: Optional[np.ndarray] = ...,
154
+ weights_add: Optional[np.ndarray] = ...,
155
+ weights_mul: Optional[np.ndarray] = ...,
156
+ subseq: bool = ...,
157
+ backtrack: Literal[True] = ...,
158
+ global_constraints: bool = ...,
159
+ band_rad: float = ...,
160
+ return_steps: Literal[False] = ...,
161
+ ) -> Tuple[np.ndarray, np.ndarray]:
162
+ ...
163
+
164
+
165
+ @overload
166
+ def dtw(
167
+ X: np.ndarray,
168
+ Y: np.ndarray,
169
+ *,
170
+ metric: str = ...,
171
+ step_sizes_sigma: Optional[np.ndarray] = ...,
172
+ weights_add: Optional[np.ndarray] = ...,
173
+ weights_mul: Optional[np.ndarray] = ...,
174
+ subseq: bool = ...,
175
+ backtrack: Literal[True] = ...,
176
+ global_constraints: bool = ...,
177
+ band_rad: float = ...,
178
+ return_steps: Literal[True],
179
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
180
+ ...
181
+
182
+
183
+ @overload
184
+ def dtw(
185
+ *,
186
+ C: np.ndarray,
187
+ metric: str = ...,
188
+ step_sizes_sigma: Optional[np.ndarray] = ...,
189
+ weights_add: Optional[np.ndarray] = ...,
190
+ weights_mul: Optional[np.ndarray] = ...,
191
+ subseq: bool = ...,
192
+ backtrack: Literal[True] = ...,
193
+ global_constraints: bool = ...,
194
+ band_rad: float = ...,
195
+ return_steps: Literal[True],
196
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
197
+ ...
198
+
199
+
200
+ def dtw(
201
+ X: Optional[np.ndarray] = None,
202
+ Y: Optional[np.ndarray] = None,
203
+ *,
204
+ C: Optional[np.ndarray] = None,
205
+ metric: str = "euclidean",
206
+ step_sizes_sigma: Optional[np.ndarray] = None,
207
+ weights_add: Optional[np.ndarray] = None,
208
+ weights_mul: Optional[np.ndarray] = None,
209
+ subseq: bool = False,
210
+ backtrack: bool = True,
211
+ global_constraints: bool = False,
212
+ band_rad: float = 0.25,
213
+ return_steps: bool = False,
214
+ ) -> Union[
215
+ np.ndarray, Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, np.ndarray]
216
+ ]:
217
+ """Dynamic time warping (DTW).
218
+
219
+ This function performs a DTW and path backtracking on two sequences.
220
+ We follow the nomenclature and algorithmic approach as described in [#]_.
221
+
222
+ .. [#] Meinard Mueller
223
+ Fundamentals of Music Processing — Audio, Analysis, Algorithms, Applications
224
+ Springer Verlag, ISBN: 978-3-319-21944-8, 2015.
225
+
226
+ Parameters
227
+ ----------
228
+ X : np.ndarray [shape=(..., K, N)]
229
+ audio feature matrix (e.g., chroma features)
230
+
231
+ If ``X`` has more than two dimensions (e.g., for multi-channel inputs), all leading
232
+ dimensions are used when computing distance to ``Y``.
233
+
234
+ Y : np.ndarray [shape=(..., K, M)]
235
+ audio feature matrix (e.g., chroma features)
236
+
237
+ C : np.ndarray [shape=(N, M)]
238
+ Precomputed distance matrix. If supplied, X and Y must not be supplied and
239
+ ``metric`` will be ignored.
240
+
241
+ metric : str
242
+ Identifier for the cost-function as documented
243
+ in `scipy.spatial.distance.cdist()`
244
+
245
+ step_sizes_sigma : np.ndarray [shape=[n, 2]]
246
+ Specifies allowed step sizes as used by the dtw.
247
+
248
+ weights_add : np.ndarray [shape=[n, ]]
249
+ Additive weights to penalize certain step sizes.
250
+
251
+ weights_mul : np.ndarray [shape=[n, ]]
252
+ Multiplicative weights to penalize certain step sizes.
253
+
254
+ subseq : bool
255
+ Enable subsequence DTW, e.g., for retrieval tasks.
256
+
257
+ backtrack : bool
258
+ Enable backtracking in accumulated cost matrix.
259
+
260
+ global_constraints : bool
261
+ Applies global constraints to the cost matrix ``C`` (Sakoe-Chiba band).
262
+
263
+ band_rad : float
264
+ The Sakoe-Chiba band radius (1/2 of the width) will be
265
+ ``int(radius*min(C.shape))``.
266
+
267
+ return_steps : bool
268
+ If true, the function returns ``steps``, the step matrix, containing
269
+ the indices of the used steps from the cost accumulation step.
270
+
271
+ Returns
272
+ -------
273
+ D : np.ndarray [shape=(N, M)]
274
+ accumulated cost matrix.
275
+ D[N, M] is the total alignment cost.
276
+ When doing subsequence DTW, D[N,:] indicates a matching function.
277
+ wp : np.ndarray [shape=(N, 2)]
278
+ Warping path with index pairs.
279
+ Each row of the array contains an index pair (n, m).
280
+ Only returned when ``backtrack`` is True.
281
+ steps : np.ndarray [shape=(N, M)]
282
+ Step matrix, containing the indices of the used steps from the cost
283
+ accumulation step.
284
+ Only returned when ``return_steps`` is True.
285
+
286
+ Raises
287
+ ------
288
+ ParameterError
289
+ If you are doing diagonal matching and Y is shorter than X or if an
290
+ incompatible combination of X, Y, and C are supplied.
291
+
292
+ If your input dimensions are incompatible.
293
+
294
+ If the cost matrix has NaN values.
295
+
296
+ Examples
297
+ --------
298
+ >>> import numpy as np
299
+ >>> import matplotlib.pyplot as plt
300
+ >>> y, sr = librosa.load(librosa.ex('brahms'), offset=10, duration=15)
301
+ >>> X = librosa.feature.chroma_cens(y=y, sr=sr)
302
+ >>> noise = np.random.rand(X.shape[0], 200)
303
+ >>> Y = np.concatenate((noise, noise, X, noise), axis=1)
304
+ >>> D, wp = librosa.sequence.dtw(X, Y, subseq=True)
305
+ >>> fig, ax = plt.subplots(nrows=2, sharex=True)
306
+ >>> img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
307
+ ... ax=ax[0])
308
+ >>> ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
309
+ >>> ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
310
+ >>> ax[0].legend()
311
+ >>> fig.colorbar(img, ax=ax[0])
312
+ >>> ax[1].plot(D[-1, :] / wp.shape[0])
313
+ >>> ax[1].set(xlim=[0, Y.shape[1]], ylim=[0, 2],
314
+ ... title='Matching cost function')
315
+ """
316
+ # Default Parameters
317
+ default_steps = np.array([[1, 1], [0, 1], [1, 0]], dtype=np.uint32)
318
+ default_weights_add = np.zeros(3, dtype=np.float64)
319
+ default_weights_mul = np.ones(3, dtype=np.float64)
320
+
321
+ if step_sizes_sigma is None:
322
+ # Use the default steps
323
+ step_sizes_sigma = default_steps
324
+
325
+ # Use default weights if none are provided
326
+ if weights_add is None:
327
+ weights_add = default_weights_add
328
+
329
+ if weights_mul is None:
330
+ weights_mul = default_weights_mul
331
+ else:
332
+ # If we have custom steps but no weights, construct them here
333
+ if weights_add is None:
334
+ weights_add = np.zeros(len(step_sizes_sigma), dtype=np.float64)
335
+
336
+ if weights_mul is None:
337
+ weights_mul = np.ones(len(step_sizes_sigma), dtype=np.float64)
338
+
339
+ # Make the default step weights infinite so that they are never
340
+ # preferred over custom steps
341
+ default_weights_add.fill(np.inf)
342
+ default_weights_mul.fill(np.inf)
343
+
344
+ # Append custom steps and weights to our defaults
345
+ step_sizes_sigma = np.concatenate((default_steps, step_sizes_sigma))
346
+ weights_add = np.concatenate((default_weights_add, weights_add))
347
+ weights_mul = np.concatenate((default_weights_mul, weights_mul))
348
+
349
+ # These asserts are bad, but mypy cannot trace the code paths properly
350
+ assert step_sizes_sigma is not None
351
+ assert weights_add is not None
352
+ assert weights_mul is not None
353
+
354
+ if np.any(step_sizes_sigma < 0):
355
+ raise ParameterError("step_sizes_sigma cannot contain negative values")
356
+
357
+ if len(step_sizes_sigma) != len(weights_add):
358
+ raise ParameterError("len(weights_add) must be equal to len(step_sizes_sigma)")
359
+ if len(step_sizes_sigma) != len(weights_mul):
360
+ raise ParameterError("len(weights_mul) must be equal to len(step_sizes_sigma)")
361
+
362
+ if C is None and (X is None or Y is None):
363
+ raise ParameterError("If C is not supplied, both X and Y must be supplied")
364
+ if C is not None and (X is not None or Y is not None):
365
+ raise ParameterError("If C is supplied, both X and Y must not be supplied")
366
+
367
+ c_is_transposed = False
368
+
369
+ # calculate pair-wise distances, unless already supplied.
370
+ # C_local will keep track of whether the distance matrix was supplied
371
+ # by the user (False) or constructed locally (True)
372
+ C_local = False
373
+ if C is None:
374
+ C_local = True
375
+ # mypy can't figure out that this case does not happen
376
+ assert X is not None and Y is not None
377
+ # take care of dimensions
378
+ X = np.atleast_2d(X)
379
+ Y = np.atleast_2d(Y)
380
+
381
+ # Perform some shape-squashing here
382
+ # Put the time axes around front
383
+ # Suppress types because mypy doesn't know these are ndarrays
384
+ X = np.swapaxes(X, -1, 0) # type: ignore
385
+ Y = np.swapaxes(Y, -1, 0) # type: ignore
386
+
387
+ # Flatten the remaining dimensions
388
+ # Use F-ordering to preserve columns
389
+ X = X.reshape((X.shape[0], -1), order="F")
390
+ Y = Y.reshape((Y.shape[0], -1), order="F")
391
+
392
+ try:
393
+ C = cdist(X, Y, metric=metric)
394
+ except ValueError as exc:
395
+ raise ParameterError(
396
+ "scipy.spatial.distance.cdist returned an error.\n"
397
+ "Please provide your input in the form X.shape=(K, N) "
398
+ "and Y.shape=(K, M).\n 1-dimensional sequences should "
399
+ "be reshaped to X.shape=(1, N) and Y.shape=(1, M)."
400
+ ) from exc
401
+
402
+ # for subsequence matching:
403
+ # if N > M, Y can be a subsequence of X
404
+ if subseq and (X.shape[0] > Y.shape[0]):
405
+ C = C.T
406
+ c_is_transposed = True
407
+
408
+ C = np.atleast_2d(C)
409
+
410
+ # if diagonal matching, Y has to be longer than X
411
+ # (X simply cannot be contained in Y)
412
+ if np.array_equal(step_sizes_sigma, np.array([[1, 1]])) and (
413
+ C.shape[0] > C.shape[1]
414
+ ):
415
+ raise ParameterError(
416
+ "For diagonal matching: Y.shape[-1] >= X.shape[-11] "
417
+ "(C.shape[1] >= C.shape[0])"
418
+ )
419
+
420
+ max_0 = step_sizes_sigma[:, 0].max()
421
+ max_1 = step_sizes_sigma[:, 1].max()
422
+
423
+ # check C here for nans before building global constraints
424
+ if np.any(np.isnan(C)):
425
+ raise ParameterError("DTW cost matrix C has NaN values. ")
426
+
427
+ if global_constraints:
428
+ # Apply global constraints to the cost matrix
429
+ if not C_local:
430
+ # If C was provided as input, make a copy here
431
+ C = np.copy(C)
432
+ fill_off_diagonal(C, radius=band_rad, value=np.inf)
433
+
434
+ # initialize whole matrix with infinity values
435
+ D = np.ones(C.shape + np.array([max_0, max_1])) * np.inf
436
+
437
+ # set starting point to C[0, 0]
438
+ D[max_0, max_1] = C[0, 0]
439
+
440
+ if subseq:
441
+ D[max_0, max_1:] = C[0, :]
442
+
443
+ # initialize step matrix with -1
444
+ # will be filled in calc_accu_cost() with indices from step_sizes_sigma
445
+ steps = np.zeros(D.shape, dtype=np.int32)
446
+
447
+ # these steps correspond to left- (first row) and up-(first column) moves
448
+ steps[0, :] = 1
449
+ steps[:, 0] = 2
450
+
451
+ # calculate accumulated cost matrix
452
+ D: np.ndarray
453
+ steps: np.ndarray
454
+ D, steps = __dtw_calc_accu_cost(
455
+ C, D, steps, step_sizes_sigma, weights_mul, weights_add, max_0, max_1
456
+ )
457
+
458
+ # delete infinity rows and columns
459
+ D = D[max_0:, max_1:]
460
+ steps = steps[max_0:, max_1:]
461
+
462
+ return_values: List[np.ndarray]
463
+ if backtrack:
464
+ wp: np.ndarray
465
+ if subseq:
466
+ if np.all(np.isinf(D[-1])):
467
+ raise ParameterError(
468
+ "No valid sub-sequence warping path could "
469
+ "be constructed with the given step sizes."
470
+ )
471
+ start = np.argmin(D[-1, :])
472
+ _wp = __dtw_backtracking(steps, step_sizes_sigma, subseq, start)
473
+ else:
474
+ # perform warping path backtracking
475
+ if np.isinf(D[-1, -1]):
476
+ raise ParameterError(
477
+ "No valid sub-sequence warping path could "
478
+ "be constructed with the given step sizes."
479
+ )
480
+
481
+ _wp = __dtw_backtracking(steps, step_sizes_sigma, subseq)
482
+ if _wp[-1] != (0, 0):
483
+ raise ParameterError(
484
+ "Unable to compute a full DTW warping path. "
485
+ "You may want to try again with subseq=True."
486
+ )
487
+
488
+ wp = np.asarray(_wp, dtype=int)
489
+
490
+ # since we transposed in the beginning, we have to adjust the index pairs back
491
+ if subseq and (
492
+ (X is not None and Y is not None and X.shape[0] > Y.shape[0])
493
+ or c_is_transposed
494
+ or C.shape[0] > C.shape[1]
495
+ ):
496
+ wp = np.fliplr(wp)
497
+ return_values = [D, wp]
498
+ else:
499
+ return_values = [D]
500
+
501
+ if return_steps:
502
+ return_values.append(steps)
503
+
504
+ if len(return_values) > 1:
505
+ # Suppressing type check here because mypy can't
506
+ # infer the exact length of the tuple
507
+ return tuple(return_values) # type: ignore
508
+ else:
509
+ return return_values[0]
510
+
511
+
512
+ @jit(nopython=True, cache=False) # type: ignore
513
+ def __dtw_calc_accu_cost(
514
+ C: np.ndarray,
515
+ D: np.ndarray,
516
+ steps: np.ndarray,
517
+ step_sizes_sigma: np.ndarray,
518
+ weights_mul: np.ndarray,
519
+ weights_add: np.ndarray,
520
+ max_0: int,
521
+ max_1: int,
522
+ ) -> Tuple[np.ndarray, np.ndarray]: # pragma: no cover
523
+ """Calculate the accumulated cost matrix D.
524
+
525
+ Use dynamic programming to calculate the accumulated costs.
526
+
527
+ Parameters
528
+ ----------
529
+ C : np.ndarray [shape=(N, M)]
530
+ pre-computed cost matrix
531
+ D : np.ndarray [shape=(N, M)]
532
+ accumulated cost matrix
533
+ steps : np.ndarray [shape=(N, M)]
534
+ Step matrix, containing the indices of the used steps from the cost
535
+ accumulation step.
536
+ step_sizes_sigma : np.ndarray [shape=[n, 2]]
537
+ Specifies allowed step sizes as used by the dtw.
538
+ weights_add : np.ndarray [shape=[n, ]]
539
+ Additive weights to penalize certain step sizes.
540
+ weights_mul : np.ndarray [shape=[n, ]]
541
+ Multiplicative weights to penalize certain step sizes.
542
+ max_0 : int
543
+ maximum number of steps in step_sizes_sigma in dim 0.
544
+ max_1 : int
545
+ maximum number of steps in step_sizes_sigma in dim 1.
546
+
547
+ Returns
548
+ -------
549
+ D : np.ndarray [shape=(N, M)]
550
+ accumulated cost matrix.
551
+ D[N, M] is the total alignment cost.
552
+ When doing subsequence DTW, D[N,:] indicates a matching function.
553
+ steps : np.ndarray [shape=(N, M)]
554
+ Step matrix, containing the indices of the used steps from the cost
555
+ accumulation step.
556
+
557
+ See Also
558
+ --------
559
+ dtw
560
+ """
561
+ for cur_n in range(max_0, D.shape[0]):
562
+ for cur_m in range(max_1, D.shape[1]):
563
+ # accumulate costs
564
+ for cur_step_idx, cur_w_add, cur_w_mul in zip(
565
+ range(step_sizes_sigma.shape[0]), weights_add, weights_mul
566
+ ):
567
+ cur_D = D[
568
+ cur_n - step_sizes_sigma[cur_step_idx, 0],
569
+ cur_m - step_sizes_sigma[cur_step_idx, 1],
570
+ ]
571
+ cur_C = cur_w_mul * C[cur_n - max_0, cur_m - max_1]
572
+ cur_C += cur_w_add
573
+ cur_cost = cur_D + cur_C
574
+
575
+ # check if cur_cost is smaller than the one stored in D
576
+ if cur_cost < D[cur_n, cur_m]:
577
+ D[cur_n, cur_m] = cur_cost
578
+
579
+ # save step-index
580
+ steps[cur_n, cur_m] = cur_step_idx
581
+
582
+ return D, steps
583
+
584
+
585
+ @jit(nopython=True, cache=False) # type: ignore
586
+ def __dtw_backtracking(
587
+ steps: np.ndarray,
588
+ step_sizes_sigma: np.ndarray,
589
+ subseq: bool,
590
+ start: Optional[int] = None,
591
+ ) -> List[Tuple[int, int]]: # pragma: no cover
592
+ """Backtrack optimal warping path.
593
+
594
+ Uses the saved step sizes from the cost accumulation
595
+ step to backtrack the index pairs for an optimal
596
+ warping path.
597
+
598
+ Parameters
599
+ ----------
600
+ steps : np.ndarray [shape=(N, M)]
601
+ Step matrix, containing the indices of the used steps from the cost
602
+ accumulation step.
603
+ step_sizes_sigma : np.ndarray [shape=[n, 2]]
604
+ Specifies allowed step sizes as used by the dtw.
605
+ subseq : bool
606
+ Enable subsequence DTW, e.g., for retrieval tasks.
607
+ start : int
608
+ Start column index for backtraing (only allowed for ``subseq=True``)
609
+
610
+ Returns
611
+ -------
612
+ wp : list [shape=(N,)]
613
+ Warping path with index pairs.
614
+ Each list entry contains an index pair
615
+ (n, m) as a tuple
616
+
617
+ See Also
618
+ --------
619
+ dtw
620
+ """
621
+ if start is None:
622
+ cur_idx = (steps.shape[0] - 1, steps.shape[1] - 1)
623
+ else:
624
+ cur_idx = (steps.shape[0] - 1, start)
625
+
626
+ wp = []
627
+ # Set starting point D(N, M) and append it to the path
628
+ wp.append((cur_idx[0], cur_idx[1]))
629
+
630
+ # Loop backwards.
631
+ # Stop criteria:
632
+ # Setting it to (0, 0) does not work for the subsequence dtw,
633
+ # so we only ask to reach the first row of the matrix.
634
+
635
+ while (subseq and cur_idx[0] > 0) or (not subseq and cur_idx != (0, 0)):
636
+ cur_step_idx = steps[(cur_idx[0], cur_idx[1])]
637
+
638
+ # save tuple with minimal acc. cost in path
639
+ cur_idx = (
640
+ cur_idx[0] - step_sizes_sigma[cur_step_idx][0],
641
+ cur_idx[1] - step_sizes_sigma[cur_step_idx][1],
642
+ )
643
+
644
+ # If we run off the side of the cost matrix, break here
645
+ if min(cur_idx) < 0:
646
+ break
647
+
648
+ # append to warping path
649
+ wp.append((cur_idx[0], cur_idx[1]))
650
+
651
+ return wp
652
+
653
+
654
+ def dtw_backtracking(
655
+ steps: np.ndarray,
656
+ *,
657
+ step_sizes_sigma: Optional[np.ndarray] = None,
658
+ subseq: bool = False,
659
+ start: Optional[Union[int, np.integer[Any]]] = None,
660
+ ) -> np.ndarray:
661
+ """Backtrack a warping path.
662
+
663
+ Uses the saved step sizes from the cost accumulation
664
+ step to backtrack the index pairs for a warping path.
665
+
666
+ Parameters
667
+ ----------
668
+ steps : np.ndarray [shape=(N, M)]
669
+ Step matrix, containing the indices of the used steps from the cost
670
+ accumulation step.
671
+ step_sizes_sigma : np.ndarray [shape=[n, 2]]
672
+ Specifies allowed step sizes as used by the dtw.
673
+ subseq : bool
674
+ Enable subsequence DTW, e.g., for retrieval tasks.
675
+ start : int
676
+ Start column index for backtraing (only allowed for ``subseq=True``)
677
+
678
+ Returns
679
+ -------
680
+ wp : list [shape=(N,)]
681
+ Warping path with index pairs.
682
+ Each list entry contains an index pair
683
+ (n, m) as a tuple
684
+
685
+ See Also
686
+ --------
687
+ dtw
688
+ """
689
+ if subseq is False and start is not None:
690
+ raise ParameterError(
691
+ f"start is only allowed to be set if subseq is True (start={start}, subseq={subseq})"
692
+ )
693
+
694
+ # Default Parameters
695
+ default_steps = np.array([[1, 1], [0, 1], [1, 0]], dtype=np.uint32)
696
+
697
+ if step_sizes_sigma is None:
698
+ # Use the default steps
699
+ step_sizes_sigma = default_steps
700
+ else:
701
+ # Append custom steps and weights to our defaults
702
+ step_sizes_sigma = np.concatenate((default_steps, step_sizes_sigma))
703
+
704
+ wp = __dtw_backtracking(steps, step_sizes_sigma, subseq, start)
705
+ return np.asarray(wp, dtype=int)
706
+
707
+
708
+ @overload
709
+ def rqa(
710
+ sim: np.ndarray,
711
+ *,
712
+ gap_onset: float = ...,
713
+ gap_extend: float = ...,
714
+ knight_moves: bool = ...,
715
+ backtrack: Literal[False],
716
+ ) -> np.ndarray:
717
+ ...
718
+
719
+
720
+ @overload
721
+ def rqa(
722
+ sim: np.ndarray,
723
+ *,
724
+ gap_onset: float = ...,
725
+ gap_extend: float = ...,
726
+ knight_moves: bool = ...,
727
+ backtrack: Literal[True] = ...,
728
+ ) -> Tuple[np.ndarray, np.ndarray]:
729
+ ...
730
+
731
+
732
+ @overload
733
+ def rqa(
734
+ sim: np.ndarray,
735
+ *,
736
+ gap_onset: float = ...,
737
+ gap_extend: float = ...,
738
+ knight_moves: bool = ...,
739
+ backtrack: bool = ...,
740
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
741
+ ...
742
+
743
+
744
+ def rqa(
745
+ sim: np.ndarray,
746
+ *,
747
+ gap_onset: float = 1,
748
+ gap_extend: float = 1,
749
+ knight_moves: bool = True,
750
+ backtrack: bool = True,
751
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
752
+ """Recurrence quantification analysis (RQA)
753
+
754
+ This function implements different forms of RQA as described by
755
+ Serra, Serra, and Andrzejak (SSA). [#]_ These methods take as input
756
+ a self- or cross-similarity matrix ``sim``, and calculate the value
757
+ of path alignments by dynamic programming.
758
+
759
+ Note that unlike dynamic time warping (`dtw`), alignment paths here are
760
+ maximized, not minimized, so the input should measure similarity rather
761
+ than distance.
762
+
763
+ The simplest RQA method, denoted as `L` (SSA equation 3) and equivalent
764
+ to the method described by Eckman, Kamphorst, and Ruelle [#]_, accumulates
765
+ the length of diagonal paths with positive values in the input:
766
+
767
+ - ``score[i, j] = score[i-1, j-1] + 1`` if ``sim[i, j] > 0``
768
+ - ``score[i, j] = 0`` otherwise.
769
+
770
+ The second method, denoted as `S` (SSA equation 4), is similar to the first,
771
+ but allows for "knight moves" (as in the chess piece) in addition to strict
772
+ diagonal moves:
773
+
774
+ - ``score[i, j] = max(score[i-1, j-1], score[i-2, j-1], score[i-1, j-2]) + 1`` if ``sim[i, j] >
775
+ 0``
776
+ - ``score[i, j] = 0`` otherwise.
777
+
778
+ The third method, denoted as `Q` (SSA equations 5 and 6) extends this by
779
+ allowing gaps in the alignment that incur some cost, rather than a hard
780
+ reset to 0 whenever ``sim[i, j] == 0``.
781
+ Gaps are penalized by two additional parameters, ``gap_onset`` and ``gap_extend``,
782
+ which are subtracted from the value of the alignment path every time a gap
783
+ is introduced or extended (respectively).
784
+
785
+ Note that setting ``gap_onset`` and ``gap_extend`` to `np.inf` recovers the second
786
+ method, and disabling knight moves recovers the first.
787
+
788
+ .. [#] Serrà, Joan, Xavier Serra, and Ralph G. Andrzejak.
789
+ "Cross recurrence quantification for cover song identification."
790
+ New Journal of Physics 11, no. 9 (2009): 093017.
791
+
792
+ .. [#] Eckmann, J. P., S. Oliffson Kamphorst, and D. Ruelle.
793
+ "Recurrence plots of dynamical systems."
794
+ World Scientific Series on Nonlinear Science Series A 16 (1995): 441-446.
795
+
796
+ Parameters
797
+ ----------
798
+ sim : np.ndarray [shape=(N, M), non-negative]
799
+ The similarity matrix to use as input.
800
+
801
+ This can either be a recurrence matrix (self-similarity)
802
+ or a cross-similarity matrix between two sequences.
803
+
804
+ gap_onset : float > 0
805
+ Penalty for introducing a gap to an alignment sequence
806
+
807
+ gap_extend : float > 0
808
+ Penalty for extending a gap in an alignment sequence
809
+
810
+ knight_moves : bool
811
+ If ``True`` (default), allow for "knight moves" in the alignment,
812
+ e.g., ``(n, m) => (n + 1, m + 2)`` or ``(n + 2, m + 1)``.
813
+
814
+ If ``False``, only allow for diagonal moves ``(n, m) => (n + 1, m + 1)``.
815
+
816
+ backtrack : bool
817
+ If ``True``, return the alignment path.
818
+
819
+ If ``False``, only return the score matrix.
820
+
821
+ Returns
822
+ -------
823
+ score : np.ndarray [shape=(N, M)]
824
+ The alignment score matrix. ``score[n, m]`` is the cumulative value of
825
+ the best alignment sequence ending in frames ``n`` and ``m``.
826
+ path : np.ndarray [shape=(k, 2)] (optional)
827
+ If ``backtrack=True``, ``path`` contains a list of pairs of aligned frames
828
+ in the best alignment sequence.
829
+
830
+ ``path[i] = [n, m]`` indicates that row ``n`` aligns to column ``m``.
831
+
832
+ See Also
833
+ --------
834
+ librosa.segment.recurrence_matrix
835
+ librosa.segment.cross_similarity
836
+ dtw
837
+
838
+ Examples
839
+ --------
840
+ Simple diagonal path enhancement (L-mode)
841
+
842
+ >>> import numpy as np
843
+ >>> import matplotlib.pyplot as plt
844
+ >>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
845
+ >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
846
+ >>> # Use time-delay embedding to reduce noise
847
+ >>> chroma_stack = librosa.feature.stack_memory(chroma, n_steps=10, delay=3)
848
+ >>> # Build recurrence, suppress self-loops within 1 second
849
+ >>> rec = librosa.segment.recurrence_matrix(chroma_stack, width=43,
850
+ ... mode='affinity',
851
+ ... metric='cosine')
852
+ >>> # using infinite cost for gaps enforces strict path continuation
853
+ >>> L_score, L_path = librosa.sequence.rqa(rec,
854
+ ... gap_onset=np.inf,
855
+ ... gap_extend=np.inf,
856
+ ... knight_moves=False)
857
+ >>> fig, ax = plt.subplots(ncols=2)
858
+ >>> librosa.display.specshow(rec, x_axis='frames', y_axis='frames', ax=ax[0])
859
+ >>> ax[0].set(title='Recurrence matrix')
860
+ >>> librosa.display.specshow(L_score, x_axis='frames', y_axis='frames', ax=ax[1])
861
+ >>> ax[1].set(title='Alignment score matrix')
862
+ >>> ax[1].plot(L_path[:, 1], L_path[:, 0], label='Optimal path', color='c')
863
+ >>> ax[1].legend()
864
+ >>> ax[1].label_outer()
865
+
866
+ Full alignment using gaps and knight moves
867
+
868
+ >>> # New gaps cost 5, extending old gaps cost 10 for each step
869
+ >>> score, path = librosa.sequence.rqa(rec, gap_onset=5, gap_extend=10)
870
+ >>> fig, ax = plt.subplots(ncols=2, sharex=True, sharey=True)
871
+ >>> librosa.display.specshow(rec, x_axis='frames', y_axis='frames', ax=ax[0])
872
+ >>> ax[0].set(title='Recurrence matrix')
873
+ >>> librosa.display.specshow(score, x_axis='frames', y_axis='frames', ax=ax[1])
874
+ >>> ax[1].set(title='Alignment score matrix')
875
+ >>> ax[1].plot(path[:, 1], path[:, 0], label='Optimal path', color='c')
876
+ >>> ax[1].legend()
877
+ >>> ax[1].label_outer()
878
+ """
879
+
880
+ if gap_onset < 0:
881
+ raise ParameterError("gap_onset={} must be strictly positive")
882
+ if gap_extend < 0:
883
+ raise ParameterError("gap_extend={} must be strictly positive")
884
+
885
+ score: np.ndarray
886
+ pointers: np.ndarray
887
+ score, pointers = __rqa_dp(sim, gap_onset, gap_extend, knight_moves)
888
+ if backtrack:
889
+ path = __rqa_backtrack(score, pointers)
890
+ return score, path
891
+
892
+ return score
893
+
894
+
895
+ @jit(nopython=True, cache=False) # type: ignore
896
+ def __rqa_dp(
897
+ sim: np.ndarray, gap_onset: float, gap_extend: float, knight: bool
898
+ ) -> Tuple[np.ndarray, np.ndarray]: # pragma: no cover
899
+ """RQA dynamic programming implementation"""
900
+
901
+ # The output array
902
+ score = np.zeros(sim.shape, dtype=sim.dtype)
903
+
904
+ # The backtracking array
905
+ backtrack = np.zeros(sim.shape, dtype=np.int8)
906
+
907
+ # These are place-holder arrays to limit the points being considered
908
+ # at each step of the DP
909
+ #
910
+ # If knight moves are enabled, values are indexed according to
911
+ # [(-1,-1), (-1, -2), (-2, -1)]
912
+ #
913
+ # If knight moves are disabled, then only the first entry is used.
914
+ #
915
+ # Using dummy vectors here makes the code a bit cleaner down below.
916
+ sim_values = np.zeros(3)
917
+ score_values = np.zeros(3)
918
+ vec = np.zeros(3)
919
+
920
+ if knight:
921
+ # Initial limit is for the base case: diagonal + one knight
922
+ init_limit = 2
923
+
924
+ # Otherwise, we have 3 positions
925
+ limit = 3
926
+ else:
927
+ init_limit = 1
928
+ limit = 1
929
+
930
+ # backtracking rubric:
931
+ # 0 ==> diagonal move
932
+ # 1 ==> knight move up
933
+ # 2 ==> knight move left
934
+ # -1 ==> reset without inclusion
935
+ # -2 ==> reset with inclusion (ie positive value at init)
936
+
937
+ # Initialize the first row and column with the data
938
+ score[0, :] = sim[0, :]
939
+ score[:, 0] = sim[:, 0]
940
+
941
+ # backtracking initialization: the first row and column are all resets
942
+ # if there's a positive link here, it's an inclusive reset
943
+ for i in range(sim.shape[0]):
944
+ if sim[i, 0]:
945
+ backtrack[i, 0] = -2
946
+ else:
947
+ backtrack[i, 0] = -1
948
+
949
+ for j in range(sim.shape[1]):
950
+ if sim[0, j]:
951
+ backtrack[0, j] = -2
952
+ else:
953
+ backtrack[0, j] = -1
954
+
955
+ # Initialize the 1-1 case using only the diagonal
956
+ if sim[1, 1] > 0:
957
+ score[1, 1] = score[0, 0] + sim[1, 1]
958
+ backtrack[1, 1] = 0
959
+ else:
960
+ link = sim[0, 0] > 0
961
+ score[1, 1] = max(0, score[0, 0] - (link) * gap_onset - (~link) * gap_extend)
962
+ if score[1, 1] > 0:
963
+ backtrack[1, 1] = 0
964
+ else:
965
+ backtrack[1, 1] = -1
966
+
967
+ # Initialize the second row with diagonal and left-knight moves
968
+ i = 1
969
+ for j in range(2, sim.shape[1]):
970
+ score_values[:-1] = (score[i - 1, j - 1], score[i - 1, j - 2])
971
+ sim_values[:-1] = (sim[i - 1, j - 1], sim[i - 1, j - 2])
972
+ t_values = sim_values > 0
973
+ if sim[i, j] > 0:
974
+ backtrack[i, j] = np.argmax(score_values[:init_limit])
975
+ score[i, j] = score_values[backtrack[i, j]] + sim[i, j] # or + 1 for binary
976
+ else:
977
+ vec[:init_limit] = (
978
+ score_values[:init_limit]
979
+ - t_values[:init_limit] * gap_onset
980
+ - (~t_values[:init_limit]) * gap_extend
981
+ )
982
+
983
+ backtrack[i, j] = np.argmax(vec[:init_limit])
984
+ score[i, j] = max(0, vec[backtrack[i, j]])
985
+ # Is it a reset?
986
+ if score[i, j] == 0:
987
+ backtrack[i, j] = -1
988
+
989
+ # Initialize the second column with diagonal and up-knight moves
990
+ j = 1
991
+ for i in range(2, sim.shape[0]):
992
+ score_values[:-1] = (score[i - 1, j - 1], score[i - 2, j - 1])
993
+ sim_values[:-1] = (sim[i - 1, j - 1], sim[i - 2, j - 1])
994
+ t_values = sim_values > 0
995
+ if sim[i, j] > 0:
996
+ backtrack[i, j] = np.argmax(score_values[:init_limit])
997
+ score[i, j] = score_values[backtrack[i, j]] + sim[i, j] # or + 1 for binary
998
+
999
+ else:
1000
+ vec[:init_limit] = (
1001
+ score_values[:init_limit]
1002
+ - t_values[:init_limit] * gap_onset
1003
+ - (~t_values[:init_limit]) * gap_extend
1004
+ )
1005
+
1006
+ backtrack[i, j] = np.argmax(vec[:init_limit])
1007
+ score[i, j] = max(0, vec[backtrack[i, j]])
1008
+ # Is it a reset?
1009
+ if score[i, j] == 0:
1010
+ backtrack[i, j] = -1
1011
+
1012
+ # Now fill in the rest of the table
1013
+ for i in range(2, sim.shape[0]):
1014
+ for j in range(2, sim.shape[1]):
1015
+ score_values[:] = (
1016
+ score[i - 1, j - 1],
1017
+ score[i - 1, j - 2],
1018
+ score[i - 2, j - 1],
1019
+ )
1020
+ sim_values[:] = (sim[i - 1, j - 1], sim[i - 1, j - 2], sim[i - 2, j - 1])
1021
+ t_values = sim_values > 0
1022
+ if sim[i, j] > 0:
1023
+ # if knight is true, it's max of (-1,-1), (-1, -2), (-2, -1)
1024
+ # otherwise, it's just the diagonal move (-1, -1)
1025
+ # for backtracking purposes, if the max is 0 then it's the start of a new sequence
1026
+ # if the max is non-zero, then we extend the existing sequence
1027
+ backtrack[i, j] = np.argmax(score_values[:limit])
1028
+ score[i, j] = (
1029
+ score_values[backtrack[i, j]] + sim[i, j]
1030
+ ) # or + 1 for binary
1031
+
1032
+ else:
1033
+ # if the max of our options is negative, then it's a hard reset
1034
+ # otherwise, it's a skip move
1035
+ vec[:limit] = (
1036
+ score_values[:limit]
1037
+ - t_values[:limit] * gap_onset
1038
+ - (~t_values[:limit]) * gap_extend
1039
+ )
1040
+
1041
+ backtrack[i, j] = np.argmax(vec[:limit])
1042
+ score[i, j] = max(0, vec[backtrack[i, j]])
1043
+ # Is it a reset?
1044
+ if score[i, j] == 0:
1045
+ backtrack[i, j] = -1
1046
+
1047
+ return score, backtrack
1048
+
1049
+
1050
+ def __rqa_backtrack(score, pointers):
1051
+ """RQA path backtracking
1052
+
1053
+ Given the score matrix and backtracking index array,
1054
+ reconstruct the optimal path.
1055
+ """
1056
+
1057
+ # backtracking rubric:
1058
+ # 0 ==> diagonal move
1059
+ # 1 ==> knight move up
1060
+ # 2 ==> knight move left
1061
+ # -1 ==> reset (sim = 0)
1062
+ # -2 ==> start of sequence (sim > 0)
1063
+
1064
+ # This array maps the backtracking values to the
1065
+ # relative index offsets
1066
+ offsets = [(-1, -1), (-1, -2), (-2, -1)]
1067
+
1068
+ # Find the maximum to end the path
1069
+ idx = list(np.unravel_index(np.argmax(score), score.shape))
1070
+
1071
+ # Construct the path
1072
+ path: List = []
1073
+ while True:
1074
+ bt_index = pointers[tuple(idx)]
1075
+
1076
+ # A -1 indicates a non-inclusive reset
1077
+ # this can only happen when sim[idx] == 0,
1078
+ # and a reset with zero score should not be included
1079
+ # in the path. In this case, we're done.
1080
+ if bt_index == -1:
1081
+ break
1082
+
1083
+ # Other bt_index values are okay for inclusion
1084
+ path.insert(0, idx)
1085
+
1086
+ # -2 indicates beginning of sequence,
1087
+ # so we can't backtrack any further
1088
+ if bt_index == -2:
1089
+ break
1090
+
1091
+ # Otherwise, prepend this index and continue
1092
+ idx = [idx[_] + offsets[bt_index][_] for _ in range(len(idx))]
1093
+
1094
+ # If there's no alignment path at all, eg an empty cross-similarity
1095
+ # matrix, return a properly shaped and typed array
1096
+ if not path:
1097
+ return np.empty((0, 2), dtype=np.uint)
1098
+
1099
+ return np.asarray(path, dtype=np.uint)
1100
+
1101
+
1102
+ @jit(nopython=True, cache=False) # type: ignore
1103
+ def _viterbi(
1104
+ log_prob: np.ndarray, log_trans: np.ndarray, log_p_init: np.ndarray
1105
+ ) -> Tuple[np.ndarray, np.ndarray]: # pragma: no cover
1106
+ """Core Viterbi algorithm.
1107
+
1108
+ This is intended for internal use only.
1109
+
1110
+ Parameters
1111
+ ----------
1112
+ log_prob : np.ndarray [shape=(T, m)]
1113
+ ``log_prob[t, s]`` is the conditional log-likelihood
1114
+ ``log P[X = X(t) | State(t) = s]``
1115
+ log_trans : np.ndarray [shape=(m, m)]
1116
+ The log transition matrix
1117
+ ``log_trans[i, j] = log P[State(t+1) = j | State(t) = i]``
1118
+ log_p_init : np.ndarray [shape=(m,)]
1119
+ log of the initial state distribution
1120
+
1121
+ Returns
1122
+ -------
1123
+ None
1124
+ All computations are performed in-place on ``state, value, ptr``.
1125
+ """
1126
+ n_steps, n_states = log_prob.shape
1127
+
1128
+ state = np.zeros(n_steps, dtype=np.uint16)
1129
+ value = np.zeros((n_steps, n_states), dtype=np.float64)
1130
+ ptr = np.zeros((n_steps, n_states), dtype=np.uint16)
1131
+
1132
+ # factor in initial state distribution
1133
+ value[0] = log_prob[0] + log_p_init
1134
+
1135
+ for t in range(1, n_steps):
1136
+ # Want V[t, j] <- p[t, j] * max_k V[t-1, k] * A[k, j]
1137
+ # assume at time t-1 we were in state k
1138
+ # transition k -> j
1139
+
1140
+ # Broadcast over rows:
1141
+ # Tout[k, j] = V[t-1, k] * A[k, j]
1142
+ # then take the max over columns
1143
+ # We'll do this in log-space for stability
1144
+
1145
+ trans_out = value[t - 1] + log_trans.T
1146
+
1147
+ # Unroll the max/argmax loop to enable numba support
1148
+ for j in range(n_states):
1149
+ ptr[t, j] = np.argmax(trans_out[j])
1150
+ # value[t, j] = log_prob[t, j] + np.max(trans_out[j])
1151
+ value[t, j] = log_prob[t, j] + trans_out[j, ptr[t][j]]
1152
+
1153
+ # Now roll backward
1154
+
1155
+ # Get the last state
1156
+ state[-1] = np.argmax(value[-1])
1157
+
1158
+ for t in range(n_steps - 2, -1, -1):
1159
+ state[t] = ptr[t + 1, state[t + 1]]
1160
+
1161
+ logp = value[-1:, state[-1]]
1162
+
1163
+ return state, logp
1164
+
1165
+
1166
+ @overload
1167
+ def viterbi(
1168
+ prob: np.ndarray,
1169
+ transition: np.ndarray,
1170
+ *,
1171
+ p_init: Optional[np.ndarray] = ...,
1172
+ return_logp: Literal[True],
1173
+ ) -> Tuple[np.ndarray, np.ndarray]:
1174
+ ...
1175
+
1176
+
1177
+ @overload
1178
+ def viterbi(
1179
+ prob: np.ndarray,
1180
+ transition: np.ndarray,
1181
+ *,
1182
+ p_init: Optional[np.ndarray] = ...,
1183
+ return_logp: Literal[False] = ...,
1184
+ ) -> np.ndarray:
1185
+ ...
1186
+
1187
+
1188
+ def viterbi(
1189
+ prob: np.ndarray,
1190
+ transition: np.ndarray,
1191
+ *,
1192
+ p_init: Optional[np.ndarray] = None,
1193
+ return_logp: bool = False,
1194
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1195
+ """Viterbi decoding from observation likelihoods.
1196
+
1197
+ Given a sequence of observation likelihoods ``prob[s, t]``,
1198
+ indicating the conditional likelihood of seeing the observation
1199
+ at time ``t`` from state ``s``, and a transition matrix
1200
+ ``transition[i, j]`` which encodes the conditional probability of
1201
+ moving from state ``i`` to state ``j``, the Viterbi algorithm [#]_ computes
1202
+ the most likely sequence of states from the observations.
1203
+
1204
+ .. [#] Viterbi, Andrew. "Error bounds for convolutional codes and an
1205
+ asymptotically optimum decoding algorithm."
1206
+ IEEE transactions on Information Theory 13.2 (1967): 260-269.
1207
+
1208
+ Parameters
1209
+ ----------
1210
+ prob : np.ndarray [shape=(..., n_states, n_steps), non-negative]
1211
+ ``prob[..., s, t]`` is the probability of observation at time ``t``
1212
+ being generated by state ``s``.
1213
+ transition : np.ndarray [shape=(n_states, n_states), non-negative]
1214
+ ``transition[i, j]`` is the probability of a transition from i->j.
1215
+ Each row must sum to 1.
1216
+ p_init : np.ndarray [shape=(n_states,)]
1217
+ Optional: initial state distribution.
1218
+ If not provided, a uniform distribution is assumed.
1219
+ return_logp : bool
1220
+ If ``True``, return the log-likelihood of the state sequence.
1221
+
1222
+ Returns
1223
+ -------
1224
+ Either ``states`` or ``(states, logp)``:
1225
+ states : np.ndarray [shape=(..., n_steps,)]
1226
+ The most likely state sequence.
1227
+ If ``prob`` contains multiple channels of input, then each channel is
1228
+ decoded independently.
1229
+ logp : scalar [float] or np.ndarray
1230
+ If ``return_logp=True``, the log probability of ``states`` given
1231
+ the observations.
1232
+
1233
+ See Also
1234
+ --------
1235
+ viterbi_discriminative : Viterbi decoding from state likelihoods
1236
+
1237
+ Examples
1238
+ --------
1239
+ Example from https://en.wikipedia.org/wiki/Viterbi_algorithm#Example
1240
+
1241
+ In this example, we have two states ``healthy`` and ``fever``, with
1242
+ initial probabilities 60% and 40%.
1243
+
1244
+ We have three observation possibilities: ``normal``, ``cold``, and
1245
+ ``dizzy``, whose probabilities given each state are:
1246
+
1247
+ ``healthy => {normal: 50%, cold: 40%, dizzy: 10%}`` and
1248
+ ``fever => {normal: 10%, cold: 30%, dizzy: 60%}``
1249
+
1250
+ Finally, we have transition probabilities:
1251
+
1252
+ ``healthy => healthy (70%)`` and
1253
+ ``fever => fever (60%)``.
1254
+
1255
+ Over three days, we observe the sequence ``[normal, cold, dizzy]``,
1256
+ and wish to know the maximum likelihood assignment of states for the
1257
+ corresponding days, which we compute with the Viterbi algorithm below.
1258
+
1259
+ >>> p_init = np.array([0.6, 0.4])
1260
+ >>> p_emit = np.array([[0.5, 0.4, 0.1],
1261
+ ... [0.1, 0.3, 0.6]])
1262
+ >>> p_trans = np.array([[0.7, 0.3], [0.4, 0.6]])
1263
+ >>> path, logp = librosa.sequence.viterbi(p_emit, p_trans, p_init=p_init,
1264
+ ... return_logp=True)
1265
+ >>> print(logp, path)
1266
+ -4.19173690823075 [0 0 1]
1267
+ """
1268
+
1269
+ n_states, n_steps = prob.shape[-2:]
1270
+
1271
+ if transition.shape != (n_states, n_states):
1272
+ raise ParameterError(
1273
+ f"transition.shape={transition.shape}, must be "
1274
+ f"(n_states, n_states)={n_states, n_states}"
1275
+ )
1276
+
1277
+ if np.any(transition < 0) or not np.allclose(transition.sum(axis=1), 1):
1278
+ raise ParameterError(
1279
+ "Invalid transition matrix: must be non-negative "
1280
+ "and sum to 1 on each row."
1281
+ )
1282
+
1283
+ if np.any(prob < 0) or np.any(prob > 1):
1284
+ raise ParameterError("Invalid probability values: must be between 0 and 1.")
1285
+
1286
+ # Compute log-likelihoods while avoiding log-underflow
1287
+ epsilon = tiny(prob)
1288
+
1289
+ if p_init is None:
1290
+ p_init = np.empty(n_states)
1291
+ p_init.fill(1.0 / n_states)
1292
+ elif (
1293
+ np.any(p_init < 0)
1294
+ or not np.allclose(p_init.sum(), 1)
1295
+ or p_init.shape != (n_states,)
1296
+ ):
1297
+ raise ParameterError(f"Invalid initial state distribution: p_init={p_init}")
1298
+
1299
+ log_trans = np.log(transition + epsilon)
1300
+ log_prob = np.log(prob + epsilon)
1301
+ log_p_init = np.log(p_init + epsilon)
1302
+
1303
+ def _helper(lp):
1304
+ # Transpose input
1305
+ _state, logp = _viterbi(lp.T, log_trans, log_p_init)
1306
+ # Transpose outputs for return
1307
+ return _state.T, logp
1308
+
1309
+ states: np.ndarray
1310
+ logp: np.ndarray
1311
+
1312
+ if log_prob.ndim == 2:
1313
+ states, logp = _helper(log_prob)
1314
+ else:
1315
+ # Vectorize the helper
1316
+ __viterbi = np.vectorize(
1317
+ _helper, otypes=[np.uint16, np.float64], signature="(s,t)->(t),(1)"
1318
+ )
1319
+
1320
+ states, logp = __viterbi(log_prob)
1321
+
1322
+ # Flatten out the trailing dimension introduced by vectorization
1323
+ logp = logp[..., 0]
1324
+
1325
+ if return_logp:
1326
+ return states, logp
1327
+
1328
+ return states
1329
+
1330
+
1331
+ @overload
1332
+ def viterbi_discriminative(
1333
+ prob: np.ndarray,
1334
+ transition: np.ndarray,
1335
+ *,
1336
+ p_state: Optional[np.ndarray] = ...,
1337
+ p_init: Optional[np.ndarray] = ...,
1338
+ return_logp: Literal[False] = ...,
1339
+ ) -> np.ndarray:
1340
+ ...
1341
+
1342
+
1343
+ @overload
1344
+ def viterbi_discriminative(
1345
+ prob: np.ndarray,
1346
+ transition: np.ndarray,
1347
+ *,
1348
+ p_state: Optional[np.ndarray] = ...,
1349
+ p_init: Optional[np.ndarray] = ...,
1350
+ return_logp: Literal[True],
1351
+ ) -> Tuple[np.ndarray, np.ndarray]:
1352
+ ...
1353
+
1354
+
1355
+ @overload
1356
+ def viterbi_discriminative(
1357
+ prob: np.ndarray,
1358
+ transition: np.ndarray,
1359
+ *,
1360
+ p_state: Optional[np.ndarray] = ...,
1361
+ p_init: Optional[np.ndarray] = ...,
1362
+ return_logp: bool,
1363
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1364
+ ...
1365
+
1366
+
1367
+ def viterbi_discriminative(
1368
+ prob: np.ndarray,
1369
+ transition: np.ndarray,
1370
+ *,
1371
+ p_state: Optional[np.ndarray] = None,
1372
+ p_init: Optional[np.ndarray] = None,
1373
+ return_logp: bool = False,
1374
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1375
+ """Viterbi decoding from discriminative state predictions.
1376
+
1377
+ Given a sequence of conditional state predictions ``prob[s, t]``,
1378
+ indicating the conditional likelihood of state ``s`` given the
1379
+ observation at time ``t``, and a transition matrix ``transition[i, j]``
1380
+ which encodes the conditional probability of moving from state ``i``
1381
+ to state ``j``, the Viterbi algorithm computes the most likely sequence
1382
+ of states from the observations.
1383
+
1384
+ This implementation uses the standard Viterbi decoding algorithm
1385
+ for observation likelihood sequences, under the assumption that
1386
+ ``P[Obs(t) | State(t) = s]`` is proportional to
1387
+ ``P[State(t) = s | Obs(t)] / P[State(t) = s]``, where the denominator
1388
+ is the marginal probability of state ``s`` occurring as given by ``p_state``.
1389
+
1390
+ Note that because the denominator ``P[State(t) = s]`` is not explicitly
1391
+ calculated, the resulting probabilities (or log-probabilities) are not
1392
+ normalized. If using the `return_logp=True` option (see below),
1393
+ be aware that the "probabilities" may not sum to (and may exceed) 1.
1394
+
1395
+ Parameters
1396
+ ----------
1397
+ prob : np.ndarray [shape=(..., n_states, n_steps), non-negative]
1398
+ ``prob[s, t]`` is the probability of state ``s`` conditional on
1399
+ the observation at time ``t``.
1400
+ Must be non-negative and sum to 1 along each column.
1401
+ transition : np.ndarray [shape=(n_states, n_states), non-negative]
1402
+ ``transition[i, j]`` is the probability of a transition from i->j.
1403
+ Each row must sum to 1.
1404
+ p_state : np.ndarray [shape=(n_states,)]
1405
+ Optional: marginal probability distribution over states,
1406
+ must be non-negative and sum to 1.
1407
+ If not provided, a uniform distribution is assumed.
1408
+ p_init : np.ndarray [shape=(n_states,)]
1409
+ Optional: initial state distribution.
1410
+ If not provided, it is assumed to be uniform.
1411
+ return_logp : bool
1412
+ If ``True``, return the log-likelihood of the state sequence.
1413
+
1414
+ Returns
1415
+ -------
1416
+ Either ``states`` or ``(states, logp)``:
1417
+ states : np.ndarray [shape=(..., n_steps,)]
1418
+ The most likely state sequence.
1419
+ If ``prob`` contains multiple input channels,
1420
+ then each channel is decoded independently.
1421
+ logp : scalar [float] or np.ndarray
1422
+ If ``return_logp=True``, the (unnormalized) log probability
1423
+ of ``states`` given the observations.
1424
+
1425
+ See Also
1426
+ --------
1427
+ viterbi :
1428
+ Viterbi decoding from observation likelihoods
1429
+ viterbi_binary :
1430
+ Viterbi decoding for multi-label, conditional state likelihoods
1431
+
1432
+ Examples
1433
+ --------
1434
+ This example constructs a simple, template-based discriminative chord estimator,
1435
+ using CENS chroma as input features.
1436
+
1437
+ .. note:: this chord model is not accurate enough to use in practice. It is only
1438
+ intended to demonstrate how to use discriminative Viterbi decoding.
1439
+
1440
+ >>> # Create templates for major, minor, and no-chord qualities
1441
+ >>> maj_template = np.array([1,0,0, 0,1,0, 0,1,0, 0,0,0])
1442
+ >>> min_template = np.array([1,0,0, 1,0,0, 0,1,0, 0,0,0])
1443
+ >>> N_template = np.array([1,1,1, 1,1,1, 1,1,1, 1,1,1.]) / 4.
1444
+ >>> # Generate the weighting matrix that maps chroma to labels
1445
+ >>> weights = np.zeros((25, 12), dtype=float)
1446
+ >>> labels = ['C:maj', 'C#:maj', 'D:maj', 'D#:maj', 'E:maj', 'F:maj',
1447
+ ... 'F#:maj', 'G:maj', 'G#:maj', 'A:maj', 'A#:maj', 'B:maj',
1448
+ ... 'C:min', 'C#:min', 'D:min', 'D#:min', 'E:min', 'F:min',
1449
+ ... 'F#:min', 'G:min', 'G#:min', 'A:min', 'A#:min', 'B:min',
1450
+ ... 'N']
1451
+ >>> for c in range(12):
1452
+ ... weights[c, :] = np.roll(maj_template, c) # c:maj
1453
+ ... weights[c + 12, :] = np.roll(min_template, c) # c:min
1454
+ >>> weights[-1] = N_template # the last row is the no-chord class
1455
+ >>> # Make a self-loop transition matrix over 25 states
1456
+ >>> trans = librosa.sequence.transition_loop(25, 0.9)
1457
+
1458
+ >>> # Load in audio and make features
1459
+ >>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=15)
1460
+ >>> # Suppress percussive elements
1461
+ >>> y = librosa.effects.harmonic(y, margin=4)
1462
+ >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
1463
+ >>> # Map chroma (observations) to class (state) likelihoods
1464
+ >>> probs = np.exp(weights.dot(chroma)) # P[class | chroma] ~= exp(template' chroma)
1465
+ >>> probs /= probs.sum(axis=0, keepdims=True) # probabilities must sum to 1 in each column
1466
+ >>> # Compute independent frame-wise estimates
1467
+ >>> chords_ind = np.argmax(probs, axis=0)
1468
+ >>> # And viterbi estimates
1469
+ >>> chords_vit = librosa.sequence.viterbi_discriminative(probs, trans)
1470
+
1471
+ >>> # Plot the features and prediction map
1472
+ >>> import matplotlib.pyplot as plt
1473
+ >>> fig, ax = plt.subplots(nrows=2)
1474
+ >>> librosa.display.specshow(chroma, x_axis='time', y_axis='chroma', ax=ax[0])
1475
+ >>> librosa.display.specshow(weights, x_axis='chroma', ax=ax[1])
1476
+ >>> ax[1].set(yticks=np.arange(25) + 0.5, yticklabels=labels, ylabel='Chord')
1477
+
1478
+ >>> # And plot the results
1479
+ >>> fig, ax = plt.subplots()
1480
+ >>> librosa.display.specshow(probs, x_axis='time', cmap='gray', ax=ax)
1481
+ >>> times = librosa.times_like(chords_vit)
1482
+ >>> ax.scatter(times, chords_ind + 0.25, color='lime', alpha=0.5, marker='+',
1483
+ ... s=15, label='Independent')
1484
+ >>> ax.scatter(times, chords_vit - 0.25, color='deeppink', alpha=0.5, marker='o',
1485
+ ... s=15, label='Viterbi')
1486
+ >>> ax.set(yticks=np.unique(chords_vit),
1487
+ ... yticklabels=[labels[i] for i in np.unique(chords_vit)])
1488
+ >>> ax.legend()
1489
+ """
1490
+
1491
+ n_states, n_steps = prob.shape[-2:]
1492
+
1493
+ if transition.shape != (n_states, n_states):
1494
+ raise ParameterError(
1495
+ f"transition.shape={transition.shape}, must be "
1496
+ f"(n_states, n_states)={n_states, n_states}"
1497
+ )
1498
+
1499
+ if np.any(transition < 0) or not np.allclose(transition.sum(axis=1), 1):
1500
+ raise ParameterError(
1501
+ "Invalid transition matrix: must be non-negative "
1502
+ "and sum to 1 on each row."
1503
+ )
1504
+
1505
+ if np.any(prob < 0) or not np.allclose(prob.sum(axis=-2), 1):
1506
+ raise ParameterError(
1507
+ "Invalid probability values: each column must "
1508
+ "sum to 1 and be non-negative"
1509
+ )
1510
+
1511
+ # Compute log-likelihoods while avoiding log-underflow
1512
+ epsilon = tiny(prob)
1513
+
1514
+ # Compute marginal log probabilities while avoiding underflow
1515
+ if p_state is None:
1516
+ p_state = np.empty(n_states)
1517
+ p_state.fill(1.0 / n_states)
1518
+ elif p_state.shape != (n_states,):
1519
+ raise ParameterError(
1520
+ "Marginal distribution p_state must have shape (n_states,). "
1521
+ f"Got p_state.shape={p_state.shape}"
1522
+ )
1523
+ elif np.any(p_state < 0) or not np.allclose(p_state.sum(axis=-1), 1):
1524
+ raise ParameterError(f"Invalid marginal state distribution: p_state={p_state}")
1525
+
1526
+ if p_init is None:
1527
+ p_init = np.empty(n_states)
1528
+ p_init.fill(1.0 / n_states)
1529
+ elif (
1530
+ np.any(p_init < 0)
1531
+ or not np.allclose(p_init.sum(), 1)
1532
+ or p_init.shape != (n_states,)
1533
+ ):
1534
+ raise ParameterError(f"Invalid initial state distribution: p_init={p_init}")
1535
+
1536
+ # By Bayes' rule, P[X | Y] * P[Y] = P[Y | X] * P[X]
1537
+ # P[X] is constant for the sake of maximum likelihood inference
1538
+ # and P[Y] is given by the marginal distribution p_state.
1539
+ #
1540
+ # So we have P[X | y] \propto P[Y | x] / P[Y]
1541
+ # if X = observation and Y = states, this can be done in log space as
1542
+ # log P[X | y] \propto \log P[Y | x] - \log P[Y]
1543
+ log_p_init = np.log(p_init + epsilon)
1544
+ log_trans = np.log(transition + epsilon)
1545
+ log_marginal = np.log(p_state + epsilon)
1546
+
1547
+ # reshape to broadcast against prob
1548
+ log_marginal = expand_to(log_marginal, ndim=prob.ndim, axes=-2)
1549
+
1550
+ log_prob = np.log(prob + epsilon) - log_marginal
1551
+
1552
+ def _helper(lp):
1553
+ # Transpose input
1554
+ _state, logp = _viterbi(lp.T, log_trans, log_p_init)
1555
+ # Transpose outputs for return
1556
+ return _state.T, logp
1557
+
1558
+ states: np.ndarray
1559
+ logp: np.ndarray
1560
+ if log_prob.ndim == 2:
1561
+ states, logp = _helper(log_prob)
1562
+ else:
1563
+ # Vectorize the helper
1564
+ __viterbi = np.vectorize(
1565
+ _helper, otypes=[np.uint16, np.float64], signature="(s,t)->(t),(1)"
1566
+ )
1567
+
1568
+ states, logp = __viterbi(log_prob)
1569
+
1570
+ # Flatten out the trailing dimension
1571
+ logp = logp[..., 0]
1572
+
1573
+ if return_logp:
1574
+ return states, logp
1575
+
1576
+ return states
1577
+
1578
+
1579
+ @overload
1580
+ def viterbi_binary(
1581
+ prob: np.ndarray,
1582
+ transition: np.ndarray,
1583
+ *,
1584
+ p_state: Optional[np.ndarray] = ...,
1585
+ p_init: Optional[np.ndarray] = ...,
1586
+ return_logp: Literal[False] = ...,
1587
+ ) -> np.ndarray:
1588
+ ...
1589
+
1590
+
1591
+ @overload
1592
+ def viterbi_binary(
1593
+ prob: np.ndarray,
1594
+ transition: np.ndarray,
1595
+ *,
1596
+ p_state: Optional[np.ndarray] = ...,
1597
+ p_init: Optional[np.ndarray] = ...,
1598
+ return_logp: Literal[True],
1599
+ ) -> Tuple[np.ndarray, np.ndarray]:
1600
+ ...
1601
+
1602
+
1603
+ @overload
1604
+ def viterbi_binary(
1605
+ prob: np.ndarray,
1606
+ transition: np.ndarray,
1607
+ *,
1608
+ p_state: Optional[np.ndarray] = ...,
1609
+ p_init: Optional[np.ndarray] = ...,
1610
+ return_logp: bool = ...,
1611
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1612
+ ...
1613
+
1614
+
1615
+ def viterbi_binary(
1616
+ prob: np.ndarray,
1617
+ transition: np.ndarray,
1618
+ *,
1619
+ p_state: Optional[np.ndarray] = None,
1620
+ p_init: Optional[np.ndarray] = None,
1621
+ return_logp: bool = False,
1622
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1623
+ """Viterbi decoding from binary (multi-label), discriminative state predictions.
1624
+
1625
+ Given a sequence of conditional state predictions ``prob[s, t]``,
1626
+ indicating the conditional likelihood of state ``s`` being active
1627
+ conditional on observation at time ``t``, and a 2*2 transition matrix
1628
+ ``transition`` which encodes the conditional probability of moving from
1629
+ state ``s`` to state ``~s`` (not-``s``), the Viterbi algorithm computes the
1630
+ most likely sequence of states from the observations.
1631
+
1632
+ This function differs from `viterbi_discriminative` in that it does not assume the
1633
+ states to be mutually exclusive. `viterbi_binary` is implemented by
1634
+ transforming the multi-label decoding problem to a collection
1635
+ of binary Viterbi problems (one for each *state* or label).
1636
+
1637
+ The output is a binary matrix ``states[s, t]`` indicating whether each
1638
+ state ``s`` is active at time ``t``.
1639
+
1640
+ Like `viterbi_discriminative`, the probabilities of the optimal state sequences
1641
+ are not normalized here. If using the `return_logp=True` option (see below),
1642
+ be aware that the "probabilities" may not sum to (and may exceed) 1.
1643
+
1644
+ Parameters
1645
+ ----------
1646
+ prob : np.ndarray [shape=(..., n_steps,) or (..., n_states, n_steps)], non-negative
1647
+ ``prob[s, t]`` is the probability of state ``s`` being active
1648
+ conditional on the observation at time ``t``.
1649
+ Must be non-negative and less than 1.
1650
+
1651
+ If ``prob`` is 1-dimensional, it is expanded to shape ``(1, n_steps)``.
1652
+
1653
+ If ``prob`` contains multiple input channels, then each channel is decoded independently.
1654
+
1655
+ transition : np.ndarray [shape=(2, 2) or (n_states, 2, 2)], non-negative
1656
+ If 2-dimensional, the same transition matrix is applied to each sub-problem.
1657
+ ``transition[0, i]`` is the probability of the state going from inactive to ``i``,
1658
+ ``transition[1, i]`` is the probability of the state going from active to ``i``.
1659
+ Each row must sum to 1.
1660
+
1661
+ If 3-dimensional, ``transition[s]`` is interpreted as the 2x2 transition matrix
1662
+ for state label ``s``.
1663
+
1664
+ p_state : np.ndarray [shape=(n_states,)]
1665
+ Optional: marginal probability for each state (between [0,1]).
1666
+ If not provided, a uniform distribution (0.5 for each state)
1667
+ is assumed.
1668
+
1669
+ p_init : np.ndarray [shape=(n_states,)]
1670
+ Optional: initial state distribution.
1671
+ If not provided, it is assumed to be uniform.
1672
+
1673
+ return_logp : bool
1674
+ If ``True``, return the (unnormalized) log-likelihood of the state sequences.
1675
+
1676
+ Returns
1677
+ -------
1678
+ Either ``states`` or ``(states, logp)``:
1679
+ states : np.ndarray [shape=(..., n_states, n_steps)]
1680
+ The most likely state sequence.
1681
+ logp : np.ndarray [shape=(..., n_states,)]
1682
+ If ``return_logp=True``, the (unnormalized) log probability of each
1683
+ state activation sequence ``states``
1684
+
1685
+ See Also
1686
+ --------
1687
+ viterbi :
1688
+ Viterbi decoding from observation likelihoods
1689
+ viterbi_discriminative :
1690
+ Viterbi decoding for discriminative (mutually exclusive) state predictions
1691
+
1692
+ Examples
1693
+ --------
1694
+ In this example, we have a sequence of binary state likelihoods that we want to de-noise
1695
+ under the assumption that state changes are relatively uncommon. Positive predictions
1696
+ should only be retained if they persist for multiple steps, and any transient predictions
1697
+ should be considered as errors. This use case arises frequently in problems such as
1698
+ instrument recognition, where state activations tend to be stable over time, but subject
1699
+ to abrupt changes (e.g., when an instrument joins the mix).
1700
+
1701
+ We assume that the 0 state has a self-transition probability of 90%, and the 1 state
1702
+ has a self-transition probability of 70%. We assume the marginal and initial
1703
+ probability of either state is 50%.
1704
+
1705
+ >>> trans = np.array([[0.9, 0.1], [0.3, 0.7]])
1706
+ >>> prob = np.array([0.1, 0.7, 0.4, 0.3, 0.8, 0.9, 0.8, 0.2, 0.6, 0.3])
1707
+ >>> librosa.sequence.viterbi_binary(prob, trans, p_state=0.5, p_init=0.5)
1708
+ array([[0, 0, 0, 0, 1, 1, 1, 0, 0, 0]])
1709
+ """
1710
+
1711
+ prob = np.atleast_2d(prob)
1712
+
1713
+ n_states, n_steps = prob.shape[-2:]
1714
+
1715
+ if transition.shape == (2, 2):
1716
+ transition = np.tile(transition, (n_states, 1, 1))
1717
+ elif transition.shape != (n_states, 2, 2):
1718
+ raise ParameterError(
1719
+ f"transition.shape={transition.shape}, must be (2, 2) or "
1720
+ f"(n_states, 2, 2)={n_states}"
1721
+ )
1722
+
1723
+ if np.any(transition < 0) or not np.allclose(transition.sum(axis=-1), 1):
1724
+ raise ParameterError(
1725
+ "Invalid transition matrix: must be non-negative "
1726
+ "and sum to 1 on each row."
1727
+ )
1728
+
1729
+ if np.any(prob < 0) or np.any(prob > 1):
1730
+ raise ParameterError("Invalid probability values: prob must be between [0, 1]")
1731
+
1732
+ if p_state is None:
1733
+ p_state = np.empty(n_states)
1734
+ p_state.fill(0.5)
1735
+ else:
1736
+ p_state = np.atleast_1d(p_state)
1737
+
1738
+ assert p_state is not None
1739
+
1740
+ if p_state.shape != (n_states,) or np.any(p_state < 0) or np.any(p_state > 1):
1741
+ raise ParameterError(f"Invalid marginal state distributions: p_state={p_state}")
1742
+
1743
+ if p_init is None:
1744
+ p_init = np.empty(n_states)
1745
+ p_init.fill(0.5)
1746
+ else:
1747
+ p_init = np.atleast_1d(p_init)
1748
+
1749
+ assert p_init is not None
1750
+
1751
+ if p_init.shape != (n_states,) or np.any(p_init < 0) or np.any(p_init > 1):
1752
+ raise ParameterError(f"Invalid initial state distributions: p_init={p_init}")
1753
+
1754
+ shape_prefix = list(prob.shape[:-2])
1755
+ states = np.empty(shape_prefix + [n_states, n_steps], dtype=np.uint16)
1756
+ logp = np.empty(shape_prefix + [n_states])
1757
+
1758
+ prob_binary = np.empty(shape_prefix + [2, n_steps])
1759
+ p_state_binary = np.empty(2)
1760
+ p_init_binary = np.empty(2)
1761
+
1762
+ for state in range(n_states):
1763
+ prob_binary[..., 0, :] = 1 - prob[..., state, :]
1764
+ prob_binary[..., 1, :] = prob[..., state, :]
1765
+
1766
+ p_state_binary[0] = 1 - p_state[state]
1767
+ p_state_binary[1] = p_state[state]
1768
+
1769
+ p_init_binary[0] = 1 - p_init[state]
1770
+ p_init_binary[1] = p_init[state]
1771
+
1772
+ states[..., state, :], logp[..., state] = viterbi_discriminative(
1773
+ prob_binary,
1774
+ transition[state],
1775
+ p_state=p_state_binary,
1776
+ p_init=p_init_binary,
1777
+ return_logp=True,
1778
+ )
1779
+
1780
+ if return_logp:
1781
+ return states, logp
1782
+
1783
+ return states
1784
+
1785
+
1786
+ def transition_uniform(n_states: int) -> np.ndarray:
1787
+ """Construct a uniform transition matrix over ``n_states``.
1788
+
1789
+ Parameters
1790
+ ----------
1791
+ n_states : int > 0
1792
+ The number of states
1793
+
1794
+ Returns
1795
+ -------
1796
+ transition : np.ndarray [shape=(n_states, n_states)]
1797
+ ``transition[i, j] = 1./n_states``
1798
+
1799
+ Examples
1800
+ --------
1801
+ >>> librosa.sequence.transition_uniform(3)
1802
+ array([[0.333, 0.333, 0.333],
1803
+ [0.333, 0.333, 0.333],
1804
+ [0.333, 0.333, 0.333]])
1805
+ """
1806
+
1807
+ if not is_positive_int(n_states):
1808
+ raise ParameterError(f"n_states={n_states} must be a positive integer")
1809
+
1810
+ transition = np.empty((n_states, n_states), dtype=np.float64)
1811
+ transition.fill(1.0 / n_states)
1812
+ return transition
1813
+
1814
+
1815
+ def transition_loop(n_states: int, prob: Union[float, Iterable[float]]) -> np.ndarray:
1816
+ """Construct a self-loop transition matrix over ``n_states``.
1817
+
1818
+ The transition matrix will have the following properties:
1819
+
1820
+ - ``transition[i, i] = p`` for all ``i``
1821
+ - ``transition[i, j] = (1 - p) / (n_states - 1)`` for all ``j != i``
1822
+
1823
+ This type of transition matrix is appropriate when states tend to be
1824
+ locally stable, and there is no additional structure between different
1825
+ states. This is primarily useful for de-noising frame-wise predictions.
1826
+
1827
+ Parameters
1828
+ ----------
1829
+ n_states : int > 1
1830
+ The number of states
1831
+
1832
+ prob : float in [0, 1] or iterable, length=n_states
1833
+ If a scalar, this is the probability of a self-transition.
1834
+
1835
+ If a vector of length ``n_states``, ``p[i]`` is the probability of self-transition in state ``i``
1836
+
1837
+ Returns
1838
+ -------
1839
+ transition : np.ndarray [shape=(n_states, n_states)]
1840
+ The transition matrix
1841
+
1842
+ Examples
1843
+ --------
1844
+ >>> librosa.sequence.transition_loop(3, 0.5)
1845
+ array([[0.5 , 0.25, 0.25],
1846
+ [0.25, 0.5 , 0.25],
1847
+ [0.25, 0.25, 0.5 ]])
1848
+
1849
+ >>> librosa.sequence.transition_loop(3, [0.8, 0.5, 0.25])
1850
+ array([[0.8 , 0.1 , 0.1 ],
1851
+ [0.25 , 0.5 , 0.25 ],
1852
+ [0.375, 0.375, 0.25 ]])
1853
+ """
1854
+
1855
+ if not (is_positive_int(n_states) and (n_states > 1)):
1856
+ raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
1857
+
1858
+ transition = np.empty((n_states, n_states), dtype=np.float64)
1859
+
1860
+ # if it's a float, make it a vector
1861
+ prob = np.asarray(prob, dtype=np.float64)
1862
+
1863
+ if prob.ndim == 0:
1864
+ prob = np.tile(prob, n_states)
1865
+
1866
+ if prob.shape != (n_states,):
1867
+ raise ParameterError(
1868
+ f"prob={prob} must have length equal to n_states={n_states}"
1869
+ )
1870
+
1871
+ if np.any(prob < 0) or np.any(prob > 1):
1872
+ raise ParameterError(f"prob={prob} must have values in the range [0, 1]")
1873
+
1874
+ for i, prob_i in enumerate(prob):
1875
+ transition[i] = (1.0 - prob_i) / (n_states - 1)
1876
+ transition[i, i] = prob_i
1877
+
1878
+ return transition
1879
+
1880
+
1881
+ def transition_cycle(n_states: int, prob: Union[float, Iterable[float]]) -> np.ndarray:
1882
+ """Construct a cyclic transition matrix over ``n_states``.
1883
+
1884
+ The transition matrix will have the following properties:
1885
+
1886
+ - ``transition[i, i] = p``
1887
+ - ``transition[i, i + 1] = (1 - p)``
1888
+
1889
+ This type of transition matrix is appropriate for state spaces
1890
+ with cyclical structure, such as metrical position within a bar.
1891
+ For example, a song in 4/4 time has state transitions of the form
1892
+
1893
+ 1->{1, 2}, 2->{2, 3}, 3->{3, 4}, 4->{4, 1}.
1894
+
1895
+ Parameters
1896
+ ----------
1897
+ n_states : int > 1
1898
+ The number of states
1899
+
1900
+ prob : float in [0, 1] or iterable, length=n_states
1901
+ If a scalar, this is the probability of a self-transition.
1902
+
1903
+ If a vector of length ``n_states``, ``p[i]`` is the probability of
1904
+ self-transition in state ``i``
1905
+
1906
+ Returns
1907
+ -------
1908
+ transition : np.ndarray [shape=(n_states, n_states)]
1909
+ The transition matrix
1910
+
1911
+ Examples
1912
+ --------
1913
+ >>> librosa.sequence.transition_cycle(4, 0.9)
1914
+ array([[0.9, 0.1, 0. , 0. ],
1915
+ [0. , 0.9, 0.1, 0. ],
1916
+ [0. , 0. , 0.9, 0.1],
1917
+ [0.1, 0. , 0. , 0.9]])
1918
+ """
1919
+
1920
+ if not (is_positive_int(n_states) and n_states > 1):
1921
+ raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
1922
+
1923
+ transition = np.zeros((n_states, n_states), dtype=np.float64)
1924
+
1925
+ # if it's a float, make it a vector
1926
+ prob = np.asarray(prob, dtype=np.float64)
1927
+
1928
+ if prob.ndim == 0:
1929
+ prob = np.tile(prob, n_states)
1930
+
1931
+ if prob.shape != (n_states,):
1932
+ raise ParameterError(
1933
+ f"prob={prob} must have length equal to n_states={n_states}"
1934
+ )
1935
+
1936
+ if np.any(prob < 0) or np.any(prob > 1):
1937
+ raise ParameterError(f"prob={prob} must have values in the range [0, 1]")
1938
+
1939
+ for i, prob_i in enumerate(prob):
1940
+ transition[i, np.mod(i + 1, n_states)] = 1.0 - prob_i
1941
+ transition[i, i] = prob_i
1942
+
1943
+ return transition
1944
+
1945
+
1946
+ def transition_local(
1947
+ n_states: int,
1948
+ width: Union[int, Iterable[int]],
1949
+ *,
1950
+ window: _WindowSpec = "triangle",
1951
+ wrap: bool = False,
1952
+ ) -> np.ndarray:
1953
+ """Construct a localized transition matrix.
1954
+
1955
+ The transition matrix will have the following properties:
1956
+
1957
+ - ``transition[i, j] = 0`` if ``|i - j| > width``
1958
+ - ``transition[i, i]`` is maximal
1959
+ - ``transition[i, i - width//2 : i + width//2]`` has shape ``window``
1960
+
1961
+ This type of transition matrix is appropriate for state spaces
1962
+ that discretely approximate continuous variables, such as in fundamental
1963
+ frequency estimation.
1964
+
1965
+ Parameters
1966
+ ----------
1967
+ n_states : int > 1
1968
+ The number of states
1969
+
1970
+ width : int >= 1 or iterable
1971
+ The maximum number of states to treat as "local".
1972
+ If iterable, it should have length equal to ``n_states``,
1973
+ and specify the width independently for each state.
1974
+
1975
+ window : str, callable, or window specification
1976
+ The window function to determine the shape of the "local" distribution.
1977
+
1978
+ Any window specification supported by `filters.get_window` will work here.
1979
+
1980
+ .. note:: Certain windows (e.g., 'hann') are identically 0 at the boundaries,
1981
+ so and effectively have ``width-2`` non-zero values. You may have to expand
1982
+ ``width`` to get the desired behavior.
1983
+
1984
+ wrap : bool
1985
+ If ``True``, then state locality ``|i - j|`` is computed modulo ``n_states``.
1986
+ If ``False`` (default), then locality is absolute.
1987
+
1988
+ See Also
1989
+ --------
1990
+ librosa.filters.get_window
1991
+
1992
+ Returns
1993
+ -------
1994
+ transition : np.ndarray [shape=(n_states, n_states)]
1995
+ The transition matrix
1996
+
1997
+ Examples
1998
+ --------
1999
+ Triangular distributions with and without wrapping
2000
+
2001
+ >>> librosa.sequence.transition_local(5, 3, window='triangle', wrap=False)
2002
+ array([[0.667, 0.333, 0. , 0. , 0. ],
2003
+ [0.25 , 0.5 , 0.25 , 0. , 0. ],
2004
+ [0. , 0.25 , 0.5 , 0.25 , 0. ],
2005
+ [0. , 0. , 0.25 , 0.5 , 0.25 ],
2006
+ [0. , 0. , 0. , 0.333, 0.667]])
2007
+
2008
+ >>> librosa.sequence.transition_local(5, 3, window='triangle', wrap=True)
2009
+ array([[0.5 , 0.25, 0. , 0. , 0.25],
2010
+ [0.25, 0.5 , 0.25, 0. , 0. ],
2011
+ [0. , 0.25, 0.5 , 0.25, 0. ],
2012
+ [0. , 0. , 0.25, 0.5 , 0.25],
2013
+ [0.25, 0. , 0. , 0.25, 0.5 ]])
2014
+
2015
+ Uniform local distributions with variable widths and no wrapping
2016
+
2017
+ >>> librosa.sequence.transition_local(5, [1, 2, 3, 3, 1], window='ones', wrap=False)
2018
+ array([[1. , 0. , 0. , 0. , 0. ],
2019
+ [0.5 , 0.5 , 0. , 0. , 0. ],
2020
+ [0. , 0.333, 0.333, 0.333, 0. ],
2021
+ [0. , 0. , 0.333, 0.333, 0.333],
2022
+ [0. , 0. , 0. , 0. , 1. ]])
2023
+ """
2024
+
2025
+ if not (is_positive_int(n_states) and n_states > 1):
2026
+ raise ParameterError(f"n_states={n_states} must be a positive integer > 1")
2027
+
2028
+ width = np.asarray(width, dtype=int)
2029
+ if width.ndim == 0:
2030
+ width = np.tile(width, n_states)
2031
+
2032
+ if width.shape != (n_states,):
2033
+ raise ParameterError(
2034
+ f"width={width} must have length equal to n_states={n_states}"
2035
+ )
2036
+
2037
+ if np.any(width < 1):
2038
+ raise ParameterError(f"width={width} must be at least 1")
2039
+
2040
+ transition = np.zeros((n_states, n_states), dtype=np.float64)
2041
+
2042
+ # Fill in the widths. This is inefficient, but simple
2043
+ for i, width_i in enumerate(width):
2044
+ trans_row = pad_center(
2045
+ get_window(window, width_i, fftbins=False), size=n_states
2046
+ )
2047
+ trans_row = np.roll(trans_row, n_states // 2 + i + 1)
2048
+
2049
+ if not wrap:
2050
+ # Knock out the off-diagonal-band elements
2051
+ trans_row[min(n_states, i + width_i // 2 + 1) :] = 0
2052
+ trans_row[: max(0, i - width_i // 2)] = 0
2053
+
2054
+ transition[i] = trans_row
2055
+
2056
+ # Row-normalize
2057
+ transition /= transition.sum(axis=1, keepdims=True)
2058
+
2059
+ return transition
utils.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """Feature manipulation utilities"""
4
+
5
+ import numpy as np
6
+ import scipy.signal
7
+ from numba import jit
8
+
9
+ from .._cache import cache
10
+ from ..util.exceptions import ParameterError
11
+ from typing import Any
12
+
13
+ __all__ = ["delta", "stack_memory"]
14
+
15
+
16
+ @cache(level=40)
17
+ def delta(
18
+ data: np.ndarray,
19
+ *,
20
+ width: int = 9,
21
+ order: int = 1,
22
+ axis: int = -1,
23
+ mode: str = "interp",
24
+ **kwargs: Any,
25
+ ) -> np.ndarray:
26
+ r"""Compute delta features: local estimate of the derivative
27
+ of the input data along the selected axis.
28
+
29
+ Delta features are computed Savitsky-Golay filtering.
30
+
31
+ Parameters
32
+ ----------
33
+ data : np.ndarray
34
+ the input data matrix (eg, spectrogram)
35
+
36
+ width : int, positive, odd [scalar]
37
+ Number of frames over which to compute the delta features.
38
+ Cannot exceed the length of ``data`` along the specified axis.
39
+
40
+ If ``mode='interp'``, then ``width`` must be at least ``data.shape[axis]``.
41
+
42
+ order : int > 0 [scalar]
43
+ the order of the difference operator.
44
+ 1 for first derivative, 2 for second, etc.
45
+
46
+ axis : int [scalar]
47
+ the axis along which to compute deltas.
48
+ Default is -1 (columns).
49
+
50
+ mode : str, {'interp', 'nearest', 'mirror', 'constant', 'wrap'}
51
+ Padding mode for estimating differences at the boundaries.
52
+
53
+ **kwargs : additional keyword arguments
54
+ See `scipy.signal.savgol_filter`
55
+
56
+ Returns
57
+ -------
58
+ delta_data : np.ndarray [shape=(..., t)]
59
+ delta matrix of ``data`` at specified order
60
+
61
+ Notes
62
+ -----
63
+ This function caches at level 40.
64
+
65
+ See Also
66
+ --------
67
+ scipy.signal.savgol_filter
68
+
69
+ Examples
70
+ --------
71
+ Compute MFCC deltas, delta-deltas
72
+
73
+ >>> y, sr = librosa.load(librosa.ex('libri1'), duration=5)
74
+ >>> mfcc = librosa.feature.mfcc(y=y, sr=sr)
75
+ >>> mfcc_delta = librosa.feature.delta(mfcc)
76
+ >>> mfcc_delta
77
+ array([[-5.713e+02, -5.697e+02, ..., -1.522e+02, -1.224e+02],
78
+ [ 1.104e+01, 1.330e+01, ..., 2.089e+02, 1.698e+02],
79
+ ...,
80
+ [ 2.829e+00, 1.933e+00, ..., -3.149e+00, 2.294e-01],
81
+ [ 2.890e+00, 2.187e+00, ..., 6.959e+00, -1.039e+00]],
82
+ dtype=float32)
83
+
84
+ >>> mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
85
+ >>> mfcc_delta2
86
+ array([[-1.195, -1.195, ..., -4.328, -4.328],
87
+ [-1.566, -1.566, ..., -9.949, -9.949],
88
+ ...,
89
+ [ 0.707, 0.707, ..., 2.287, 2.287],
90
+ [ 0.655, 0.655, ..., -1.719, -1.719]], dtype=float32)
91
+
92
+ >>> import matplotlib.pyplot as plt
93
+ >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
94
+ >>> img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='time')
95
+ >>> ax[0].set(title='MFCC')
96
+ >>> ax[0].label_outer()
97
+ >>> img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='time')
98
+ >>> ax[1].set(title=r'MFCC-$\Delta$')
99
+ >>> ax[1].label_outer()
100
+ >>> img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='time')
101
+ >>> ax[2].set(title=r'MFCC-$\Delta^2$')
102
+ >>> fig.colorbar(img1, ax=[ax[0]])
103
+ >>> fig.colorbar(img2, ax=[ax[1]])
104
+ >>> fig.colorbar(img3, ax=[ax[2]])
105
+ """
106
+
107
+ data = np.atleast_1d(data)
108
+
109
+ if mode == "interp" and width > data.shape[axis]:
110
+ raise ParameterError(
111
+ f"when mode='interp', width={width} "
112
+ f"cannot exceed data.shape[axis]={data.shape[axis]}"
113
+ )
114
+
115
+ if width < 3 or np.mod(width, 2) != 1:
116
+ raise ParameterError("width must be an odd integer >= 3")
117
+
118
+ if order <= 0 or not isinstance(order, (int, np.integer)):
119
+ raise ParameterError("order must be a positive integer")
120
+
121
+ kwargs.pop("deriv", None)
122
+ kwargs.setdefault("polyorder", order)
123
+ result: np.ndarray = scipy.signal.savgol_filter(
124
+ data, width, deriv=order, axis=axis, mode=mode, **kwargs
125
+ )
126
+ return result
127
+
128
+
129
+ @cache(level=40)
130
+ def stack_memory(
131
+ data: np.ndarray, *, n_steps: int = 2, delay: int = 1, **kwargs: Any
132
+ ) -> np.ndarray:
133
+ """Short-term history embedding: vertically concatenate a data
134
+ vector or matrix with delayed copies of itself.
135
+
136
+ Each column ``data[:, i]`` is mapped to::
137
+
138
+ data[..., i] -> [data[..., i],
139
+ data[..., i - delay],
140
+ ...
141
+ data[..., i - (n_steps-1)*delay]]
142
+
143
+ For columns ``i < (n_steps - 1) * delay``, the data will be padded.
144
+ By default, the data is padded with zeros, but this behavior can be
145
+ overridden by supplying additional keyword arguments which are passed
146
+ to `np.pad()`.
147
+
148
+ Parameters
149
+ ----------
150
+ data : np.ndarray [shape=(..., d, t)]
151
+ Input data matrix. If ``data`` is a vector (``data.ndim == 1``),
152
+ it will be interpreted as a row matrix and reshaped to ``(1, t)``.
153
+
154
+ n_steps : int > 0 [scalar]
155
+ embedding dimension, the number of steps back in time to stack
156
+
157
+ delay : int != 0 [scalar]
158
+ the number of columns to step.
159
+
160
+ Positive values embed from the past (previous columns).
161
+
162
+ Negative values embed from the future (subsequent columns).
163
+
164
+ **kwargs : additional keyword arguments
165
+ Additional arguments to pass to `numpy.pad`
166
+
167
+ Returns
168
+ -------
169
+ data_history : np.ndarray [shape=(..., m * d, t)]
170
+ data augmented with lagged copies of itself,
171
+ where ``m == n_steps - 1``.
172
+
173
+ Notes
174
+ -----
175
+ This function caches at level 40.
176
+
177
+ Examples
178
+ --------
179
+ Keep two steps (current and previous)
180
+
181
+ >>> data = np.arange(-3, 3)
182
+ >>> librosa.feature.stack_memory(data)
183
+ array([[-3, -2, -1, 0, 1, 2],
184
+ [ 0, -3, -2, -1, 0, 1]])
185
+
186
+ Or three steps
187
+
188
+ >>> librosa.feature.stack_memory(data, n_steps=3)
189
+ array([[-3, -2, -1, 0, 1, 2],
190
+ [ 0, -3, -2, -1, 0, 1],
191
+ [ 0, 0, -3, -2, -1, 0]])
192
+
193
+ Use reflection padding instead of zero-padding
194
+
195
+ >>> librosa.feature.stack_memory(data, n_steps=3, mode='reflect')
196
+ array([[-3, -2, -1, 0, 1, 2],
197
+ [-2, -3, -2, -1, 0, 1],
198
+ [-1, -2, -3, -2, -1, 0]])
199
+
200
+ Or pad with edge-values, and delay by 2
201
+
202
+ >>> librosa.feature.stack_memory(data, n_steps=3, delay=2, mode='edge')
203
+ array([[-3, -2, -1, 0, 1, 2],
204
+ [-3, -3, -3, -2, -1, 0],
205
+ [-3, -3, -3, -3, -3, -2]])
206
+
207
+ Stack time-lagged beat-synchronous chroma edge padding
208
+
209
+ >>> y, sr = librosa.load(librosa.ex('sweetwaltz'), duration=10)
210
+ >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
211
+ >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
212
+ >>> beats = librosa.util.fix_frames(beats, x_min=0)
213
+ >>> chroma_sync = librosa.util.sync(chroma, beats)
214
+ >>> chroma_lag = librosa.feature.stack_memory(chroma_sync, n_steps=3,
215
+ ... mode='edge')
216
+
217
+ Plot the result
218
+
219
+ >>> import matplotlib.pyplot as plt
220
+ >>> fig, ax = plt.subplots()
221
+ >>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512)
222
+ >>> librosa.display.specshow(chroma_lag, y_axis='chroma', x_axis='time',
223
+ ... x_coords=beat_times, ax=ax)
224
+ >>> ax.text(1.0, 1/6, "Lag=0", transform=ax.transAxes, rotation=-90, ha="left", va="center")
225
+ >>> ax.text(1.0, 3/6, "Lag=1", transform=ax.transAxes, rotation=-90, ha="left", va="center")
226
+ >>> ax.text(1.0, 5/6, "Lag=2", transform=ax.transAxes, rotation=-90, ha="left", va="center")
227
+ >>> ax.set(title='Time-lagged chroma', ylabel="")
228
+ """
229
+
230
+ if n_steps < 1:
231
+ raise ParameterError("n_steps must be a positive integer")
232
+
233
+ if delay == 0:
234
+ raise ParameterError("delay must be a non-zero integer")
235
+
236
+ data = np.atleast_2d(data)
237
+ t = data.shape[-1]
238
+
239
+ if t < 1:
240
+ raise ParameterError(
241
+ "Cannot stack memory when input data has "
242
+ f"no columns. Given data.shape={data.shape}"
243
+ )
244
+ kwargs.setdefault("mode", "constant")
245
+
246
+ if kwargs["mode"] == "constant":
247
+ kwargs.setdefault("constant_values", [0])
248
+
249
+ padding = [(0, 0) for _ in range(data.ndim)]
250
+
251
+ # Pad the end with zeros, which will roll to the front below
252
+ if delay > 0:
253
+ padding[-1] = (int((n_steps - 1) * delay), 0)
254
+ else:
255
+ padding[-1] = (0, int((n_steps - 1) * -delay))
256
+
257
+ data = np.pad(data, padding, **kwargs)
258
+
259
+ # Construct the shape of the target array
260
+ shape = list(data.shape)
261
+ shape[-2] = shape[-2] * n_steps
262
+ shape[-1] = t
263
+ shape = tuple(shape)
264
+
265
+ # Construct the output array to match layout and dtype of input
266
+ history = np.empty_like(data, shape=shape)
267
+
268
+ # Populate the output array
269
+ __stack(history, data, n_steps, delay)
270
+
271
+ return history
272
+
273
+
274
+ @jit(nopython=True, cache=False)
275
+ def __stack(history, data, n_steps, delay):
276
+ """Memory-stacking helper function.
277
+
278
+ Parameters
279
+ ----------
280
+ history : output array (2-dimensional)
281
+ data : pre-padded input array (2-dimensional)
282
+ n_steps : int > 0, the number of steps to stack
283
+ delay : int != 0, the amount of delay between steps
284
+
285
+ Returns
286
+ -------
287
+ None
288
+ Output is stored directly in the history array
289
+ """
290
+ # Dimension of each copy of the data
291
+ d = data.shape[-2]
292
+
293
+ # Total number of time-steps to output
294
+ t = history.shape[-1]
295
+
296
+ if delay > 0:
297
+ for step in range(n_steps):
298
+ q = n_steps - 1 - step
299
+ # nth block is original shifted left by n*delay steps
300
+ history[..., step * d : (step + 1) * d, :] = data[
301
+ ..., q * delay : q * delay + t
302
+ ]
303
+ else:
304
+ # Handle the last block separately to avoid -t:0 empty slices
305
+ history[..., -d:, :] = data[..., -t:]
306
+
307
+ for step in range(n_steps - 1):
308
+ # nth block is original shifted right by n*delay steps
309
+ q = n_steps - 1 - step
310
+ history[..., step * d : (step + 1) * d, :] = data[
311
+ ..., -t + q * delay : q * delay
312
+ ]
313
+
314
+
315
+
316
+