renator commited on
Commit
8d0c7f8
1 Parent(s): aaa69e0

fix build issue and env

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Dockerfile +1 -0
  3. pitch.py +952 -0
  4. utils/utils.py +3 -5
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  venv
2
  env
 
3
  __pycache__
 
1
  venv
2
  env
3
+ accounts
4
  __pycache__
Dockerfile CHANGED
@@ -33,6 +33,7 @@ COPY utils.py /usr/local/lib/python3.10/site-packages/librosa/feature/utils.py
33
  COPY utils/utils.py /usr/local/lib/python3.10/site-packages/librosa/util/utils.py
34
  COPY matching.py /usr/local/lib/python3.10/site-packages/librosa/util/matching.py
35
  COPY spectrum.py /usr/local/lib/python3.10/site-packages/librosa/core/spectrum.py
 
36
  # RUN cd /tmp && mkdir cache1
37
 
38
  ENV NUMBA_CACHE_DIR=/tmp
 
33
  COPY utils/utils.py /usr/local/lib/python3.10/site-packages/librosa/util/utils.py
34
  COPY matching.py /usr/local/lib/python3.10/site-packages/librosa/util/matching.py
35
  COPY spectrum.py /usr/local/lib/python3.10/site-packages/librosa/core/spectrum.py
36
+ COPY pitch.py /usr/local/lib/python3.10/site-packages/librosa/core/pitch.py
37
  # RUN cd /tmp && mkdir cache1
38
 
39
  ENV NUMBA_CACHE_DIR=/tmp
pitch.py ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """Pitch-tracking and tuning estimation"""
4
+
5
+ import warnings
6
+ import numpy as np
7
+ import scipy
8
+ import numba
9
+
10
+
11
+ from .spectrum import _spectrogram
12
+ from . import convert
13
+ from .._cache import cache
14
+ from .. import util
15
+ from .. import sequence
16
+ from ..util.exceptions import ParameterError
17
+ from numpy.typing import ArrayLike
18
+ from typing import Any, Callable, Optional, Tuple, Union
19
+ from .._typing import _WindowSpec, _PadMode, _PadModeSTFT
20
+
21
+ __all__ = ["estimate_tuning", "pitch_tuning", "piptrack", "yin", "pyin"]
22
+
23
+
24
+ def estimate_tuning(
25
+ *,
26
+ y: Optional[np.ndarray] = None,
27
+ sr: float = 22050,
28
+ S: Optional[np.ndarray] = None,
29
+ n_fft: Optional[int] = 2048,
30
+ resolution: float = 0.01,
31
+ bins_per_octave: int = 12,
32
+ **kwargs: Any,
33
+ ) -> float:
34
+ """Estimate the tuning of an audio time series or spectrogram input.
35
+
36
+ Parameters
37
+ ----------
38
+ y : np.ndarray [shape=(..., n)] or None
39
+ audio signal. Multi-channel is supported..
40
+ sr : number > 0 [scalar]
41
+ audio sampling rate of ``y``
42
+ S : np.ndarray [shape=(..., d, t)] or None
43
+ magnitude or power spectrogram
44
+ n_fft : int > 0 [scalar] or None
45
+ number of FFT bins to use, if ``y`` is provided.
46
+ resolution : float in `(0, 1)`
47
+ Resolution of the tuning as a fraction of a bin.
48
+ 0.01 corresponds to measurements in cents.
49
+ bins_per_octave : int > 0 [scalar]
50
+ How many frequency bins per octave
51
+ **kwargs : additional keyword arguments
52
+ Additional arguments passed to `piptrack`
53
+
54
+ Returns
55
+ -------
56
+ tuning: float in `[-0.5, 0.5)`
57
+ estimated tuning deviation (fractions of a bin).
58
+
59
+ Note that if multichannel input is provided, a single tuning estimate is provided spanning all
60
+ channels.
61
+
62
+ See Also
63
+ --------
64
+ piptrack : Pitch tracking by parabolic interpolation
65
+
66
+ Examples
67
+ --------
68
+ With time-series input
69
+
70
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
71
+ >>> librosa.estimate_tuning(y=y, sr=sr)
72
+ -0.08000000000000002
73
+
74
+ In tenths of a cent
75
+
76
+ >>> librosa.estimate_tuning(y=y, sr=sr, resolution=1e-3)
77
+ -0.016000000000000014
78
+
79
+ Using spectrogram input
80
+
81
+ >>> S = np.abs(librosa.stft(y))
82
+ >>> librosa.estimate_tuning(S=S, sr=sr)
83
+ -0.08000000000000002
84
+
85
+ Using pass-through arguments to `librosa.piptrack`
86
+
87
+ >>> librosa.estimate_tuning(y=y, sr=sr, n_fft=8192,
88
+ ... fmax=librosa.note_to_hz('G#9'))
89
+ -0.08000000000000002
90
+ """
91
+
92
+ pitch, mag = piptrack(y=y, sr=sr, S=S, n_fft=n_fft, **kwargs)
93
+
94
+ # Only count magnitude where frequency is > 0
95
+ pitch_mask = pitch > 0
96
+
97
+ if pitch_mask.any():
98
+ threshold = np.median(mag[pitch_mask])
99
+ else:
100
+ threshold = 0.0
101
+
102
+ return pitch_tuning(
103
+ pitch[(mag >= threshold) & pitch_mask],
104
+ resolution=resolution,
105
+ bins_per_octave=bins_per_octave,
106
+ )
107
+
108
+
109
+ def pitch_tuning(
110
+ frequencies: ArrayLike, *, resolution: float = 0.01, bins_per_octave: int = 12
111
+ ) -> float:
112
+ """Given a collection of pitches, estimate its tuning offset
113
+ (in fractions of a bin) relative to A440=440.0Hz.
114
+
115
+ Parameters
116
+ ----------
117
+ frequencies : array-like, float
118
+ A collection of frequencies detected in the signal.
119
+ See `piptrack`
120
+ resolution : float in `(0, 1)`
121
+ Resolution of the tuning as a fraction of a bin.
122
+ 0.01 corresponds to cents.
123
+ bins_per_octave : int > 0 [scalar]
124
+ How many frequency bins per octave
125
+
126
+ Returns
127
+ -------
128
+ tuning: float in `[-0.5, 0.5)`
129
+ estimated tuning deviation (fractions of a bin)
130
+
131
+ See Also
132
+ --------
133
+ estimate_tuning : Estimating tuning from time-series or spectrogram input
134
+
135
+ Examples
136
+ --------
137
+ >>> # Generate notes at +25 cents
138
+ >>> freqs = librosa.cqt_frequencies(n_bins=24, fmin=55, tuning=0.25)
139
+ >>> librosa.pitch_tuning(freqs)
140
+ 0.25
141
+
142
+ >>> # Track frequencies from a real spectrogram
143
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
144
+ >>> freqs, times, mags = librosa.reassigned_spectrogram(y, sr=sr,
145
+ ... fill_nan=True)
146
+ >>> # Select out pitches with high energy
147
+ >>> freqs = freqs[mags > np.median(mags)]
148
+ >>> librosa.pitch_tuning(freqs)
149
+ -0.07
150
+
151
+ """
152
+
153
+ frequencies = np.atleast_1d(frequencies)
154
+
155
+ # Trim out any DC components
156
+ frequencies = frequencies[frequencies > 0]
157
+
158
+ if not np.any(frequencies):
159
+ warnings.warn(
160
+ "Trying to estimate tuning from empty frequency set.", stacklevel=2
161
+ )
162
+ return 0.0
163
+
164
+ # Compute the residual relative to the number of bins
165
+ residual = np.mod(bins_per_octave * convert.hz_to_octs(frequencies), 1.0)
166
+
167
+ # Are we on the wrong side of the semitone?
168
+ # A residual of 0.95 is more likely to be a deviation of -0.05
169
+ # from the next tone up.
170
+ residual[residual >= 0.5] -= 1.0
171
+
172
+ bins = np.linspace(-0.5, 0.5, int(np.ceil(1.0 / resolution)) + 1)
173
+
174
+ counts, tuning = np.histogram(residual, bins)
175
+
176
+ # return the histogram peak
177
+ tuning_est: float = tuning[np.argmax(counts)]
178
+ return tuning_est
179
+
180
+
181
+ @cache(level=30)
182
+ def piptrack(
183
+ *,
184
+ y: Optional[np.ndarray] = None,
185
+ sr: float = 22050,
186
+ S: Optional[np.ndarray] = None,
187
+ n_fft: Optional[int] = 2048,
188
+ hop_length: Optional[int] = None,
189
+ fmin: float = 150.0,
190
+ fmax: float = 4000.0,
191
+ threshold: float = 0.1,
192
+ win_length: Optional[int] = None,
193
+ window: _WindowSpec = "hann",
194
+ center: bool = True,
195
+ pad_mode: _PadModeSTFT = "constant",
196
+ ref: Optional[Union[float, Callable]] = None,
197
+ ) -> Tuple[np.ndarray, np.ndarray]:
198
+ """Pitch tracking on thresholded parabolically-interpolated STFT.
199
+
200
+ This implementation uses the parabolic interpolation method described by [#]_.
201
+
202
+ .. [#] https://ccrma.stanford.edu/~jos/sasp/Sinusoidal_Peak_Interpolation.html
203
+
204
+ Parameters
205
+ ----------
206
+ y : np.ndarray [shape=(..., n)] or None
207
+ audio signal. Multi-channel is supported..
208
+
209
+ sr : number > 0 [scalar]
210
+ audio sampling rate of ``y``
211
+
212
+ S : np.ndarray [shape=(..., d, t)] or None
213
+ magnitude or power spectrogram
214
+
215
+ n_fft : int > 0 [scalar] or None
216
+ number of FFT bins to use, if ``y`` is provided.
217
+
218
+ hop_length : int > 0 [scalar] or None
219
+ number of samples to hop
220
+
221
+ threshold : float in `(0, 1)`
222
+ A bin in spectrum ``S`` is considered a pitch when it is greater than
223
+ ``threshold * ref(S)``.
224
+
225
+ By default, ``ref(S)`` is taken to be ``max(S, axis=0)`` (the maximum value in
226
+ each column).
227
+
228
+ fmin : float > 0 [scalar]
229
+ lower frequency cutoff.
230
+
231
+ fmax : float > 0 [scalar]
232
+ upper frequency cutoff.
233
+
234
+ win_length : int <= n_fft [scalar]
235
+ Each frame of audio is windowed by ``window``.
236
+ The window will be of length `win_length` and then padded
237
+ with zeros to match ``n_fft``.
238
+
239
+ If unspecified, defaults to ``win_length = n_fft``.
240
+
241
+ window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
242
+ - a window specification (string, tuple, or number);
243
+ see `scipy.signal.get_window`
244
+ - a window function, such as `scipy.signal.windows.hann`
245
+ - a vector or array of length ``n_fft``
246
+
247
+ .. see also:: `filters.get_window`
248
+
249
+ center : boolean
250
+ - If ``True``, the signal ``y`` is padded so that frame
251
+ ``t`` is centered at ``y[t * hop_length]``.
252
+ - If ``False``, then frame ``t`` begins at ``y[t * hop_length]``
253
+
254
+ pad_mode : string
255
+ If ``center=True``, the padding mode to use at the edges of the signal.
256
+ By default, STFT uses zero-padding.
257
+
258
+ See also: `np.pad`.
259
+
260
+ ref : scalar or callable [default=np.max]
261
+ If scalar, the reference value against which ``S`` is compared for determining
262
+ pitches.
263
+
264
+ If callable, the reference value is computed as ``ref(S, axis=0)``.
265
+
266
+ Returns
267
+ -------
268
+ pitches, magnitudes : np.ndarray [shape=(..., d, t)]
269
+ Where ``d`` is the subset of FFT bins within ``fmin`` and ``fmax``.
270
+
271
+ ``pitches[..., f, t]`` contains instantaneous frequency at bin
272
+ ``f``, time ``t``
273
+
274
+ ``magnitudes[..., f, t]`` contains the corresponding magnitudes.
275
+
276
+ Both ``pitches`` and ``magnitudes`` take value 0 at bins
277
+ of non-maximal magnitude.
278
+
279
+ Notes
280
+ -----
281
+ This function caches at level 30.
282
+
283
+ One of ``S`` or ``y`` must be provided.
284
+ If ``S`` is not given, it is computed from ``y`` using
285
+ the default parameters of `librosa.stft`.
286
+
287
+ Examples
288
+ --------
289
+ Computing pitches from a waveform input
290
+
291
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
292
+ >>> pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
293
+
294
+ Or from a spectrogram input
295
+
296
+ >>> S = np.abs(librosa.stft(y))
297
+ >>> pitches, magnitudes = librosa.piptrack(S=S, sr=sr)
298
+
299
+ Or with an alternate reference value for pitch detection, where
300
+ values above the mean spectral energy in each frame are counted as pitches
301
+
302
+ >>> pitches, magnitudes = librosa.piptrack(S=S, sr=sr, threshold=1,
303
+ ... ref=np.mean)
304
+
305
+ """
306
+
307
+ # Check that we received an audio time series or STFT
308
+ S, n_fft = _spectrogram(
309
+ y=y,
310
+ S=S,
311
+ n_fft=n_fft,
312
+ hop_length=hop_length,
313
+ win_length=win_length,
314
+ window=window,
315
+ center=center,
316
+ pad_mode=pad_mode,
317
+ )
318
+
319
+ # Make sure we're dealing with magnitudes
320
+ S = np.abs(S)
321
+
322
+ # Truncate to feasible region
323
+ fmin = np.maximum(fmin, 0)
324
+ fmax = np.minimum(fmax, float(sr) / 2)
325
+
326
+ fft_freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
327
+
328
+ # Do the parabolic interpolation everywhere,
329
+ # then figure out where the peaks are
330
+ # then restrict to the feasible range (fmin:fmax)
331
+ avg = np.gradient(S, axis=-2)
332
+ shift = _parabolic_interpolation(S, axis=-2)
333
+ # this will get us the interpolated peak value
334
+ dskew = 0.5 * avg * shift
335
+
336
+ # Pre-allocate output
337
+ pitches = np.zeros_like(S)
338
+ mags = np.zeros_like(S)
339
+
340
+ # Clip to the viable frequency range
341
+ freq_mask = (fmin <= fft_freqs) & (fft_freqs < fmax)
342
+ freq_mask = util.expand_to(freq_mask, ndim=S.ndim, axes=-2)
343
+
344
+ # Compute the column-wise local max of S after thresholding
345
+ # Find the argmax coordinates
346
+ if ref is None:
347
+ ref = np.max
348
+
349
+ if callable(ref):
350
+ ref_value = threshold * ref(S, axis=-2)
351
+ # Reinsert the frequency axis here, in case the callable doesn't
352
+
353
+ # support keepdims=True
354
+ ref_value = np.expand_dims(ref_value, -2)
355
+ else:
356
+ ref_value = np.abs(ref)
357
+
358
+ # Store pitch and magnitude
359
+ idx = np.nonzero(freq_mask & util.localmax(S * (S > ref_value), axis=-2))
360
+ pitches[idx] = (idx[-2] + shift[idx]) * float(sr) / n_fft
361
+ mags[idx] = S[idx] + dskew[idx]
362
+
363
+ return pitches, mags
364
+
365
+
366
+ def _cumulative_mean_normalized_difference(
367
+ y_frames: np.ndarray,
368
+ frame_length: int,
369
+ win_length: int,
370
+ min_period: int,
371
+ max_period: int,
372
+ ) -> np.ndarray:
373
+ """Cumulative mean normalized difference function (equation 8 in [#]_)
374
+
375
+ .. [#] De Cheveigné, Alain, and Hideki Kawahara.
376
+ "YIN, a fundamental frequency estimator for speech and music."
377
+ The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
378
+
379
+ Parameters
380
+ ----------
381
+ y_frames : np.ndarray [shape=(frame_length, n_frames)]
382
+ framed audio time series.
383
+ frame_length : int > 0 [scalar]
384
+ length of the frames in samples.
385
+ win_length : int > 0 [scalar]
386
+ length of the window for calculating autocorrelation in samples.
387
+ min_period : int > 0 [scalar]
388
+ minimum period.
389
+ max_period : int > 0 [scalar]
390
+ maximum period.
391
+
392
+ Returns
393
+ -------
394
+ yin_frames : np.ndarray [shape=(max_period-min_period+1,n_frames)]
395
+ Cumulative mean normalized difference function for each frame.
396
+ """
397
+ # Autocorrelation.
398
+ a = np.fft.rfft(y_frames, frame_length, axis=-2)
399
+ b = np.fft.rfft(y_frames[..., win_length:0:-1, :], frame_length, axis=-2)
400
+ acf_frames = np.fft.irfft(a * b, frame_length, axis=-2)[..., win_length:, :]
401
+ acf_frames[np.abs(acf_frames) < 1e-6] = 0
402
+
403
+ # Energy terms.
404
+ energy_frames = np.cumsum(y_frames**2, axis=-2)
405
+ energy_frames = (
406
+ energy_frames[..., win_length:, :] - energy_frames[..., :-win_length, :]
407
+ )
408
+ energy_frames[np.abs(energy_frames) < 1e-6] = 0
409
+
410
+ # Difference function.
411
+ yin_frames = energy_frames[..., :1, :] + energy_frames - 2 * acf_frames
412
+
413
+ # Cumulative mean normalized difference function.
414
+ yin_numerator = yin_frames[..., min_period : max_period + 1, :]
415
+ # broadcast this shape to have leading ones
416
+ tau_range = util.expand_to(
417
+ np.arange(1, max_period + 1), ndim=yin_frames.ndim, axes=-2
418
+ )
419
+
420
+ cumulative_mean = (
421
+ np.cumsum(yin_frames[..., 1 : max_period + 1, :], axis=-2) / tau_range
422
+ )
423
+ yin_denominator = cumulative_mean[..., min_period - 1 : max_period, :]
424
+ yin_frames: np.ndarray = yin_numerator / (
425
+ yin_denominator + util.tiny(yin_denominator)
426
+ )
427
+ return yin_frames
428
+
429
+
430
+ @numba.stencil # type: ignore
431
+ def _pi_stencil(x: np.ndarray) -> np.ndarray:
432
+ """Stencil to compute local parabolic interpolation"""
433
+
434
+ a = x[1] + x[-1] - 2 * x[0]
435
+ b = (x[1] - x[-1]) / 2
436
+
437
+ if np.abs(b) >= np.abs(a):
438
+ # If this happens, we'll shift by more than 1 bin
439
+ # Suppressing types because mypy has no idea about stencils
440
+ return 0 # type: ignore
441
+
442
+ return -b / a # type: ignore
443
+
444
+
445
+ @numba.guvectorize(
446
+ ["void(float32[:], float32[:])", "void(float64[:], float64[:])"],
447
+ "(n)->(n)",
448
+ cache=False,
449
+ nopython=True,
450
+ ) # type: ignore
451
+ def _pi_wrapper(x: np.ndarray, y: np.ndarray) -> None: # pragma: no cover
452
+ """Vectorized wrapper for the parabolic interpolation stencil"""
453
+ y[:] = _pi_stencil(x)
454
+
455
+
456
+ def _parabolic_interpolation(x: np.ndarray, *, axis: int = -2) -> np.ndarray:
457
+ """Piecewise parabolic interpolation for yin and pyin.
458
+
459
+ Parameters
460
+ ----------
461
+ x : np.ndarray
462
+ array to interpolate
463
+ axis : int
464
+ axis along which to interpolate
465
+
466
+ Returns
467
+ -------
468
+ parabolic_shifts : np.ndarray [shape=x.shape]
469
+ position of the parabola optima (relative to bin indices)
470
+
471
+ Note: the shift at bin `n` is determined as 0 if the estimated
472
+ optimum is outside the range `[n-1, n+1]`.
473
+ """
474
+ # Rotate the target axis to the end
475
+ xi = x.swapaxes(-1, axis)
476
+
477
+ # Allocate the output array and rotate target axis
478
+ shifts = np.empty_like(x)
479
+ shiftsi = shifts.swapaxes(-1, axis)
480
+
481
+ # Call the vectorized stencil
482
+ _pi_wrapper(xi, shiftsi)
483
+
484
+ # Handle the edge condition not covered by the stencil
485
+ shiftsi[..., -1] = 0
486
+ shiftsi[..., 0] = 0
487
+
488
+ return shifts
489
+
490
+
491
+ def yin(
492
+ y: np.ndarray,
493
+ *,
494
+ fmin: float,
495
+ fmax: float,
496
+ sr: float = 22050,
497
+ frame_length: int = 2048,
498
+ win_length: Optional[int] = None,
499
+ hop_length: Optional[int] = None,
500
+ trough_threshold: float = 0.1,
501
+ center: bool = True,
502
+ pad_mode: _PadMode = "constant",
503
+ ) -> np.ndarray:
504
+ """Fundamental frequency (F0) estimation using the YIN algorithm.
505
+
506
+ YIN is an autocorrelation based method for fundamental frequency estimation [#]_.
507
+ First, a normalized difference function is computed over short (overlapping) frames of audio.
508
+ Next, the first minimum in the difference function below ``trough_threshold`` is selected as
509
+ an estimate of the signal's period.
510
+ Finally, the estimated period is refined using parabolic interpolation before converting
511
+ into the corresponding frequency.
512
+
513
+ .. [#] De Cheveigné, Alain, and Hideki Kawahara.
514
+ "YIN, a fundamental frequency estimator for speech and music."
515
+ The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
516
+
517
+ Parameters
518
+ ----------
519
+ y : np.ndarray [shape=(..., n)]
520
+ audio time series. Multi-channel is supported..
521
+ fmin : number > 0 [scalar]
522
+ minimum frequency in Hertz.
523
+ The recommended minimum is ``librosa.note_to_hz('C2')`` (~65 Hz)
524
+ though lower values may be feasible.
525
+ fmax : number > 0 [scalar]
526
+ maximum frequency in Hertz.
527
+ The recommended maximum is ``librosa.note_to_hz('C7')`` (~2093 Hz)
528
+ though higher values may be feasible.
529
+ sr : number > 0 [scalar]
530
+ sampling rate of ``y`` in Hertz.
531
+ frame_length : int > 0 [scalar]
532
+ length of the frames in samples.
533
+ By default, ``frame_length=2048`` corresponds to a time scale of about 93 ms at
534
+ a sampling rate of 22050 Hz.
535
+ win_length : None or int > 0 [scalar]
536
+ length of the window for calculating autocorrelation in samples.
537
+ If ``None``, defaults to ``frame_length // 2``
538
+ hop_length : None or int > 0 [scalar]
539
+ number of audio samples between adjacent YIN predictions.
540
+ If ``None``, defaults to ``frame_length // 4``.
541
+ trough_threshold : number > 0 [scalar]
542
+ absolute threshold for peak estimation.
543
+ center : boolean
544
+ If ``True``, the signal `y` is padded so that frame
545
+ ``D[:, t]`` is centered at `y[t * hop_length]`.
546
+ If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``.
547
+ Defaults to ``True``, which simplifies the alignment of ``D`` onto a
548
+ time grid by means of ``librosa.core.frames_to_samples``.
549
+ pad_mode : string or function
550
+ If ``center=True``, this argument is passed to ``np.pad`` for padding
551
+ the edges of the signal ``y``. By default (``pad_mode="constant"``),
552
+ ``y`` is padded on both sides with zeros.
553
+ If ``center=False``, this argument is ignored.
554
+ .. see also:: `np.pad`
555
+
556
+ Returns
557
+ -------
558
+ f0: np.ndarray [shape=(..., n_frames)]
559
+ time series of fundamental frequencies in Hertz.
560
+
561
+ If multi-channel input is provided, f0 curves are estimated separately for each channel.
562
+
563
+ See Also
564
+ --------
565
+ librosa.pyin :
566
+ Fundamental frequency (F0) estimation using probabilistic YIN (pYIN).
567
+
568
+ Examples
569
+ --------
570
+ Computing a fundamental frequency (F0) curve from an audio input
571
+
572
+ >>> y = librosa.chirp(fmin=440, fmax=880, duration=5.0)
573
+ >>> librosa.yin(y, fmin=440, fmax=880)
574
+ array([442.66354675, 441.95299983, 441.58010963, ...,
575
+ 871.161732 , 873.99001454, 877.04297681])
576
+ """
577
+
578
+ if fmin is None or fmax is None:
579
+ raise ParameterError('both "fmin" and "fmax" must be provided')
580
+
581
+ # Set the default window length if it is not already specified.
582
+ if win_length is None:
583
+ win_length = frame_length // 2
584
+
585
+ if win_length >= frame_length:
586
+ raise ParameterError(
587
+ f"win_length={win_length} cannot exceed given frame_length={frame_length}"
588
+ )
589
+
590
+ # Set the default hop if it is not already specified.
591
+ if hop_length is None:
592
+ hop_length = frame_length // 4
593
+
594
+ # Check that audio is valid.
595
+ util.valid_audio(y, mono=False)
596
+
597
+ # Pad the time series so that frames are centered
598
+ if center:
599
+ padding = [(0, 0)] * y.ndim
600
+ padding[-1] = (frame_length // 2, frame_length // 2)
601
+ y = np.pad(y, padding, mode=pad_mode)
602
+
603
+ # Frame audio.
604
+ y_frames = util.frame(y, frame_length=frame_length, hop_length=hop_length)
605
+
606
+ # Calculate minimum and maximum periods
607
+ min_period = max(int(np.floor(sr / fmax)), 1)
608
+ max_period = min(int(np.ceil(sr / fmin)), frame_length - win_length - 1)
609
+
610
+ # Calculate cumulative mean normalized difference function.
611
+ yin_frames = _cumulative_mean_normalized_difference(
612
+ y_frames, frame_length, win_length, min_period, max_period
613
+ )
614
+
615
+ # Parabolic interpolation.
616
+ parabolic_shifts = _parabolic_interpolation(yin_frames)
617
+
618
+ # Find local minima.
619
+ is_trough = util.localmin(yin_frames, axis=-2)
620
+ is_trough[..., 0, :] = yin_frames[..., 0, :] < yin_frames[..., 1, :]
621
+
622
+ # Find minima below peak threshold.
623
+ is_threshold_trough = np.logical_and(is_trough, yin_frames < trough_threshold)
624
+
625
+ # Absolute threshold.
626
+ # "The solution we propose is to set an absolute threshold and choose the
627
+ # smallest value of tau that gives a minimum of d' deeper than
628
+ # this threshold. If none is found, the global minimum is chosen instead."
629
+ target_shape = list(yin_frames.shape)
630
+ target_shape[-2] = 1
631
+
632
+ global_min = np.argmin(yin_frames, axis=-2)
633
+ yin_period = np.argmax(is_threshold_trough, axis=-2)
634
+
635
+ global_min = global_min.reshape(target_shape)
636
+ yin_period = yin_period.reshape(target_shape)
637
+
638
+ no_trough_below_threshold = np.all(~is_threshold_trough, axis=-2, keepdims=True)
639
+ yin_period[no_trough_below_threshold] = global_min[no_trough_below_threshold]
640
+
641
+ # Refine peak by parabolic interpolation.
642
+
643
+ yin_period = (
644
+ min_period
645
+ + yin_period
646
+ + np.take_along_axis(parabolic_shifts, yin_period, axis=-2)
647
+ )[..., 0, :]
648
+
649
+ # Convert period to fundamental frequency.
650
+ f0: np.ndarray = sr / yin_period
651
+ return f0
652
+
653
+
654
+ def pyin(
655
+ y: np.ndarray,
656
+ *,
657
+ fmin: float,
658
+ fmax: float,
659
+ sr: float = 22050,
660
+ frame_length: int = 2048,
661
+ win_length: Optional[int] = None,
662
+ hop_length: Optional[int] = None,
663
+ n_thresholds: int = 100,
664
+ beta_parameters: Tuple[float, float] = (2, 18),
665
+ boltzmann_parameter: float = 2,
666
+ resolution: float = 0.1,
667
+ max_transition_rate: float = 35.92,
668
+ switch_prob: float = 0.01,
669
+ no_trough_prob: float = 0.01,
670
+ fill_na: Optional[float] = np.nan,
671
+ center: bool = True,
672
+ pad_mode: _PadMode = "constant",
673
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
674
+ """Fundamental frequency (F0) estimation using probabilistic YIN (pYIN).
675
+
676
+ pYIN [#]_ is a modificatin of the YIN algorithm [#]_ for fundamental frequency (F0) estimation.
677
+ In the first step of pYIN, F0 candidates and their probabilities are computed using the YIN algorithm.
678
+ In the second step, Viterbi decoding is used to estimate the most likely F0 sequence and voicing flags.
679
+
680
+ .. [#] Mauch, Matthias, and Simon Dixon.
681
+ "pYIN: A fundamental frequency estimator using probabilistic threshold distributions."
682
+ 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2014.
683
+
684
+ .. [#] De Cheveigné, Alain, and Hideki Kawahara.
685
+ "YIN, a fundamental frequency estimator for speech and music."
686
+ The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
687
+
688
+ Parameters
689
+ ----------
690
+ y : np.ndarray [shape=(..., n)]
691
+ audio time series. Multi-channel is supported.
692
+ fmin : number > 0 [scalar]
693
+ minimum frequency in Hertz.
694
+ The recommended minimum is ``librosa.note_to_hz('C2')`` (~65 Hz)
695
+ though lower values may be feasible.
696
+ fmax : number > 0 [scalar]
697
+ maximum frequency in Hertz.
698
+ The recommended maximum is ``librosa.note_to_hz('C7')`` (~2093 Hz)
699
+ though higher values may be feasible.
700
+ sr : number > 0 [scalar]
701
+ sampling rate of ``y`` in Hertz.
702
+ frame_length : int > 0 [scalar]
703
+ length of the frames in samples.
704
+ By default, ``frame_length=2048`` corresponds to a time scale of about 93 ms at
705
+ a sampling rate of 22050 Hz.
706
+ win_length : None or int > 0 [scalar]
707
+ length of the window for calculating autocorrelation in samples.
708
+ If ``None``, defaults to ``frame_length // 2``
709
+ hop_length : None or int > 0 [scalar]
710
+ number of audio samples between adjacent pYIN predictions.
711
+ If ``None``, defaults to ``frame_length // 4``.
712
+ n_thresholds : int > 0 [scalar]
713
+ number of thresholds for peak estimation.
714
+ beta_parameters : tuple
715
+ shape parameters for the beta distribution prior over thresholds.
716
+ boltzmann_parameter : number > 0 [scalar]
717
+ shape parameter for the Boltzmann distribution prior over troughs.
718
+ Larger values will assign more mass to smaller periods.
719
+ resolution : float in `(0, 1)`
720
+ Resolution of the pitch bins.
721
+ 0.01 corresponds to cents.
722
+ max_transition_rate : float > 0
723
+ maximum pitch transition rate in octaves per second.
724
+ switch_prob : float in ``(0, 1)``
725
+ probability of switching from voiced to unvoiced or vice versa.
726
+ no_trough_prob : float in ``(0, 1)``
727
+ maximum probability to add to global minimum if no trough is below threshold.
728
+ fill_na : None, float, or ``np.nan``
729
+ default value for unvoiced frames of ``f0``.
730
+ If ``None``, the unvoiced frames will contain a best guess value.
731
+ center : boolean
732
+ If ``True``, the signal ``y`` is padded so that frame
733
+ ``D[:, t]`` is centered at ``y[t * hop_length]``.
734
+ If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``.
735
+ Defaults to ``True``, which simplifies the alignment of ``D`` onto a
736
+ time grid by means of ``librosa.core.frames_to_samples``.
737
+ pad_mode : string or function
738
+ If ``center=True``, this argument is passed to ``np.pad`` for padding
739
+ the edges of the signal ``y``. By default (``pad_mode="constant"``),
740
+ ``y`` is padded on both sides with zeros.
741
+ If ``center=False``, this argument is ignored.
742
+ .. see also:: `np.pad`
743
+
744
+ Returns
745
+ -------
746
+ f0: np.ndarray [shape=(..., n_frames)]
747
+ time series of fundamental frequencies in Hertz.
748
+ voiced_flag: np.ndarray [shape=(..., n_frames)]
749
+ time series containing boolean flags indicating whether a frame is voiced or not.
750
+ voiced_prob: np.ndarray [shape=(..., n_frames)]
751
+ time series containing the probability that a frame is voiced.
752
+ .. note:: If multi-channel input is provided, f0 and voicing are estimated separately for each channel.
753
+
754
+ See Also
755
+ --------
756
+ librosa.yin :
757
+ Fundamental frequency (F0) estimation using the YIN algorithm.
758
+
759
+ Examples
760
+ --------
761
+ Computing a fundamental frequency (F0) curve from an audio input
762
+
763
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
764
+ >>> f0, voiced_flag, voiced_probs = librosa.pyin(y,
765
+ ... fmin=librosa.note_to_hz('C2'),
766
+ ... fmax=librosa.note_to_hz('C7'))
767
+ >>> times = librosa.times_like(f0)
768
+
769
+ Overlay F0 over a spectrogram
770
+
771
+ >>> import matplotlib.pyplot as plt
772
+ >>> D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
773
+ >>> fig, ax = plt.subplots()
774
+ >>> img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax)
775
+ >>> ax.set(title='pYIN fundamental frequency estimation')
776
+ >>> fig.colorbar(img, ax=ax, format="%+2.f dB")
777
+ >>> ax.plot(times, f0, label='f0', color='cyan', linewidth=3)
778
+ >>> ax.legend(loc='upper right')
779
+ """
780
+
781
+ if fmin is None or fmax is None:
782
+ raise ParameterError('both "fmin" and "fmax" must be provided')
783
+
784
+ # Set the default window length if it is not already specified.
785
+ if win_length is None:
786
+ win_length = frame_length // 2
787
+
788
+ if win_length >= frame_length:
789
+ raise ParameterError(
790
+ f"win_length={win_length} cannot exceed given frame_length={frame_length}"
791
+ )
792
+
793
+ # Set the default hop if it is not already specified.
794
+ if hop_length is None:
795
+ hop_length = frame_length // 4
796
+
797
+ # Check that audio is valid.
798
+ util.valid_audio(y, mono=False)
799
+
800
+ # Pad the time series so that frames are centered
801
+ if center:
802
+ padding = [(0, 0) for _ in y.shape]
803
+ padding[-1] = (frame_length // 2, frame_length // 2)
804
+ y = np.pad(y, padding, mode=pad_mode)
805
+
806
+ # Frame audio.
807
+ y_frames = util.frame(y, frame_length=frame_length, hop_length=hop_length)
808
+
809
+ # Calculate minimum and maximum periods
810
+ min_period = max(int(np.floor(sr / fmax)), 1)
811
+ max_period = min(int(np.ceil(sr / fmin)), frame_length - win_length - 1)
812
+
813
+ # Calculate cumulative mean normalized difference function.
814
+ yin_frames = _cumulative_mean_normalized_difference(
815
+ y_frames, frame_length, win_length, min_period, max_period
816
+ )
817
+
818
+ # Parabolic interpolation.
819
+ parabolic_shifts = _parabolic_interpolation(yin_frames)
820
+
821
+ # Find Yin candidates and probabilities.
822
+ # The implementation here follows the official pYIN software which
823
+ # differs from the method described in the paper.
824
+ # 1. Define the prior over the thresholds.
825
+ thresholds = np.linspace(0, 1, n_thresholds + 1)
826
+ beta_cdf = scipy.stats.beta.cdf(thresholds, beta_parameters[0], beta_parameters[1])
827
+ beta_probs = np.diff(beta_cdf)
828
+
829
+ n_bins_per_semitone = int(np.ceil(1.0 / resolution))
830
+ n_pitch_bins = int(np.floor(12 * n_bins_per_semitone * np.log2(fmax / fmin))) + 1
831
+
832
+ def _helper(a, b):
833
+ return __pyin_helper(
834
+ a,
835
+ b,
836
+ sr,
837
+ thresholds,
838
+ boltzmann_parameter,
839
+ beta_probs,
840
+ no_trough_prob,
841
+ min_period,
842
+ fmin,
843
+ n_pitch_bins,
844
+ n_bins_per_semitone,
845
+ )
846
+
847
+ helper = np.vectorize(_helper, signature="(f,t),(k,t)->(1,d,t),(j,t)")
848
+ observation_probs, voiced_prob = helper(yin_frames, parabolic_shifts)
849
+
850
+ # Construct transition matrix.
851
+ max_semitones_per_frame = round(max_transition_rate * 12 * hop_length / sr)
852
+ transition_width = max_semitones_per_frame * n_bins_per_semitone + 1
853
+ # Construct the within voicing transition probabilities
854
+ transition = sequence.transition_local(
855
+ n_pitch_bins, transition_width, window="triangle", wrap=False
856
+ )
857
+
858
+ # Include across voicing transition probabilities
859
+ t_switch = sequence.transition_loop(2, 1 - switch_prob)
860
+ transition = np.kron(t_switch, transition)
861
+
862
+ p_init = np.zeros(2 * n_pitch_bins)
863
+ p_init[n_pitch_bins:] = 1 / n_pitch_bins
864
+
865
+ states = sequence.viterbi(observation_probs, transition, p_init=p_init)
866
+
867
+ # Find f0 corresponding to each decoded pitch bin.
868
+ freqs = fmin * 2 ** (np.arange(n_pitch_bins) / (12 * n_bins_per_semitone))
869
+ f0 = freqs[states % n_pitch_bins]
870
+ voiced_flag = states < n_pitch_bins
871
+
872
+ if fill_na is not None:
873
+ f0[~voiced_flag] = fill_na
874
+
875
+ return f0[..., 0, :], voiced_flag[..., 0, :], voiced_prob[..., 0, :]
876
+
877
+
878
+ def __pyin_helper(
879
+ yin_frames,
880
+ parabolic_shifts,
881
+ sr,
882
+ thresholds,
883
+ boltzmann_parameter,
884
+ beta_probs,
885
+ no_trough_prob,
886
+ min_period,
887
+ fmin,
888
+ n_pitch_bins,
889
+ n_bins_per_semitone,
890
+ ):
891
+ yin_probs = np.zeros_like(yin_frames)
892
+
893
+ for i, yin_frame in enumerate(yin_frames.T):
894
+ # 2. For each frame find the troughs.
895
+ is_trough = util.localmin(yin_frame)
896
+
897
+ is_trough[0] = yin_frame[0] < yin_frame[1]
898
+ (trough_index,) = np.nonzero(is_trough)
899
+
900
+ if len(trough_index) == 0:
901
+ continue
902
+
903
+ # 3. Find the troughs below each threshold.
904
+ # these are the local minima of the frame, could get them directly without the trough index
905
+ trough_heights = yin_frame[trough_index]
906
+ trough_thresholds = np.less.outer(trough_heights, thresholds[1:])
907
+
908
+ # 4. Define the prior over the troughs.
909
+ # Smaller periods are weighted more.
910
+ trough_positions = np.cumsum(trough_thresholds, axis=0) - 1
911
+ n_troughs = np.count_nonzero(trough_thresholds, axis=0)
912
+
913
+ trough_prior = scipy.stats.boltzmann.pmf(
914
+ trough_positions, boltzmann_parameter, n_troughs
915
+ )
916
+
917
+ trough_prior[~trough_thresholds] = 0
918
+
919
+ # 5. For each threshold add probability to global minimum if no trough is below threshold,
920
+ # else add probability to each trough below threshold biased by prior.
921
+
922
+ probs = trough_prior.dot(beta_probs)
923
+
924
+ global_min = np.argmin(trough_heights)
925
+ n_thresholds_below_min = np.count_nonzero(~trough_thresholds[global_min, :])
926
+ probs[global_min] += no_trough_prob * np.sum(
927
+ beta_probs[:n_thresholds_below_min]
928
+ )
929
+
930
+ yin_probs[trough_index, i] = probs
931
+
932
+ yin_period, frame_index = np.nonzero(yin_probs)
933
+
934
+ # Refine peak by parabolic interpolation.
935
+ period_candidates = min_period + yin_period
936
+ period_candidates = period_candidates + parabolic_shifts[yin_period, frame_index]
937
+ f0_candidates = sr / period_candidates
938
+
939
+ # Find pitch bin corresponding to each f0 candidate.
940
+ bin_index = 12 * n_bins_per_semitone * np.log2(f0_candidates / fmin)
941
+ bin_index = np.clip(np.round(bin_index), 0, n_pitch_bins).astype(int)
942
+
943
+ # Observation probabilities.
944
+ observation_probs = np.zeros((2 * n_pitch_bins, yin_frames.shape[1]))
945
+ observation_probs[bin_index, frame_index] = yin_probs[yin_period, frame_index]
946
+
947
+ voiced_prob = np.clip(
948
+ np.sum(observation_probs[:n_pitch_bins, :], axis=0, keepdims=True), 0, 1
949
+ )
950
+ observation_probs[n_pitch_bins:, :] = (1 - voiced_prob) / n_pitch_bins
951
+
952
+ return observation_probs[np.newaxis], voiced_prob
utils/utils.py CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  #!/usr/bin/env python
4
  # -*- coding: utf-8 -*-
5
  """Utility functions"""
@@ -1071,7 +1069,7 @@ def _localmin_sten(x): # pragma: no cover
1071
  "void(float64[:], bool_[:])",
1072
  ],
1073
  "(n)->(n)",
1074
- cache=True,
1075
  nopython=True,
1076
  )
1077
  def _localmax(x, y): # pragma: no cover
@@ -1088,7 +1086,7 @@ def _localmax(x, y): # pragma: no cover
1088
  "void(float64[:], bool_[:])",
1089
  ],
1090
  "(n)->(n)",
1091
- cache=True,
1092
  nopython=True,
1093
  )
1094
  def _localmin(x, y): # pragma: no cover
@@ -2472,7 +2470,7 @@ def is_unique(data: np.ndarray, *, axis: int = -1) -> np.ndarray:
2472
 
2473
 
2474
  @numba.vectorize(
2475
- ["float32(complex64)", "float64(complex128)"], nopython=True, cache=True, identity=0
2476
  ) # type: ignore
2477
  def _cabs2(x: _ComplexLike_co) -> _FloatLike_co: # pragma: no cover
2478
  """Helper function for efficiently computing abs2 on complex inputs"""
 
 
 
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
3
  """Utility functions"""
 
1069
  "void(float64[:], bool_[:])",
1070
  ],
1071
  "(n)->(n)",
1072
+ cache=False,
1073
  nopython=True,
1074
  )
1075
  def _localmax(x, y): # pragma: no cover
 
1086
  "void(float64[:], bool_[:])",
1087
  ],
1088
  "(n)->(n)",
1089
+ cache=False,
1090
  nopython=True,
1091
  )
1092
  def _localmin(x, y): # pragma: no cover
 
2470
 
2471
 
2472
  @numba.vectorize(
2473
+ ["float32(complex64)", "float64(complex128)"], nopython=True, cache=False, identity=0
2474
  ) # type: ignore
2475
  def _cabs2(x: _ComplexLike_co) -> _FloatLike_co: # pragma: no cover
2476
  """Helper function for efficiently computing abs2 on complex inputs"""