Spaces:
Running
on
Zero
Running
on
Zero
# raccoonML audio tools. | |
# MIT License | |
# Copyright (c) 2021 raccoonML (https://patreon.com/raccoonML) | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software") to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR ANY OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import torch | |
from scipy import signal | |
_mel_basis = None | |
def load_wav(path, sr): | |
# Loads an audio file and returns the waveform data. | |
wav, _ = librosa.load(str(path), sr=sr) | |
return wav | |
def save_wav(wav, path, sr): | |
# Saves waveform data to audio file. | |
sf.write(path, wav, sr) | |
def melspectrogram(wav, hparams): | |
# Converts a waveform to a mel-scale spectrogram. | |
# Output shape = (num_mels, frames) | |
# Apply preemphasis | |
if hparams.preemphasize: | |
wav = preemphasis(wav, hparams.preemphasis) | |
# Short-time Fourier Transform (STFT) | |
D = librosa.stft( | |
y=wav, | |
n_fft=hparams.n_fft, | |
hop_length=hparams.hop_size, | |
win_length=hparams.win_size, | |
) | |
# Convert complex-valued output of STFT to absolute value (real) | |
S = np.abs(D) | |
# Build and cache mel basis | |
# This improves speed when calculating thousands of mel spectrograms. | |
global _mel_basis | |
if _mel_basis is None: | |
_mel_basis = _build_mel_basis(hparams) | |
# Transform to mel scale | |
S = np.dot(_mel_basis, S) | |
# Dynamic range compression | |
S = np.log(np.clip(S, a_min=1e-5, a_max=None)) | |
return S.astype(np.float32) | |
def inv_mel_spectrogram(S, hparams): | |
# Converts a mel spectrogram to waveform using Griffin-Lim | |
# Input shape = (num_mels, frames) | |
# Denormalize | |
S = np.exp(S) | |
# Build and cache mel basis | |
# This improves speed when calculating thousands of mel spectrograms. | |
global _mel_basis | |
if _mel_basis is None: | |
_mel_basis = _build_mel_basis(hparams) | |
# Inverse mel basis | |
p = np.matmul(_mel_basis, _mel_basis.T) | |
d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)] | |
_inv_mel_basis = np.matmul(_mel_basis.T, np.diag(d)) | |
# Invert mel basis to recover linear spectrogram | |
S = np.dot(_inv_mel_basis, S) | |
# Use Griffin-Lim to recover waveform | |
wav = _griffin_lim(S ** hparams.power, hparams) | |
# Invert preemphasis | |
if hparams.preemphasize: | |
wav = inv_preemphasis(wav, hparams.preemphasis) | |
return wav | |
def preemphasis(wav, k, preemphasize=True): | |
# Amplifies high frequency content in a waveform. | |
if preemphasize: | |
wav = signal.lfilter([1, -k], [1], wav) | |
return wav | |
def inv_preemphasis(wav, k, inv_preemphasize=True): | |
# Inverts the preemphasis filter. | |
if inv_preemphasize: | |
wav = signal.lfilter([1], [1, -k], wav) | |
return wav | |
def _build_mel_basis(hparams): | |
return librosa.filters.mel( | |
sr=hparams.sample_rate, | |
n_fft=hparams.n_fft, | |
n_mels=hparams.num_mels, | |
fmin=hparams.fmin, | |
fmax=hparams.fmax, | |
) | |
def _griffin_lim(S, hparams): | |
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) | |
S = np.abs(S).astype(np.complex) | |
wav = librosa.istft( | |
S * angles, hop_length=hparams.hop_size, win_length=hparams.win_size | |
) | |
for i in range(hparams.griffin_lim_iters): | |
angles = np.exp( | |
1j | |
* np.angle( | |
librosa.stft( | |
wav, | |
n_fft=hparams.n_fft, | |
hop_length=hparams.hop_size, | |
win_length=hparams.win_size, | |
) | |
) | |
) | |
wav = librosa.istft( | |
S * angles, hop_length=hparams.hop_size, win_length=hparams.win_size | |
) | |
return wav | |