camenduru's picture
thanks to RVC-Project ❤
ef99749
import scipy.signal
import numpy as np
import librosa
from noisereduce.plotting import plot_reduction_steps
from tqdm.autonotebook import tqdm
import warnings
import copy
def _stft(y, n_fft, hop_length, win_length, use_tensorflow=False):
if use_tensorflow:
# return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True)
return _stft_tensorflow(y, n_fft, hop_length, win_length)
else:
return librosa.stft(
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
)
def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False):
if use_tensorflow:
# return librosa.istft(y, hop_length, win_length)
return _istft_tensorflow(y.T, n_fft, hop_length, win_length)
else:
return librosa.istft(y, hop_length, win_length)
def _stft_librosa(y, n_fft, hop_length, win_length):
return librosa.stft(
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
)
def _istft_librosa(y, hop_length, win_length):
return librosa.istft(y, hop_length, win_length)
def _stft_tensorflow(y, n_fft, hop_length, win_length):
return (
tf.signal.stft(
y,
win_length,
hop_length,
n_fft,
pad_end=True,
window_fn=tf.signal.hann_window,
)
.numpy()
.T
)
def _istft_tensorflow(y, n_fft, hop_length, win_length):
return tf.signal.inverse_stft(
y.astype(np.complex64), win_length, hop_length, n_fft
).numpy()
def _amp_to_db(x):
return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0)
def _db_to_amp(x,):
return librosa.core.db_to_amplitude(x, ref=1.0)
def update_pbar(pbar, message):
""" writes to progress bar
"""
if pbar is not None:
pbar.set_description(message)
pbar.update(1)
def _smoothing_filter(n_grad_freq, n_grad_time):
"""Generates a filter to smooth the mask for the spectrogram
Arguments:
n_grad_freq {[type]} -- [how many frequency channels to smooth over with the mask.]
n_grad_time {[type]} -- [how many time channels to smooth over with the mask.]
"""
smoothing_filter = np.outer(
np.concatenate(
[
np.linspace(0, 1, n_grad_freq + 1, endpoint=False),
np.linspace(1, 0, n_grad_freq + 2),
]
)[1:-1],
np.concatenate(
[
np.linspace(0, 1, n_grad_time + 1, endpoint=False),
np.linspace(1, 0, n_grad_time + 2),
]
)[1:-1],
)
smoothing_filter = smoothing_filter / np.sum(smoothing_filter)
return smoothing_filter
def mask_signal(sig_stft, sig_mask):
""" Reduces amplitude of time/frequency regions of a spectrogram based upon a mask
Arguments:
sig_stft {[type]} -- spectrogram of signal
sig_mask {[type]} -- mask to apply to signal
Returns:
sig_stft_amp [type] -- masked signal
"""
sig_stft_amp = sig_stft * (1 - sig_mask)
return sig_stft_amp
def convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow=False):
""" Convolves a gaussian filter with a mask (or any image)
Arguments:
sig_mask {[type]} -- The signal mask
smoothing_filter {[type]} -- the filter to convolve
Keyword Arguments:
use_tensorflow {bool} -- use tensorflow.signal or scipy.signal (default: {False})
"""
if use_tensorflow:
smoothing_filter = smoothing_filter * (
(np.shape(smoothing_filter)[1] - 1) / 2 + 1
)
smoothing_filter = smoothing_filter[:, :, tf.newaxis, tf.newaxis].astype(
"float32"
)
img = sig_mask[:, :, tf.newaxis, tf.newaxis].astype("float32")
return (
tf.nn.conv2d(img, smoothing_filter, strides=[1, 1, 1, 1], padding="SAME")
.numpy()
.squeeze()
)
else:
return scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")
def load_tensorflow(verbose=False):
"""loads tensorflow if it is available
Used as a backend for fft and convolution
Returns:
bool -- whether to use tensorflow
"""
try:
# import tensorflow as tf
globals()["tf"] = __import__("tensorflow")
if verbose:
available_gpus = tf.config.experimental.list_physical_devices("GPU")
print("GPUs available: {}".format(available_gpus))
if int(tf.__version__[0]) < 2:
warnings.warn(
"Tensorflow version is below 2.0, reverting to non-tensorflow backend"
)
return False
except:
warnings.warn(
"Tensorflow is not installed, reverting to non-tensorflow backend"
)
return False
return True
def reduce_noise(
audio_clip,
noise_clip=None,
n_grad_freq=2,
n_grad_time=4,
n_fft=2048,
win_length=2048,
hop_length=512,
n_std_thresh=1.5,
prop_decrease=1.0,
pad_clipping=True,
use_tensorflow=False,
verbose=False,
):
"""Remove noise from audio based upon a clip containing only noise
Args:
audio_clip (array): Waveform of audio
noise_clip (array): The second parameter.
n_grad_freq (int): how many frequency channels to smooth over with the mask.
n_grad_time (int): how many time channels to smooth over with the mask.
n_fft (int): number audio of frames between STFT columns.
win_length (int): Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`..
hop_length (int):number audio of frames between STFT columns.
n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal
prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none)
pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data
use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation
verbose (bool): Whether to plot the steps of the algorithm
Returns:
array: The recovered signal with noise subtracted
"""
# load tensorflow if you are using it as a backend
if use_tensorflow:
use_tensorflow = load_tensorflow(verbose)
if verbose:
pbar = tqdm(total=7)
else:
pbar = None
# STFT over signal
update_pbar(pbar, "STFT on signal")
# pad signal with zeros to avoid extra frames being clipped if desired
if pad_clipping:
nsamp = len(audio_clip)
audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant")
sig_stft = _stft(
audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
)
# spectrogram of signal in dB
sig_stft_db = _amp_to_db(np.abs(sig_stft))
update_pbar(pbar, "STFT on noise")
# STFT over noise
if noise_clip is None:
noise_stft = copy.deepcopy(sig_stft)
noise_stft_db = copy.deepcopy(sig_stft_db)
else:
noise_stft = _stft(
noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
)
noise_stft_db = _amp_to_db(np.abs(noise_stft)) # convert to dB
# Calculate statistics over noise
mean_freq_noise = np.mean(noise_stft_db, axis=1)
std_freq_noise = np.std(noise_stft_db, axis=1)
noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh
update_pbar(pbar, "Generate mask")
# calculate the threshold for each frequency/time bin
db_thresh = np.repeat(
np.reshape(noise_thresh, [1, len(mean_freq_noise)]),
np.shape(sig_stft_db)[1],
axis=0,
).T
# mask if the signal is above the threshold
sig_mask = sig_stft_db < db_thresh
update_pbar(pbar, "Smooth mask")
# Create a smoothing filter for the mask in time and frequency
smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time)
# convolve the mask with a smoothing filter
sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow)
sig_mask = sig_mask * prop_decrease
update_pbar(pbar, "Apply mask")
# mask the signal
sig_stft_amp = mask_signal(sig_stft, sig_mask)
update_pbar(pbar, "Recover signal")
# recover the signal
recovered_signal = _istft(
sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
)
# fix the recovered signal length if padding signal
if pad_clipping:
recovered_signal = librosa.util.fix_length(recovered_signal, nsamp)
recovered_spec = _amp_to_db(
np.abs(
_stft(
recovered_signal,
n_fft,
hop_length,
win_length,
use_tensorflow=use_tensorflow,
)
)
)
if verbose:
plot_reduction_steps(
noise_stft_db,
mean_freq_noise,
std_freq_noise,
noise_thresh,
smoothing_filter,
sig_stft_db,
sig_mask,
recovered_spec,
)
return recovered_signal