|
import scipy.signal |
|
import numpy as np |
|
import librosa |
|
from noisereduce.plotting import plot_reduction_steps |
|
from tqdm.autonotebook import tqdm |
|
import warnings |
|
import copy |
|
|
|
|
|
def _stft(y, n_fft, hop_length, win_length, use_tensorflow=False): |
|
if use_tensorflow: |
|
|
|
return _stft_tensorflow(y, n_fft, hop_length, win_length) |
|
else: |
|
return librosa.stft( |
|
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True |
|
) |
|
|
|
|
|
def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False): |
|
if use_tensorflow: |
|
|
|
return _istft_tensorflow(y.T, n_fft, hop_length, win_length) |
|
else: |
|
return librosa.istft(y, hop_length, win_length) |
|
|
|
|
|
def _stft_librosa(y, n_fft, hop_length, win_length): |
|
return librosa.stft( |
|
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True |
|
) |
|
|
|
|
|
def _istft_librosa(y, hop_length, win_length): |
|
return librosa.istft(y, hop_length, win_length) |
|
|
|
|
|
def _stft_tensorflow(y, n_fft, hop_length, win_length): |
|
return ( |
|
tf.signal.stft( |
|
y, |
|
win_length, |
|
hop_length, |
|
n_fft, |
|
pad_end=True, |
|
window_fn=tf.signal.hann_window, |
|
) |
|
.numpy() |
|
.T |
|
) |
|
|
|
|
|
def _istft_tensorflow(y, n_fft, hop_length, win_length): |
|
return tf.signal.inverse_stft( |
|
y.astype(np.complex64), win_length, hop_length, n_fft |
|
).numpy() |
|
|
|
|
|
def _amp_to_db(x): |
|
return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0) |
|
|
|
|
|
def _db_to_amp(x,): |
|
return librosa.core.db_to_amplitude(x, ref=1.0) |
|
|
|
|
|
def update_pbar(pbar, message): |
|
""" writes to progress bar |
|
""" |
|
if pbar is not None: |
|
pbar.set_description(message) |
|
pbar.update(1) |
|
|
|
|
|
def _smoothing_filter(n_grad_freq, n_grad_time): |
|
"""Generates a filter to smooth the mask for the spectrogram |
|
|
|
Arguments: |
|
n_grad_freq {[type]} -- [how many frequency channels to smooth over with the mask.] |
|
n_grad_time {[type]} -- [how many time channels to smooth over with the mask.] |
|
""" |
|
|
|
smoothing_filter = np.outer( |
|
np.concatenate( |
|
[ |
|
np.linspace(0, 1, n_grad_freq + 1, endpoint=False), |
|
np.linspace(1, 0, n_grad_freq + 2), |
|
] |
|
)[1:-1], |
|
np.concatenate( |
|
[ |
|
np.linspace(0, 1, n_grad_time + 1, endpoint=False), |
|
np.linspace(1, 0, n_grad_time + 2), |
|
] |
|
)[1:-1], |
|
) |
|
smoothing_filter = smoothing_filter / np.sum(smoothing_filter) |
|
return smoothing_filter |
|
|
|
|
|
def mask_signal(sig_stft, sig_mask): |
|
""" Reduces amplitude of time/frequency regions of a spectrogram based upon a mask |
|
|
|
Arguments: |
|
sig_stft {[type]} -- spectrogram of signal |
|
sig_mask {[type]} -- mask to apply to signal |
|
|
|
Returns: |
|
sig_stft_amp [type] -- masked signal |
|
""" |
|
sig_stft_amp = sig_stft * (1 - sig_mask) |
|
return sig_stft_amp |
|
|
|
|
|
def convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow=False): |
|
""" Convolves a gaussian filter with a mask (or any image) |
|
|
|
Arguments: |
|
sig_mask {[type]} -- The signal mask |
|
smoothing_filter {[type]} -- the filter to convolve |
|
|
|
Keyword Arguments: |
|
use_tensorflow {bool} -- use tensorflow.signal or scipy.signal (default: {False}) |
|
""" |
|
if use_tensorflow: |
|
smoothing_filter = smoothing_filter * ( |
|
(np.shape(smoothing_filter)[1] - 1) / 2 + 1 |
|
) |
|
smoothing_filter = smoothing_filter[:, :, tf.newaxis, tf.newaxis].astype( |
|
"float32" |
|
) |
|
img = sig_mask[:, :, tf.newaxis, tf.newaxis].astype("float32") |
|
return ( |
|
tf.nn.conv2d(img, smoothing_filter, strides=[1, 1, 1, 1], padding="SAME") |
|
.numpy() |
|
.squeeze() |
|
) |
|
else: |
|
return scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same") |
|
|
|
|
|
def load_tensorflow(verbose=False): |
|
"""loads tensorflow if it is available |
|
Used as a backend for fft and convolution |
|
|
|
Returns: |
|
bool -- whether to use tensorflow |
|
""" |
|
try: |
|
|
|
globals()["tf"] = __import__("tensorflow") |
|
|
|
if verbose: |
|
available_gpus = tf.config.experimental.list_physical_devices("GPU") |
|
print("GPUs available: {}".format(available_gpus)) |
|
if int(tf.__version__[0]) < 2: |
|
warnings.warn( |
|
"Tensorflow version is below 2.0, reverting to non-tensorflow backend" |
|
) |
|
return False |
|
except: |
|
warnings.warn( |
|
"Tensorflow is not installed, reverting to non-tensorflow backend" |
|
) |
|
return False |
|
return True |
|
|
|
|
|
def reduce_noise( |
|
audio_clip, |
|
noise_clip=None, |
|
n_grad_freq=2, |
|
n_grad_time=4, |
|
n_fft=2048, |
|
win_length=2048, |
|
hop_length=512, |
|
n_std_thresh=1.5, |
|
prop_decrease=1.0, |
|
pad_clipping=True, |
|
use_tensorflow=False, |
|
verbose=False, |
|
): |
|
"""Remove noise from audio based upon a clip containing only noise |
|
|
|
Args: |
|
audio_clip (array): Waveform of audio |
|
noise_clip (array): The second parameter. |
|
n_grad_freq (int): how many frequency channels to smooth over with the mask. |
|
n_grad_time (int): how many time channels to smooth over with the mask. |
|
n_fft (int): number audio of frames between STFT columns. |
|
win_length (int): Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`.. |
|
hop_length (int):number audio of frames between STFT columns. |
|
n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal |
|
prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none) |
|
pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data |
|
use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation |
|
verbose (bool): Whether to plot the steps of the algorithm |
|
|
|
Returns: |
|
array: The recovered signal with noise subtracted |
|
|
|
""" |
|
|
|
if use_tensorflow: |
|
use_tensorflow = load_tensorflow(verbose) |
|
|
|
if verbose: |
|
pbar = tqdm(total=7) |
|
else: |
|
pbar = None |
|
|
|
|
|
update_pbar(pbar, "STFT on signal") |
|
|
|
|
|
if pad_clipping: |
|
nsamp = len(audio_clip) |
|
audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant") |
|
|
|
sig_stft = _stft( |
|
audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow |
|
) |
|
|
|
sig_stft_db = _amp_to_db(np.abs(sig_stft)) |
|
|
|
update_pbar(pbar, "STFT on noise") |
|
|
|
if noise_clip is None: |
|
noise_stft = copy.deepcopy(sig_stft) |
|
noise_stft_db = copy.deepcopy(sig_stft_db) |
|
else: |
|
noise_stft = _stft( |
|
noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow |
|
) |
|
noise_stft_db = _amp_to_db(np.abs(noise_stft)) |
|
|
|
mean_freq_noise = np.mean(noise_stft_db, axis=1) |
|
std_freq_noise = np.std(noise_stft_db, axis=1) |
|
noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh |
|
|
|
update_pbar(pbar, "Generate mask") |
|
|
|
|
|
db_thresh = np.repeat( |
|
np.reshape(noise_thresh, [1, len(mean_freq_noise)]), |
|
np.shape(sig_stft_db)[1], |
|
axis=0, |
|
).T |
|
|
|
sig_mask = sig_stft_db < db_thresh |
|
update_pbar(pbar, "Smooth mask") |
|
|
|
smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time) |
|
|
|
|
|
sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow) |
|
|
|
sig_mask = sig_mask * prop_decrease |
|
update_pbar(pbar, "Apply mask") |
|
|
|
|
|
sig_stft_amp = mask_signal(sig_stft, sig_mask) |
|
|
|
update_pbar(pbar, "Recover signal") |
|
|
|
recovered_signal = _istft( |
|
sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow |
|
) |
|
|
|
if pad_clipping: |
|
recovered_signal = librosa.util.fix_length(recovered_signal, nsamp) |
|
|
|
recovered_spec = _amp_to_db( |
|
np.abs( |
|
_stft( |
|
recovered_signal, |
|
n_fft, |
|
hop_length, |
|
win_length, |
|
use_tensorflow=use_tensorflow, |
|
) |
|
) |
|
) |
|
if verbose: |
|
plot_reduction_steps( |
|
noise_stft_db, |
|
mean_freq_noise, |
|
std_freq_noise, |
|
noise_thresh, |
|
smoothing_filter, |
|
sig_stft_db, |
|
sig_mask, |
|
recovered_spec, |
|
) |
|
return recovered_signal |
|
|