Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import math | |
import numpy as np | |
def freq2erb(freq_hz: float) -> float: | |
""" | |
https://www.cnblogs.com/LXP-Never/p/16011229.html | |
1 / (24.7 * 9.265) = 0.00436976 | |
""" | |
return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1) | |
def erb2freq(n_erb: float) -> float: | |
return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1) | |
def get_erb_widths(sample_rate: int, fft_size: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray: | |
""" | |
https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs | |
:param sample_rate: | |
:param fft_size: | |
:param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数. | |
:param min_freq_bins_for_erb: Minimum number of frequency bands per erb band | |
:return: | |
""" | |
nyq_freq = sample_rate / 2. | |
freq_width: float = sample_rate / fft_size | |
min_erb: float = freq2erb(0.) | |
max_erb: float = freq2erb(nyq_freq) | |
erb = [0] * erb_bins | |
step = (max_erb - min_erb) / erb_bins | |
prev_freq_bin = 0 | |
freq_over = 0 | |
for i in range(1, erb_bins + 1): | |
f = erb2freq(min_erb + i * step) | |
freq_bin = int(round(f / freq_width)) | |
freq_bins = freq_bin - prev_freq_bin - freq_over | |
if freq_bins < min_freq_bins_for_erb: | |
freq_over = min_freq_bins_for_erb - freq_bins | |
freq_bins = min_freq_bins_for_erb | |
else: | |
freq_over = 0 | |
erb[i - 1] = freq_bins | |
prev_freq_bin = freq_bin | |
erb[erb_bins - 1] += 1 | |
too_large = sum(erb) - (fft_size / 2 + 1) | |
if too_large > 0: | |
erb[erb_bins - 1] -= too_large | |
return np.array(erb, dtype=np.uint64) | |
def get_erb_filter_bank(erb_widths: np.ndarray, | |
sample_rate: int, | |
normalized: bool = True, | |
inverse: bool = False, | |
): | |
num_freq_bins = int(np.sum(erb_widths)) | |
num_erb_bins = len(erb_widths) | |
fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins)) | |
points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1] | |
for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())): | |
fb[b: b + w, i] = 1 | |
if inverse: | |
fb = fb.T | |
if not normalized: | |
fb /= np.sum(fb, axis=1, keepdims=True) | |
else: | |
if normalized: | |
fb /= np.sum(fb, axis=0) | |
return fb | |
def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True): | |
""" | |
ERB filterbank and transform to decibel scale. | |
:param spec: Spectrum of shape [B, C, T, F]. | |
:param erb_fb: ERB filterbank array of shape [B] containing the ERB widths, | |
where B are the number of ERB bins. | |
:param db: Whether to transform the output into decibel scale. Defaults to `True`. | |
:return: | |
""" | |
# complex spec to power spec. (real * real + image * image) | |
spec_ = np.abs(spec) ** 2 | |
# spec to erb feature. | |
erb_feat = np.matmul(spec_, erb_fb) | |
if db: | |
erb_feat = 10 * np.log10(erb_feat + 1e-10) | |
erb_feat = np.array(erb_feat, dtype=np.float32) | |
return erb_feat | |
def _calculate_norm_alpha(sample_rate: int, hop_size: int, tau: float): | |
"""Exponential decay factor alpha for a given tau (decay window size [s]).""" | |
dt = hop_size / sample_rate | |
result = math.exp(-dt / tau) | |
return result | |
def get_norm_alpha(sample_rate: int, hop_size: int, norm_tau: float) -> float: | |
a_ = _calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau) | |
precision = 3 | |
a = 1.0 | |
while a >= 1.0: | |
a = round(a_, precision) | |
precision += 1 | |
return a | |
MEAN_NORM_INIT = [-60., -90.] | |
def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray: | |
state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins) | |
state = np.expand_dims(state, axis=0) | |
state = np.repeat(state, channels, axis=0) | |
# state shape: (audio_channels, erb_bins) | |
return state | |
def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None): | |
erb_feat = np.copy(erb_feat) | |
batch_size, time_steps, erb_bins = erb_feat.shape | |
if state is None: | |
state = make_erb_norm_state(erb_bins, erb_feat.shape[0]) | |
# state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins) | |
# state = np.expand_dims(state, axis=0) | |
# state = np.repeat(state, erb_feat.shape[0], axis=0) | |
for i in range(batch_size): | |
for j in range(time_steps): | |
for k in range(erb_bins): | |
x = erb_feat[i][j][k] | |
s = state[i][k] | |
state[i][k] = x * (1. - alpha) + s * alpha | |
erb_feat[i][j][k] -= state[i][k] | |
erb_feat[i][j][k] /= 40. | |
return erb_feat | |
UNIT_NORM_INIT = [0.001, 0.0001] | |
def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray: | |
state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins) | |
state = np.expand_dims(state, axis=0) | |
state = np.repeat(state, channels, axis=0) | |
# state shape: (audio_channels, df_bins) | |
return state | |
def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None): | |
spec_feat = np.copy(spec_feat) | |
batch_size, time_steps, df_bins = spec_feat.shape | |
if state is None: | |
state = make_spec_norm_state(df_bins, spec_feat.shape[0]) | |
for i in range(batch_size): | |
for j in range(time_steps): | |
for k in range(df_bins): | |
x = spec_feat[i][j][k] | |
s = state[i][k] | |
state[i][k] = np.abs(x) * (1. - alpha) + s * alpha | |
spec_feat[i][j][k] /= np.sqrt(state[i][k]) | |
return spec_feat | |
if __name__ == '__main__': | |
pass | |