Spaces:
Running
Running
File size: 5,735 Bytes
bd94e77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import math
import numpy as np
def freq2erb(freq_hz: float) -> float:
"""
https://www.cnblogs.com/LXP-Never/p/16011229.html
1 / (24.7 * 9.265) = 0.00436976
"""
return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
def erb2freq(n_erb: float) -> float:
return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
def get_erb_widths(sample_rate: int, fft_size: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
"""
https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
:param sample_rate:
:param fft_size:
:param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
:param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
:return:
"""
nyq_freq = sample_rate / 2.
freq_width: float = sample_rate / fft_size
min_erb: float = freq2erb(0.)
max_erb: float = freq2erb(nyq_freq)
erb = [0] * erb_bins
step = (max_erb - min_erb) / erb_bins
prev_freq_bin = 0
freq_over = 0
for i in range(1, erb_bins + 1):
f = erb2freq(min_erb + i * step)
freq_bin = int(round(f / freq_width))
freq_bins = freq_bin - prev_freq_bin - freq_over
if freq_bins < min_freq_bins_for_erb:
freq_over = min_freq_bins_for_erb - freq_bins
freq_bins = min_freq_bins_for_erb
else:
freq_over = 0
erb[i - 1] = freq_bins
prev_freq_bin = freq_bin
erb[erb_bins - 1] += 1
too_large = sum(erb) - (fft_size / 2 + 1)
if too_large > 0:
erb[erb_bins - 1] -= too_large
return np.array(erb, dtype=np.uint64)
def get_erb_filter_bank(erb_widths: np.ndarray,
sample_rate: int,
normalized: bool = True,
inverse: bool = False,
):
num_freq_bins = int(np.sum(erb_widths))
num_erb_bins = len(erb_widths)
fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
fb[b: b + w, i] = 1
if inverse:
fb = fb.T
if not normalized:
fb /= np.sum(fb, axis=1, keepdims=True)
else:
if normalized:
fb /= np.sum(fb, axis=0)
return fb
def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
"""
ERB filterbank and transform to decibel scale.
:param spec: Spectrum of shape [B, C, T, F].
:param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
where B are the number of ERB bins.
:param db: Whether to transform the output into decibel scale. Defaults to `True`.
:return:
"""
# complex spec to power spec. (real * real + image * image)
spec_ = np.abs(spec) ** 2
# spec to erb feature.
erb_feat = np.matmul(spec_, erb_fb)
if db:
erb_feat = 10 * np.log10(erb_feat + 1e-10)
erb_feat = np.array(erb_feat, dtype=np.float32)
return erb_feat
def _calculate_norm_alpha(sample_rate: int, hop_size: int, tau: float):
"""Exponential decay factor alpha for a given tau (decay window size [s])."""
dt = hop_size / sample_rate
result = math.exp(-dt / tau)
return result
def get_norm_alpha(sample_rate: int, hop_size: int, norm_tau: float) -> float:
a_ = _calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
precision = 3
a = 1.0
while a >= 1.0:
a = round(a_, precision)
precision += 1
return a
MEAN_NORM_INIT = [-60., -90.]
def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray:
state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
state = np.expand_dims(state, axis=0)
state = np.repeat(state, channels, axis=0)
# state shape: (audio_channels, erb_bins)
return state
def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None):
erb_feat = np.copy(erb_feat)
batch_size, time_steps, erb_bins = erb_feat.shape
if state is None:
state = make_erb_norm_state(erb_bins, erb_feat.shape[0])
# state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
# state = np.expand_dims(state, axis=0)
# state = np.repeat(state, erb_feat.shape[0], axis=0)
for i in range(batch_size):
for j in range(time_steps):
for k in range(erb_bins):
x = erb_feat[i][j][k]
s = state[i][k]
state[i][k] = x * (1. - alpha) + s * alpha
erb_feat[i][j][k] -= state[i][k]
erb_feat[i][j][k] /= 40.
return erb_feat
UNIT_NORM_INIT = [0.001, 0.0001]
def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray:
state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins)
state = np.expand_dims(state, axis=0)
state = np.repeat(state, channels, axis=0)
# state shape: (audio_channels, df_bins)
return state
def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None):
spec_feat = np.copy(spec_feat)
batch_size, time_steps, df_bins = spec_feat.shape
if state is None:
state = make_spec_norm_state(df_bins, spec_feat.shape[0])
for i in range(batch_size):
for j in range(time_steps):
for k in range(df_bins):
x = spec_feat[i][j][k]
s = state[i][k]
state[i][k] = np.abs(x) * (1. - alpha) + s * alpha
spec_feat[i][j][k] /= np.sqrt(state[i][k])
return spec_feat
if __name__ == '__main__':
pass
|