Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import math | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
class EMANumpy(object): | |
def _calculate_norm_alpha(cls, sample_rate: int, hop_size: int, tau: float): | |
"""Exponential decay factor alpha for a given tau (decay window size [s]).""" | |
dt = hop_size / sample_rate | |
result = math.exp(-dt / tau) | |
return result | |
def get_norm_alpha(cls, sample_rate: int, hop_size: int, norm_tau: float) -> float: | |
a_ = cls._calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau) | |
precision = 3 | |
a = 1.0 | |
while a >= 1.0: | |
a = round(a_, precision) | |
precision += 1 | |
return a | |
class ErbEMA(nn.Module, EMANumpy): | |
def __init__(self, | |
sample_rate: int = 8000, | |
hop_size: int = 80, | |
erb_bins: int = 32, | |
mean_norm_init_start: float = -60., | |
mean_norm_init_end: float = -90., | |
norm_tau: float = 1., | |
): | |
super().__init__() | |
self.sample_rate = sample_rate | |
self.hop_size = hop_size | |
self.erb_bins = erb_bins | |
self.mean_norm_init_start = mean_norm_init_start | |
self.mean_norm_init_end = mean_norm_init_end | |
self.norm_tau = norm_tau | |
self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau) | |
def make_erb_norm_state(self) -> torch.Tensor: | |
state = torch.linspace(start=self.mean_norm_init_start, end=self.mean_norm_init_end, | |
steps=self.erb_bins) | |
state = state.unsqueeze(0).unsqueeze(0) | |
# state shape: [b, c, erb_bins] | |
# state shape: [1, 1, erb_bins] | |
return state | |
def norm(self, | |
feat_erb: torch.Tensor, | |
state: torch.Tensor = None, | |
): | |
feat_erb = feat_erb.clone() | |
b, c, t, f = feat_erb.shape | |
# erb_feat shape: [b, c, t, f] | |
if state is None: | |
state = self.make_erb_norm_state() | |
state = state.to(feat_erb.device) | |
state = state.clone() | |
for j in range(t): | |
current = feat_erb[:, :, j, :] | |
new_state = current * (1 - self.alpha) + state * self.alpha | |
feat_erb[:, :, j, :] = (current - new_state) / 40.0 | |
state = new_state | |
return feat_erb, state | |
class SpecEMA(nn.Module, EMANumpy): | |
""" | |
https://github.com/grazder/DeepFilterNet/blob/torchDF_main/libDF/src/lib.rs | |
""" | |
def __init__(self, | |
sample_rate: int = 8000, | |
hop_size: int = 80, | |
df_bins: int = 96, | |
unit_norm_init_start: float = 0.001, | |
unit_norm_init_end: float = 0.0001, | |
norm_tau: float = 1., | |
): | |
super().__init__() | |
self.sample_rate = sample_rate | |
self.hop_size = hop_size | |
self.df_bins = df_bins | |
self.unit_norm_init_start = unit_norm_init_start | |
self.unit_norm_init_end = unit_norm_init_end | |
self.norm_tau = norm_tau | |
self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau) | |
def make_spec_norm_state(self) -> torch.Tensor: | |
state = torch.linspace(start=self.unit_norm_init_start, end=self.unit_norm_init_end, | |
steps=self.df_bins) | |
state = state.unsqueeze(0).unsqueeze(0) | |
# state shape: [b, c, df_bins] | |
# state shape: [1, 1, df_bins] | |
return state | |
def norm(self, | |
feat_spec: torch.Tensor, | |
state: torch.Tensor = None, | |
): | |
feat_spec = feat_spec.clone() | |
b, c, t, f = feat_spec.shape | |
# feat_spec shape: [b, 2, t, df_bins] | |
if state is None: | |
state = self.make_spec_norm_state() | |
state = state.to(feat_spec.device) | |
state = state.clone() | |
for j in range(t): | |
current = feat_spec[:, :, j, :] | |
current_abs = torch.sum(torch.square(current), dim=1, keepdim=True) | |
# current_abs shape: [b, 1, df_bins] | |
new_state = current_abs * (1 - self.alpha) + state * self.alpha | |
feat_spec[:, :, j, :] = current / torch.sqrt(new_state) | |
state = new_state | |
return feat_spec, state | |
MEAN_NORM_INIT = [-60., -90.] | |
def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray: | |
state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins) | |
state = np.expand_dims(state, axis=0) | |
state = np.repeat(state, channels, axis=0) | |
# state shape: (audio_channels, erb_bins) | |
return state | |
def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None): | |
erb_feat = np.copy(erb_feat) | |
batch_size, time_steps, erb_bins = erb_feat.shape | |
if state is None: | |
state = make_erb_norm_state(erb_bins, erb_feat.shape[0]) | |
# state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins) | |
# state = np.expand_dims(state, axis=0) | |
# state = np.repeat(state, erb_feat.shape[0], axis=0) | |
for i in range(batch_size): | |
for j in range(time_steps): | |
for k in range(erb_bins): | |
x = erb_feat[i][j][k] | |
s = state[i][k] | |
state[i][k] = x * (1. - alpha) + s * alpha | |
erb_feat[i][j][k] -= state[i][k] | |
erb_feat[i][j][k] /= 40. | |
return erb_feat | |
UNIT_NORM_INIT = [0.001, 0.0001] | |
def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray: | |
state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins) | |
state = np.expand_dims(state, axis=0) | |
state = np.repeat(state, channels, axis=0) | |
# state shape: (audio_channels, df_bins) | |
return state | |
def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None): | |
spec_feat = np.copy(spec_feat) | |
batch_size, time_steps, df_bins = spec_feat.shape | |
if state is None: | |
state = make_spec_norm_state(df_bins, spec_feat.shape[0]) | |
for i in range(batch_size): | |
for j in range(time_steps): | |
for k in range(df_bins): | |
x = spec_feat[i][j][k] | |
s = state[i][k] | |
state[i][k] = np.abs(x) * (1. - alpha) + s * alpha | |
spec_feat[i][j][k] /= np.sqrt(state[i][k]) | |
return spec_feat | |
if __name__ == "__main__": | |
pass | |