Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on May 16

Commit

169d6d6

1 Parent(s): 2171fed

add ema

Browse files

Files changed (5) hide show

examples/dtln/run.sh +1 -1
examples/rnnoise/run.sh +2 -1
examples/rnnoise/step_2_train_model.py +4 -0
toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py +40 -11
toolbox/torchaudio/modules/utils/ema.py +120 -18

examples/dtln/run.sh CHANGED Viewed

@@ -6,7 +6,7 @@ sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name f
 --noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/speech"
-sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dtln-dns3 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/dns3-noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"

 --noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/speech"
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dtln-nx-dns3 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/dns3-noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"

examples/rnnoise/run.sh CHANGED Viewed

@@ -8,7 +8,8 @@ sh run.sh --stage 3 --stop_stage 3 --system_version windows --file_folder_name f
 sh run.sh --stage 1 --stop_stage 3 --system_version centos --file_folder_name file_dir \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
---speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
 END

 sh run.sh --stage 1 --stop_stage 3 --system_version centos --file_folder_name file_dir \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
+--sparse
 END

examples/rnnoise/step_2_train_model.py CHANGED Viewed

@@ -48,6 +48,8 @@ def get_args():
     parser.add_argument("--config_file", default="config.yaml", type=str)
     args = parser.parse_args()
     return args
@@ -289,6 +291,8 @@ def main():
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
             optimizer.step()
             lr_scheduler.step()
             total_pesq_score += pesq_score

     parser.add_argument("--config_file", default="config.yaml", type=str)
+    parser.add_argument("--sparse", action="store_true")
     args = parser.parse_args()
     return args
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
             optimizer.step()
+            if args.sparse:
+                model.sparsify()
             lr_scheduler.step()
             total_pesq_score += pesq_score

toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py CHANGED Viewed

@@ -24,6 +24,7 @@ from toolbox.torchaudio.models.dfnet2.configuration_dfnet2 import DfNet2Config
 from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
 from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
 from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
 MODEL_FILE = "model.pt"
@@ -965,13 +966,6 @@ class DfNet2(nn.Module):
         self.hop_size = config.hop_size
         self.win_type = config.win_type
-        self.erb_bands = ErbBands(
-            sample_rate=config.sample_rate,
-            nfft=config.nfft,
-            erb_bins=config.erb_bins,
-            min_freq_bins_for_erb=config.min_freq_bins_for_erb,
-        )
         self.stft = ConvSTFT(
             nfft=config.nfft,
             win_size=config.win_size,
@@ -988,6 +982,24 @@ class DfNet2(nn.Module):
             requires_grad=False
         )
         self.encoder = Encoder(config)
         self.erb_decoder = ErbDecoder(config)
@@ -1052,6 +1064,24 @@ class DfNet2(nn.Module):
         feat_spec = feat_spec.detach()
         return spec, feat_erb, feat_spec
     def forward(self,
                 noisy: torch.Tensor,
                 ):
@@ -1067,6 +1097,7 @@ class DfNet2(nn.Module):
         noisy = self.signal_prepare(noisy)
         spec, feat_erb, feat_spec = self.feature_prepare(noisy)
         e0, e1, e2, e3, emb, c0, lsnr, _ = self.encoder.forward(feat_erb, feat_spec)
@@ -1137,6 +1168,7 @@ class DfNet2(nn.Module):
         cache_dict3 = None
         cache_dict4 = None
         cache_dict5 = None
         waveform_list = list()
         for i in range(int(t)):
@@ -1148,6 +1180,7 @@ class DfNet2(nn.Module):
             # spec shape: [b, 1, t, f, 2]
             # feat_erb shape: [b, 1, t, erb_bins]
             # feat_spec shape: [b, 2, t, df_bins]
             e0, e1, e2, e3, emb, c0, lsnr, cache_dict0 = self.encoder.forward(feat_erb, feat_spec, cache_dict=cache_dict0)
@@ -1174,10 +1207,6 @@ class DfNet2(nn.Module):
             spec_f, cache_dict3 = self.df_op.forward_online(spec_, df_coefs, cache_dict=cache_dict3)
             # spec_f shape: [b, 1, t, df_bins, 2], torch.float32
-            spec_e = torch.concat(tensors=[
-                spec_f, spec_m[..., self.df_decoder.df_bins:, :]
-            ], dim=3)
             spec_e, cache_dict4 = self.spec_e_m_combine_online(spec_f, spec_m, cache_dict=cache_dict4)
             spec_e = torch.squeeze(spec_e, dim=1)

 from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
 from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
 from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
+from toolbox.torchaudio.modules.utils.ema import ErbEMA, SpecEMA
 MODEL_FILE = "model.pt"
         self.hop_size = config.hop_size
         self.win_type = config.win_type
         self.stft = ConvSTFT(
             nfft=config.nfft,
             win_size=config.win_size,
             requires_grad=False
         )
+        self.erb_bands = ErbBands(
+            sample_rate=config.sample_rate,
+            nfft=config.nfft,
+            erb_bins=config.erb_bins,
+            min_freq_bins_for_erb=config.min_freq_bins_for_erb,
+        )
+        self.erb_ema = ErbEMA(
+            sample_rate=config.sample_rate,
+            hop_size=config.hop_size,
+            erb_bins=config.erb_bins,
+        )
+        self.spec_ema = SpecEMA(
+            sample_rate=config.sample_rate,
+            hop_size=config.hop_size,
+            df_bins=config.df_bins,
+        )
         self.encoder = Encoder(config)
         self.erb_decoder = ErbDecoder(config)
         feat_spec = feat_spec.detach()
         return spec, feat_erb, feat_spec
+    def feature_norm(self, feat_erb, feat_spec, cache_dict: dict = None):
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache0 = cache_dict["cache0"]
+        cache1 = cache_dict["cache1"]
+        feat_erb, new_cache0 = self.erb_ema.norm(feat_erb, state=cache0)
+        feat_spec, new_cache1 = self.spec_ema.norm(feat_spec, state=cache1)
+        new_cache_dict = {
+            "cache0": new_cache0,
+            "cache1": new_cache1,
+        }
+        feat_erb = feat_erb.detach()
+        feat_spec = feat_spec.detach()
+        return feat_erb, feat_spec, new_cache_dict
     def forward(self,
                 noisy: torch.Tensor,
                 ):
         noisy = self.signal_prepare(noisy)
         spec, feat_erb, feat_spec = self.feature_prepare(noisy)
+        feat_erb, feat_spec, _ = self.feature_norm(feat_erb, feat_spec)
         e0, e1, e2, e3, emb, c0, lsnr, _ = self.encoder.forward(feat_erb, feat_spec)
         cache_dict3 = None
         cache_dict4 = None
         cache_dict5 = None
+        cache_dict6 = None
         waveform_list = list()
         for i in range(int(t)):
             # spec shape: [b, 1, t, f, 2]
             # feat_erb shape: [b, 1, t, erb_bins]
             # feat_spec shape: [b, 2, t, df_bins]
+            feat_erb, feat_spec, cache_dict6 = self.feature_norm(feat_erb, feat_spec, cache_dict=cache_dict6)
             e0, e1, e2, e3, emb, c0, lsnr, cache_dict0 = self.encoder.forward(feat_erb, feat_spec, cache_dict=cache_dict0)
             spec_f, cache_dict3 = self.df_op.forward_online(spec_, df_coefs, cache_dict=cache_dict3)
             # spec_f shape: [b, 1, t, df_bins, 2], torch.float32
             spec_e, cache_dict4 = self.spec_e_m_combine_online(spec_f, spec_m, cache_dict=cache_dict4)
             spec_e = torch.squeeze(spec_e, dim=1)

toolbox/torchaudio/modules/utils/ema.py CHANGED Viewed

@@ -3,26 +3,133 @@
 import math
 import numpy as np
 import torch.nn as nn
-def _calculate_norm_alpha(sample_rate: int, hop_size: int, tau: float):
-    """Exponential decay factor alpha for a given tau (decay window size [s])."""
-    dt = hop_size / sample_rate
-    result = math.exp(-dt / tau)
-    return result
-def get_norm_alpha(sample_rate: int, hop_size: int, norm_tau: float) -> float:
-    a_ = _calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
-    precision = 3
-    a = 1.0
-    while a >= 1.0:
-        a = round(a_, precision)
-        precision += 1
-    return a
 MEAN_NORM_INIT = [-60., -90.]
@@ -90,10 +197,5 @@ def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None
     return spec_feat
-class ExponentialMovingAverage(nn.Module):
-    def __init__(self):
-        super().__init__()
 if __name__ == "__main__":
     pass

 import math
 import numpy as np
+import torch
 import torch.nn as nn
+class EMANumpy(object):
+    @classmethod
+    def _calculate_norm_alpha(cls, sample_rate: int, hop_size: int, tau: float):
+        """Exponential decay factor alpha for a given tau (decay window size [s])."""
+        dt = hop_size / sample_rate
+        result = math.exp(-dt / tau)
+        return result
+    @classmethod
+    def get_norm_alpha(cls, sample_rate: int, hop_size: int, norm_tau: float) -> float:
+        a_ = cls._calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
+        precision = 3
+        a = 1.0
+        while a >= 1.0:
+            a = round(a_, precision)
+            precision += 1
+        return a
+class ErbEMA(nn.Module, EMANumpy):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 hop_size: int = 80,
+                 erb_bins: int = 32,
+                 mean_norm_init_start: float = -60.,
+                 mean_norm_init_end: float = -90.,
+                 norm_tau: float = 1.,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.erb_bins = erb_bins
+        self.mean_norm_init_start = mean_norm_init_start
+        self.mean_norm_init_end = mean_norm_init_end
+        self.norm_tau = norm_tau
+        self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau)
+    def make_erb_norm_state(self) -> torch.Tensor:
+        state = torch.linspace(start=self.mean_norm_init_start, end=self.mean_norm_init_end,
+                               steps=self.erb_bins)
+        state = state.unsqueeze(0).unsqueeze(0)
+        # state shape: [b, c, erb_bins]
+        # state shape: [1, 1, erb_bins]
+        return state
+    def norm(self,
+             feat_erb: torch.Tensor,
+             state: torch.Tensor = None,
+             ):
+        feat_erb = feat_erb.clone()
+        b, c, t, f = feat_erb.shape
+        # erb_feat shape: [b, c, t, f]
+        if state is None:
+            state = self.make_erb_norm_state()
+        state = state.clone()
+        for j in range(t):
+            current = feat_erb[:, :, j, :]
+            new_state = current * (1 - self.alpha) + state * self.alpha
+            feat_erb[:, :, j, :] = (current - new_state) / 40.0
+            state = new_state
+        return feat_erb, state
+class SpecEMA(nn.Module, EMANumpy):
+    """
+    https://github.com/grazder/DeepFilterNet/blob/torchDF_main/libDF/src/lib.rs
+    """
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 hop_size: int = 80,
+                 df_bins: int = 96,
+                 unit_norm_init_start: float = 0.001,
+                 unit_norm_init_end: float = 0.0001,
+                 norm_tau: float = 1.,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.df_bins = df_bins
+        self.unit_norm_init_start = unit_norm_init_start
+        self.unit_norm_init_end = unit_norm_init_end
+        self.norm_tau = norm_tau
+        self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau)
+    def make_spec_norm_state(self) -> torch.Tensor:
+        state = torch.linspace(start=self.unit_norm_init_start, end=self.unit_norm_init_end,
+                               steps=self.df_bins)
+        state = state.unsqueeze(0).unsqueeze(0)
+        # state shape: [b, c, df_bins]
+        # state shape: [1, 1, df_bins]
+        return state
+    def norm(self,
+             feat_spec: torch.Tensor,
+             state: torch.Tensor = None,
+             ):
+        feat_spec = feat_spec.clone()
+        b, c, t, f = feat_spec.shape
+        # feat_spec shape: [b, 2, t, df_bins]
+        if state is None:
+            state = self.make_spec_norm_state()
+        state = state.clone()
+        for j in range(t):
+            current = feat_spec[:, :, j, :]
+            current_abs = torch.sum(torch.square(current), dim=1, keepdim=True)
+            # current_abs shape: [b, 1, df_bins]
+            new_state = current_abs * (1 - self.alpha) + state * self.alpha
+            feat_spec[:, :, j, :] = current / torch.sqrt(new_state)
+            state = new_state
+        return feat_spec, state
 MEAN_NORM_INIT = [-60., -90.]
     return spec_feat
 if __name__ == "__main__":
     pass