Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on May 15

Commit

35a4689

1 Parent(s): f1a5461

update

Browse files

Files changed (26) hide show

examples/lstm/step_2_train_model.py +67 -101
examples/lstm/yaml/config.yaml +32 -0
examples/rnnoise/run.sh +172 -0
examples/rnnoise/step_1_prepare_data.py +197 -0
examples/rnnoise/step_2_train_model.py +442 -0
examples/rnnoise/yaml/config.yaml +31 -0
examples/test.py +39 -0
toolbox/{torchaudio/models/dfnet3 → torch/sparsification}/__init__.py +1 -1
toolbox/torch/sparsification/common.py +131 -0
toolbox/torch/sparsification/gru_sparsifier.py +190 -0
toolbox/torchaudio/models/dfnet/modeling_dfnet.py +31 -16
toolbox/torchaudio/models/dfnet/modeling_dfnet_online.py +226 -0
toolbox/torchaudio/models/dfnet3/configuration_dfnet3.py +0 -89
toolbox/torchaudio/models/dfnet3/features.py +0 -192
toolbox/torchaudio/models/dfnet3/modeling_dfnet3.py +0 -835
toolbox/torchaudio/models/dfnet3/multiframes.py +0 -145
toolbox/torchaudio/models/dfnet3/utils.py +0 -17
toolbox/torchaudio/models/dtln/modeling_dtln.py +4 -0
toolbox/torchaudio/models/frcrn/modeling_frcrn.py +2 -0
toolbox/torchaudio/models/gtcrn/__init__.py +6 -0
toolbox/torchaudio/models/gtcrn/modeling_gtcrn.py +15 -0
toolbox/torchaudio/models/lstm/modeling_lstm.py +4 -3
toolbox/torchaudio/models/rnnoise/configuration_rnnoise.py +77 -0
toolbox/torchaudio/models/rnnoise/modeling_rnnoise.py +393 -2
toolbox/torchaudio/models/rnnoise/yaml/config.yaml +34 -0
toolbox/torchaudio/modules/freq_bands/erb_bands.py +1 -0

examples/lstm/step_2_train_model.py CHANGED Viewed

@@ -26,6 +26,8 @@ import torchaudio
 from tqdm import tqdm
 from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
 from toolbox.torchaudio.metrics.pesq import run_pesq_score
 from toolbox.torchaudio.models.lstm.configuration_lstm import LstmConfig
 from toolbox.torchaudio.models.lstm.modeling_lstm import LstmPretrainedModel
@@ -72,95 +74,32 @@ def logging_config(file_dir: str):
 class CollateFunction(object):
-    def __init__(self,
-                 n_fft: int = 512,
-                 win_length: int = 200,
-                 hop_length: int = 80,
-                 window_fn: str = "hamming",
-                 irm_beta: float = 1.0,
-                 epsilon: float = 1e-8,
-                 ):
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.window_fn = window_fn
-        self.irm_beta = irm_beta
-        self.epsilon = epsilon
-        self.stft_mag = torchaudio.transforms.Spectrogram(
-            n_fft=self.n_fft,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            power=1.0,
-            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
-        )
-        self.stft_complex = torchaudio.transforms.Spectrogram(
-            n_fft=self.n_fft,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            power=None,
-            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
-        )
-        self.istft = torchaudio.transforms.InverseSpectrogram(
-            n_fft=self.n_fft,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
-        )
     def __call__(self, batch: List[dict]):
-        mag_noisy_audios = list()
-        pha_noisy_audios = list()
-        irm_gth = list()
         clean_audios = list()
         for sample in batch:
-            noise_audio: torch.Tensor = sample["noise_wave"]
             clean_audio: torch.Tensor = sample["speech_wave"]
             noisy_audio: torch.Tensor = sample["mix_wave"]
-            snr_db: float = sample["snr_db"]
-            mag_noise = self.stft_mag.forward(noise_audio)
-            mag_clean = self.stft_mag.forward(clean_audio)
-            stft_noisy = self.stft_complex.forward(noisy_audio)
-            irm_clean = mag_clean / (mag_noise + mag_clean + self.epsilon)
-            irm_clean = torch.pow(irm_clean, self.irm_beta)
-            real = torch.real(stft_noisy)
-            imag = torch.imag(stft_noisy)
-            mag_noisy = torch.sqrt(real ** 2 + imag ** 2)
-            pha_noisy = torch.atan2(imag, real)
-            mag_noisy_audios.append(mag_noisy)
-            pha_noisy_audios.append(pha_noisy)
-            irm_gth.append(irm_clean)
             clean_audios.append(clean_audio)
-        mag_noisy_audios = torch.stack(mag_noisy_audios)
-        pha_noisy_audios = torch.stack(pha_noisy_audios)
-        irm_gth = torch.stack(irm_gth)
         clean_audios = torch.stack(clean_audios)
         # assert
-        if torch.any(torch.isnan(mag_noisy_audios)):
-            raise AssertionError("nan in mag_noisy_audios Tensor")
-        if torch.any(torch.isnan(pha_noisy_audios)):
-            raise AssertionError("nan in pha_noisy_audios Tensor")
-        if torch.any(torch.isnan(irm_gth)):
-            raise AssertionError("nan in irm_gth Tensor")
-        if torch.any(torch.isnan(clean_audios)):
-            raise AssertionError("nan in clean_audios Tensor")
-        return mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios
-    def enhance(self, mag_noisy: torch.Tensor, pha_noisy: torch.Tensor, irm_speech: torch.Tensor):
-        mag_denoise = mag_noisy * irm_speech
-        stft_denoise = mag_denoise * torch.exp((1j * pha_noisy))
-        denoise = self.istft.forward(stft_denoise)
-        return denoise
 collate_fn = CollateFunction()
@@ -282,8 +221,14 @@ def main():
     else:
         raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
-    mse_loss_fn = nn.MSELoss(
-        reduction="mean",
     ).to(device)
     # training loop
@@ -291,6 +236,8 @@ def main():
     average_pesq_score = 1000000000
     average_loss = 1000000000
     model_list = list()
     best_epoch_idx = None
@@ -311,6 +258,8 @@ def main():
         total_pesq_score = 0.
         total_loss = 0.
         total_batches = 0.
         progress_bar_train = tqdm(
@@ -318,15 +267,19 @@ def main():
             desc="Training; epoch: {}".format(epoch_idx),
         )
         for train_batch in train_data_loader:
-            mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios = train_batch
-            mag_noisy_audios = mag_noisy_audios.to(device)
-            pha_noisy_audios = pha_noisy_audios.to(device)
-            irm_gth = irm_gth.to(device)
-            clean_audios = clean_audios.to(device)
-            irm = model.forward(mag_noisy_audios)
-            denoise_audios = collate_fn.enhance(mag_noisy_audios, pha_noisy_audios, irm)
-            loss = mse_loss_fn.forward(irm, irm_gth)
             denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
             clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
@@ -340,16 +293,22 @@ def main():
             total_pesq_score += pesq_score
             total_loss += loss.item()
             total_batches += 1
             average_pesq_score = round(total_pesq_score / total_batches, 4)
             average_loss = round(total_loss / total_batches, 4)
             progress_bar_train.update(1)
             progress_bar_train.set_postfix({
                 "lr": lr_scheduler.get_last_lr()[0],
                 "pesq_score": average_pesq_score,
                 "loss": average_loss,
             })
             # evaluation
@@ -360,6 +319,8 @@ def main():
                     total_pesq_score = 0.
                     total_loss = 0.
                     total_batches = 0.
                     progress_bar_train.close()
@@ -368,43 +329,48 @@ def main():
                     )
                     for eval_batch in valid_data_loader:
-                        mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios = eval_batch
-                        mag_noisy_audios = mag_noisy_audios.to(device)
-                        pha_noisy_audios = pha_noisy_audios.to(device)
-                        irm_gth = irm_gth.to(device)
-                        clean_audios = clean_audios.to(device)
-                        with torch.no_grad():
-                            irm = model.forward(mag_noisy_audios)
-                            denoise_audios = collate_fn.enhance(mag_noisy_audios, pha_noisy_audios, irm)
-                            loss = mse_loss_fn.forward(irm, irm_gth)
                         denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
                         clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
                         pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
-                        optimizer.zero_grad()
-                        loss.backward()
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
-                        optimizer.step()
-                        lr_scheduler.step()
                         total_pesq_score += pesq_score
                         total_loss += loss.item()
                         total_batches += 1
                         average_pesq_score = round(total_pesq_score / total_batches, 4)
                         average_loss = round(total_loss / total_batches, 4)
                         progress_bar_eval.update(1)
                         progress_bar_eval.set_postfix({
                             "lr": lr_scheduler.get_last_lr()[0],
                             "pesq_score": average_pesq_score,
                             "loss": average_loss,
                         })
                     total_pesq_score = 0.
                     total_loss = 0.
                     total_batches = 0.
                     progress_bar_eval.close()

 from tqdm import tqdm
 from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
 from toolbox.torchaudio.metrics.pesq import run_pesq_score
 from toolbox.torchaudio.models.lstm.configuration_lstm import LstmConfig
 from toolbox.torchaudio.models.lstm.modeling_lstm import LstmPretrainedModel
 class CollateFunction(object):
+    def __init__(self):
+        pass
     def __call__(self, batch: List[dict]):
         clean_audios = list()
+        noisy_audios = list()
+        snr_db_list = list()
         for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
             clean_audio: torch.Tensor = sample["speech_wave"]
             noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
             clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
         clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
         # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
 collate_fn = CollateFunction()
     else:
         raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    mr_stft_loss_fn = MultiResolutionSTFTLoss(
+        fft_size_list=[256, 512, 1024],
+        win_size_list=[256, 512, 1024],
+        hop_size_list=[128, 256, 512],
+        factor_sc=1.5,
+        factor_mag=1.0,
+        reduction="mean"
     ).to(device)
     # training loop
     average_pesq_score = 1000000000
     average_loss = 1000000000
+    average_mr_stft_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
     model_list = list()
     best_epoch_idx = None
         total_pesq_score = 0.
         total_loss = 0.
+        total_mr_stft_loss = 0.
+        total_neg_si_snr_loss = 0.
         total_batches = 0.
         progress_bar_train = tqdm(
             desc="Training; epoch: {}".format(epoch_idx),
         )
         for train_batch in train_data_loader:
+            clean_audios, noisy_audios = train_batch
+            clean_audios: torch.Tensor = clean_audios.to(device)
+            noisy_audios: torch.Tensor = noisy_audios.to(device)
+            denoise_audios, _, _ = model.forward(noisy_audios)
+            mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                logger.info(f"find nan or inf in loss.")
+                continue
             denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
             clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
             total_pesq_score += pesq_score
             total_loss += loss.item()
+            total_mr_stft_loss += mr_stft_loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
             total_batches += 1
             average_pesq_score = round(total_pesq_score / total_batches, 4)
             average_loss = round(total_loss / total_batches, 4)
+            average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
             progress_bar_train.update(1)
             progress_bar_train.set_postfix({
                 "lr": lr_scheduler.get_last_lr()[0],
                 "pesq_score": average_pesq_score,
                 "loss": average_loss,
+                "mr_stft_loss": average_mr_stft_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
             })
             # evaluation
                     total_pesq_score = 0.
                     total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
                     total_batches = 0.
                     progress_bar_train.close()
                     )
                     for eval_batch in valid_data_loader:
+                        clean_audios, noisy_audios = eval_batch
+                        clean_audios: torch.Tensor = clean_audios.to(device)
+                        noisy_audios: torch.Tensor = noisy_audios.to(device)
+                        denoise_audios, _, _ = model.forward(noisy_audios)
+                        mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                            logger.info(f"find nan or inf in loss.")
+                            continue
                         denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
                         clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
                         pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
                         total_pesq_score += pesq_score
                         total_loss += loss.item()
+                        total_mr_stft_loss += mr_stft_loss.item()
+                        total_neg_si_snr_loss += neg_si_snr_loss.item()
                         total_batches += 1
                         average_pesq_score = round(total_pesq_score / total_batches, 4)
                         average_loss = round(total_loss / total_batches, 4)
+                        average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
                         progress_bar_eval.update(1)
                         progress_bar_eval.set_postfix({
                             "lr": lr_scheduler.get_last_lr()[0],
                             "pesq_score": average_pesq_score,
                             "loss": average_loss,
+                            "mr_stft_loss": average_mr_stft_loss,
+                            "neg_si_snr_loss": average_neg_si_snr_loss,
                         })
                     total_pesq_score = 0.
                     total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
                     total_batches = 0.
                     progress_bar_eval.close()

examples/lstm/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model_name: "lstm"
+# spec
+sample_rate: 8000
+segment_size: 32000
+n_fft: 320
+win_size: 320
+hop_size: 160
+win_type: hann
+# data
+max_snr_db: 20
+min_snr_db: -10
+# model
+hidden_size: 512
+num_layers: 3
+dropout: 0.1
+# train
+max_epochs: 100
+batch_size: 32
+num_workers: 4
+seed: 1234
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+weight_decay: 0.00001
+clip_grad_norm: 10.0
+eval_steps: 25000

examples/rnnoise/run.sh ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env bash
+: <<'END'
+sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir
+sh run.sh --stage 3 --stop_stage 3 --system_version windows --file_folder_name file_dir
+sh run.sh --stage 1 --stop_stage 3 --system_version centos --file_folder_name file_dir \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
+END
+# params
+system_version="windows";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=9
+work_dir="$(pwd)"
+file_folder_name=file_folder_name
+final_model_name=final_model_name
+config_file="yaml/config.yaml"
+limit=10
+noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
+speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+nohup_name=nohup.out
+# model params
+batch_size=64
+max_epochs=200
+save_top_k=10
+patience=5
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+file_dir="${work_dir}/${file_folder_name}"
+final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
+evaluation_audio_dir="${file_dir}/evaluation_audio"
+dataset="${file_dir}/dataset.xlsx"
+train_dataset="${file_dir}/train.xlsx"
+valid_dataset="${file_dir}/valid.xlsx"
+$verbose && echo "system_version: ${system_version}"
+$verbose && echo "file_folder_name: ${file_folder_name}"
+if [ $system_version == "windows" ]; then
+  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
+elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
+  #source /data/local/bin/nx_denoise/bin/activate
+  alias python3='/data/local/bin/nx_denoise/bin/python3'
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: prepare data"
+  cd "${work_dir}" || exit 1
+  python3 step_1_prepare_data.py \
+  --file_dir "${file_dir}" \
+  --noise_dir "${noise_dir}" \
+  --speech_dir "${speech_dir}" \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: train model"
+  cd "${work_dir}" || exit 1
+  python3 step_2_train_model.py \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --serialization_dir "${file_dir}" \
+  --config_file "${config_file}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: test model"
+  cd "${work_dir}" || exit 1
+  python3 step_3_evaluation.py \
+  --valid_dataset "${valid_dataset}" \
+  --model_dir "${file_dir}/best" \
+  --evaluation_audio_dir "${evaluation_audio_dir}" \
+  --limit "${limit}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: export model"
+  cd "${work_dir}" || exit 1
+  python3 step_5_export_models.py \
+  --vocabulary_dir "${vocabulary_dir}" \
+  --model_dir "${file_dir}/best" \
+  --serialization_dir "${file_dir}" \
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: collect files"
+  cd "${work_dir}" || exit 1
+  mkdir -p ${final_model_dir}
+  cp "${file_dir}/best"/* "${final_model_dir}"
+  cp -r "${file_dir}/vocabulary" "${final_model_dir}"
+  cp "${file_dir}/evaluation.xlsx" "${final_model_dir}/evaluation.xlsx"
+  cp "${file_dir}/trace_model.zip" "${final_model_dir}/trace_model.zip"
+  cp "${file_dir}/trace_quant_model.zip" "${final_model_dir}/trace_quant_model.zip"
+  cp "${file_dir}/script_model.zip" "${final_model_dir}/script_model.zip"
+  cp "${file_dir}/script_quant_model.zip" "${final_model_dir}/script_quant_model.zip"
+  cd "${final_model_dir}/.." || exit 1;
+  if [ -e "${final_model_name}.zip" ]; then
+    rm -rf "${final_model_name}_backup.zip"
+    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
+  fi
+  zip -r "${final_model_name}.zip" "${final_model_name}"
+  rm -rf "${final_model_name}"
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  $verbose && echo "stage 6: clear file_dir"
+  cd "${work_dir}" || exit 1
+  rm -rf "${file_dir}";
+fi

examples/rnnoise/step_1_prepare_data.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+from pathlib import Path
+import random
+import sys
+import shutil
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import pandas as pd
+from scipy.io import wavfile
+from tqdm import tqdm
+import librosa
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_dir", default="./", type=str)
+    parser.add_argument(
+        "--noise_dir",
+        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
+        type=str
+    )
+    parser.add_argument(
+        "--speech_dir",
+        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
+        type=str
+    )
+    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
+    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
+    parser.add_argument("--duration", default=2.0, type=float)
+    parser.add_argument("--min_nsr_db", default=-20, type=float)
+    parser.add_argument("--max_nsr_db", default=5, type=float)
+    parser.add_argument("--target_sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def filename_generator(data_dir: str):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        yield filename.as_posix()
+def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
+        raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
+        if raw_duration < duration:
+            # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
+            continue
+        if signal.ndim != 1:
+            raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
+        signal_length = len(signal)
+        win_size = int(duration * sample_rate)
+        for begin in range(0, signal_length - win_size, win_size):
+            row = {
+                "filename": filename.as_posix(),
+                "raw_duration": round(raw_duration, 4),
+                "offset": round(begin / sample_rate, 4),
+                "duration": round(duration, 4),
+            }
+            yield row
+def get_dataset(args):
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    noise_dir = Path(args.noise_dir)
+    speech_dir = Path(args.speech_dir)
+    noise_generator = target_second_signal_generator(
+        noise_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate
+    )
+    speech_generator = target_second_signal_generator(
+        speech_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate
+    )
+    dataset = list()
+    count = 0
+    process_bar = tqdm(desc="build dataset excel")
+    for noise, speech in zip(noise_generator, speech_generator):
+        noise_filename = noise["filename"]
+        noise_raw_duration = noise["raw_duration"]
+        noise_offset = noise["offset"]
+        noise_duration = noise["duration"]
+        speech_filename = speech["filename"]
+        speech_raw_duration = speech["raw_duration"]
+        speech_offset = speech["offset"]
+        speech_duration = speech["duration"]
+        random1 = random.random()
+        random2 = random.random()
+        row = {
+            "noise_filename": noise_filename,
+            "noise_raw_duration": noise_raw_duration,
+            "noise_offset": noise_offset,
+            "noise_duration": noise_duration,
+            "speech_filename": speech_filename,
+            "speech_raw_duration": speech_raw_duration,
+            "speech_offset": speech_offset,
+            "speech_duration": speech_duration,
+            "snr_db": random.uniform(args.min_nsr_db, args.max_nsr_db),
+            "random1": random1,
+            "random2": random2,
+            "flag": "TRAIN" if random2 < 0.8 else "TEST",
+        }
+        dataset.append(row)
+        count += 1
+        duration_seconds = count * args.duration
+        duration_hours = duration_seconds / 3600
+        process_bar.update(n=1)
+        process_bar.set_postfix({
+            # "duration_seconds": round(duration_seconds, 4),
+            "duration_hours": round(duration_hours, 4),
+        })
+    dataset = pd.DataFrame(dataset)
+    dataset = dataset.sort_values(by=["random1"], ascending=False)
+    dataset.to_excel(
+        file_dir / "dataset.xlsx",
+        index=False,
+    )
+    return
+def split_dataset(args):
+    """分割训练集, 测试集"""
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    df = pd.read_excel(file_dir / "dataset.xlsx")
+    train = list()
+    test = list()
+    for i, row in df.iterrows():
+        flag = row["flag"]
+        if flag == "TRAIN":
+            train.append(row)
+        else:
+            test.append(row)
+    train = pd.DataFrame(train)
+    train.to_excel(
+        args.train_dataset,
+        index=False,
+        # encoding="utf_8_sig"
+    )
+    test = pd.DataFrame(test)
+    test.to_excel(
+        args.valid_dataset,
+        index=False,
+        # encoding="utf_8_sig"
+    )
+    return
+def main():
+    args = get_args()
+    get_dataset(args)
+    split_dataset(args)
+    return
+if __name__ == "__main__":
+    main()

examples/rnnoise/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,442 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/WenzheLiu-Speech/awesome-speech-enhancement
+"""
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data.dataloader import DataLoader
+import torchaudio
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+from toolbox.torchaudio.models.rnnoise.configuration_rnnoise import RNNoiseConfig
+from toolbox.torchaudio.models.rnnoise.modeling_rnnoise import RNNoisePretrainedModel
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--max_epochs", default=100, type=int)
+    parser.add_argument("--batch_size", default=64, type=int)
+    parser.add_argument("--learning_rate", default=1e-3, type=float)
+    parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
+    parser.add_argument("--patience", default=10, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self):
+        pass
+    def __call__(self, batch: List[dict]):
+        clean_audios = list()
+        noisy_audios = list()
+        snr_db_list = list()
+        for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
+            clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
+        clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
+        # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = RNNoiseConfig.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    logger.info("set seed: {}".format(args.seed))
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("GPU available count: {}; device: {}".format(n_gpu, device))
+    # datasets
+    logger.info("prepare datasets")
+    train_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.train_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+        # skip=225000,
+    )
+    valid_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.valid_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = RNNoisePretrainedModel(
+        config=config,
+    )
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, evaluation_metric")
+    optimizer = torch.optim.AdamW(model.parameters(), config.lr)
+    # resume training
+    last_step_idx = -1
+    last_epoch = -1
+    for step_idx_str in serialization_dir.glob("steps-*"):
+        step_idx_str = Path(step_idx_str)
+        step_idx = step_idx_str.stem.split("-")[1]
+        step_idx = int(step_idx)
+        if step_idx > last_step_idx:
+            last_step_idx = step_idx
+    # last_epoch = 1
+    if last_step_idx != -1:
+        logger.info(f"resume from steps-{last_step_idx}.")
+        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
+        optimizer_pth = serialization_dir / f"steps-{last_step_idx}/optimizer.pth"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        logger.info(f"load state dict for optimizer.")
+        with open(optimizer_pth.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        optimizer.load_state_dict(state_dict)
+    if config.lr_scheduler == "CosineAnnealingLR":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            last_epoch=last_epoch,
+            # T_max=10 * config.eval_steps,
+            # eta_min=0.01 * config.lr,
+            **config.lr_scheduler_kwargs,
+        )
+    elif config.lr_scheduler == "MultiStepLR":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            last_epoch=last_epoch,
+            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+        )
+    else:
+        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    mr_stft_loss_fn = MultiResolutionSTFTLoss(
+        fft_size_list=[256, 512, 1024],
+        win_size_list=[256, 512, 1024],
+        hop_size_list=[128, 256, 512],
+        factor_sc=1.5,
+        factor_mag=1.0,
+        reduction="mean"
+    ).to(device)
+    # training loop
+    logger.info("training")
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    average_mr_stft_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
+    model_list = list()
+    best_epoch_idx = None
+    best_step_idx = None
+    best_metric = None
+    patience_count = 0
+    step_idx = 0 if last_step_idx == -1 else last_step_idx
+    logger.info("training")
+    early_stop_flag = False
+    for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_mr_stft_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_batches = 0.
+        progress_bar_train = tqdm(
+            initial=step_idx,
+            desc="Training; epoch: {}".format(epoch_idx),
+        )
+        for train_batch in train_data_loader:
+            clean_audios, noisy_audios = train_batch
+            clean_audios: torch.Tensor = clean_audios.to(device)
+            noisy_audios: torch.Tensor = noisy_audios.to(device)
+            denoise_audios, _, _ = model.forward(noisy_audios)
+            mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                logger.info(f"find nan or inf in loss.")
+                continue
+            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_mr_stft_loss += mr_stft_loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+            progress_bar_train.update(1)
+            progress_bar_train.set_postfix({
+                "lr": lr_scheduler.get_last_lr()[0],
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+                "mr_stft_loss": average_mr_stft_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
+            })
+            # evaluation
+            step_idx += 1
+            if step_idx % config.eval_steps == 0:
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_train.close()
+                    progress_bar_eval = tqdm(
+                        desc="Evaluation; steps-{}k".format(int(step_idx / 1000)),
+                    )
+                    for eval_batch in valid_data_loader:
+                        clean_audios, noisy_audios = eval_batch
+                        clean_audios: torch.Tensor = clean_audios.to(device)
+                        noisy_audios: torch.Tensor = noisy_audios.to(device)
+                        denoise_audios, _, _ = model.forward(noisy_audios)
+                        mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                            logger.info(f"find nan or inf in loss.")
+                            continue
+                        denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+                        total_pesq_score += pesq_score
+                        total_loss += loss.item()
+                        total_mr_stft_loss += mr_stft_loss.item()
+                        total_neg_si_snr_loss += neg_si_snr_loss.item()
+                        total_batches += 1
+                        average_pesq_score = round(total_pesq_score / total_batches, 4)
+                        average_loss = round(total_loss / total_batches, 4)
+                        average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+                        progress_bar_eval.update(1)
+                        progress_bar_eval.set_postfix({
+                            "lr": lr_scheduler.get_last_lr()[0],
+                            "pesq_score": average_pesq_score,
+                            "loss": average_loss,
+                            "mr_stft_loss": average_mr_stft_loss,
+                            "neg_si_snr_loss": average_neg_si_snr_loss,
+                        })
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_eval.close()
+                    progress_bar_train = tqdm(
+                        initial=progress_bar_train.n,
+                        postfix=progress_bar_train.postfix,
+                        desc=progress_bar_train.desc,
+                    )
+                    # save path
+                    epoch_dir = serialization_dir / "epoch-{}".format(epoch_idx)
+                    epoch_dir.mkdir(parents=True, exist_ok=False)
+                    # save models
+                    model.save_pretrained(epoch_dir.as_posix())
+                    model_list.append(epoch_dir)
+                    if len(model_list) >= args.num_serialized_models_to_keep:
+                        model_to_delete: Path = model_list.pop(0)
+                        shutil.rmtree(model_to_delete.as_posix())
+                    # save metric
+                    if best_metric is None:
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    elif average_pesq_score >= best_metric:
+                        # great is better.
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    else:
+                        pass
+                    metrics = {
+                        "epoch_idx": epoch_idx,
+                        "best_epoch_idx": best_epoch_idx,
+                        "best_step_idx": best_step_idx,
+                        "pesq_score": average_pesq_score,
+                        "loss": average_loss,
+                    }
+                    metrics_filename = epoch_dir / "metrics_epoch.json"
+                    with open(metrics_filename, "w", encoding="utf-8") as f:
+                        json.dump(metrics, f, indent=4, ensure_ascii=False)
+                    # save best
+                    best_dir = serialization_dir / "best"
+                    if best_epoch_idx == epoch_idx:
+                        if best_dir.exists():
+                            shutil.rmtree(best_dir)
+                        shutil.copytree(epoch_dir, best_dir)
+                    # early stop
+                    early_stop_flag = False
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        patience_count = 0
+                    else:
+                        patience_count += 1
+                    if patience_count >= args.patience:
+                        early_stop_flag = True
+                    # early stop
+                    if early_stop_flag:
+                        break
+    return
+if __name__ == '__main__':
+    main()

examples/rnnoise/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+model_name: "rnnoise"
+# spec
+sample_rate: 8000
+segment_size: 32000
+nfft: 512
+win_size: 512
+hop_size: 256
+win_type: hann
+# data
+max_snr_db: 20
+min_snr_db: -10
+# model
+conv_size: 256
+gru_size: 256
+# train
+max_epochs: 100
+batch_size: 32
+num_workers: 4
+seed: 1234
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+weight_decay: 0.00001
+clip_grad_norm: 10.0
+eval_steps: 20000

examples/test.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+inputs = torch.randn(size=(1, 1, 16000))
+conv1d = nn.Conv1d(
+    in_channels=1,
+    out_channels=1,
+    kernel_size=3,
+    stride=2,
+    padding=0,
+    dilation=1,
+)
+conv1dt = nn.ConvTranspose1d(
+    in_channels=1,
+    out_channels=1,
+    kernel_size=3,
+    stride=2,
+    padding=0,
+    output_padding=1,
+    dilation=1,
+)
+x = conv1d.forward(inputs)
+print(x.shape)
+x = conv1dt.forward(x)
+print(x.shape)
+print(x[:, :, 0])
+print(x[:, :, -2])
+print(x[:, :, -1])
+if __name__ == "__main__":
+    pass

toolbox/{torchaudio/models/dfnet3 → torch/sparsification}/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@
 # -*- coding: utf-8 -*-
-if __name__ == '__main__':
     pass

 # -*- coding: utf-8 -*-
+if __name__ == "__main__":
     pass

toolbox/torch/sparsification/common.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+/* Copyright (c) 2023 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+import torch
+"""
+https://github.com/xiph/rnnoise/blob/main/torch/sparsification/common.py
+"""
+def sparsify_matrix(matrix : torch.tensor, density : float, block_size, keep_diagonal : bool=False, return_mask : bool=False):
+    """ sparsifies matrix with specified block size
+        Parameters:
+        -----------
+        matrix : torch.tensor
+            matrix to sparsify
+        density : int
+            target density
+        block_size : [int, int]
+            block size dimensions
+        keep_diagonal : bool
+            If true, the diagonal will be kept. This option requires block_size[0] == block_size[1] and defaults to False
+    """
+    m, n   = matrix.shape
+    m1, n1 = block_size
+    if m % m1 or n % n1:
+        raise ValueError(f"block size {(m1, n1)} does not divide matrix size {(m, n)}")
+    # extract diagonal if keep_diagonal = True
+    if keep_diagonal:
+        if m != n:
+            raise ValueError("Attempting to sparsify non-square matrix with keep_diagonal=True")
+        to_spare = torch.diag(torch.diag(matrix))
+        matrix   = matrix - to_spare
+    else:
+        to_spare = torch.zeros_like(matrix)
+    # calculate energy in sub-blocks
+    x = torch.reshape(matrix, (m // m1, m1, n // n1, n1))
+    x = x ** 2
+    block_energies = torch.sum(torch.sum(x, dim=3), dim=1)
+    number_of_blocks = (m * n) // (m1 * n1)
+    number_of_survivors = round(number_of_blocks * density)
+    # masking threshold
+    if number_of_survivors == 0:
+        threshold = 0
+    else:
+        threshold = torch.sort(torch.flatten(block_energies)).values[-number_of_survivors]
+    # create mask
+    mask = torch.ones_like(block_energies)
+    mask[block_energies < threshold] = 0
+    mask = torch.repeat_interleave(mask, m1, dim=0)
+    mask = torch.repeat_interleave(mask, n1, dim=1)
+    # perform masking
+    masked_matrix = mask * matrix + to_spare
+    if return_mask:
+        return masked_matrix, mask
+    else:
+        return masked_matrix
+def calculate_gru_flops_per_step(gru, sparsification_dict=dict(), drop_input=False):
+    input_size = gru.input_size
+    hidden_size = gru.hidden_size
+    flops = 0
+    input_density = (
+        sparsification_dict.get('W_ir', [1])[0]
+        + sparsification_dict.get('W_in', [1])[0]
+        + sparsification_dict.get('W_iz', [1])[0]
+    ) / 3
+    recurrent_density = (
+        sparsification_dict.get('W_hr', [1])[0]
+        + sparsification_dict.get('W_hn', [1])[0]
+        + sparsification_dict.get('W_hz', [1])[0]
+    ) / 3
+    # input matrix vector multiplications
+    if not drop_input:
+        flops += 2 * 3 * input_size * hidden_size * input_density
+    # recurrent matrix vector multiplications
+    flops += 2 * 3 * hidden_size * hidden_size * recurrent_density
+    # biases
+    flops += 6 * hidden_size
+    # activations estimated by 10 flops per activation
+    flops += 30 * hidden_size
+    return flops
+if __name__ == "__main__":
+    pass

toolbox/torch/sparsification/gru_sparsifier.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+/* Copyright (c) 2023 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+import torch
+from toolbox.torch.sparsification.common import sparsify_matrix
+"""
+https://github.com/xiph/rnnoise/blob/main/torch/sparsification/gru_sparsifier.py
+"""
+class GRUSparsifier:
+    def __init__(self, task_list, start, stop, interval, exponent=3):
+        """ Sparsifier for torch.nn.GRUs
+            Parameters:
+            -----------
+            task_list : list
+                task_list contains a list of tuples (gru, sparsify_dict), where gru is an instance
+                of torch.nn.GRU and sparsify_dic is a dictionary with keys in {'W_ir', 'W_iz', 'W_in',
+                'W_hr', 'W_hz', 'W_hn'} corresponding to the input and recurrent weights for the reset,
+                update, and new gate. The values of sparsify_dict are tuples (density, [m, n], keep_diagonal),
+                where density is the target density in [0, 1], [m, n] is the shape sub-blocks to which
+                sparsification is applied and keep_diagonal is a bool variable indicating whether the diagonal
+                should be kept.
+            start : int
+                training step after which sparsification will be started.
+            stop : int
+                training step after which sparsification will be completed.
+            interval : int
+                sparsification interval for steps between start and stop. After stop sparsification will be
+                carried out after every call to GRUSparsifier.step()
+            exponent : float
+                Interpolation exponent for sparsification interval. In step i sparsification will be carried out
+                with density (alpha + target_density * (1 * alpha)), where
+                alpha = ((stop - i) / (start - stop)) ** exponent
+            Example:
+            --------
+            >>> import torch
+            >>> gru = torch.nn.GRU(10, 20)
+            >>> sparsify_dict = {
+            ...         'W_ir' : (0.5, [2, 2], False),
+            ...         'W_iz' : (0.6, [2, 2], False),
+            ...         'W_in' : (0.7, [2, 2], False),
+            ...         'W_hr' : (0.1, [4, 4], True),
+            ...         'W_hz' : (0.2, [4, 4], True),
+            ...         'W_hn' : (0.3, [4, 4], True),
+            ...     }
+            >>> sparsifier = GRUSparsifier([(gru, sparsify_dict)], 0, 100, 50)
+            >>> for i in range(100):
+            ...         sparsifier.step()
+        """
+        # just copying parameters...
+        self.start      = start
+        self.stop       = stop
+        self.interval   = interval
+        self.exponent   = exponent
+        self.task_list  = task_list
+        # ... and setting counter to 0
+        self.step_counter = 0
+        self.last_masks = {key : None for key in ['W_ir', 'W_in', 'W_iz', 'W_hr', 'W_hn', 'W_hz']}
+    def step(self, verbose=False):
+        """ carries out sparsification step
+            Call this function after optimizer.step in your
+            training loop.
+            Parameters:
+            ----------
+            verbose : bool
+                if true, densities are printed out
+            Returns:
+            --------
+            None
+        """
+        # compute current interpolation factor
+        self.step_counter += 1
+        if self.step_counter < self.start:
+            return
+        elif self.step_counter < self.stop:
+            # update only every self.interval-th interval
+            if self.step_counter % self.interval:
+                return
+            alpha = ((self.stop - self.step_counter) / (self.stop - self.start)) ** self.exponent
+        else:
+            alpha = 0
+        with torch.no_grad():
+            for gru, params in self.task_list:
+                hidden_size = gru.hidden_size
+                # input weights
+                for i, key in enumerate(['W_ir', 'W_iz', 'W_in']):
+                    if key in params:
+                        density = alpha + (1 - alpha) * params[key][0]
+                        if verbose:
+                            print(f"[{self.step_counter}]: {key} density: {density}")
+                        gru.weight_ih_l0[i * hidden_size : (i+1) * hidden_size, : ], new_mask = sparsify_matrix(
+                            gru.weight_ih_l0[i * hidden_size : (i + 1) * hidden_size, : ],
+                            density, # density
+                            params[key][1], # block_size
+                            params[key][2], # keep_diagonal (might want to set this to False)
+                            return_mask=True
+                        )
+                        if type(self.last_masks[key]) != type(None):
+                            if not torch.all(self.last_masks[key] == new_mask) and self.step_counter > self.stop:
+                                print(f"sparsification mask {key} changed for gru {gru}")
+                        self.last_masks[key] = new_mask
+                # recurrent weights
+                for i, key in enumerate(['W_hr', 'W_hz', 'W_hn']):
+                    if key in params:
+                        density = alpha + (1 - alpha) * params[key][0]
+                        if verbose:
+                            print(f"[{self.step_counter}]: {key} density: {density}")
+                        gru.weight_hh_l0[i * hidden_size : (i+1) * hidden_size, : ], new_mask = sparsify_matrix(
+                            gru.weight_hh_l0[i * hidden_size : (i + 1) * hidden_size, : ],
+                            density,
+                            params[key][1], # block_size
+                            params[key][2], # keep_diagonal (might want to set this to False)
+                            return_mask=True
+                        )
+                        if type(self.last_masks[key]) != type(None):
+                            if not torch.all(self.last_masks[key] == new_mask) and self.step_counter > self.stop:
+                                print(f"sparsification mask {key} changed for gru {gru}")
+                        self.last_masks[key] = new_mask
+if __name__ == "__main__":
+    print("Testing sparsifier")
+    gru = torch.nn.GRU(10, 20)
+    sparsify_dict = {
+        'W_ir' : (0.5, [2, 2], False),
+        'W_iz' : (0.6, [2, 2], False),
+        'W_in' : (0.7, [2, 2], False),
+        'W_hr' : (0.1, [4, 4], True),
+        'W_hz' : (0.2, [4, 4], True),
+        'W_hn' : (0.3, [4, 4], True),
+    }
+    sparsifier = GRUSparsifier([(gru, sparsify_dict)], 0, 100, 10)
+    for i in range(100):
+        sparsifier.step(verbose=True)

toolbox/torchaudio/models/dfnet/modeling_dfnet.py CHANGED Viewed

@@ -1,5 +1,11 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import os
 import math
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@@ -8,7 +14,6 @@ import numpy as np
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-import torchaudio
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
@@ -480,6 +485,7 @@ class Encoder(nn.Module):
 class Decoder(nn.Module):
     def __init__(self, config: DfNetConfig):
         super(Decoder, self).__init__()
@@ -800,6 +806,9 @@ class DeepFiltering(nn.Module):
 class DfNet(nn.Module):
     def __init__(self, config: DfNetConfig):
         super(DfNet, self).__init__()
         self.config = config
@@ -867,23 +876,11 @@ class DfNet(nn.Module):
         if remainder > 0:
             n_samples_pad = self.hop_size - remainder
             signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
-        return signal, n_samples
-    def forward(self,
-                noisy: torch.Tensor,
-                ):
-        """
-        :param noisy:
-        :return:
-        est_spec: shape: [b, 257*2, t]
-        est_wav:  shape: [b, num_samples]
-        est_mask: shape: [b, 257, t]
-        lsnr:     shape: [b, 1, t]
-        """
-        noisy, n_samples = self.signal_prepare(noisy)
         # noisy shape: [b, num_samples_pad]
-        spec_cmp = self.stft.forward(noisy)
         # spec_complex shape: [b, f, t], torch.complex64
         spec_cmp = torch.transpose(spec_cmp, dim0=1, dim1=2)
         # spec_complex shape: [b, t, f], torch.complex64
@@ -906,6 +903,24 @@ class DfNet(nn.Module):
         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
         e0, e1, e2, e3, emb, c0, lsnr, h = self.encoder.forward(feat_erb, feat_spec)
         mask = self.decoder.forward(emb, e3, e2, e1, e0)

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+"""
+DeepFilterNet 的原生实现不直接支持流式推理
+社区开发者（如 Rikorose）提供了基于 Torch 的流式推理实现
+https://github.com/grazder/DeepFilterNet/tree/1097015d53ced78fb234e7d7071a5dd4446e3952/torchDF
+"""
 import os
 import math
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
 class Decoder(nn.Module):
+    """ErbDecoder"""
     def __init__(self, config: DfNetConfig):
         super(Decoder, self).__init__()
 class DfNet(nn.Module):
+    """
+    我感觉这个模型没办法实现完全一致的流式推理。
+    """
     def __init__(self, config: DfNetConfig):
         super(DfNet, self).__init__()
         self.config = config
         if remainder > 0:
             n_samples_pad = self.hop_size - remainder
             signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal
+    def feature_prepare(self, signal: torch.Tensor):
         # noisy shape: [b, num_samples_pad]
+        spec_cmp = self.stft.forward(signal)
         # spec_complex shape: [b, f, t], torch.complex64
         spec_cmp = torch.transpose(spec_cmp, dim0=1, dim1=2)
         # spec_complex shape: [b, t, f], torch.complex64
         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
+        return spec, feat_erb, feat_spec
+    def forward(self,
+                noisy: torch.Tensor,
+                ):
+        """
+        :param noisy:
+        :return:
+        est_spec: shape: [b, 257*2, t]
+        est_wav:  shape: [b, num_samples]
+        est_mask: shape: [b, 257, t]
+        lsnr:     shape: [b, 1, t]
+        """
+        n_samples = noisy.shape[-1]
+        noisy = self.signal_prepare(noisy)
+        spec, feat_erb, feat_spec = self.feature_prepare(noisy)
         e0, e1, e2, e3, emb, c0, lsnr, h = self.encoder.forward(feat_erb, feat_spec)
         mask = self.decoder.forward(emb, e3, e2, e1, e0)

toolbox/torchaudio/models/dfnet/modeling_dfnet_online.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+DeepFilterNet 的原生实现不直接支持流式推理
+社区开发者（如 Rikorose）提供了基于 Torch 的流式推理实现
+https://github.com/grazder/DeepFilterNet/tree/1097015d53ced78fb234e7d7071a5dd4446e3952/torchDF
+此文件试图实现一个支持流式推理的 dfnet
+"""
+import os
+import math
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
+from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
+from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
+MODEL_FILE = "model.pt"
+norm_layer_dict = {
+    "batch_norm_2d": torch.nn.BatchNorm2d
+}
+activation_layer_dict = {
+    "relu": torch.nn.ReLU,
+    "identity": torch.nn.Identity,
+    "sigmoid": torch.nn.Sigmoid,
+}
+class CausalConv2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 pad_f_dim: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 ):
+        super(CausalConv2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+        if pad_f_dim:
+            fpad = kernel_size[1] // 2 + dilation - 1
+        else:
+            fpad = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        self.lookback = kernel_size[0] - 1
+        if self.lookback > 0:
+            self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
+        else:
+            self.tpad = nn.Identity()
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        if max(kernel_size) == 1:
+            separable = False
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, fpad),
+            stride=(1, fstride),  # stride over time is always 1
+            dilation=(1, dilation),  # dilation over time is always 1
+            groups=groups,
+            bias=bias,
+        )
+        if separable:
+            self.convp = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        else:
+            self.convp = nn.Identity()
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        else:
+            self.norm = nn.Identity()
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+        else:
+            self.activation = nn.Identity()
+        super().__init__()
+    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
+        """
+        :param inputs: shape: [b, c, t, f]
+        :param cache: shape: [b, c, lookback, f];
+        :return:
+        """
+        x = inputs
+        if cache is None:
+            x = self.tpad(x)
+        else:
+            x = torch.concat(tensors=[cache, x], dim=2)
+        new_cache = x[:, :, -self.lookback:, :]
+        x = self.conv(x)
+        x = self.convp(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x, new_cache
+class CausalConvTranspose2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 pad_f_dim: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 ):
+        super(CausalConvTranspose2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        if pad_f_dim:
+            fpad = kernel_size[1] // 2
+        else:
+            fpad = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        self.lookback = kernel_size[0] - 1
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        self.convt = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, fpad),
+            output_padding=(0, 0),
+            stride=(1, fstride),  # stride over time is always 1
+            dilation=(1, dilation),  # dilation over time is always 1
+            groups=groups,
+            bias=bias,
+        )
+        if separable:
+            self.convp = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        else:
+            self.convp = nn.Identity()
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        else:
+            self.norm = nn.Identity()
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+        else:
+            self.activation = nn.Identity()
+    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
+        """
+        :param inputs: shape: [b, c, t, f]
+        :param cache: shape: [b, c, lookback, f];
+        :return:
+        """
+        x = inputs
+        # x shape: [b, c, t, f]
+        x = self.convt(x)
+        # x shape: [b, c, t+lookback, f]
+        if cache is not None:
+            x = torch.concat(tensors=[
+                x[:, :, :self.lookback, :] + cache,
+                x[:, :, self.lookback:, :]
+            ], dim=2)
+        x = x[:, :, :-self.lookback, :]
+        new_cache = x[:, :, -self.lookback:, :]
+        x = self.convp(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x, new_cache
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/dfnet3/configuration_dfnet3.py DELETED Viewed

@@ -1,89 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Tuple, Union
-from toolbox.torchaudio.configuration_utils import PretrainedConfig
-class DfNetConfig(PretrainedConfig):
-    def __init__(self,
-                 sample_rate: int,
-                 fft_size: int,
-                 hop_size: int,
-                 df_bins: int,
-                 erb_bins: int,
-                 min_freq_bins_for_erb: int,
-                 df_order: int,
-                 df_lookahead: int,
-                 norm_tau: int,
-                 lsnr_max: int,
-                 lsnr_min: int,
-                 conv_channels: int,
-                 conv_kernel_size_input: Tuple[int, int],
-                 conv_kernel_size_inner: Tuple[int, int],
-                 convt_kernel_size_inner: Tuple[int, int],
-                 conv_lookahead: int,
-                 emb_hidden_dim: int,
-                 mask_post_filter: bool,
-                 df_hidden_dim: int,
-                 df_num_layers: int,
-                 df_pathway_kernel_size_t: int,
-                 df_gru_skip: str,
-                 post_filter_beta: float,
-                 df_n_iter: float,
-                 lsnr_dropout: bool,
-                 encoder_gru_skip_op: str,
-                 encoder_linear_groups: int,
-                 encoder_squeezed_gru_linear_groups: int,
-                 encoder_concat: bool,
-                 erb_decoder_gru_skip_op: str,
-                 erb_decoder_linear_groups: int,
-                 erb_decoder_emb_num_layers: int,
-                 df_decoder_linear_groups: int,
-                 **kwargs
-                 ):
-        super(DfNetConfig, self).__init__(**kwargs)
-        if df_gru_skip not in ("none", "identity", "grouped_linear"):
-            raise AssertionError
-        self.sample_rate = sample_rate
-        self.fft_size = fft_size
-        self.hop_size = hop_size
-        self.df_bins = df_bins
-        self.erb_bins = erb_bins
-        self.min_freq_bins_for_erb = min_freq_bins_for_erb
-        self.df_order = df_order
-        self.df_lookahead = df_lookahead
-        self.norm_tau = norm_tau
-        self.lsnr_max = lsnr_max
-        self.lsnr_min = lsnr_min
-        self.conv_channels = conv_channels
-        self.conv_kernel_size_input = conv_kernel_size_input
-        self.conv_kernel_size_inner = conv_kernel_size_inner
-        self.convt_kernel_size_inner = convt_kernel_size_inner
-        self.conv_lookahead = conv_lookahead
-        self.emb_hidden_dim = emb_hidden_dim
-        self.mask_post_filter = mask_post_filter
-        self.df_hidden_dim = df_hidden_dim
-        self.df_num_layers = df_num_layers
-        self.df_pathway_kernel_size_t = df_pathway_kernel_size_t
-        self.df_gru_skip = df_gru_skip
-        self.post_filter_beta = post_filter_beta
-        self.df_n_iter = df_n_iter
-        self.lsnr_dropout = lsnr_dropout
-        self.encoder_gru_skip_op = encoder_gru_skip_op
-        self.encoder_linear_groups = encoder_linear_groups
-        self.encoder_squeezed_gru_linear_groups = encoder_squeezed_gru_linear_groups
-        self.encoder_concat = encoder_concat
-        self.erb_decoder_gru_skip_op = erb_decoder_gru_skip_op
-        self.erb_decoder_linear_groups = erb_decoder_linear_groups
-        self.erb_decoder_emb_num_layers = erb_decoder_emb_num_layers
-        self.df_decoder_linear_groups = df_decoder_linear_groups
-if __name__ == "__main__":
-    pass

toolbox/torchaudio/models/dfnet3/features.py DELETED Viewed

@@ -1,192 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import math
-import numpy as np
-def freq2erb(freq_hz: float) -> float:
-    """
-    https://www.cnblogs.com/LXP-Never/p/16011229.html
-    1 / (24.7 * 9.265) = 0.00436976
-    """
-    return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
-def erb2freq(n_erb: float) -> float:
-    return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
-def get_erb_widths(sample_rate: int, fft_size: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
-    """
-    https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
-    :param sample_rate:
-    :param fft_size:
-    :param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
-    :param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
-    :return:
-    """
-    nyq_freq = sample_rate / 2.
-    freq_width: float = sample_rate / fft_size
-    min_erb: float = freq2erb(0.)
-    max_erb: float = freq2erb(nyq_freq)
-    erb = [0] * erb_bins
-    step = (max_erb - min_erb) / erb_bins
-    prev_freq_bin = 0
-    freq_over = 0
-    for i in range(1, erb_bins + 1):
-        f = erb2freq(min_erb + i * step)
-        freq_bin = int(round(f / freq_width))
-        freq_bins = freq_bin - prev_freq_bin - freq_over
-        if freq_bins < min_freq_bins_for_erb:
-            freq_over = min_freq_bins_for_erb - freq_bins
-            freq_bins = min_freq_bins_for_erb
-        else:
-            freq_over = 0
-        erb[i - 1] = freq_bins
-        prev_freq_bin = freq_bin
-    erb[erb_bins - 1] += 1
-    too_large = sum(erb) - (fft_size / 2 + 1)
-    if too_large > 0:
-        erb[erb_bins - 1] -= too_large
-    return np.array(erb, dtype=np.uint64)
-def get_erb_filter_bank(erb_widths: np.ndarray,
-                        sample_rate: int,
-                        normalized: bool = True,
-                        inverse: bool = False,
-                        ):
-    num_freq_bins = int(np.sum(erb_widths))
-    num_erb_bins = len(erb_widths)
-    fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
-    points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
-    for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
-        fb[b: b + w, i] = 1
-    if inverse:
-        fb = fb.T
-        if not normalized:
-            fb /= np.sum(fb, axis=1, keepdims=True)
-    else:
-        if normalized:
-            fb /= np.sum(fb, axis=0)
-    return fb
-def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
-    """
-    ERB filterbank and transform to decibel scale.
-    :param spec: Spectrum of shape [B, C, T, F].
-    :param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
-            where B are the number of ERB bins.
-    :param db: Whether to transform the output into decibel scale. Defaults to `True`.
-    :return:
-    """
-    # complex spec to power spec. (real * real + image * image)
-    spec_ = np.abs(spec) ** 2
-    # spec to erb feature.
-    erb_feat = np.matmul(spec_, erb_fb)
-    if db:
-        erb_feat = 10 * np.log10(erb_feat + 1e-10)
-    erb_feat = np.array(erb_feat, dtype=np.float32)
-    return erb_feat
-def _calculate_norm_alpha(sample_rate: int, hop_size: int, tau: float):
-    """Exponential decay factor alpha for a given tau (decay window size [s])."""
-    dt = hop_size / sample_rate
-    result = math.exp(-dt / tau)
-    return result
-def get_norm_alpha(sample_rate: int, hop_size: int, norm_tau: float) -> float:
-    a_ = _calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
-    precision = 3
-    a = 1.0
-    while a >= 1.0:
-        a = round(a_, precision)
-        precision += 1
-    return a
-MEAN_NORM_INIT = [-60., -90.]
-def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray:
-    state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
-    state = np.expand_dims(state, axis=0)
-    state = np.repeat(state, channels, axis=0)
-    # state shape: (audio_channels, erb_bins)
-    return state
-def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None):
-    erb_feat = np.copy(erb_feat)
-    batch_size, time_steps, erb_bins = erb_feat.shape
-    if state is None:
-        state = make_erb_norm_state(erb_bins, erb_feat.shape[0])
-        # state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
-        # state = np.expand_dims(state, axis=0)
-        # state = np.repeat(state, erb_feat.shape[0], axis=0)
-    for i in range(batch_size):
-        for j in range(time_steps):
-            for k in range(erb_bins):
-                x = erb_feat[i][j][k]
-                s = state[i][k]
-                state[i][k] = x * (1. - alpha) + s * alpha
-                erb_feat[i][j][k] -= state[i][k]
-                erb_feat[i][j][k] /= 40.
-    return erb_feat
-UNIT_NORM_INIT = [0.001, 0.0001]
-def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray:
-    state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins)
-    state = np.expand_dims(state, axis=0)
-    state = np.repeat(state, channels, axis=0)
-    # state shape: (audio_channels, df_bins)
-    return state
-def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None):
-    spec_feat = np.copy(spec_feat)
-    batch_size, time_steps, df_bins = spec_feat.shape
-    if state is None:
-        state = make_spec_norm_state(df_bins, spec_feat.shape[0])
-    for i in range(batch_size):
-        for j in range(time_steps):
-            for k in range(df_bins):
-                x = spec_feat[i][j][k]
-                s = state[i][k]
-                state[i][k] = np.abs(x) * (1. - alpha) + s * alpha
-                spec_feat[i][j][k] /= np.sqrt(state[i][k])
-    return spec_feat
-if __name__ == '__main__':
-    pass

toolbox/torchaudio/models/dfnet3/modeling_dfnet3.py DELETED Viewed

@@ -1,835 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import logging
-import math
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-from toolbox.torchaudio.models.dfnet3.configuration_dfnet3 import DfNetConfig
-from toolbox.torchaudio.models.dfnet3 import multiframes as MF
-from toolbox.torchaudio.models.dfnet3 import utils
-logger = logging.getLogger("toolbox")
-PI = 3.1415926535897932384626433
-norm_layer_dict = {
-    "batch_norm_2d": torch.nn.BatchNorm2d
-}
-activation_layer_dict = {
-    "relu": torch.nn.ReLU,
-    "identity": torch.nn.Identity,
-    "sigmoid": torch.nn.Sigmoid,
-}
-class CausalConv2d(nn.Sequential):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Iterable[int]],
-                 fstride: int = 1,
-                 dilation: int = 1,
-                 fpad: bool = True,
-                 bias: bool = True,
-                 separable: bool = False,
-                 norm_layer: str = "batch_norm_2d",
-                 activation_layer: str = "relu",
-                 ):
-        """
-        Causal Conv2d by delaying the signal for any lookahead.
-        Expected input format: [B, C, T, F]
-        :param in_channels:
-        :param out_channels:
-        :param kernel_size:
-        :param fstride:
-        :param dilation:
-        :param fpad:
-        """
-        super(CausalConv2d, self).__init__()
-        lookahead = 0
-        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
-        if fpad:
-            fpad_ = kernel_size[1] // 2 + dilation - 1
-        else:
-            fpad_ = 0
-        # for last 2 dim, pad (left, right, top, bottom).
-        pad = (0, 0, kernel_size[0] - 1 - lookahead, lookahead)
-        layers = []
-        if any(x > 0 for x in pad):
-            layers.append(nn.ConstantPad2d(pad, 0.0))
-        groups = math.gcd(in_channels, out_channels) if separable else 1
-        if groups == 1:
-            separable = False
-        if max(kernel_size) == 1:
-            separable = False
-        layers.append(
-            nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                padding=(0, fpad_),
-                stride=(1, fstride),  # stride over time is always 1
-                dilation=(1, dilation),  # dilation over time is always 1
-                groups=groups,
-                bias=bias,
-            )
-        )
-        if separable:
-            layers.append(
-                nn.Conv2d(
-                    out_channels,
-                    out_channels,
-                    kernel_size=1,
-                    bias=False,
-                )
-            )
-        if norm_layer is not None:
-            norm_layer = norm_layer_dict[norm_layer]
-            layers.append(norm_layer(out_channels))
-        if activation_layer is not None:
-            activation_layer = activation_layer_dict[activation_layer]
-            layers.append(activation_layer())
-        super().__init__(*layers)
-class CausalConvTranspose2d(nn.Sequential):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Iterable[int]],
-                 fstride: int = 1,
-                 dilation: int = 1,
-                 fpad: bool = True,
-                 bias: bool = True,
-                 separable: bool = False,
-                 norm_layer: str = "batch_norm_2d",
-                 activation_layer: str = "relu",
-                 ):
-        """
-        Causal ConvTranspose2d.
-        Expected input format: [B, C, T, F]
-        """
-        super(CausalConvTranspose2d, self).__init__()
-        lookahead = 0
-        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
-        if fpad:
-            fpad_ = kernel_size[1] // 2
-        else:
-            fpad_ = 0
-        # for last 2 dim, pad (left, right, top, bottom).
-        pad = (0, 0, kernel_size[0] - 1 - lookahead, lookahead)
-        layers = []
-        if any(x > 0 for x in pad):
-            layers.append(nn.ConstantPad2d(pad, 0.0))
-        groups = math.gcd(in_channels, out_channels) if separable else 1
-        if groups == 1:
-            separable = False
-        layers.append(
-            nn.ConvTranspose2d(
-                in_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                padding=(kernel_size[0] - 1, fpad_ + dilation - 1),
-                output_padding=(0, fpad_),
-                stride=(1, fstride),  # stride over time is always 1
-                dilation=(1, dilation),  # dilation over time is always 1
-                groups=groups,
-                bias=bias,
-            )
-        )
-        if separable:
-            layers.append(
-                nn.Conv2d(
-                    out_channels,
-                    out_channels,
-                    kernel_size=1,
-                    bias=False,
-                )
-            )
-        if norm_layer is not None:
-            norm_layer = norm_layer_dict[norm_layer]
-            layers.append(norm_layer(out_channels))
-        if activation_layer is not None:
-            activation_layer = activation_layer_dict[activation_layer]
-            layers.append(activation_layer())
-        super().__init__(*layers)
-class GroupedLinear(nn.Module):
-    def __init__(self, input_size: int, hidden_size: int, groups: int = 1):
-        super().__init__()
-        # self.weight: Tensor
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.groups = groups
-        assert input_size % groups == 0, f"Input size {input_size} not divisible by {groups}"
-        assert hidden_size % groups == 0, f"Hidden size {hidden_size} not divisible by {groups}"
-        self.ws = input_size // groups
-        self.register_parameter(
-            "weight",
-            torch.nn.Parameter(
-                torch.zeros(groups, input_size // groups, hidden_size // groups), requires_grad=True
-            ),
-        )
-        self.reset_parameters()
-    def reset_parameters(self):
-        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))  # type: ignore
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: [..., I]
-        b, t, _ = x.shape
-        # new_shape = list(x.shape)[:-1] + [self.groups, self.ws]
-        new_shape = (b, t, self.groups, self.ws)
-        x = x.view(new_shape)
-        # The better way, but not supported by torchscript
-        # x = x.unflatten(-1, (self.groups, self.ws))  # [..., G, I/G]
-        x = torch.einsum("btgi,gih->btgh", x, self.weight)  # [..., G, H/G]
-        x = x.flatten(2, 3)  # [B, T, H]
-        return x
-    def __repr__(self):
-        cls = self.__class__.__name__
-        return f"{cls}(input_size: {self.input_size}, hidden_size: {self.hidden_size}, groups: {self.groups})"
-class SqueezedGRU_S(nn.Module):
-    """
-    SGE net: Video object detection with squeezed GRU and information entropy map
-    https://arxiv.org/abs/2106.07224
-    """
-    def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        output_size: Optional[int] = None,
-        num_layers: int = 1,
-        linear_groups: int = 8,
-        batch_first: bool = True,
-        skip_op: str = "none",
-        activation_layer: str = "identity",
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.linear_in = nn.Sequential(
-            GroupedLinear(
-                input_size=input_size,
-                hidden_size=hidden_size,
-                groups=linear_groups,
-            ),
-            activation_layer_dict[activation_layer](),
-        )
-        # gru skip operator
-        self.gru_skip_op = None
-        if skip_op == "none":
-            self.gru_skip_op = None
-        elif skip_op == "identity":
-            if not input_size != output_size:
-                raise AssertionError("Dimensions do not match")
-            self.gru_skip_op = nn.Identity()
-        elif skip_op == "grouped_linear":
-            self.gru_skip_op = GroupedLinear(
-                input_size=hidden_size,
-                hidden_size=hidden_size,
-                groups=linear_groups,
-            )
-        else:
-            raise NotImplementedError()
-        self.gru = nn.GRU(
-            input_size=hidden_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            batch_first=batch_first,
-        )
-        if output_size is not None:
-            self.linear_out = nn.Sequential(
-                GroupedLinear(
-                    input_size=hidden_size,
-                    hidden_size=output_size,
-                    groups=linear_groups,
-                ),
-                activation_layer_dict[activation_layer](),
-            )
-        else:
-            self.linear_out = nn.Identity()
-    def forward(self, inputs: torch.Tensor, h=None) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = self.linear_in(inputs)
-        x, h = self.gru(x, h)
-        x = self.linear_out(x)
-        if self.gru_skip_op is not None:
-            x = x + self.gru_skip_op(inputs)
-        return x, h
-class Add(nn.Module):
-    def forward(self, a, b):
-        return a + b
-class Concat(nn.Module):
-    def forward(self, a, b):
-        return torch.cat((a, b), dim=-1)
-class Encoder(nn.Module):
-    def __init__(self, config: DfNetConfig):
-        super(Encoder, self).__init__()
-        self.emb_in_dim = config.conv_channels * config.erb_bins // 4
-        self.emb_out_dim = config.conv_channels * config.erb_bins // 4
-        self.emb_hidden_dim = config.emb_hidden_dim
-        self.erb_conv0 = CausalConv2d(
-            in_channels=1,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_input,
-            bias=False,
-            separable=True,
-        )
-        self.erb_conv1 = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_inner,
-            bias=False,
-            separable=True,
-            fstride=2,
-        )
-        self.erb_conv2 = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_inner,
-            bias=False,
-            separable=True,
-            fstride=2,
-        )
-        self.erb_conv3 = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_inner,
-            bias=False,
-            separable=True,
-            fstride=1,
-        )
-        self.df_conv0 = CausalConv2d(
-            in_channels=2,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_input,
-            bias=False,
-            separable=True,
-        )
-        self.df_conv1 = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_inner,
-            bias=False,
-            separable=True,
-            fstride=2,
-        )
-        self.df_fc_emb = nn.Sequential(
-            GroupedLinear(
-                config.conv_channels * config.df_bins // 2,
-                self.emb_in_dim,
-                groups=config.encoder_linear_groups
-            ),
-            nn.ReLU(inplace=True)
-        )
-        if config.encoder_concat:
-            self.emb_in_dim *= 2
-            self.combine = Concat()
-        else:
-            self.combine = Add()
-        self.emb_gru = SqueezedGRU_S(
-            self.emb_in_dim,
-            self.emb_hidden_dim,
-            output_size=self.emb_out_dim,
-            num_layers=1,
-            batch_first=True,
-            skip_op=config.encoder_gru_skip_op,
-            linear_groups=config.encoder_squeezed_gru_linear_groups,
-            activation_layer="relu",
-        )
-        self.lsnr_fc = nn.Sequential(
-            nn.Linear(self.emb_out_dim, 1),
-            nn.Sigmoid()
-        )
-        self.lsnr_scale = config.lsnr_max - config.lsnr_min
-        self.lsnr_offset = config.lsnr_min
-    def forward(self,
-                feat_erb: torch.Tensor,
-                feat_spec: torch.Tensor,
-                h: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Encodes erb; erb should be in dB scale + normalized; Fe are number of erb bands.
-        # erb: [B, 1, T, Fe]
-        # spec: [B, 2, T, Fc]
-        # b, _, t, _ = feat_erb.shape
-        e0 = self.erb_conv0(feat_erb)  # [B, C, T, F]
-        e1 = self.erb_conv1(e0)  # [B, C*2, T, F/2]
-        e2 = self.erb_conv2(e1)  # [B, C*4, T, F/4]
-        e3 = self.erb_conv3(e2)  # [B, C*4, T, F/4]
-        c0 = self.df_conv0(feat_spec)  # [B, C, T, Fc]
-        c1 = self.df_conv1(c0)  # [B, C*2, T, Fc/2]
-        cemb = c1.permute(0, 2, 3, 1).flatten(2)  # [B, T, -1]
-        cemb = self.df_fc_emb(cemb)  # [T, B, C * F/4]
-        emb = e3.permute(0, 2, 3, 1).flatten(2)  # [B, T, C * F]
-        emb = self.combine(emb, cemb)
-        emb, h = self.emb_gru(emb, h)  # [B, T, -1]
-        lsnr = self.lsnr_fc(emb) * self.lsnr_scale + self.lsnr_offset
-        return e0, e1, e2, e3, emb, c0, lsnr, h
-class ErbDecoder(nn.Module):
-    def __init__(self,
-                 config: DfNetConfig,
-                 ):
-        super(ErbDecoder, self).__init__()
-        if config.erb_bins % 8 != 0:
-            raise AssertionError("erb_bins should be divisible by 8")
-        self.emb_in_dim = config.conv_channels * config.erb_bins // 4
-        self.emb_out_dim = config.conv_channels * config.erb_bins // 4
-        self.emb_hidden_dim = config.emb_hidden_dim
-        self.emb_gru = SqueezedGRU_S(
-            self.emb_in_dim,
-            self.emb_hidden_dim,
-            output_size=self.emb_out_dim,
-            num_layers=config.erb_decoder_emb_num_layers - 1,
-            batch_first=True,
-            skip_op=config.erb_decoder_gru_skip_op,
-            linear_groups=config.erb_decoder_linear_groups,
-            activation_layer="relu",
-        )
-        # convt: TransposedConvolution, convp: Pathway (encoder to decoder) convolutions
-        self.conv3p = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=1,
-            bias=False,
-            separable=True,
-        )
-        self.convt3 = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=config.conv_kernel_size_inner,
-            bias=False,
-            separable=True,
-        )
-        self.conv2p = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=1,
-            bias=False,
-            separable=True,
-        )
-        self.convt2 = CausalConvTranspose2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            fstride=2,
-            kernel_size=config.convt_kernel_size_inner,
-            bias=False,
-            separable=True,
-        )
-        self.conv1p = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=1,
-            bias=False,
-            separable=True,
-        )
-        self.convt1 = CausalConvTranspose2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            fstride=2,
-            kernel_size=config.convt_kernel_size_inner,
-            bias=False,
-            separable=True,
-        )
-        self.conv0p = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=config.conv_channels,
-            kernel_size=1,
-            bias=False,
-            separable=True,
-        )
-        self.conv0_out = CausalConv2d(
-            in_channels=config.conv_channels,
-            out_channels=1,
-            kernel_size=config.conv_kernel_size_inner,
-            activation_layer="sigmoid",
-            bias=False,
-            separable=True,
-        )
-    def forward(self, emb, e3, e2, e1, e0) -> torch.Tensor:
-        # Estimates erb mask
-        b, _, t, f8 = e3.shape
-        emb, _ = self.emb_gru(emb)
-        emb = emb.view(b, t, f8, -1).permute(0, 3, 1, 2)  # [B, C*8, T, F/8]
-        e3 = self.convt3(self.conv3p(e3) + emb)  # [B, C*4, T, F/4]
-        e2 = self.convt2(self.conv2p(e2) + e3)  # [B, C*2, T, F/2]
-        e1 = self.convt1(self.conv1p(e1) + e2)  # [B, C, T, F]
-        m = self.conv0_out(self.conv0p(e0) + e1)  # [B, 1, T, F]
-        return m
-class Mask(nn.Module):
-    def __init__(self, erb_inv_fb: torch.FloatTensor, post_filter: bool = False, eps: float = 1e-12):
-        super().__init__()
-        self.erb_inv_fb: torch.FloatTensor
-        self.register_buffer("erb_inv_fb", erb_inv_fb.float())
-        self.clamp_tensor = torch.__version__ > "1.9.0" or torch.__version__ == "1.9.0"
-        self.post_filter = post_filter
-        self.eps = eps
-    def pf(self, mask: torch.Tensor, beta: float = 0.02) -> torch.Tensor:
-        """
-        Post-Filter
-        A Perceptually-Motivated Approach for Low-Complexity, Real-Time Enhancement of Fullband Speech.
-        https://arxiv.org/abs/2008.04259
-        :param mask: Real valued mask, typically of shape [B, C, T, F].
-        :param beta: Global gain factor.
-        :return:
-        """
-        mask_sin = mask * torch.sin(np.pi * mask / 2)
-        mask_pf = (1 + beta) * mask / (1 + beta * mask.div(mask_sin.clamp_min(self.eps)).pow(2))
-        return mask_pf
-    def forward(self, spec: torch.Tensor, mask: torch.Tensor, atten_lim: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # spec (real) [B, 1, T, F, 2], F: freq_bins
-        # mask (real): [B, 1, T, Fe], Fe: erb_bins
-        # atten_lim: [B]
-        if not self.training and self.post_filter:
-            mask = self.pf(mask)
-        if atten_lim is not None:
-            # dB to amplitude
-            atten_lim = 10 ** (-atten_lim / 20)
-            # Greater equal (__ge__) not implemented for TorchVersion.
-            if self.clamp_tensor:
-                # Supported by torch >= 1.9
-                mask = mask.clamp(min=atten_lim.view(-1, 1, 1, 1))
-            else:
-                m_out = []
-                for i in range(atten_lim.shape[0]):
-                    m_out.append(mask[i].clamp_min(atten_lim[i].item()))
-                mask = torch.stack(m_out, dim=0)
-        mask = mask.matmul(self.erb_inv_fb)  # [B, 1, T, F]
-        if not spec.is_complex():
-            mask = mask.unsqueeze(4)
-        return spec * mask
-class DfDecoder(nn.Module):
-    def __init__(self,
-                 config: DfNetConfig,
-                 ):
-        super().__init__()
-        layer_width = config.conv_channels
-        self.emb_in_dim = config.conv_channels * config.erb_bins // 4
-        self.emb_dim = config.df_hidden_dim
-        self.df_n_hidden = config.df_hidden_dim
-        self.df_n_layers = config.df_num_layers
-        self.df_order = config.df_order
-        self.df_bins = config.df_bins
-        self.df_out_ch = config.df_order * 2
-        self.df_convp = CausalConv2d(
-            layer_width,
-            self.df_out_ch,
-            fstride=1,
-            kernel_size=(config.df_pathway_kernel_size_t, 1),
-            separable=True,
-            bias=False,
-        )
-        self.df_gru = SqueezedGRU_S(
-            self.emb_in_dim,
-            self.emb_dim,
-            num_layers=self.df_n_layers,
-            batch_first=True,
-            skip_op="none",
-            activation_layer="relu",
-        )
-        if config.df_gru_skip == "none":
-            self.df_skip = None
-        elif config.df_gru_skip == "identity":
-            if config.emb_hidden_dim != config.df_hidden_dim:
-                raise AssertionError("Dimensions do not match")
-            self.df_skip = nn.Identity()
-        elif config.df_gru_skip == "grouped_linear":
-            self.df_skip = GroupedLinear(self.emb_in_dim, self.emb_dim, groups=config.df_decoder_linear_groups)
-        else:
-            raise NotImplementedError()
-        self.df_out: nn.Module
-        out_dim = self.df_bins * self.df_out_ch
-        self.df_out = nn.Sequential(
-            GroupedLinear(
-                input_size=self.df_n_hidden,
-                hidden_size=out_dim,
-                groups=config.df_decoder_linear_groups
-            ),
-            nn.Tanh()
-        )
-        self.df_fc_a = nn.Sequential(
-            nn.Linear(self.df_n_hidden, 1),
-            nn.Sigmoid()
-        )
-    def forward(self, emb: torch.Tensor, c0: torch.Tensor) -> torch.Tensor:
-        b, t, _ = emb.shape
-        c, _ = self.df_gru(emb)  # [B, T, H], H: df_n_hidden
-        if self.df_skip is not None:
-            c = c + self.df_skip(emb)
-        c0 = self.df_convp(c0).permute(0, 2, 3, 1)  # [B, T, F, O*2], channels_last
-        c = self.df_out(c)  # [B, T, F*O*2], O: df_order
-        c = c.view(b, t, self.df_bins, self.df_out_ch) + c0  # [B, T, F, O*2]
-        return c
-class DfOutputReshapeMF(nn.Module):
-    """Coefficients output reshape for multiframe/MultiFrameModule
-    Requires input of shape B, C, T, F, 2.
-    """
-    def __init__(self, df_order: int, df_bins: int):
-        super().__init__()
-        self.df_order = df_order
-        self.df_bins = df_bins
-    def forward(self, coefs: torch.Tensor) -> torch.Tensor:
-        # [B, T, F, O*2] -> [B, O, T, F, 2]
-        new_shape = list(coefs.shape)
-        new_shape[-1] = -1
-        new_shape.append(2)
-        coefs = coefs.view(new_shape)
-        coefs = coefs.permute(0, 3, 1, 2, 4)
-        return coefs
-class DfNet(nn.Module):
-    """
-    DeepFilterNet: Perceptually Motivated Real-Time Speech Enhancement
-    https://arxiv.org/abs/2305.08227
-    [email protected]
-    """
-    def __init__(self,
-                 config: DfNetConfig,
-                 erb_fb: torch.FloatTensor,
-                 erb_inv_fb: torch.FloatTensor,
-                 run_df: bool = True,
-                 train_mask: bool = True,
-                 ):
-        """
-        :param erb_fb: erb filter bank.
-        """
-        super(DfNet, self).__init__()
-        if config.erb_bins % 8 != 0:
-            raise AssertionError("erb_bins should be divisible by 8")
-        self.df_lookahead = config.df_lookahead
-        self.df_bins = config.df_bins
-        self.freq_bins: int = config.fft_size // 2 + 1
-        self.emb_dim: int = config.conv_channels * config.erb_bins
-        self.erb_bins: int = config.erb_bins
-        if config.conv_lookahead > 0:
-            if config.conv_lookahead < config.df_lookahead:
-                raise AssertionError
-            # for last 2 dim, pad (left, right, top, bottom).
-            self.pad_feat = nn.ConstantPad2d((0, 0, -config.conv_lookahead, config.conv_lookahead), 0.0)
-        else:
-            self.pad_feat = nn.Identity()
-        if config.df_lookahead > 0:
-            # for last 3 dim, pad (left, right, top, bottom, front, back).
-            self.pad_spec = nn.ConstantPad3d((0, 0, 0, 0, -config.df_lookahead, config.df_lookahead), 0.0)
-        else:
-            self.pad_spec = nn.Identity()
-        self.register_buffer("erb_fb", erb_fb)
-        self.enc = Encoder(config)
-        self.erb_dec = ErbDecoder(config)
-        self.mask = Mask(erb_inv_fb)
-        self.erb_inv_fb = erb_inv_fb
-        self.post_filter = config.mask_post_filter
-        self.post_filter_beta = config.post_filter_beta
-        self.df_order = config.df_order
-        self.df_op = MF.DF(num_freqs=config.df_bins, frame_size=config.df_order, lookahead=self.df_lookahead)
-        self.df_dec = DfDecoder(config)
-        self.df_out_transform = DfOutputReshapeMF(self.df_order, config.df_bins)
-        self.run_erb = config.df_bins + 1 < self.freq_bins
-        if not self.run_erb:
-            logger.warning("Running without ERB stage")
-        self.run_df = run_df
-        if not run_df:
-            logger.warning("Running without DF stage")
-        self.train_mask = train_mask
-        self.lsnr_dropout = config.lsnr_dropout
-        if config.df_n_iter != 1:
-            raise AssertionError
-    def forward1(
-        self,
-        spec: torch.Tensor,
-        feat_erb: torch.Tensor,
-        feat_spec: torch.Tensor,  # Not used, take spec modified by mask instead
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward method of DeepFilterNet2.
-        Args:
-            spec (Tensor): Spectrum of shape [B, 1, T, F, 2]
-            feat_erb (Tensor): ERB features of shape [B, 1, T, E]
-            feat_spec (Tensor): Complex spectrogram features of shape [B, 1, T, F', 2]
-        Returns:
-            spec (Tensor): Enhanced spectrum of shape [B, 1, T, F, 2]
-            m (Tensor): ERB mask estimate of shape [B, 1, T, E]
-            lsnr (Tensor): Local SNR estimate of shape [B, T, 1]
-        """
-        # feat_spec shape: [batch_size, 1, time_steps, freq_dim, 2]
-        feat_spec = feat_spec.squeeze(1).permute(0, 3, 1, 2)
-        # feat_spec shape: [batch_size, 2, time_steps, freq_dim]
-        # feat_erb shape: [batch_size, 1, time_steps, erb_bins]
-        # assert time_steps >= conv_lookahead.
-        feat_erb = self.pad_feat(feat_erb)
-        feat_spec = self.pad_feat(feat_spec)
-        e0, e1, e2, e3, emb, c0, lsnr, h = self.enc(feat_erb, feat_spec)
-        if self.lsnr_droput:
-            idcs = lsnr.squeeze() > -10.0
-            b, t = (spec.shape[0], spec.shape[2])
-            m = torch.zeros((b, 1, t, self.erb_bins), device=spec.device)
-            df_coefs = torch.zeros((b, t, self.nb_df, self.df_order * 2))
-            spec_m = spec.clone()
-            emb = emb[:, idcs]
-            e0 = e0[:, :, idcs]
-            e1 = e1[:, :, idcs]
-            e2 = e2[:, :, idcs]
-            e3 = e3[:, :, idcs]
-            c0 = c0[:, :, idcs]
-        if self.run_erb:
-            if self.lsnr_dropout:
-                m[:, :, idcs] = self.erb_dec(emb, e3, e2, e1, e0)
-            else:
-                m = self.erb_dec(emb, e3, e2, e1, e0)
-            spec_m = self.mask(spec, m)
-        else:
-            m = torch.zeros((), device=spec.device)
-            spec_m = torch.zeros_like(spec)
-        if self.run_df:
-            if self.lsnr_dropout:
-                df_coefs[:, idcs] = self.df_dec(emb, c0)
-            else:
-                df_coefs = self.df_dec(emb, c0)
-            df_coefs = self.df_out_transform(df_coefs)
-            spec_e = self.df_op(spec.clone(), df_coefs)
-            spec_e[..., self.df_bins:, :] = spec_m[..., self.df_bins:, :]
-        else:
-            df_coefs = torch.zeros((), device=spec.device)
-            spec_e = spec_m
-        if self.post_filter:
-            beta = self.post_filter_beta
-            eps = 1e-12
-            mask = (utils.as_complex(spec_e).abs() / utils.as_complex(spec).abs().add(eps)).clamp(eps, 1)
-            mask_sin = mask * torch.sin(PI * mask / 2).clamp_min(eps)
-            pf = (1 + beta) / (1 + beta * mask.div(mask_sin).pow(2))
-            spec_e = spec_e * pf.unsqueeze(-1)
-        return spec_e, m, lsnr, df_coefs
-    def forward(
-        self,
-        spec: torch.Tensor,
-        feat_erb: torch.Tensor,
-        feat_spec: torch.Tensor,  # Not used, take spec modified by mask instead
-        erb_encoder_h: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # feat_spec shape: [batch_size, 1, time_steps, freq_dim, 2]
-        feat_spec = feat_spec.squeeze(1).permute(0, 3, 1, 2)
-        # feat_spec shape: [batch_size, 2, time_steps, freq_dim]
-        # feat_erb shape: [batch_size, 1, time_steps, erb_bins]
-        # assert time_steps >= conv_lookahead.
-        feat_erb = self.pad_feat(feat_erb)
-        feat_spec = self.pad_feat(feat_spec)
-        e0, e1, e2, e3, emb, c0, lsnr, erb_encoder_h = self.enc(feat_erb, feat_spec, erb_encoder_h)
-        m = self.erb_dec(emb, e3, e2, e1, e0)
-        spec_m = self.mask(spec, m)
-        # spec_e = spec_m
-        df_coefs = self.df_dec(emb, c0)
-        df_coefs = self.df_out_transform(df_coefs)
-        spec_e = self.df_op(spec.clone(), df_coefs)
-        spec_e[..., self.df_bins:, :] = spec_m[..., self.df_bins:, :]
-        return spec_e, m, lsnr, df_coefs, erb_encoder_h
-if __name__ == "__main__":
-    pass

toolbox/torchaudio/models/dfnet3/multiframes.py DELETED Viewed

@@ -1,145 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import torch
-import torch.nn as nn
-# From torchaudio
-def _compute_mat_trace(input: torch.Tensor, dim1: int = -2, dim2: int = -1) -> torch.Tensor:
-    r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions.
-    Args:
-        input (torch.Tensor): Tensor of dimension `(..., channel, channel)`
-        dim1 (int, optional): the first dimension of the diagonal matrix
-            (Default: -1)
-        dim2 (int, optional): the second dimension of the diagonal matrix
-            (Default: -2)
-    Returns:
-        Tensor: trace of the input Tensor
-    """
-    assert input.ndim >= 2, "The dimension of the tensor must be at least 2."
-    assert (
-        input.shape[dim1] == input.shape[dim2]
-    ), "The size of ``dim1`` and ``dim2`` must be the same."
-    input = torch.diagonal(input, 0, dim1=dim1, dim2=dim2)
-    return input.sum(dim=-1)
-def _tik_reg(mat: torch.Tensor, reg: float = 1e-7, eps: float = 1e-8) -> torch.Tensor:
-    """Perform Tikhonov regularization (only modifying real part).
-    Args:
-        mat (torch.Tensor): input matrix (..., channel, channel)
-        reg (float, optional): regularization factor (Default: 1e-8)
-        eps (float, optional): a value to avoid the correlation matrix is all-zero (Default: ``1e-8``)
-    Returns:
-        Tensor: regularized matrix (..., channel, channel)
-    """
-    # Add eps
-    C = mat.size(-1)
-    eye = torch.eye(C, dtype=mat.dtype, device=mat.device)
-    epsilon = _compute_mat_trace(mat).real[..., None, None] * reg
-    # in case that correlation_matrix is all-zero
-    epsilon = epsilon + eps
-    mat = mat + epsilon * eye[..., :, :]
-    return mat
-class MultiFrameModule(nn.Module):
-    """
-    Multi-frame speech enhancement modules.
-    Signal model and notation:
-        Noisy: `x = s + n`
-        Enhanced: `y = f(x)`
-        Objective: `min ||s - y||`
-        PSD: Power spectral density, notated eg. as `Rxx` for noisy PSD.
-        IFC: Inter-frame correlation vector: PSD*u, u: selection vector. Notated as `rxx`
-        RTF: Relative transfere function, also called steering vector.
-    """
-    def __init__(self, num_freqs: int, frame_size: int, lookahead: int = 0, real: bool = False):
-        """
-        Multi-Frame filtering module.
-        :param num_freqs: int. Number of frequency bins used for filtering.
-        :param frame_size: int. Frame size in FD domain.
-        :param lookahead: int. Lookahead, may be used to select the output time step.
-                Note: This module does not add additional padding according to lookahead!
-        :param real:
-        """
-        super().__init__()
-        self.num_freqs = num_freqs
-        self.frame_size = frame_size
-        self.real = real
-        if real:
-            self.pad = nn.ConstantPad3d((0, 0, 0, 0, frame_size - 1 - lookahead, lookahead), 0.0)
-        else:
-            self.pad = nn.ConstantPad2d((0, 0, frame_size - 1 - lookahead, lookahead), 0.0)
-        self.need_unfold = frame_size > 1
-        self.lookahead = lookahead
-    def spec_unfold_real(self, spec: torch.Tensor):
-        if self.need_unfold:
-            spec = self.pad(spec).unfold(-3, self.frame_size, 1)
-            return spec.permute(0, 1, 5, 2, 3, 4)
-            # return as_windowed(self.pad(spec), self.frame_size, 1, dim=-3)
-        return spec.unsqueeze(-1)
-    def spec_unfold(self, spec: torch.Tensor):
-        """Pads and unfolds the spectrogram according to frame_size.
-        Args:
-            spec (complex Tensor): Spectrogram of shape [B, C, T, F]
-        Returns:
-            spec (Tensor): Unfolded spectrogram of shape [B, C, T, F, N], where N: frame_size.
-        """
-        if self.need_unfold:
-            return self.pad(spec).unfold(2, self.frame_size, 1)
-        return spec.unsqueeze(-1)
-    @staticmethod
-    def solve(Rxx, rss, diag_eps: float = 1e-8, eps: float = 1e-7) -> torch.Tensor:
-        return torch.einsum(
-            "...nm,...m->...n", torch.inverse(_tik_reg(Rxx, diag_eps, eps)), rss
-        )  # [T, F, N]
-    @staticmethod
-    def apply_coefs(spec: torch.Tensor, coefs: torch.Tensor) -> torch.Tensor:
-        # spec: [B, C, T, F, N]
-        # coefs: [B, C, T, F, N]
-        return torch.einsum("...n,...n->...", spec, coefs)
-class DF(MultiFrameModule):
-    """Deep Filtering."""
-    def __init__(self, num_freqs: int, frame_size: int, lookahead: int = 0, conj: bool = False):
-        super().__init__(num_freqs, frame_size, lookahead)
-        self.conj: bool = conj
-    def forward(self, spec: torch.Tensor, coefs: torch.Tensor):
-        spec_u = self.spec_unfold(torch.view_as_complex(spec))
-        coefs = torch.view_as_complex(coefs)
-        spec_f = spec_u.narrow(-2, 0, self.num_freqs)
-        coefs = coefs.view(coefs.shape[0], -1, self.frame_size, *coefs.shape[2:])
-        if self.conj:
-            coefs = coefs.conj()
-        spec_f = self.df(spec_f, coefs)
-        if self.training:
-            spec = spec.clone()
-        spec[..., : self.num_freqs, :] = torch.view_as_real(spec_f)
-        return spec
-    @staticmethod
-    def df(spec: torch.Tensor, coefs: torch.Tensor) -> torch.Tensor:
-        """
-        Deep filter implementation using `torch.einsum`. Requires unfolded spectrogram.
-        :param spec: (complex Tensor). Spectrogram of shape [B, C, T, F, N].
-        :param coefs: (complex Tensor). Coefficients of shape [B, C, N, T, F].
-        :return: (complex Tensor). Spectrogram of shape [B, C, T, F].
-        """
-        return torch.einsum("...tfn,...ntf->...tf", spec, coefs)
-if __name__ == '__main__':
-    pass

toolbox/torchaudio/models/dfnet3/utils.py DELETED Viewed

@@ -1,17 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import torch
-def as_complex(x: torch.Tensor):
-    if torch.is_complex(x):
-        return x
-    if x.shape[-1] != 2:
-        raise ValueError(f"Last dimension need to be of length 2 (re + im), but got {x.shape}")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    return torch.view_as_complex(x)
-if __name__ == '__main__':
-    pass

toolbox/torchaudio/models/dtln/modeling_dtln.py CHANGED Viewed

@@ -2,6 +2,10 @@
 # -*- coding: utf-8 -*-
 """
 https://github.com/AkenoSyuRi/DTLNPytorch
 """
 import os
 from typing import Optional, Union

 # -*- coding: utf-8 -*-
 """
 https://github.com/AkenoSyuRi/DTLNPytorch
+https://github.com/breizhn/DTLN
+在 dns3 500个小时的数据上训练, 在 dns3 的测试集上达到了 pesq 3.04 的水平。
 """
 import os
 from typing import Optional, Union

toolbox/torchaudio/models/frcrn/modeling_frcrn.py CHANGED Viewed

@@ -6,6 +6,8 @@ https://arxiv.org/abs/2206.07293
 https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/frcrn.py
 https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/models/frcrn_se/frcrn.py
 """
 import os
 from typing import Optional, Union

 https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/frcrn.py
 https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/models/frcrn_se/frcrn.py
+https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice/clearvoice/models/frcrn_se
 """
 import os
 from typing import Optional, Union

toolbox/torchaudio/models/gtcrn/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/gtcrn/modeling_gtcrn.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://blog.csdn.net/gitblog_00478/article/details/141522595
+https://github.com/Xiaobin-Rong/gtcrn/blob/main/gtcrn.py
+https://github.com/Xiaobin-Rong/gtcrn/blob/main/stream/gtcrn_stream.py
+"""
+import torch
+import torch.nn as nn
+from typing import List, Tuple, Union
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/lstm/modeling_lstm.py CHANGED Viewed

@@ -85,13 +85,14 @@ class LstmModel(nn.Module):
         if remainder > 0:
             n_samples_pad = self.hop_size - remainder
             signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
-        return signal, n_samples
     def forward(self,
                 noisy: torch.Tensor,
                 h_state: Tuple[torch.Tensor, torch.Tensor] = None,
                 ):
-        noisy, num_samples = self.signal_prepare(noisy)
         batch_size, _, num_samples_pad = noisy.shape
         # print(f"num_samples: {num_samples}, num_samples_pad: {num_samples_pad}")
@@ -207,7 +208,7 @@ def main():
     model.eval()
     noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
-    noisy, _ = model.signal_prepare(noisy)
     b, _, num_samples = noisy.shape
     t = (num_samples - config.win_size) / config.hop_size + 1

         if remainder > 0:
             n_samples_pad = self.hop_size - remainder
             signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal
     def forward(self,
                 noisy: torch.Tensor,
                 h_state: Tuple[torch.Tensor, torch.Tensor] = None,
                 ):
+        num_samples = noisy.shape[-1]
+        noisy = self.signal_prepare(noisy)
         batch_size, _, num_samples_pad = noisy.shape
         # print(f"num_samples: {num_samples}, num_samples_pad: {num_samples_pad}")
     model.eval()
     noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    noisy = model.signal_prepare(noisy)
     b, _, num_samples = noisy.shape
     t = (num_samples - config.win_size) / config.hop_size + 1

toolbox/torchaudio/models/rnnoise/configuration_rnnoise.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class RNNoiseConfig(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 segment_size: int = 32000,
+                 nfft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 win_type: str = "hann",
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
+                 conv_size: int = 128,
+                 gru_size: int = 256,
+                 min_snr_db: float = -10,
+                 max_snr_db: float = 20,
+                 max_epochs: int = 100,
+                 batch_size: int = 4,
+                 num_workers: int = 4,
+                 seed: int = 1234,
+                 lr: float = 0.001,
+                 lr_scheduler: str = "CosineAnnealingLR",
+                 lr_scheduler_kwargs: dict = None,
+                 weight_decay: float = 0.00001,
+                 clip_grad_norm: float = 10.,
+                 eval_steps: int = 25000,
+                 **kwargs
+                 ):
+        super(RNNoiseConfig, self).__init__(**kwargs)
+        self.sample_rate = sample_rate
+        self.segment_size = segment_size
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
+        self.conv_size = conv_size
+        self.gru_size = gru_size
+        self.min_snr_db = min_snr_db
+        self.max_snr_db = max_snr_db
+        self.max_epochs = max_epochs
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.seed = seed
+        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
+        self.weight_decay = weight_decay
+        self.clip_grad_norm = clip_grad_norm
+        self.eval_steps = eval_steps
+def main():
+    config = RNNoiseConfig()
+    config.to_yaml_file("yaml/config.yaml")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/rnnoise/modeling_rnnoise.py CHANGED Viewed

@@ -2,10 +2,401 @@
 # -*- coding: utf-8 -*-
 """
 https://github.com/xiph/rnnoise
 https://arxiv.org/abs/1709.08243
 """
-if __name__ == '__main__':
-    pass

 # -*- coding: utf-8 -*-
 """
 https://github.com/xiph/rnnoise
+https://github.com/xiph/rnnoise/blob/main/torch/rnnoise/rnnoise.py
 https://arxiv.org/abs/1709.08243
 """
+import os
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torch.sparsification.gru_sparsifier import GRUSparsifier
+from toolbox.torchaudio.models.rnnoise.configuration_rnnoise import RNNoiseConfig
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
+from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
+sparsify_start     = 6000
+sparsify_stop      = 20000
+sparsify_interval  = 100
+sparsify_exponent  = 3
+sparse_params1 = {
+    "W_hr" : (0.3, [8, 4], True),
+    "W_hz" : (0.2, [8, 4], True),
+    "W_hn" : (0.5, [8, 4], True),
+    "W_ir" : (0.3, [8, 4], False),
+    "W_iz" : (0.2, [8, 4], False),
+    "W_in" : (0.5, [8, 4], False),
+}
+def init_weights(module):
+    if isinstance(module, nn.GRU):
+        for p in module.named_parameters():
+            if p[0].startswith("weight_hh_"):
+                nn.init.orthogonal_(p[1])
+class RNNoise(nn.Module):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 nfft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 win_type: str = "hann",
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
+                 conv_size: int = 128,
+                 gru_size: int = 256,
+                 ):
+        super(RNNoise, self).__init__()
+        self.sample_rate = sample_rate
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
+        self.conv_size = conv_size
+        self.gru_size = gru_size
+        self.input_dim = nfft // 2 + 1
+        self.eps = 1e-12
+        self.erb_bands = ErbBands(
+            sample_rate=self.sample_rate,
+            nfft=self.nfft,
+            erb_bins=self.erb_bins,
+            min_freq_bins_for_erb=self.min_freq_bins_for_erb,
+        )
+        self.stft = ConvSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            win_type=self.win_type,
+            power=None,
+            requires_grad=False
+        )
+        self.istft = ConviSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            win_type=self.win_type,
+            requires_grad=False
+        )
+        self.pad = nn.ConstantPad1d(padding=(2, 2), value=0)
+        self.conv1 = nn.Conv1d(self.erb_bins, conv_size, kernel_size=3, padding="valid")
+        self.conv2 = nn.Conv1d(conv_size, gru_size, kernel_size=3, padding="valid")
+        self.gru1 = nn.GRU(self.gru_size, self.gru_size, batch_first=True)
+        self.gru2 = nn.GRU(self.gru_size, self.gru_size, batch_first=True)
+        self.gru3 = nn.GRU(self.gru_size, self.gru_size, batch_first=True)
+        self.dense_out = nn.Linear(4*self.gru_size, self.erb_bins)
+        nb_params = sum(p.numel() for p in self.parameters())
+        print(f"model: {nb_params} weights")
+        self.apply(init_weights)
+        self.sparsifier = [
+            GRUSparsifier(
+                task_list=[(self.gru1, sparse_params1)],
+                start=sparsify_start,
+                stop=sparsify_stop,
+                interval=sparsify_interval,
+                exponent=sparsify_exponent,
+            ),
+            GRUSparsifier(
+                task_list=[(self.gru2, sparse_params1)],
+                start=sparsify_start,
+                stop=sparsify_stop,
+                interval=sparsify_interval,
+                exponent=sparsify_exponent,
+            ),
+            GRUSparsifier(
+                task_list=[(self.gru3, sparse_params1)],
+                start=sparsify_start,
+                stop=sparsify_stop,
+                interval=sparsify_interval,
+                exponent=sparsify_exponent,
+            )
+        ]
+    def sparsify(self):
+        for sparsifier in self.sparsifier:
+            sparsifier.step()
+    def signal_prepare(self, signal: torch.Tensor) -> torch.Tensor:
+        if signal.dim() == 2:
+            signal = torch.unsqueeze(signal, dim=1)
+        _, _, n_samples = signal.shape
+        remainder = (n_samples - self.win_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal
+    def forward(self,
+                noisy: torch.Tensor,
+                states: Tuple[torch.Tensor, torch.Tensor, torch.Tensor] = None,
+                ):
+        num_samples = noisy.shape[-1]
+        noisy = self.signal_prepare(noisy)
+        batch_size, _, num_samples_pad = noisy.shape
+        # print(f"num_samples: {num_samples}, num_samples_pad: {num_samples_pad}")
+        mag_noisy, pha_noisy = self.mag_pha_stft(noisy)
+        # shape: (b, f, t)
+        # t = (num_samples - win_size) / hop_size + 1
+        mag_noisy_t = torch.transpose(mag_noisy, dim0=1, dim1=2)
+        # shape: (b, t, f)
+        mag_noisy_t_erb = self.erb_bands.erb_scale(mag_noisy_t, db=True)
+        # shape: (b, t, erb_bins)
+        mag_noisy_t_erb = torch.transpose(mag_noisy_t_erb, dim0=1, dim1=2)
+        # shape: (b, erb_bins, t)
+        mag_noisy_t_erb = self.pad(mag_noisy_t_erb)
+        mag_noisy_t_erb = self.forward_conv(mag_noisy_t_erb)
+        gru_out, states = self.forward_gru(mag_noisy_t_erb, states)
+        # gru_out shape: [b, t, f]
+        mask_erb = torch.sigmoid(self.dense_out(gru_out))
+        # mask_erb shape: (b, t, erb_bins)
+        mask = self.erb_bands.erb_scale_inv(mask_erb)
+        # mask shape: (b, t, f)
+        mask = torch.transpose(mask, dim0=1, dim1=2)
+        # mask shape: (b, f, t)
+        stft_denoise = self.do_mask(mag_noisy, pha_noisy, mask)
+        denoise = self.istft.forward(stft_denoise)
+        # denoise shape: [b, 1, num_samples_pad]
+        denoise = denoise[:, :, :num_samples]
+        # denoise shape: [b, 1, num_samples]
+        return denoise, mask, states
+    def forward_conv(self, mag_noisy: torch.Tensor):
+        # mag_noisy shape: [b, f, t]
+        tmp = mag_noisy
+        # tmp shape: [b, f, t]
+        tmp = torch.tanh(self.conv1(tmp))
+        tmp = torch.tanh(self.conv2(tmp))
+        # tmp shape: [b, f, t]
+        return tmp
+    def forward_gru(self,
+                      mag_noisy: torch.Tensor,
+                      states: Tuple[torch.Tensor, torch.Tensor, torch.Tensor] = None,
+                      ):
+        if states is None:
+            gru1_state = None
+            gru2_state = None
+            gru3_state = None
+        else:
+            gru1_state = states[0]
+            gru2_state = states[1]
+            gru3_state = states[2]
+        # mag_noisy shape: [b, f, t]
+        tmp = mag_noisy.permute(0, 2, 1)
+        # tmp shape: [b, t, f]
+        gru1_out, gru1_state = self.gru1(tmp, gru1_state)
+        gru2_out, gru2_state = self.gru2(gru1_out, gru2_state)
+        gru3_out, gru3_state = self.gru3(gru2_out, gru3_state)
+        new_states = [gru1_state, gru2_state, gru3_state]
+        gru_out = torch.cat(tensors=[tmp, gru1_out, gru2_out, gru3_out], dim=-1)
+        # gru_out shape: [b, t, f]
+        return gru_out, new_states
+    def forward_chunk_by_chunk(self,
+                               noisy: torch.Tensor,
+                               ):
+        noisy = self.signal_prepare(noisy)
+        b, _, num_samples = noisy.shape
+        t = (num_samples - self.win_size) / self.hop_size + 1
+        waveform = torch.zeros(size=(b, 1, 0), dtype=torch.float32)
+        states = None
+        waveform_cache = None
+        coff_cache = None
+        cache_list = list()
+        for i in range(int(t)):
+            begin = i * self.hop_size
+            end = begin + self.win_size
+            sub_noisy = noisy[:, :, begin:end]
+            mag_noisy, pha_noisy = self.mag_pha_stft(sub_noisy)
+            mag_noisy_t = torch.transpose(mag_noisy, dim0=1, dim1=2)
+            mag_noisy_t_erb = self.erb_bands.erb_scale(mag_noisy_t, db=True)
+            mag_noisy_t_erb = torch.transpose(mag_noisy_t_erb, dim0=1, dim1=2)
+            # mag_noisy_t_erb shape: (b, erb_bins, t)
+            if len(cache_list) == 0:
+                cache_list.extend([{
+                    "mag_noisy": torch.zeros_like(mag_noisy),
+                    "pha_noisy": torch.zeros_like(pha_noisy),
+                    "mag_noisy_t_erb": torch.zeros_like(mag_noisy_t_erb),
+                }] * 2)
+            cache_list.append({
+                "mag_noisy": mag_noisy,
+                "pha_noisy": pha_noisy,
+                "mag_noisy_t_erb": mag_noisy_t_erb,
+            })
+            if len(cache_list) < 5:
+                continue
+            mag_noisy_t_erb = torch.concat(
+                tensors=[c["mag_noisy_t_erb"] for c in cache_list],
+                dim=-1
+            )
+            mag_noisy = cache_list[2]["mag_noisy"]
+            pha_noisy = cache_list[2]["pha_noisy"]
+            cache_list.pop(0)
+            # mag_noisy_t_erb shape: [b, f, 5]
+            mag_noisy_t_erb = self.forward_conv(mag_noisy_t_erb)
+            # mag_noisy_t_erb shape: [b, f, 1]
+            gru_out, states = self.forward_gru(mag_noisy_t_erb, states)
+            mask_erb = torch.sigmoid(self.dense_out(gru_out))
+            mask = self.erb_bands.erb_scale_inv(mask_erb)
+            mask = torch.transpose(mask, dim0=1, dim1=2)
+            stft_denoise = self.do_mask(mag_noisy, pha_noisy, mask)
+            sub_waveform, waveform_cache, coff_cache = self.istft.forward_chunk(stft_denoise, waveform_cache, coff_cache)
+            waveform = torch.concat(tensors=[waveform, sub_waveform], dim=-1)
+        return waveform
+    def do_mask(self,
+                mag_noisy: torch.Tensor,
+                pha_noisy: torch.Tensor,
+                mask: torch.Tensor,
+                ):
+        # (b, f, t)
+        mag_denoise = mag_noisy * mask
+        stft_denoise = mag_denoise * torch.exp((1j * pha_noisy))
+        return stft_denoise
+    def mag_pha_stft(self, noisy: torch.Tensor):
+        # noisy shape: [b, num_samples]
+        stft_noisy = self.stft.forward(noisy)
+        # stft_noisy shape: [b, f, t], torch.complex64
+        real = torch.real(stft_noisy)
+        imag = torch.imag(stft_noisy)
+        mag_noisy = torch.sqrt(real ** 2 + imag ** 2)
+        pha_noisy = torch.atan2(imag, real)
+        # shape: (b, f, t)
+        return mag_noisy, pha_noisy
+MODEL_FILE = "model.pt"
+class RNNoisePretrainedModel(RNNoise):
+    def __init__(self,
+                 config: RNNoiseConfig,
+                 ):
+        super(RNNoisePretrainedModel, self).__init__(
+            sample_rate=config.sample_rate,
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+            erb_bins=config.erb_bins,
+            min_freq_bins_for_erb=config.min_freq_bins_for_erb,
+            conv_size=config.conv_size,
+            gru_size=config.gru_size,
+        )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = RNNoiseConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main1():
+    config = RNNoiseConfig()
+    model = RNNoisePretrainedModel(config)
+    model.eval()
+    noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    noisy = model.signal_prepare(noisy)
+    b, _, num_samples = noisy.shape
+    t = (num_samples - config.win_size) / config.hop_size + 1
+    waveform, mask, h_state = model.forward(noisy)
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    return
+def main2():
+    config = RNNoiseConfig()
+    model = RNNoisePretrainedModel(config)
+    model.eval()
+    noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    noisy = model.signal_prepare(noisy)
+    b, _, num_samples = noisy.shape
+    t = (num_samples - config.win_size) / config.hop_size + 1
+    waveform, mask, h_state = model.forward(noisy)
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    waveform = model.forward_chunk_by_chunk(noisy)
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    return
+if __name__ == "__main__":
+    main2()

toolbox/torchaudio/models/rnnoise/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+model_name: "rnnoise"
+# spec
+sample_rate: 8000
+segment_size: 32000
+nfft: 512
+win_size: 512
+hop_size: 256
+win_type: hann
+erb_bins: 32
+min_freq_bins_for_erb: 2
+# data
+max_snr_db: 20
+min_snr_db: -10
+# model
+conv_size: 256
+gru_size: 256
+# train
+max_epochs: 100
+batch_size: 32
+num_workers: 4
+seed: 1234
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+weight_decay: 0.00001
+clip_grad_norm: 10.0
+eval_steps: 20000

toolbox/torchaudio/modules/freq_bands/erb_bands.py CHANGED Viewed

@@ -147,6 +147,7 @@ class ErbBands(nn.Module):
         return erb_fb, erb_fb_inv
     def erb_scale(self, spec: torch.Tensor, db: bool = True):
         spec_erb = torch.matmul(spec, self.erb_fb)
         if db:
             spec_erb = 10 * torch.log10(spec_erb + 1e-10)

         return erb_fb, erb_fb_inv
     def erb_scale(self, spec: torch.Tensor, db: bool = True):
+        # spec shape: (b, t, f)
         spec_erb = torch.matmul(spec, self.erb_fb)
         if db:
             spec_erb = 10 * torch.log10(spec_erb + 1e-10)