Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on May 16

Commit

ed91efa

1 Parent(s): c255825

add dfnet2

Browse files

Files changed (14) hide show

examples/dfnet2/run.sh +156 -0
examples/dfnet2/step_1_prepare_data.py +164 -0
examples/dfnet2/step_2_train_model.py +459 -0
examples/dfnet2/yaml/config.yaml +72 -0
examples/test.py +0 -39
toolbox/torchaudio/models/dfnet/modeling_dfnet_online.py +0 -226
toolbox/torchaudio/models/dfnet2/__init__.py +6 -0
toolbox/torchaudio/models/dfnet2/configuration_dfnet2.py +147 -0
toolbox/torchaudio/models/dfnet2/inference_dfnet2.py +115 -0
toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py +1364 -0
toolbox/torchaudio/models/dfnet2/yaml/config.yaml +72 -0
toolbox/torchaudio/models/lstm/modeling_lstm.py +2 -3
toolbox/torchaudio/models/rnnoise/modeling_rnnoise.py +2 -3
toolbox/torchaudio/modules/conv_stft.py +18 -8

examples/dfnet2/run.sh ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env bash
+: <<'END'
+sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
+--noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/speech"
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet-nx-dns3 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"
+END
+# params
+system_version="windows";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=9
+work_dir="$(pwd)"
+file_folder_name=file_folder_name
+final_model_name=final_model_name
+config_file="yaml/config.yaml"
+limit=10
+noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
+speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+max_count=10000000
+nohup_name=nohup.out
+# model params
+batch_size=64
+max_epochs=200
+save_top_k=10
+patience=5
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+file_dir="${work_dir}/${file_folder_name}"
+final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
+evaluation_audio_dir="${file_dir}/evaluation_audio"
+train_dataset="${file_dir}/train.jsonl"
+valid_dataset="${file_dir}/valid.jsonl"
+$verbose && echo "system_version: ${system_version}"
+$verbose && echo "file_folder_name: ${file_folder_name}"
+if [ $system_version == "windows" ]; then
+  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
+elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
+  #source /data/local/bin/nx_denoise/bin/activate
+  alias python3='/data/local/bin/nx_denoise/bin/python3'
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: prepare data"
+  cd "${work_dir}" || exit 1
+  python3 step_1_prepare_data.py \
+  --file_dir "${file_dir}" \
+  --noise_dir "${noise_dir}" \
+  --speech_dir "${speech_dir}" \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --max_count "${max_count}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: train model"
+  cd "${work_dir}" || exit 1
+  python3 step_2_train_model.py \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --serialization_dir "${file_dir}" \
+  --config_file "${config_file}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: test model"
+  cd "${work_dir}" || exit 1
+  python3 step_3_evaluation.py \
+  --valid_dataset "${valid_dataset}" \
+  --model_dir "${file_dir}/best" \
+  --evaluation_audio_dir "${evaluation_audio_dir}" \
+  --limit "${limit}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: collect files"
+  cd "${work_dir}" || exit 1
+  mkdir -p ${final_model_dir}
+  cp "${file_dir}/best"/* "${final_model_dir}"
+  cp -r "${file_dir}/evaluation_audio" "${final_model_dir}"
+  cd "${final_model_dir}/.." || exit 1;
+  if [ -e "${final_model_name}.zip" ]; then
+    rm -rf "${final_model_name}_backup.zip"
+    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
+  fi
+  zip -r "${final_model_name}.zip" "${final_model_name}"
+  rm -rf "${final_model_name}"
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: clear file_dir"
+  cd "${work_dir}" || exit 1
+  rm -rf "${file_dir}";
+fi

examples/dfnet2/step_1_prepare_data.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import random
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+import numpy as np
+from tqdm import tqdm
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_dir", default="./", type=str)
+    parser.add_argument(
+        "--noise_dir",
+        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
+        type=str
+    )
+    parser.add_argument(
+        "--speech_dir",
+        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
+        type=str
+    )
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--duration", default=4.0, type=float)
+    parser.add_argument("--min_snr_db", default=-10, type=float)
+    parser.add_argument("--max_snr_db", default=20, type=float)
+    parser.add_argument("--target_sample_rate", default=8000, type=int)
+    parser.add_argument("--max_count", default=10000, type=int)
+    args = parser.parse_args()
+    return args
+def filename_generator(data_dir: str):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        yield filename.as_posix()
+def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000, max_epoch: int = 20000):
+    data_dir = Path(data_dir)
+    for epoch_idx in range(max_epoch):
+        for filename in data_dir.glob("**/*.wav"):
+            signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
+            raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
+            if raw_duration < duration:
+                # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
+                continue
+            if signal.ndim != 1:
+                raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
+            signal_length = len(signal)
+            win_size = int(duration * sample_rate)
+            for begin in range(0, signal_length - win_size, win_size):
+                if np.sum(signal[begin: begin+win_size]) == 0:
+                    continue
+                row = {
+                    "epoch_idx": epoch_idx,
+                    "filename": filename.as_posix(),
+                    "raw_duration": round(raw_duration, 4),
+                    "offset": round(begin / sample_rate, 4),
+                    "duration": round(duration, 4),
+                }
+                yield row
+def main():
+    args = get_args()
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    noise_dir = Path(args.noise_dir)
+    speech_dir = Path(args.speech_dir)
+    noise_generator = target_second_signal_generator(
+        noise_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=100000,
+    )
+    speech_generator = target_second_signal_generator(
+        speech_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=1,
+    )
+    dataset = list()
+    count = 0
+    process_bar = tqdm(desc="build dataset jsonl")
+    with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
+        for noise, speech in zip(noise_generator, speech_generator):
+            if count >= args.max_count > 0:
+                break
+            noise_filename = noise["filename"]
+            noise_raw_duration = noise["raw_duration"]
+            noise_offset = noise["offset"]
+            noise_duration = noise["duration"]
+            speech_filename = speech["filename"]
+            speech_raw_duration = speech["raw_duration"]
+            speech_offset = speech["offset"]
+            speech_duration = speech["duration"]
+            random1 = random.random()
+            random2 = random.random()
+            row = {
+                "count": count,
+                "noise_filename": noise_filename,
+                "noise_raw_duration": noise_raw_duration,
+                "noise_offset": noise_offset,
+                "noise_duration": noise_duration,
+                "speech_filename": speech_filename,
+                "speech_raw_duration": speech_raw_duration,
+                "speech_offset": speech_offset,
+                "speech_duration": speech_duration,
+                "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
+                "random1": random1,
+            }
+            row = json.dumps(row, ensure_ascii=False)
+            if random2 < (1 / 300 / 1):
+                fvalid.write(f"{row}\n")
+            else:
+                ftrain.write(f"{row}\n")
+            count += 1
+            duration_seconds = count * args.duration
+            duration_hours = duration_seconds / 3600
+            process_bar.update(n=1)
+            process_bar.set_postfix({
+                # "duration_seconds": round(duration_seconds, 4),
+                "duration_hours": round(duration_hours, 4),
+            })
+    return
+if __name__ == "__main__":
+    main()

examples/dfnet2/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,459 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/Rikorose/DeepFilterNet
+"""
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+from fontTools.varLib.plot import stops
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+from toolbox.torchaudio.models.dfnet2.configuration_dfnet2 import DfNet2Config
+from toolbox.torchaudio.models.dfnet2.modeling_dfnet2 import DfNet2, DfNet2PretrainedModel
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
+    parser.add_argument("--patience", default=10, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self):
+        pass
+    def __call__(self, batch: List[dict]):
+        clean_audios = list()
+        noisy_audios = list()
+        snr_db_list = list()
+        for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
+            clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
+        clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
+        # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = DfNet2Config.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(config.seed)
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    logger.info(f"set seed: {config.seed}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info(f"GPU available count: {n_gpu}; device: {device}")
+    # datasets
+    train_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.train_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+        # skip=225000,
+    )
+    valid_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.valid_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = DfNet2PretrainedModel(config).to(device)
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, evaluation_metric")
+    optimizer = torch.optim.AdamW(model.parameters(), config.lr)
+    # resume training
+    last_step_idx = -1
+    last_epoch = -1
+    for step_idx_str in serialization_dir.glob("steps-*"):
+        step_idx_str = Path(step_idx_str)
+        step_idx = step_idx_str.stem.split("-")[1]
+        step_idx = int(step_idx)
+        if step_idx > last_step_idx:
+            last_step_idx = step_idx
+    # last_epoch = 1
+    if last_step_idx != -1:
+        logger.info(f"resume from steps-{last_step_idx}.")
+        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+    if config.lr_scheduler == "CosineAnnealingLR":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            last_epoch=last_epoch,
+            # T_max=10 * config.eval_steps,
+            # eta_min=0.01 * config.lr,
+            **config.lr_scheduler_kwargs,
+        )
+    elif config.lr_scheduler == "MultiStepLR":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            last_epoch=last_epoch,
+            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+        )
+    else:
+        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    mr_stft_loss_fn = MultiResolutionSTFTLoss(
+        fft_size_list=[256, 512, 1024],
+        win_size_list=[256, 512, 1024],
+        hop_size_list=[128, 256, 512],
+        factor_sc=1.5,
+        factor_mag=1.0,
+        reduction="mean"
+    ).to(device)
+    # training loop
+    # state
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    average_mr_stft_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
+    average_mask_loss = 1000000000
+    average_lsnr_loss = 1000000000
+    model_list = list()
+    best_epoch_idx = None
+    best_step_idx = None
+    best_metric = None
+    patience_count = 0
+    step_idx = 0 if last_step_idx == -1 else last_step_idx
+    logger.info("training")
+    early_stop_flag = False
+    for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_mr_stft_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_mask_loss = 0.
+        total_lsnr_loss = 0.
+        total_batches = 0.
+        progress_bar_train = tqdm(
+            initial=step_idx,
+            desc="Training; epoch-{}".format(epoch_idx),
+        )
+        for train_batch in train_data_loader:
+            clean_audios, noisy_audios = train_batch
+            clean_audios: torch.Tensor = clean_audios.to(device)
+            noisy_audios: torch.Tensor = noisy_audios.to(device)
+            est_spec, est_wav, est_mask, lsnr = model.forward(noisy_audios)
+            mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
+            mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
+            lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
+            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 0.3 * lsnr_loss
+            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                logger.info(f"find nan or inf in loss.")
+                continue
+            denoise_audios_list_r = list(est_wav.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_mr_stft_loss += mr_stft_loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
+            total_mask_loss += mask_loss.item()
+            total_lsnr_loss += lsnr_loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+            average_mask_loss = round(total_mask_loss / total_batches, 4)
+            average_lsnr_loss = round(total_lsnr_loss / total_batches, 4)
+            progress_bar_train.update(1)
+            progress_bar_train.set_postfix({
+                "lr": lr_scheduler.get_last_lr()[0],
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+                "mr_stft_loss": average_mr_stft_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
+                "mask_loss": average_mask_loss,
+                "lsnr_loss": average_lsnr_loss,
+            })
+            # evaluation
+            step_idx += 1
+            if step_idx % config.eval_steps == 0:
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_mask_loss = 0.
+                    total_lsnr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_train.close()
+                    progress_bar_eval = tqdm(
+                        desc="Evaluation; steps-{}k".format(int(step_idx/1000)),
+                    )
+                    for eval_batch in valid_data_loader:
+                        clean_audios, noisy_audios = eval_batch
+                        clean_audios: torch.Tensor = clean_audios.to(device)
+                        noisy_audios: torch.Tensor = noisy_audios.to(device)
+                        est_spec, est_wav, est_mask, lsnr = model.forward(noisy_audios)
+                        mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
+                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
+                        mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
+                        lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
+                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 0.3 * lsnr_loss
+                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                            logger.info(f"find nan or inf in loss.")
+                            continue
+                        denoise_audios_list_r = list(est_wav.detach().cpu().numpy())
+                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+                        total_pesq_score += pesq_score
+                        total_loss += loss.item()
+                        total_mr_stft_loss += mr_stft_loss.item()
+                        total_neg_si_snr_loss += neg_si_snr_loss.item()
+                        total_mask_loss += mask_loss.item()
+                        total_lsnr_loss += lsnr_loss.item()
+                        total_batches += 1
+                        average_pesq_score = round(total_pesq_score / total_batches, 4)
+                        average_loss = round(total_loss / total_batches, 4)
+                        average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+                        average_mask_loss = round(total_mask_loss / total_batches, 4)
+                        average_lsnr_loss = round(total_lsnr_loss / total_batches, 4)
+                        progress_bar_eval.update(1)
+                        progress_bar_eval.set_postfix({
+                            "lr": lr_scheduler.get_last_lr()[0],
+                            "pesq_score": average_pesq_score,
+                            "loss": average_loss,
+                            "mr_stft_loss": average_mr_stft_loss,
+                            "neg_si_snr_loss": average_neg_si_snr_loss,
+                            "mask_loss": average_mask_loss,
+                            "lsnr_loss": average_lsnr_loss,
+                        })
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_mask_loss = 0.
+                    total_lsnr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_eval.close()
+                    progress_bar_train = tqdm(
+                        initial=progress_bar_train.n,
+                        postfix=progress_bar_train.postfix,
+                        desc=progress_bar_train.desc,
+                    )
+                    # save path
+                    save_dir = serialization_dir / "steps-{}".format(step_idx)
+                    save_dir.mkdir(parents=True, exist_ok=False)
+                    # save models
+                    model.save_pretrained(save_dir.as_posix())
+                    model_list.append(save_dir)
+                    if len(model_list) >= args.num_serialized_models_to_keep:
+                        model_to_delete: Path = model_list.pop(0)
+                        shutil.rmtree(model_to_delete.as_posix())
+                    # save metric
+                    if best_metric is None:
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    elif average_pesq_score >= best_metric:
+                        # great is better.
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    else:
+                        pass
+                    metrics = {
+                        "epoch_idx": epoch_idx,
+                        "best_epoch_idx": best_epoch_idx,
+                        "best_step_idx": best_step_idx,
+                        "pesq_score": average_pesq_score,
+                        "loss": average_loss,
+                        "mr_stft_loss": average_mr_stft_loss,
+                        "neg_si_snr_loss": average_neg_si_snr_loss,
+                        "mask_loss": average_mask_loss,
+                        "lsnr_loss": average_lsnr_loss,
+                    }
+                    metrics_filename = save_dir / "metrics_epoch.json"
+                    with open(metrics_filename, "w", encoding="utf-8") as f:
+                        json.dump(metrics, f, indent=4, ensure_ascii=False)
+                    # save best
+                    best_dir = serialization_dir / "best"
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        if best_dir.exists():
+                            shutil.rmtree(best_dir)
+                        shutil.copytree(save_dir, best_dir)
+                    # early stop
+                    early_stop_flag = False
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        patience_count = 0
+                    else:
+                        patience_count += 1
+                    if patience_count >= args.patience:
+                        early_stop_flag = True
+                    # early stop
+                    if early_stop_flag:
+                        break
+    return
+if __name__ == "__main__":
+    main()

examples/dfnet2/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+model_name: "dfnet2"
+# spec
+sample_rate: 8000
+nfft: 512
+win_size: 200
+hop_size: 80
+spec_bins: 256
+# model
+conv_channels: 64
+conv_kernel_size_input:
+  - 3
+  - 3
+conv_kernel_size_inner:
+  - 1
+  - 3
+convt_kernel_size_inner:
+  - 1
+  - 3
+embedding_hidden_size: 256
+encoder_combine_op: "concat"
+encoder_emb_skip_op: "none"
+encoder_emb_linear_groups: 16
+encoder_emb_hidden_size: 256
+encoder_linear_groups: 32
+decoder_emb_num_layers: 3
+decoder_emb_skip_op: "none"
+decoder_emb_linear_groups: 16
+decoder_emb_hidden_size: 256
+df_decoder_hidden_size: 256
+df_num_layers: 2
+df_order: 5
+df_bins: 96
+df_gru_skip: "grouped_linear"
+df_decoder_linear_groups: 16
+df_pathway_kernel_size_t: 5
+df_lookahead: 2
+# lsnr
+n_frame: 3
+lsnr_max: 30
+lsnr_min: -15
+norm_tau: 1.
+# data
+min_snr_db: -10
+max_snr_db: 20
+# train
+lr: 0.001
+lr_scheduler: "CosineAnnealingLR"
+lr_scheduler_kwargs:
+  T_max: 250000
+  eta_min: 0.0001
+max_epochs: 100
+clip_grad_norm: 10.0
+seed: 1234
+num_workers: 8
+batch_size: 64
+eval_steps: 10000
+# runtime
+use_post_filter: true

examples/test.py DELETED Viewed

@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import torch
-import torch.nn as nn
-inputs = torch.randn(size=(1, 1, 16000))
-conv1d = nn.Conv1d(
-    in_channels=1,
-    out_channels=1,
-    kernel_size=3,
-    stride=2,
-    padding=0,
-    dilation=1,
-)
-conv1dt = nn.ConvTranspose1d(
-    in_channels=1,
-    out_channels=1,
-    kernel_size=3,
-    stride=2,
-    padding=0,
-    output_padding=1,
-    dilation=1,
-)
-x = conv1d.forward(inputs)
-print(x.shape)
-x = conv1dt.forward(x)
-print(x.shape)
-print(x[:, :, 0])
-print(x[:, :, -2])
-print(x[:, :, -1])
-if __name__ == "__main__":
-    pass

toolbox/torchaudio/models/dfnet/modeling_dfnet_online.py DELETED Viewed

@@ -1,226 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-"""
-DeepFilterNet 的原生实现不直接支持流式推理
-社区开发者（如 Rikorose）提供了基于 Torch 的流式推理实现
-https://github.com/grazder/DeepFilterNet/tree/1097015d53ced78fb234e7d7071a5dd4446e3952/torchDF
-此文件试图实现一个支持流式推理的 dfnet
-"""
-import os
-import math
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from toolbox.torchaudio.configuration_utils import CONFIG_FILE
-from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
-from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
-from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
-from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
-MODEL_FILE = "model.pt"
-norm_layer_dict = {
-    "batch_norm_2d": torch.nn.BatchNorm2d
-}
-activation_layer_dict = {
-    "relu": torch.nn.ReLU,
-    "identity": torch.nn.Identity,
-    "sigmoid": torch.nn.Sigmoid,
-}
-class CausalConv2d(nn.Module):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Iterable[int]],
-                 fstride: int = 1,
-                 dilation: int = 1,
-                 pad_f_dim: bool = True,
-                 bias: bool = True,
-                 separable: bool = False,
-                 norm_layer: str = "batch_norm_2d",
-                 activation_layer: str = "relu",
-                 ):
-        super(CausalConv2d, self).__init__()
-        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
-        if pad_f_dim:
-            fpad = kernel_size[1] // 2 + dilation - 1
-        else:
-            fpad = 0
-        # for last 2 dim, pad (left, right, top, bottom).
-        self.lookback = kernel_size[0] - 1
-        if self.lookback > 0:
-            self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
-        else:
-            self.tpad = nn.Identity()
-        groups = math.gcd(in_channels, out_channels) if separable else 1
-        if groups == 1:
-            separable = False
-        if max(kernel_size) == 1:
-            separable = False
-        self.conv = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            padding=(0, fpad),
-            stride=(1, fstride),  # stride over time is always 1
-            dilation=(1, dilation),  # dilation over time is always 1
-            groups=groups,
-            bias=bias,
-        )
-        if separable:
-            self.convp = nn.Conv2d(
-                out_channels,
-                out_channels,
-                kernel_size=1,
-                bias=False,
-            )
-        else:
-            self.convp = nn.Identity()
-        if norm_layer is not None:
-            norm_layer = norm_layer_dict[norm_layer]
-            self.norm = norm_layer(out_channels)
-        else:
-            self.norm = nn.Identity()
-        if activation_layer is not None:
-            activation_layer = activation_layer_dict[activation_layer]
-            self.activation = activation_layer()
-        else:
-            self.activation = nn.Identity()
-        super().__init__()
-    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
-        """
-        :param inputs: shape: [b, c, t, f]
-        :param cache: shape: [b, c, lookback, f];
-        :return:
-        """
-        x = inputs
-        if cache is None:
-            x = self.tpad(x)
-        else:
-            x = torch.concat(tensors=[cache, x], dim=2)
-        new_cache = x[:, :, -self.lookback:, :]
-        x = self.conv(x)
-        x = self.convp(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        return x, new_cache
-class CausalConvTranspose2d(nn.Module):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Iterable[int]],
-                 fstride: int = 1,
-                 dilation: int = 1,
-                 pad_f_dim: bool = True,
-                 bias: bool = True,
-                 separable: bool = False,
-                 norm_layer: str = "batch_norm_2d",
-                 activation_layer: str = "relu",
-                 ):
-        super(CausalConvTranspose2d, self).__init__()
-        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
-        if pad_f_dim:
-            fpad = kernel_size[1] // 2
-        else:
-            fpad = 0
-        # for last 2 dim, pad (left, right, top, bottom).
-        self.lookback = kernel_size[0] - 1
-        groups = math.gcd(in_channels, out_channels) if separable else 1
-        if groups == 1:
-            separable = False
-        self.convt = nn.ConvTranspose2d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            padding=(0, fpad),
-            output_padding=(0, 0),
-            stride=(1, fstride),  # stride over time is always 1
-            dilation=(1, dilation),  # dilation over time is always 1
-            groups=groups,
-            bias=bias,
-        )
-        if separable:
-            self.convp = nn.Conv2d(
-                out_channels,
-                out_channels,
-                kernel_size=1,
-                bias=False,
-            )
-        else:
-            self.convp = nn.Identity()
-        if norm_layer is not None:
-            norm_layer = norm_layer_dict[norm_layer]
-            self.norm = norm_layer(out_channels)
-        else:
-            self.norm = nn.Identity()
-        if activation_layer is not None:
-            activation_layer = activation_layer_dict[activation_layer]
-            self.activation = activation_layer()
-        else:
-            self.activation = nn.Identity()
-    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
-        """
-        :param inputs: shape: [b, c, t, f]
-        :param cache: shape: [b, c, lookback, f];
-        :return:
-        """
-        x = inputs
-        # x shape: [b, c, t, f]
-        x = self.convt(x)
-        # x shape: [b, c, t+lookback, f]
-        if cache is not None:
-            x = torch.concat(tensors=[
-                x[:, :, :self.lookback, :] + cache,
-                x[:, :, self.lookback:, :]
-            ], dim=2)
-        x = x[:, :, :-self.lookback, :]
-        new_cache = x[:, :, -self.lookback:, :]
-        x = self.convp(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        return x, new_cache
-if __name__ == "__main__":
-    pass

toolbox/torchaudio/models/dfnet2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/models/dfnet2/configuration_dfnet2.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Tuple
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class DfNet2Config(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 nfft: int = 512,
+                 win_size: int = 200,
+                 hop_size: int = 80,
+                 win_type: str = "hann",
+                 spec_bins: int = 256,
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
+                 conv_channels: int = 64,
+                 conv_kernel_size_input: Tuple[int, int] = (3, 3),
+                 conv_kernel_size_inner: Tuple[int, int] = (1, 3),
+                 convt_kernel_size_inner: Tuple[int, int] = (1, 3),
+                 embedding_hidden_size: int = 256,
+                 encoder_combine_op: str = "concat",
+                 encoder_emb_skip_op: str = "none",
+                 encoder_emb_linear_groups: int = 16,
+                 encoder_emb_hidden_size: int = 256,
+                 encoder_linear_groups: int = 32,
+                 decoder_emb_num_layers: int = 3,
+                 decoder_emb_skip_op: str = "none",
+                 decoder_emb_linear_groups: int = 16,
+                 decoder_emb_hidden_size: int = 256,
+                 df_decoder_hidden_size: int = 256,
+                 df_num_layers: int = 2,
+                 df_order: int = 5,
+                 df_bins: int = 96,
+                 df_gru_skip: str =  "grouped_linear",
+                 df_decoder_linear_groups: int =  16,
+                 df_pathway_kernel_size_t: int = 5,
+                 df_lookahead: int = 2,
+                 n_frame: int = 3,
+                 max_local_snr: int = 30,
+                 min_local_snr: int = -15,
+                 norm_tau: float = 1.,
+                 min_snr_db: float = -10,
+                 max_snr_db: float = 20,
+                 lr: float = 0.001,
+                 lr_scheduler: str = "CosineAnnealingLR",
+                 lr_scheduler_kwargs: dict = None,
+                 max_epochs: int = 100,
+                 clip_grad_norm: float = 10.,
+                 seed: int = 1234,
+                 num_workers: int = 4,
+                 batch_size: int = 4,
+                 eval_steps: int = 25000,
+                 use_post_filter: bool = False,
+                 **kwargs
+                 ):
+        super(DfNet2Config, self).__init__(**kwargs)
+        # transform
+        self.sample_rate = sample_rate
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        # spectrum
+        self.spec_bins = spec_bins
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
+        # conv
+        self.conv_channels = conv_channels
+        self.conv_kernel_size_input = conv_kernel_size_input
+        self.conv_kernel_size_inner = conv_kernel_size_inner
+        self.convt_kernel_size_inner = convt_kernel_size_inner
+        self.embedding_hidden_size = embedding_hidden_size
+        # encoder
+        self.encoder_emb_skip_op = encoder_emb_skip_op
+        self.encoder_emb_linear_groups = encoder_emb_linear_groups
+        self.encoder_emb_hidden_size = encoder_emb_hidden_size
+        self.encoder_linear_groups = encoder_linear_groups
+        self.encoder_combine_op = encoder_combine_op
+        # decoder
+        self.decoder_emb_num_layers = decoder_emb_num_layers
+        self.decoder_emb_skip_op = decoder_emb_skip_op
+        self.decoder_emb_linear_groups = decoder_emb_linear_groups
+        self.decoder_emb_hidden_size = decoder_emb_hidden_size
+        # df decoder
+        self.df_decoder_hidden_size = df_decoder_hidden_size
+        self.df_num_layers = df_num_layers
+        self.df_order = df_order
+        self.df_bins = df_bins
+        self.df_gru_skip = df_gru_skip
+        self.df_decoder_linear_groups = df_decoder_linear_groups
+        self.df_pathway_kernel_size_t = df_pathway_kernel_size_t
+        self.df_lookahead = df_lookahead
+        # lsnr
+        self.n_frame = n_frame
+        self.max_local_snr = max_local_snr
+        self.min_local_snr = min_local_snr
+        self.norm_tau = norm_tau
+        # data snr
+        self.min_snr_db = min_snr_db
+        self.max_snr_db = max_snr_db
+        # train
+        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
+        self.max_epochs = max_epochs
+        self.clip_grad_norm = clip_grad_norm
+        self.seed = seed
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.eval_steps = eval_steps
+        # runtime
+        self.use_post_filter = use_post_filter
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/dfnet2/inference_dfnet2.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import logging
+from pathlib import Path
+import shutil
+import tempfile, time
+import zipfile
+import librosa
+import numpy as np
+import torch
+import torchaudio
+torch.set_num_threads(1)
+from project_settings import project_path
+from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
+from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNetPretrainedModel, MODEL_FILE
+logger = logging.getLogger("toolbox")
+class InferenceDfNet(object):
+    def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
+        self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
+        self.device = torch.device(device)
+        logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
+        config, model = self.load_models(self.pretrained_model_path_or_zip_file)
+        logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")
+        self.config = config
+        self.model = model
+        self.model.to(device)
+        self.model.eval()
+    def load_models(self, model_path: str):
+        model_path = Path(model_path)
+        if model_path.name.endswith(".zip"):
+            with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
+                out_root = Path(tempfile.gettempdir()) / "nx_denoise"
+                out_root.mkdir(parents=True, exist_ok=True)
+                f_zip.extractall(path=out_root)
+            model_path = out_root / model_path.stem
+        config = DfNetConfig.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model = DfNetPretrainedModel.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model.to(self.device)
+        model.eval()
+        shutil.rmtree(model_path)
+        return config, model
+    def enhancement_by_ndarray(self, noisy_audio: np.ndarray) -> np.ndarray:
+        noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+        noisy_audio = noisy_audio.unsqueeze(dim=0)
+        # noisy_audio shape: [batch_size, n_samples]
+        enhanced_audio = self.enhancement_by_tensor(noisy_audio)
+        # enhanced_audio shape: [channels, num_samples]
+        enhanced_audio = enhanced_audio[0]
+        # enhanced_audio shape: [num_samples]
+        return enhanced_audio.cpu().numpy()
+    def enhancement_by_tensor(self, noisy_audio: torch.Tensor) -> torch.Tensor:
+        if torch.max(noisy_audio) > 1 or torch.min(noisy_audio) < -1:
+            raise AssertionError(f"The value range of audio samples should be between -1 and 1.")
+        # noisy_audio shape: [batch_size, num_samples]
+        noisy_audios = noisy_audio.to(self.device)
+        with torch.no_grad():
+            est_spec, est_wav, est_mask, lsnr = self.model.forward(noisy_audios)
+        # shape: [batch_size, num_samples]
+        enhanced_audio = torch.unsqueeze(est_wav, dim=1)
+        # shape: [batch_size, 1, num_samples]
+        enhanced_audio = enhanced_audio[0]
+        # shape: [channels, num_samples]
+        return enhanced_audio
+def main():
+    model_zip_file = project_path / "trained_models/dfnet-nx-dns3.zip"
+    infer_model = InferenceDfNet(model_zip_file)
+    sample_rate = 8000
+    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_3.wav"
+    noisy_audio, sample_rate = librosa.load(
+        noisy_audio_file.as_posix(),
+        sr=sample_rate,
+    )
+    duration = librosa.get_duration(y=noisy_audio, sr=sample_rate)
+    # noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
+    noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+    noisy_audio = noisy_audio.unsqueeze(dim=0)
+    begin = time.time()
+    enhanced_audio = infer_model.enhancement_by_tensor(noisy_audio)
+    time_cost = time.time() - begin
+    print(f"enhanced_audio.shape: {enhanced_audio.shape}, time_cost: {time_cost:.4f}, audio_duration: {duration:.4f}, fpr: {time_cost / duration:.4f}")
+    filename = "enhanced_audio.wav"
+    torchaudio.save(filename, enhanced_audio.detach().cpu(), sample_rate)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py ADDED Viewed

	@@ -0,0 +1,1364 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+DeepFilterNet 的原生实现不直接支持流式推理
+社区开发者（如 Rikorose）提供了基于 Torch 的流式推理实现
+https://github.com/grazder/DeepFilterNet/tree/1097015d53ced78fb234e7d7071a5dd4446e3952/torchDF
+此文件试图实现一个支持流式推理的 dfnet
+"""
+import os
+import math
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.models.dfnet2.configuration_dfnet2 import DfNet2Config
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
+from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
+from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
+MODEL_FILE = "model.pt"
+norm_layer_dict = {
+    "batch_norm_2d": torch.nn.BatchNorm2d
+}
+activation_layer_dict = {
+    "relu": torch.nn.ReLU,
+    "identity": torch.nn.Identity,
+    "sigmoid": torch.nn.Sigmoid,
+}
+class CausalConv2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 pad_f_dim: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 ):
+        super(CausalConv2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+        if pad_f_dim:
+            fpad = kernel_size[1] // 2 + dilation - 1
+        else:
+            fpad = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        self.lookback = kernel_size[0] - 1
+        if self.lookback > 0:
+            self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
+        else:
+            self.tpad = nn.Identity()
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        if max(kernel_size) == 1:
+            separable = False
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, fpad),
+            stride=(1, fstride),  # stride over time is always 1
+            dilation=(1, dilation),  # dilation over time is always 1
+            groups=groups,
+            bias=bias,
+        )
+        if separable:
+            self.convp = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        else:
+            self.convp = nn.Identity()
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        else:
+            self.norm = nn.Identity()
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+        else:
+            self.activation = nn.Identity()
+    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
+        """
+        :param inputs: shape: [b, c, t, f]
+        :param cache: shape: [b, c, lookback, f];
+        :return:
+        """
+        x = inputs
+        if cache is None:
+            x = self.tpad(x)
+        else:
+            x = torch.concat(tensors=[cache, x], dim=2)
+        new_cache = None
+        if self.lookback > 0:
+            new_cache = x[:, :, -self.lookback:, :]
+        x = self.conv(x)
+        x = self.convp(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x, new_cache
+class CausalConvTranspose2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 pad_f_dim: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 ):
+        super(CausalConvTranspose2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        if pad_f_dim:
+            fpad = kernel_size[1] // 2
+        else:
+            fpad = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        self.lookback = kernel_size[0] - 1
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        self.convt = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, fpad),
+            output_padding=(0, fpad),
+            stride=(1, fstride),  # stride over time is always 1
+            dilation=(1, dilation),  # dilation over time is always 1
+            groups=groups,
+            bias=bias,
+        )
+        if separable:
+            self.convp = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        else:
+            self.convp = nn.Identity()
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        else:
+            self.norm = nn.Identity()
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+        else:
+            self.activation = nn.Identity()
+    def forward(self, inputs: torch.Tensor, cache: Tuple[torch.Tensor, torch.Tensor] = None):
+        """
+        :param inputs: shape: [b, c, t, f]
+        :param cache: shape: [b, c, lookback, f];
+        :return:
+        """
+        x = inputs
+        # x shape: [b, c, t, f]
+        x = self.convt(x)
+        # x shape: [b, c, t+lookback, f]
+        new_cache = None
+        if self.lookback > 0:
+            if cache is not None:
+                x = torch.concat(tensors=[
+                    x[:, :, :self.lookback, :] + cache,
+                    x[:, :, self.lookback:, :]
+                ], dim=2)
+            x = x[:, :, :-self.lookback, :]
+            new_cache = x[:, :, -self.lookback:, :]
+        x = self.convp(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x, new_cache
+class GroupedLinear(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, groups: int = 1):
+        super().__init__()
+        # self.weight: Tensor
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.groups = groups
+        assert input_size % groups == 0, f"Input size {input_size} not divisible by {groups}"
+        assert hidden_size % groups == 0, f"Hidden size {hidden_size} not divisible by {groups}"
+        self.ws = input_size // groups
+        self.register_parameter(
+            "weight",
+            torch.nn.Parameter(
+                torch.zeros(groups, input_size // groups, hidden_size // groups), requires_grad=True
+            ),
+        )
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))  # type: ignore
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [..., I]
+        b, t, f = x.shape
+        if f != self.input_size:
+            raise AssertionError
+        # new_shape = list(x.shape)[:-1] + [self.groups, self.ws]
+        new_shape = (b, t, self.groups, self.ws)
+        x = x.view(new_shape)
+        # The better way, but not supported by torchscript
+        # x = x.unflatten(-1, (self.groups, self.ws))  # [..., G, I/G]
+        x = torch.einsum("btgi,gih->btgh", x, self.weight)  # [..., G, H/G]
+        x = x.flatten(2, 3)
+        # x: [b, t, h]
+        return x
+    def __repr__(self):
+        cls = self.__class__.__name__
+        return f"{cls}(input_size: {self.input_size}, hidden_size: {self.hidden_size}, groups: {self.groups})"
+class SqueezedGRU_S(nn.Module):
+    """
+    SGE net: Video object detection with squeezed GRU and information entropy map
+    https://arxiv.org/abs/2106.07224
+    """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: Optional[int] = None,
+        num_layers: int = 1,
+        linear_groups: int = 8,
+        batch_first: bool = True,
+        skip_op: str = "none",
+        activation_layer: str = "identity",
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.linear_in = nn.Sequential(
+            GroupedLinear(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                groups=linear_groups,
+            ),
+            activation_layer_dict[activation_layer](),
+        )
+        # gru skip operator
+        self.gru_skip_op = None
+        if skip_op == "none":
+            self.gru_skip_op = None
+        elif skip_op == "identity":
+            if not input_size != output_size:
+                raise AssertionError("Dimensions do not match")
+            self.gru_skip_op = nn.Identity()
+        elif skip_op == "grouped_linear":
+            self.gru_skip_op = GroupedLinear(
+                input_size=hidden_size,
+                hidden_size=hidden_size,
+                groups=linear_groups,
+            )
+        else:
+            raise NotImplementedError()
+        self.gru = nn.GRU(
+            input_size=hidden_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=batch_first,
+            bidirectional=False,
+        )
+        if output_size is not None:
+            self.linear_out = nn.Sequential(
+                GroupedLinear(
+                    input_size=hidden_size,
+                    hidden_size=output_size,
+                    groups=linear_groups,
+                ),
+                activation_layer_dict[activation_layer](),
+            )
+        else:
+            self.linear_out = nn.Identity()
+    def forward(self, inputs: torch.Tensor, hx: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        # inputs: shape: [b, t, h]
+        x = self.linear_in.forward(inputs)
+        x, hx = self.gru.forward(x, hx)
+        x = self.linear_out(x)
+        if self.gru_skip_op is not None:
+            x = x + self.gru_skip_op(inputs)
+        return x, hx
+class Add(nn.Module):
+    def forward(self, a, b):
+        return a + b
+class Concat(nn.Module):
+    def forward(self, a, b):
+        return torch.cat((a, b), dim=-1)
+class Encoder(nn.Module):
+    def __init__(self, config: DfNet2Config):
+        super(Encoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.erb_bins // 4
+        self.embedding_output_size = config.conv_channels * config.erb_bins // 4
+        self.embedding_hidden_size = config.embedding_hidden_size
+        self.spec_conv0 = CausalConv2d(
+            in_channels=1,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_input,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.spec_conv1 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.spec_conv2 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.spec_conv3 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.df_conv0 = CausalConv2d(
+            in_channels=2,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_input,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.df_conv1 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.df_fc_emb = nn.Sequential(
+            GroupedLinear(
+                config.conv_channels * config.df_bins // 2,
+                self.embedding_input_size,
+                groups=config.encoder_linear_groups
+            ),
+            nn.ReLU(inplace=True)
+        )
+        if config.encoder_combine_op == "concat":
+            self.embedding_input_size *= 2
+            self.combine = Concat()
+        else:
+            self.combine = Add()
+        # emb_gru
+        if config.spec_bins % 8 != 0:
+            raise AssertionError("spec_bins should be divisible by 8")
+        self.emb_gru = SqueezedGRU_S(
+            self.embedding_input_size,
+            self.embedding_hidden_size,
+            output_size=self.embedding_output_size,
+            num_layers=1,
+            batch_first=True,
+            skip_op=config.encoder_emb_skip_op,
+            linear_groups=config.encoder_emb_linear_groups,
+            activation_layer="relu",
+        )
+        # lsnr
+        self.lsnr_fc = nn.Sequential(
+            nn.Linear(self.embedding_output_size, 1),
+            nn.Sigmoid()
+        )
+        self.lsnr_scale = config.max_local_snr - config.min_local_snr
+        self.lsnr_offset = config.min_local_snr
+    def forward(self,
+                feat_erb: torch.Tensor,
+                feat_spec: torch.Tensor,
+                cache_dict: dict = None,
+                ):
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache0 = cache_dict["cache0"]
+        cache1 = cache_dict["cache1"]
+        cache2 = cache_dict["cache2"]
+        cache3 = cache_dict["cache3"]
+        cache4 = cache_dict["cache4"]
+        cache5 = cache_dict["cache5"]
+        cache6 = cache_dict["cache6"]
+        # feat_erb shape: (b, 1, t, erb_bins)
+        e0, new_cache0 = self.spec_conv0.forward(feat_erb, cache=cache0)
+        e1, new_cache1 = self.spec_conv1.forward(e0, cache=cache1)
+        e2, new_cache2 = self.spec_conv2.forward(e1, cache=cache2)
+        e3, new_cache3 = self.spec_conv3.forward(e2, cache=cache3)
+        # e0 shape: [b, c, t, erb_bins]
+        # e1 shape: [b, c, t, erb_bins // 2]
+        # e2 shape: [b, c, t, erb_bins // 4]
+        # e3 shape: [b, c, t, erb_bins // 4]
+        # e3 shape: [b, 64, t, 32/4=8]
+        # feat_spec, shape: (b, 2, t, df_bins)
+        c0, new_cache4 = self.df_conv0.forward(feat_spec, cache=cache4)
+        c1, new_cache5 = self.df_conv1.forward(c0, cache=cache5)
+        # c0 shape: [b, c, t, df_bins]
+        # c1 shape: [b, c, t, df_bins // 2]
+        # c1 shape: [b, 64, t, 96/2=48]
+        cemb = c1.permute(0, 2, 3, 1)
+        # cemb shape: [b, t, df_bins // 2, c]
+        cemb = cemb.flatten(2)
+        # cemb shape: [b, t, df_bins // 2 * c]
+        # cemb shape: [b, t, 96/2*64=3072]
+        cemb = self.df_fc_emb.forward(cemb)
+        # cemb shape: [b, t, erb_bins // 4 * c]
+        # cemb shape: [b, t, 32/4*64=512]
+        # e3 shape: [b, c, t, erb_bins // 4]
+        emb = e3.permute(0, 2, 3, 1)
+        # emb shape: [b, t, erb_bins // 4, c]
+        emb = emb.flatten(2)
+        # emb shape: [b, t, erb_bins // 4 * c]
+        # emb shape: [b, t, 32/4*64=512]
+        emb = self.combine(emb, cemb)
+        # if concat; emb shape: [b, t, spec_bins // 4 * c * 2]
+        # if add; emb shape: [b, t, spec_bins // 4 * c]
+        emb, new_cache6 = self.emb_gru.forward(emb, hx=cache6)
+        # emb shape: [b, t, spec_dim // 4 * c]
+        # h shape: [b, 1, spec_dim]
+        lsnr = self.lsnr_fc(emb) * self.lsnr_scale + self.lsnr_offset
+        # lsnr shape: [b, t, 1]
+        new_cache_dict = {
+            "cache0": new_cache0,
+            "cache1": new_cache1,
+            "cache2": new_cache2,
+            "cache3": new_cache3,
+            "cache4": new_cache4,
+            "cache5": new_cache5,
+            "cache6": new_cache6,
+        }
+        return e0, e1, e2, e3, emb, c0, lsnr, new_cache_dict
+class ErbDecoder(nn.Module):
+    def __init__(self, config: DfNet2Config):
+        super(ErbDecoder, self).__init__()
+        if config.spec_bins % 8 != 0:
+            raise AssertionError("spec_bins should be divisible by 8")
+        self.emb_in_dim = config.conv_channels * config.erb_bins // 4
+        self.emb_out_dim = config.conv_channels * config.erb_bins // 4
+        self.emb_hidden_dim = config.decoder_emb_hidden_size
+        self.emb_gru = SqueezedGRU_S(
+            self.emb_in_dim,
+            self.emb_hidden_dim,
+            output_size=self.emb_out_dim,
+            num_layers=config.decoder_emb_num_layers - 1,
+            batch_first=True,
+            skip_op=config.decoder_emb_skip_op,
+            linear_groups=config.decoder_emb_linear_groups,
+            activation_layer="relu",
+        )
+        self.conv3p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.convt3 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.conv2p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.convt2 = CausalConvTranspose2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.convt_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.conv1p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.convt1 = CausalConvTranspose2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.convt_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.conv0p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.conv0_out = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=1,
+            kernel_size=config.conv_kernel_size_inner,
+            activation_layer="sigmoid",
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+    def forward(self, emb, e3, e2, e1, e0, cache_dict: dict = None) -> torch.Tensor:
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache0 = cache_dict["cache0"]
+        cache1 = cache_dict["cache1"]
+        cache2 = cache_dict["cache2"]
+        cache3 = cache_dict["cache3"]
+        cache4 = cache_dict["cache4"]
+        # Estimates erb mask
+        b, _, t, f8 = e3.shape
+        # emb shape: [batch_size, time_steps, (freq_dim // 4) * conv_channels]
+        emb, new_cache0 = self.emb_gru.forward(emb, hx=cache0)
+        # emb shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
+        emb = emb.view(b, t, f8, -1).permute(0, 3, 1, 2)
+        e3, new_cache1 = self.convt3.forward(self.conv3p(e3)[0] + emb, cache=cache1)
+        # e3 shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
+        e2, new_cache2 = self.convt2.forward(self.conv2p(e2)[0] + e3, cache=cache2)
+        # e2 shape: [batch_size, conv_channels, time_steps, freq_dim // 2]
+        e1, new_cache3 = self.convt1.forward(self.conv1p(e1)[0] + e2, cache=cache3)
+        # e1 shape: [batch_size, conv_channels, time_steps, freq_dim]
+        mask, new_cache4 = self.conv0_out.forward(self.conv0p(e0)[0] + e1, cache=cache4)
+        # mask shape: [batch_size, 1, time_steps, freq_dim]
+        new_cache_dict = {
+            "cache0": new_cache0,
+            "cache1": new_cache1,
+            "cache2": new_cache2,
+            "cache3": new_cache3,
+            "cache4": new_cache4,
+        }
+        return mask, new_cache_dict
+class DfDecoder(nn.Module):
+    def __init__(self, config: DfNet2Config):
+        super(DfDecoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.erb_bins // 4
+        self.df_decoder_hidden_size = config.df_decoder_hidden_size
+        self.df_num_layers = config.df_num_layers
+        self.df_order = config.df_order
+        self.df_bins = config.df_bins
+        self.df_out_ch = config.df_order * 2
+        self.df_convp = CausalConv2d(
+            config.conv_channels,
+            self.df_out_ch,
+            fstride=1,
+            kernel_size=(config.df_pathway_kernel_size_t, 1),
+            separable=True,
+            bias=False,
+        )
+        self.df_gru = SqueezedGRU_S(
+            self.embedding_input_size,
+            self.df_decoder_hidden_size,
+            num_layers=self.df_num_layers,
+            batch_first=True,
+            skip_op="none",
+            activation_layer="relu",
+        )
+        if config.df_gru_skip == "none":
+            self.df_skip = None
+        elif config.df_gru_skip == "identity":
+            if config.embedding_hidden_size != config.df_decoder_hidden_size:
+                raise AssertionError("Dimensions do not match")
+            self.df_skip = nn.Identity()
+        elif config.df_gru_skip == "grouped_linear":
+            self.df_skip = GroupedLinear(
+                self.embedding_input_size,
+                self.df_decoder_hidden_size,
+                groups=config.df_decoder_linear_groups
+            )
+        else:
+            raise NotImplementedError()
+        self.df_out: nn.Module
+        out_dim = self.df_bins * self.df_out_ch
+        self.df_out = nn.Sequential(
+            GroupedLinear(
+                input_size=self.df_decoder_hidden_size,
+                hidden_size=out_dim,
+                groups=config.df_decoder_linear_groups,
+                # groups = self.df_bins // 5,
+        ),
+            nn.Tanh()
+        )
+        self.df_fc_a = nn.Sequential(
+            nn.Linear(self.df_decoder_hidden_size, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, emb: torch.Tensor, c0: torch.Tensor, cache_dict: dict = None) -> torch.Tensor:
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache0 = cache_dict["cache0"]
+        cache1 = cache_dict["cache1"]
+        # emb shape: [batch_size, time_steps, df_bins // 4 * channels]
+        b, t, _ = emb.shape
+        df_coefs, new_cache0 = self.df_gru.forward(emb, hx=cache0)
+        if self.df_skip is not None:
+            df_coefs = df_coefs + self.df_skip(emb)
+        # df_coefs shape: [batch_size, time_steps, df_decoder_hidden_size]
+        # c0 shape: [batch_size, channels, time_steps, df_bins]
+        c0, new_cache1 = self.df_convp.forward(c0, cache=cache1)
+        # c0 shape: [batch_size, df_order * 2, time_steps, df_bins]
+        c0 = c0.permute(0, 2, 3, 1)
+        # c0 shape: [batch_size, time_steps, df_bins, df_order * 2]
+        df_coefs = self.df_out(df_coefs)  # [B, T, F*O*2], O: df_order
+        # df_coefs shape: [batch_size, time_steps, df_bins * df_order * 2]
+        df_coefs = df_coefs.view(b, t, self.df_bins, self.df_out_ch)
+        # df_coefs shape: [batch_size, time_steps, df_bins, df_order * 2]
+        df_coefs = df_coefs + c0
+        # df_coefs shape: [batch_size, time_steps, df_bins, df_order * 2]
+        new_cache_dict = {
+            "cache0": new_cache0,
+            "cache1": new_cache1,
+        }
+        return df_coefs, new_cache_dict
+class DfOutputReshapeMF(nn.Module):
+    """Coefficients output reshape for multiframe/MultiFrameModule
+    Requires input of shape B, C, T, F, 2.
+    """
+    def __init__(self, df_order: int, df_bins: int):
+        super().__init__()
+        self.df_order = df_order
+        self.df_bins = df_bins
+    def forward(self, coefs: torch.Tensor) -> torch.Tensor:
+        # [B, T, F, O*2] -> [B, O, T, F, 2]
+        new_shape = list(coefs.shape)
+        new_shape[-1] = -1
+        new_shape.append(2)
+        coefs = coefs.view(new_shape)
+        coefs = coefs.permute(0, 3, 1, 2, 4)
+        return coefs
+class Mask(nn.Module):
+    def __init__(self, use_post_filter: bool = False, eps: float = 1e-12):
+        super().__init__()
+        self.use_post_filter = use_post_filter
+        self.eps = eps
+    def post_filter(self, mask: torch.Tensor, beta: float = 0.02) -> torch.Tensor:
+        """
+        Post-Filter
+        A Perceptually-Motivated Approach for Low-Complexity, Real-Time Enhancement of Fullband Speech.
+        https://arxiv.org/abs/2008.04259
+        :param mask: Real valued mask, typically of shape [B, C, T, F].
+        :param beta: Global gain factor.
+        :return:
+        """
+        mask_sin = mask * torch.sin(np.pi * mask / 2)
+        mask_pf = (1 + beta) * mask / (1 + beta * mask.div(mask_sin.clamp_min(self.eps)).pow(2))
+        return mask_pf
+    def forward(self, spec: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # spec shape: [b, 1, t, spec_bins, 2]
+        if not self.training and self.use_post_filter:
+            mask = self.post_filter(mask)
+        # mask shape: [b, 1, t, spec_bins]
+        mask = mask.unsqueeze(4)
+        # mask shape: [b, 1, t, spec_bins, 1]
+        return spec * mask
+class DeepFiltering(nn.Module):
+    def __init__(self,
+                 df_bins: int,
+                 df_order: int,
+                 lookahead: int = 0,
+                 ):
+        super(DeepFiltering, self).__init__()
+        self.df_bins = df_bins
+        self.df_order = df_order
+        self.lookahead = lookahead
+        self.pad = nn.ConstantPad2d((0, 0, df_order - 1 - lookahead, lookahead), 0.0)
+    def forward(self, *args, **kwargs):
+        raise AssertionError("use `forward_offline` or `forward_online` stead.")
+    def spec_unfold_offline(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Pads and unfolds the spectrogram according to frame_size.
+        :param spec: shape: [b, c, t, f], dtype: torch.complex64
+        :return: shape: [b, c, t, f, df_order]
+        """
+        if self.df_order <= 1:
+            return spec.unsqueeze(-1)
+        # spec shape: [b, 1, t, f], dtype: torch.complex64
+        spec = self.pad(spec)
+        # spec_pad shape: [b, 1, t+df_order-1, f], dtype: torch.complex64
+        spec_unfold = spec.unfold(dimension=2, size=self.df_order, step=1)
+        # spec_unfold shape: [b, 1, t, f, df_order], dtype: torch.complex64
+        return spec_unfold
+    def forward_offline(self,
+                        spec: torch.Tensor,
+                        coefs: torch.Tensor,
+                        ):
+        # spec shape: [b, 1, t, spec_bins, 2]
+        spec_c = torch.view_as_complex(spec.contiguous())
+        # spec_c shape: [b, 1, t, spec_bins]
+        spec_u = self.spec_unfold_offline(spec_c)
+        # spec_u shape: [b, 1, t, spec_bins, df_order]
+        spec_f = spec_u.narrow(dim=-2, start=0, length=self.df_bins)
+        # spec_f shape: [b, 1, t, df_bins, df_order]
+        # coefs shape: [b, df_order, t, df_bins, 2]
+        coefs = torch.view_as_complex(coefs.contiguous())
+        # coefs shape: [b, df_order, t, df_bins]
+        coefs = coefs.unsqueeze(dim=1)
+        # coefs shape: [b, 1, df_order, t, df_bins]
+        spec_f = self.df_offline(spec_f, coefs)
+        # spec_f shape: [b, 1, t, df_bins]
+        spec_f = torch.view_as_real(spec_f)
+        # spec_f shape: [b, 1, t, df_bins, 2]
+        return spec_f
+    def df_offline(self, spec: torch.Tensor, coefs: torch.Tensor):
+        """
+        Deep filter implementation using `torch.einsum`. Requires unfolded spectrogram.
+        :param spec: [b, 1, t, df_bins, df_order] complex.
+        :param coefs: [b, 1, df_order, t, df_bins] complex.
+        :return: [b, 1, t, df_bins] complex.
+        """
+        spec_f = torch.einsum("...tfn,...ntf->...tf", spec, coefs)
+        return spec_f
+    def spec_unfold_online(self, spec: torch.Tensor, cache_spec: torch.Tensor = None):
+        """
+        Pads and unfolds the spectrogram according to frame_size.
+        :param spec: shape: [b, c, t, f], dtype: torch.complex64
+        :param cache_spec: shape: [b, c, df_order-1, f], dtype: torch.complex64
+        :return: shape: [b, c, t, f, df_order]
+        """
+        if self.df_order <= 1:
+            return spec.unsqueeze(-1)
+        if cache_spec is None:
+            b, c, _, f = spec.shape
+            cache_spec = spec.new_zeros(size=(b, c, self.df_order-1, f))
+        spec_pad = torch.concat(tensors=[
+            cache_spec, spec
+        ], dim=2)
+        new_cache_spec = spec_pad[:, :, -(self.df_order-1):, :]
+        # spec_pad shape: [b, 1, t+df_order-1, f], dtype: torch.complex64
+        spec_unfold = spec_pad.unfold(dimension=2, size=self.df_order, step=1)
+        # spec_unfold shape: [b, 1, t, f, df_order], dtype: torch.complex64
+        return spec_unfold, new_cache_spec
+    def forward_online(self,
+                       spec: torch.Tensor,
+                       coefs: torch.Tensor,
+                       cache_dict: dict = None,
+                       ):
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache0 = cache_dict["cache0"]
+        cache1 = cache_dict["cache1"]
+        # spec shape: [b, 1, t, spec_bins, 2]
+        spec_c = torch.view_as_complex(spec.contiguous())
+        # spec_c shape: [b, 1, t, spec_bins]
+        spec_u, new_cache0 = self.spec_unfold_online(spec_c, cache_spec=cache0)
+        # spec_u shape: [b, 1, t, spec_bins, df_order]
+        spec_f = spec_u.narrow(dim=-2, start=0, length=self.df_bins)
+        # spec_f shape: [b, 1, t, df_bins, df_order]
+        # coefs shape: [b, df_order, t, df_bins, 2]
+        coefs = torch.view_as_complex(coefs.contiguous())
+        # coefs shape: [b, df_order, t, df_bins]
+        coefs = coefs.unsqueeze(dim=1)
+        # coefs shape: [b, 1, df_order, t, df_bins]
+        spec_f, new_cache1 = self.df_online(spec_f, coefs, cache_coefs=cache1)
+        # spec_f shape: [b, 1, t, df_bins]
+        spec_f = torch.view_as_real(spec_f)
+        # spec_f shape: [b, 1, t, df_bins, 2]
+        new_cache_dict = {
+            "cache0": new_cache0,
+            "cache1": new_cache1,
+        }
+        return spec_f, new_cache_dict
+    def df_online(self, spec: torch.Tensor, coefs: torch.Tensor, cache_coefs: torch.Tensor = None) -> torch.Tensor:
+        """
+        Deep filter implementation using `torch.einsum`. Requires unfolded spectrogram.
+        :param spec: [b, 1, 1, df_bins, df_order] complex.
+        :param coefs: [b, 1, df_order, 1, df_bins] complex.
+        :param cache_coefs: [b, 1, df_order, lookahead, df_bins] complex.
+        :return: [b, 1, 1, df_bins] complex.
+        """
+        if cache_coefs is None:
+            b, c, _, _, f = coefs.shape
+            cache_coefs = coefs.new_zeros(size=(b, c, self.df_order, self.lookahead, f))
+        coefs_pad = torch.concat(tensors=[
+            cache_coefs, coefs
+        ], dim=3)
+        # coefs_pad shape: [b, 1, df_order, 1+lookahead, df_bins], torch.complex64.
+        coefs = coefs_pad[:, :, :, :-self.lookahead, :]
+        # coefs shape: [b, 1, df_order, 1, df_bins], torch.complex64.
+        new_cache_coefs = coefs_pad[:, :, :, -self.lookahead:, :]
+        # new_cache_coefs shape: [b, 1, df_order, lookahead, df_bins], torch.complex64.
+        spec_f = torch.einsum("...tfn,...ntf->...tf", spec, coefs)
+        return spec_f, new_cache_coefs
+class DfNet2(nn.Module):
+    def __init__(self, config: DfNet2Config):
+        super(DfNet2, self).__init__()
+        self.config = config
+        self.eps = 1e-12
+        self.freq_bins = self.config.nfft // 2 + 1
+        self.nfft = config.nfft
+        self.win_size = config.win_size
+        self.hop_size = config.hop_size
+        self.win_type = config.win_type
+        self.erb_bands = ErbBands(
+            sample_rate=config.sample_rate,
+            nfft=config.nfft,
+            erb_bins=config.erb_bins,
+            min_freq_bins_for_erb=config.min_freq_bins_for_erb,
+        )
+        self.stft = ConvSTFT(
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+            power=None,
+            requires_grad=False
+        )
+        self.istft = ConviSTFT(
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+            requires_grad=False
+        )
+        self.encoder = Encoder(config)
+        self.erb_decoder = ErbDecoder(config)
+        self.df_decoder = DfDecoder(config)
+        self.df_out_transform = DfOutputReshapeMF(config.df_order, config.df_bins)
+        self.df_op = DeepFiltering(
+            df_bins=config.df_bins,
+            df_order=config.df_order,
+            lookahead=config.df_lookahead,
+        )
+        self.mask = Mask(use_post_filter=config.use_post_filter)
+        self.lsnr_fn = LocalSnrTarget(
+            sample_rate=config.sample_rate,
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            n_frame=config.n_frame,
+            min_local_snr=config.min_local_snr,
+            max_local_snr=config.max_local_snr,
+            db=True,
+        )
+    def signal_prepare(self, signal: torch.Tensor) -> torch.Tensor:
+        if signal.dim() == 2:
+            signal = torch.unsqueeze(signal, dim=1)
+        _, _, n_samples = signal.shape
+        remainder = (n_samples - self.win_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal
+    def feature_prepare(self, signal: torch.Tensor):
+        # noisy shape: [b, num_samples_pad]
+        spec_cmp = self.stft.forward(signal)
+        # spec_complex shape: [b, f, t], torch.complex64
+        spec_cmp = torch.transpose(spec_cmp, dim0=1, dim1=2)
+        # spec_complex shape: [b, t, f], torch.complex64
+        spec_cmp_real = torch.view_as_real(spec_cmp)
+        # spec_cmp_real shape: [b, t, f, 2]
+        spec_mag = torch.abs(spec_cmp)
+        spec_pow = torch.square(spec_mag)
+        # shape: [b, t, f]
+        spec = torch.unsqueeze(spec_cmp_real, dim=1)
+        # spec shape: [b, 1, t, f, 2]
+        feat_erb = self.erb_bands.erb_scale(spec_pow, db=True)
+        # feat_erb shape: [b, t, erb_bins]
+        feat_erb = torch.unsqueeze(feat_erb, dim=1)
+        # feat_erb shape: [b, 1, t, erb_bins]
+        feat_spec = spec_cmp_real.permute(0, 3, 1, 2)
+        # feat_spec shape: [b, 2, t, f]
+        feat_spec = feat_spec[..., :self.df_decoder.df_bins]
+        # feat_spec shape: [b, 2, t, df_bins]
+        return spec, feat_erb, feat_spec
+    def forward(self,
+                noisy: torch.Tensor,
+                ):
+        """
+        :param noisy:
+        :return:
+        est_spec: shape: [b, 257*2, t]
+        est_wav:  shape: [b, num_samples]
+        est_mask: shape: [b, 257, t]
+        lsnr:     shape: [b, 1, t]
+        """
+        n_samples = noisy.shape[-1]
+        noisy = self.signal_prepare(noisy)
+        spec, feat_erb, feat_spec = self.feature_prepare(noisy)
+        e0, e1, e2, e3, emb, c0, lsnr, _ = self.encoder.forward(feat_erb, feat_spec)
+        mask, _ = self.erb_decoder.forward(emb, e3, e2, e1, e0)
+        # mask shape: [b, 1, t, erb_bins]
+        mask = self.erb_bands.erb_scale_inv(mask)
+        # mask shape: [b, 1, t, f]
+        if torch.any(mask > 1) or torch.any(mask < 0):
+            raise AssertionError
+        spec_m = self.mask.forward(spec, mask)
+        # spec_m shape: [b, 1, t, f, 2]
+        spec_m = spec_m[:, :, :, :self.config.spec_bins, :]
+        # spec_m shape: [b, 1, t, spec_bins, 2]
+        # lsnr shape: [b, t, 1]
+        lsnr = torch.transpose(lsnr, dim0=2, dim1=1)
+        # lsnr shape: [b, 1, t]
+        df_coefs, _ = self.df_decoder.forward(emb, c0)
+        df_coefs = self.df_out_transform(df_coefs)
+        # df_coefs shape: [b, df_order, t, df_bins, 2]
+        spec_ = spec[:, :, :, :self.config.spec_bins, :]
+        # spec shape: [b, 1, t, spec_bins, 2]
+        spec_f = self.df_op.forward_offline(spec_, df_coefs)
+        # spec_f shape: [b, 1, t, df_bins, 2], torch.float32
+        spec_e = torch.concat(tensors=[
+            spec_f, spec_m[..., self.df_decoder.df_bins:, :]
+        ], dim=3)
+        spec_e = torch.squeeze(spec_e, dim=1)
+        spec_e = spec_e.permute(0, 2, 1, 3)
+        # spec_e shape: [b, spec_bins, t, 2]
+        # spec_e shape: [b, spec_bins, t, 2]
+        est_spec = torch.view_as_complex(spec_e.contiguous())
+        # est_spec shape: [b, spec_bins, t], torch.complex64
+        est_spec = torch.concat(tensors=[est_spec, est_spec[:, -1:, :]], dim=1)
+        # est_spec shape: [b, f, t], torch.complex64
+        est_wav = self.istft.forward(est_spec)
+        est_wav = est_wav[:, :, :n_samples]
+        # est_wav shape: [b, 1, n_samples]
+        est_mask = torch.squeeze(mask, dim=1)
+        est_mask = est_mask.permute(0, 2, 1)
+        # est_mask shape: [b, f, t]
+        return est_spec, est_wav, est_mask, lsnr
+    def forward_chunk_by_chunk(self,
+                               noisy: torch.Tensor,
+                               ):
+        noisy = self.signal_prepare(noisy)
+        b, _, _ = noisy.shape
+        noisy = torch.concat(tensors=[
+            noisy, noisy.new_zeros(size=(b, 1, (self.config.df_lookahead+1)*self.hop_size))
+        ], dim=2)
+        b, _, num_samples = noisy.shape
+        t = (num_samples - self.win_size) // self.hop_size + 1
+        cache_dict0 = None
+        cache_dict1 = None
+        cache_dict2 = None
+        cache_dict3 = None
+        cache_dict4 = None
+        cache_dict5 = None
+        waveform_list = list()
+        for i in range(int(t)):
+            begin = i * self.hop_size
+            end = begin + self.win_size
+            sub_noisy = noisy[:, :, begin: end]
+            spec, feat_erb, feat_spec = self.feature_prepare(sub_noisy)
+            # spec shape: [b, 1, t, f, 2]
+            # feat_erb shape: [b, 1, t, erb_bins]
+            # feat_spec shape: [b, 2, t, df_bins]
+            e0, e1, e2, e3, emb, c0, lsnr, cache_dict0 = self.encoder.forward(feat_erb, feat_spec, cache_dict=cache_dict0)
+            mask, cache_dict1 = self.erb_decoder.forward(emb, e3, e2, e1, e0, cache_dict=cache_dict1)
+            # mask shape: [b, 1, t, erb_bins]
+            mask = self.erb_bands.erb_scale_inv(mask)
+            # mask shape: [b, 1, t, f]
+            spec_m = self.mask.forward(spec, mask)
+            # spec_m shape: [b, 1, t, f, 2]
+            spec_m = spec_m[:, :, :, :self.config.spec_bins, :]
+            # spec_m shape: [b, 1, t, spec_bins, 2]
+            # lsnr shape: [b, t, 1]
+            lsnr = torch.transpose(lsnr, dim0=2, dim1=1)
+            # lsnr shape: [b, 1, t]
+            df_coefs, cache_dict2 = self.df_decoder.forward(emb, c0, cache_dict=cache_dict2)
+            df_coefs = self.df_out_transform(df_coefs)
+            # df_coefs shape: [b, df_order, t, df_bins, 2]
+            spec_ = spec[:, :, :, :self.config.spec_bins, :]
+            # spec shape: [b, 1, t, spec_bins, 2]
+            spec_f, cache_dict3 = self.df_op.forward_online(spec_, df_coefs, cache_dict=cache_dict3)
+            # spec_f shape: [b, 1, t, df_bins, 2], torch.float32
+            spec_e = torch.concat(tensors=[
+                spec_f, spec_m[..., self.df_decoder.df_bins:, :]
+            ], dim=3)
+            spec_e, cache_dict4 = self.spec_e_m_combine_online(spec_f, spec_m, cache_dict=cache_dict4)
+            spec_e = torch.squeeze(spec_e, dim=1)
+            spec_e = spec_e.permute(0, 2, 1, 3)
+            # spec_e shape: [b, spec_bins, t, 2]
+            # spec_e shape: [b, spec_bins, t, 2]
+            est_spec = torch.view_as_complex(spec_e.contiguous())
+            # est_spec shape: [b, spec_bins, t], torch.complex64
+            est_spec = torch.concat(tensors=[est_spec, est_spec[:, -1:, :]], dim=1)
+            # est_spec shape: [b, f, t], torch.complex64
+            est_wav, cache_dict5 = self.istft.forward_chunk(est_spec, cache_dict=cache_dict5)
+            # est_wav shape: [b, 1, hop_size]
+            waveform_list.append(est_wav)
+        waveform = torch.concat(tensors=waveform_list, dim=-1)
+        # waveform shape: [b, 1, n]
+        return waveform
+    def spec_e_m_combine_online(self, spec_f: torch.Tensor, spec_m: torch.Tensor, cache_dict: dict = None):
+        """
+        :param spec_f: shape: [b, 1, t, df_bins, 2], torch.float32
+        :param spec_m: shape: [b, 1, t, spec_bins, 2]
+        :param cache_dict:
+        :return:
+        """
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        cache_spec_m = cache_dict["cache_spec_m"]
+        if cache_spec_m is None:
+            b, c, t, f, _ = spec_m.shape
+            cache_spec_m = spec_m.new_zeros(size=(b, c, self.config.df_lookahead, f, 2))
+            # cache0 shape: [b, 1, lookahead, f, 2]
+        spec_m_cat = torch.concat(tensors=[
+            cache_spec_m, spec_m,
+        ], dim=2)
+        spec_m = spec_m_cat[:, :, :-self.config.df_lookahead, :, :]
+        new_cache_spec_m = spec_m_cat[:, :, -self.config.df_lookahead:, :, :]
+        spec_e = torch.concat(tensors=[
+            spec_f, spec_m[..., self.df_decoder.df_bins:, :]
+        ], dim=3)
+        new_cache_dict = {
+            "cache_spec_m": new_cache_spec_m,
+        }
+        return spec_e, new_cache_dict
+    def mask_loss_fn(self, est_mask: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
+        """
+        :param est_mask: torch.Tensor, shape: [b, 257, t]
+        :param clean:
+        :param noisy:
+        :return:
+        """
+        if noisy.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        noise = noisy - clean
+        clean = self.signal_prepare(clean)
+        noise = self.signal_prepare(noise)
+        stft_clean = self.stft.forward(clean)
+        mag_clean = torch.abs(stft_clean)
+        stft_noise = self.stft.forward(noise)
+        mag_noise = torch.abs(stft_noise)
+        gth_irm_mask = (mag_clean / (mag_clean + mag_noise + self.eps)).clamp(0, 1)
+        loss = F.l1_loss(gth_irm_mask, est_mask, reduction="mean")
+        return loss
+    def lsnr_loss_fn(self, lsnr: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
+        if noisy.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        noise = noisy - clean
+        clean = self.signal_prepare(clean)
+        noise = self.signal_prepare(noise)
+        stft_clean = self.stft.forward(clean)
+        stft_noise = self.stft.forward(noise)
+        # shape: [b, f, t]
+        stft_clean = torch.transpose(stft_clean, dim0=1, dim1=2)
+        stft_noise = torch.transpose(stft_noise, dim0=1, dim1=2)
+        # shape: [b, t, f]
+        stft_clean = torch.unsqueeze(stft_clean, dim=1)
+        stft_noise = torch.unsqueeze(stft_noise, dim=1)
+        # shape: [b, 1, t, f]
+        # lsnr shape: [b, 1, t]
+        lsnr = lsnr.squeeze(1)
+        # lsnr shape: [b, t]
+        lsnr_gth = self.lsnr_fn.forward(stft_clean, stft_noise)
+        # lsnr_gth shape: [b, t]
+        loss = F.mse_loss(lsnr, lsnr_gth)
+        return loss
+class DfNet2PretrainedModel(DfNet2):
+    def __init__(self,
+                 config: DfNet2Config,
+                 ):
+        super(DfNet2PretrainedModel, self).__init__(
+            config=config,
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = DfNet2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    config = DfNet2Config()
+    model = DfNet2PretrainedModel(config=config)
+    model.eval()
+    noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    est_spec, est_wav, est_mask, lsnr = model.forward(noisy)
+    # print(f"est_spec.shape: {est_spec.shape}")
+    # print(f"est_wav.shape: {est_wav.shape}")
+    # print(f"est_mask.shape: {est_mask.shape}")
+    # print(f"lsnr.shape: {lsnr.shape}")
+    waveform = est_wav
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    print(waveform[:, :, 15680: 15682])
+    print(waveform[:, :, 15760: 15762])
+    print(waveform[:, :, 15840: 15842])
+    waveform = model.forward_chunk_by_chunk(noisy)
+    waveform = waveform[:, :, (config.df_lookahead*config.hop_size):]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    print(waveform[:, :, 15680: 15682])
+    print(waveform[:, :, 15760: 15762])
+    print(waveform[:, :, 15840: 15842])
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/dfnet2/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+model_name: "dfnet"
+# spec
+sample_rate: 8000
+nfft: 512
+win_size: 200
+hop_size: 80
+spec_bins: 256
+# model
+conv_channels: 64
+conv_kernel_size_input:
+  - 3
+  - 3
+conv_kernel_size_inner:
+  - 1
+  - 3
+convt_kernel_size_inner:
+  - 1
+  - 3
+embedding_hidden_size: 256
+encoder_combine_op: "concat"
+encoder_emb_skip_op: "none"
+encoder_emb_linear_groups: 16
+encoder_emb_hidden_size: 256
+encoder_linear_groups: 32
+decoder_emb_num_layers: 3
+decoder_emb_skip_op: "none"
+decoder_emb_linear_groups: 16
+decoder_emb_hidden_size: 256
+df_decoder_hidden_size: 256
+df_num_layers: 2
+df_order: 5
+df_bins: 96
+df_gru_skip: "grouped_linear"
+df_decoder_linear_groups: 16
+df_pathway_kernel_size_t: 5
+df_lookahead: 2
+# lsnr
+n_frame: 3
+lsnr_max: 30
+lsnr_min: -15
+norm_tau: 1.
+# data
+min_snr_db: -10
+max_snr_db: 20
+# train
+lr: 0.001
+lr_scheduler: "CosineAnnealingLR"
+lr_scheduler_kwargs:
+  T_max: 250000
+  eta_min: 0.0001
+max_epochs: 100
+clip_grad_norm: 10.0
+seed: 1234
+num_workers: 8
+batch_size: 64
+eval_steps: 10000
+# runtime
+use_post_filter: true

toolbox/torchaudio/models/lstm/modeling_lstm.py CHANGED Viewed

@@ -238,14 +238,13 @@ def main():
     print(waveform[:, :, 300: 302])
     # 2
-    waveform_cache = None
-    coff_cache = None
     waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
     for i in range(int(t)):
         sub_spec = spec[:, :, i:i+1]
         begin = i * config.hop_size
         end = begin + config.win_size - config.hop_size
-        sub_waveform, waveform_cache, coff_cache = model.istft.forward_chunk(sub_spec, waveform_cache, coff_cache)
         # end = begin + config.win_size
         # sub_waveform = model.istft.forward(sub_spec)

     print(waveform[:, :, 300: 302])
     # 2
+    cache_dict = None
     waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
     for i in range(int(t)):
         sub_spec = spec[:, :, i:i+1]
         begin = i * config.hop_size
         end = begin + config.win_size - config.hop_size
+        sub_waveform, cache_dict = model.istft.forward_chunk(sub_spec, cache_dict=cache_dict)
         # end = begin + config.win_size
         # sub_waveform = model.istft.forward(sub_spec)

toolbox/torchaudio/models/rnnoise/modeling_rnnoise.py CHANGED Viewed

@@ -232,8 +232,7 @@ class RNNoise(nn.Module):
         waveform = torch.zeros(size=(b, 1, 0), dtype=torch.float32)
         states = None
-        waveform_cache = None
-        coff_cache = None
         cache_list = list()
         for i in range(int(t)):
@@ -274,7 +273,7 @@ class RNNoise(nn.Module):
             mask = self.erb_bands.erb_scale_inv(mask_erb)
             mask = torch.transpose(mask, dim0=1, dim1=2)
             stft_denoise = self.do_mask(mag_noisy, pha_noisy, mask)
-            sub_waveform, waveform_cache, coff_cache = self.istft.forward_chunk(stft_denoise, waveform_cache, coff_cache)
             waveform = torch.concat(tensors=[waveform, sub_waveform], dim=-1)
         return waveform

         waveform = torch.zeros(size=(b, 1, 0), dtype=torch.float32)
         states = None
+        cache_dict = None
         cache_list = list()
         for i in range(int(t)):
             mask = self.erb_bands.erb_scale_inv(mask_erb)
             mask = torch.transpose(mask, dim0=1, dim1=2)
             stft_denoise = self.do_mask(mag_noisy, pha_noisy, mask)
+            sub_waveform, cache_dict = self.istft.forward_chunk(stft_denoise, cache_dict=cache_dict)
             waveform = torch.concat(tensors=[waveform, sub_waveform], dim=-1)
         return waveform

toolbox/torchaudio/modules/conv_stft.py CHANGED Viewed

@@ -3,6 +3,7 @@
 """
 https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
 """
 import numpy as np
 import torch
 import torch.nn as nn
@@ -144,15 +145,20 @@ class ConviSTFT(nn.Module):
     @torch.no_grad()
     def forward_chunk(self,
                       spec: torch.Tensor,
-                      waveform_cache: torch.Tensor = None,
-                      coff_cache: torch.Tensor = None,
                       ):
         """
         :param spec: shape: [b, f, t]
-        :param waveform_cache: shape: [b, 1, win_size - hop_size]
-        :param coff_cache: shape: [b, 1, win_size - hop_size]
         :return:
         """
         spec = torch.view_as_real(spec)
         matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
@@ -174,7 +180,12 @@ class ConviSTFT(nn.Module):
         new_coff_cache = coff_current[:, :, self.hop_size:]
         waveform_output = waveform_output / (coff_output + 1e-8)
-        return waveform_output, new_waveform_cache, new_coff_cache
 def main():
@@ -238,15 +249,14 @@ def main2():
     print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
     print(waveform[:, :, 300: 302])
-    waveform_cache = None
-    coff_cache = None
     waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
     for i in range(int(t)):
         sub_spec = spec[:, :, i:i+1]
         begin = i * hop_size
         end = begin + win_size - hop_size
-        sub_waveform, waveform_cache, coff_cache = istft.forward_chunk(sub_spec, waveform_cache, coff_cache)
         # end = begin + win_size
         # sub_waveform = istft.forward(sub_spec)

 """
 https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
 """
+from collections import defaultdict
 import numpy as np
 import torch
 import torch.nn as nn
     @torch.no_grad()
     def forward_chunk(self,
                       spec: torch.Tensor,
+                      cache_dict: dict = None
                       ):
         """
         :param spec: shape: [b, f, t]
+        :param cache_dict: dict,
+        waveform_cache shape: [b, 1, win_size - hop_size]
+        coff_cache shape: [b, 1, win_size - hop_size]
         :return:
         """
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        waveform_cache = cache_dict["waveform_cache"]
+        coff_cache = cache_dict["coff_cache"]
         spec = torch.view_as_real(spec)
         matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
         new_coff_cache = coff_current[:, :, self.hop_size:]
         waveform_output = waveform_output / (coff_output + 1e-8)
+        new_cache_dict = {
+            "waveform_cache": new_waveform_cache,
+            "coff_cache": new_coff_cache,
+        }
+        return waveform_output, new_cache_dict
 def main():
     print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
     print(waveform[:, :, 300: 302])
+    cache_dict = None
     waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
     for i in range(int(t)):
         sub_spec = spec[:, :, i:i+1]
         begin = i * hop_size
         end = begin + win_size - hop_size
+        sub_waveform, cache_dict = istft.forward_chunk(sub_spec, cache_dict=cache_dict)
         # end = begin + win_size
         # sub_waveform = istft.forward(sub_spec)