Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on Apr 18

Commit

cba47e4

1 Parent(s): 1b032b9

add frcrn model

Browse files

Files changed (13) hide show

examples/conv_tasnet/step_2_train_model.py +3 -2
examples/frcrn/run.sh +154 -0
examples/frcrn/step_1_prepare_data.py +162 -0
examples/frcrn/step_2_train_model.py +436 -0
examples/frcrn/yaml/config.yaml +24 -0
toolbox/torchaudio/losses/irm.py +111 -0
toolbox/torchaudio/losses/spectral.py +72 -1
toolbox/torchaudio/models/frcrn/complex_nn.py +258 -0
toolbox/torchaudio/models/frcrn/configuration_frcrn.py +67 -0
toolbox/torchaudio/models/frcrn/conv_stft.py +147 -0
toolbox/torchaudio/models/frcrn/modeling_frcrn.py +317 -2
toolbox/torchaudio/models/frcrn/unet.py +359 -0
toolbox/torchaudio/models/frcrn/uni_deep_fsmn.py +71 -0

examples/conv_tasnet/step_2_train_model.py CHANGED Viewed

@@ -300,7 +300,7 @@ def main():
             # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss
             # loss = 2.0 * mr_stft_loss + 0.8 * ae_loss + 0.7 * neg_si_snr_loss + 0.5 * neg_stoi_loss
             # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss
-            loss = 0.2 * ae_loss + 0.2 * neg_si_snr_loss + 1.0 * mr_stft_loss + 0.3 * neg_stoi_loss + 0.5 * pesq_loss
             if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                 logger.info(f"find nan or inf in loss.")
                 continue
@@ -381,7 +381,8 @@ def main():
                         # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.5 * mr_stft_loss + 0.3 * neg_stoi_loss
                         # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss
                         # loss = 2.0 * mr_stft_loss + 0.8 * ae_loss + 0.7 * neg_si_snr_loss + 0.5 * neg_stoi_loss
-                        loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss
                         if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                             logger.info(f"find nan or inf in loss.")
                             continue

             # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss
             # loss = 2.0 * mr_stft_loss + 0.8 * ae_loss + 0.7 * neg_si_snr_loss + 0.5 * neg_stoi_loss
             # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss
+            loss = 0.1 * ae_loss + 0.1 * neg_si_snr_loss + 1.0 * mr_stft_loss + 0.2 * neg_stoi_loss + 0.2 * pesq_loss
             if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                 logger.info(f"find nan or inf in loss.")
                 continue
                         # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.5 * mr_stft_loss + 0.3 * neg_stoi_loss
                         # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss
                         # loss = 2.0 * mr_stft_loss + 0.8 * ae_loss + 0.7 * neg_si_snr_loss + 0.5 * neg_stoi_loss
+                        # loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss
+                        loss = 0.1 * ae_loss + 0.1 * neg_si_snr_loss + 1.0 * mr_stft_loss + 0.2 * neg_stoi_loss + 0.2 * pesq_loss
                         if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                             logger.info(f"find nan or inf in loss.")
                             continue

examples/frcrn/run.sh ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/env bash
+: <<'END'
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name conv-tasnet-dns3-20250319 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/dns3-noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech" \
+--max_epochs 400
+END
+# params
+system_version="windows";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=9
+work_dir="$(pwd)"
+file_folder_name=file_folder_name
+final_model_name=final_model_name
+config_file="yaml/config.yaml"
+limit=10
+noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
+speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+max_count=10000000
+nohup_name=nohup.out
+# model params
+batch_size=64
+max_epochs=200
+save_top_k=10
+patience=5
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+file_dir="${work_dir}/${file_folder_name}"
+final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
+evaluation_audio_dir="${file_dir}/evaluation_audio"
+train_dataset="${file_dir}/train.jsonl"
+valid_dataset="${file_dir}/valid.jsonl"
+$verbose && echo "system_version: ${system_version}"
+$verbose && echo "file_folder_name: ${file_folder_name}"
+if [ $system_version == "windows" ]; then
+  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
+elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
+  #source /data/local/bin/nx_denoise/bin/activate
+  alias python3='/data/local/bin/nx_denoise/bin/python3'
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: prepare data"
+  cd "${work_dir}" || exit 1
+  python3 step_1_prepare_data.py \
+  --file_dir "${file_dir}" \
+  --noise_dir "${noise_dir}" \
+  --speech_dir "${speech_dir}" \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --max_count "${max_count}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: train model"
+  cd "${work_dir}" || exit 1
+  python3 step_2_train_model.py \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --serialization_dir "${file_dir}" \
+  --config_file "${config_file}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: test model"
+  cd "${work_dir}" || exit 1
+  python3 step_3_evaluation.py \
+  --valid_dataset "${valid_dataset}" \
+  --model_dir "${file_dir}/best" \
+  --evaluation_audio_dir "${evaluation_audio_dir}" \
+  --limit "${limit}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: collect files"
+  cd "${work_dir}" || exit 1
+  mkdir -p ${final_model_dir}
+  cp "${file_dir}/best"/* "${final_model_dir}"
+  cp -r "${file_dir}/evaluation_audio" "${final_model_dir}"
+  cd "${final_model_dir}/.." || exit 1;
+  if [ -e "${final_model_name}.zip" ]; then
+    rm -rf "${final_model_name}_backup.zip"
+    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
+  fi
+  zip -r "${final_model_name}.zip" "${final_model_name}"
+  rm -rf "${final_model_name}"
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: clear file_dir"
+  cd "${work_dir}" || exit 1
+  rm -rf "${file_dir}";
+fi

examples/frcrn/step_1_prepare_data.py ADDED Viewed

	@@ -0,0 +1,162 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import random
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+import numpy as np
+from tqdm import tqdm
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_dir", default="./", type=str)
+    parser.add_argument(
+        "--noise_dir",
+        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
+        type=str
+    )
+    parser.add_argument(
+        "--speech_dir",
+        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
+        type=str
+    )
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--duration", default=4.0, type=float)
+    parser.add_argument("--min_snr_db", default=-10, type=float)
+    parser.add_argument("--max_snr_db", default=20, type=float)
+    parser.add_argument("--target_sample_rate", default=8000, type=int)
+    parser.add_argument("--max_count", default=10000, type=int)
+    args = parser.parse_args()
+    return args
+def filename_generator(data_dir: str):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        yield filename.as_posix()
+def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000, max_epoch: int = 20000):
+    data_dir = Path(data_dir)
+    for epoch_idx in range(max_epoch):
+        for filename in data_dir.glob("**/*.wav"):
+            signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
+            raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
+            if raw_duration < duration:
+                # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
+                continue
+            if signal.ndim != 1:
+                raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
+            signal_length = len(signal)
+            win_size = int(duration * sample_rate)
+            for begin in range(0, signal_length - win_size, win_size):
+                if np.sum(signal[begin: begin+win_size]) == 0:
+                    continue
+                row = {
+                    "epoch_idx": epoch_idx,
+                    "filename": filename.as_posix(),
+                    "raw_duration": round(raw_duration, 4),
+                    "offset": round(begin / sample_rate, 4),
+                    "duration": round(duration, 4),
+                }
+                yield row
+def main():
+    args = get_args()
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    noise_dir = Path(args.noise_dir)
+    speech_dir = Path(args.speech_dir)
+    noise_generator = target_second_signal_generator(
+        noise_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=100000,
+    )
+    speech_generator = target_second_signal_generator(
+        speech_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=1,
+    )
+    dataset = list()
+    count = 0
+    process_bar = tqdm(desc="build dataset excel")
+    with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
+        for noise, speech in zip(noise_generator, speech_generator):
+            if count >= args.max_count:
+                break
+            noise_filename = noise["filename"]
+            noise_raw_duration = noise["raw_duration"]
+            noise_offset = noise["offset"]
+            noise_duration = noise["duration"]
+            speech_filename = speech["filename"]
+            speech_raw_duration = speech["raw_duration"]
+            speech_offset = speech["offset"]
+            speech_duration = speech["duration"]
+            random1 = random.random()
+            random2 = random.random()
+            row = {
+                "noise_filename": noise_filename,
+                "noise_raw_duration": noise_raw_duration,
+                "noise_offset": noise_offset,
+                "noise_duration": noise_duration,
+                "speech_filename": speech_filename,
+                "speech_raw_duration": speech_raw_duration,
+                "speech_offset": speech_offset,
+                "speech_duration": speech_duration,
+                "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
+                "random1": random1,
+            }
+            row = json.dumps(row, ensure_ascii=False)
+            if random2 < (1 / 300 / 1):
+                fvalid.write(f"{row}\n")
+            else:
+                ftrain.write(f"{row}\n")
+            count += 1
+            duration_seconds = count * args.duration
+            duration_hours = duration_seconds / 3600
+            process_bar.update(n=1)
+            process_bar.set_postfix({
+                # "duration_seconds": round(duration_seconds, 4),
+                "duration_hours": round(duration_hours, 4),
+            })
+    return
+if __name__ == "__main__":
+    main()

examples/frcrn/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,436 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+from toolbox.torchaudio.models.frcrn.configuration_frcrn import FRCRNConfig
+from toolbox.torchaudio.models.frcrn.modeling_frcrn import FRCRN, FRCRNPretrainedModel
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
+    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
+    parser.add_argument("--num_serialized_models_to_keep", default=10, type=int)
+    parser.add_argument("--patience", default=5, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self):
+        pass
+    def __call__(self, batch: List[dict]):
+        clean_audios = list()
+        noisy_audios = list()
+        for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
+            clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
+        clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
+        # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = FRCRNConfig.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(config.seed)
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    logger.info(f"set seed: {config.seed}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info(f"GPU available count: {n_gpu}; device: {device}")
+    # datasets
+    train_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.train_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+        # skip=225000,
+    )
+    valid_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.valid_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=2,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=args.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=2,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = FRCRNPretrainedModel(config).to(device)
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, categorical_accuracy")
+    optimizer = torch.optim.AdamW(model.get_params(weight_decay=config.weight_decay), config.lr)
+    # resume training
+    last_step_idx = -1
+    last_epoch = -1
+    for step_idx_str in serialization_dir.glob("steps-*"):
+        step_idx_str = Path(step_idx_str)
+        step_idx = step_idx_str.stem.split("-")[1]
+        step_idx = int(step_idx)
+        if step_idx > last_step_idx:
+            last_step_idx = step_idx
+    last_epoch = 1
+    if last_step_idx != -1:
+        logger.info(f"resume from steps-{last_step_idx}.")
+        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
+        optimizer_pth = serialization_dir / f"steps-{last_step_idx}/optimizer.pth"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        logger.info(f"load state dict for optimizer.")
+        with open(optimizer_pth.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        optimizer.load_state_dict(state_dict)
+    if config.lr_scheduler == "CosineAnnealingLR":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            last_epoch=last_epoch,
+            # T_max=10 * config.eval_steps,
+            # eta_min=0.01 * config.lr,
+            **config.lr_scheduler_kwargs,
+        )
+    elif config.lr_scheduler == "MultiStepLR":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            last_epoch=last_epoch,
+            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+        )
+    else:
+        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    # training loop
+    # state
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
+    average_mag_loss = 1000000000
+    average_pha_loss = 1000000000
+    model_list = list()
+    best_epoch_idx = None
+    best_step_idx = None
+    best_metric = None
+    patience_count = 0
+    step_idx = 0 if last_step_idx == -1 else last_step_idx
+    logger.info("training")
+    for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_map_loss = 0.
+        total_pha_loss = 0.
+        total_batches = 0.
+        progress_bar_train = tqdm(
+            initial=step_idx,
+            desc="Training; epoch-{}".format(epoch_idx),
+        )
+        for train_batch in train_data_loader:
+            clean_audios, noisy_audios = train_batch
+            clean_audios: torch.Tensor = clean_audios.to(device)
+            noisy_audios: torch.Tensor = noisy_audios.to(device)
+            est_spec, est_wav, est_mask = model.forward(noisy_audios)
+            denoise_audios = est_wav
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+            map_loss, pha_loss = model.mag_pha_loss_fn(est_mask, clean_audios, noisy_audios)
+            loss = 0.5 * map_loss + 0.5 * pha_loss + 0.5 * neg_si_snr_loss
+            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                logger.info(f"find nan or inf in loss.")
+                continue
+            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
+            total_map_loss += map_loss.item()
+            total_pha_loss += pha_loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+            average_mag_loss = round(total_map_loss / total_batches, 4)
+            average_pha_loss = round(total_pha_loss / total_batches, 4)
+            progress_bar_train.update(1)
+            progress_bar_train.set_postfix({
+                "lr": lr_scheduler.get_last_lr()[0],
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
+                "mag_loss": average_mag_loss,
+                "pha_loss": average_pha_loss,
+            })
+            # evaluation
+            step_idx += 1
+            if step_idx % config.eval_steps == 0:
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_map_loss = 0.
+                    total_pha_loss = 0.
+                    total_batches = 0.
+                    progress_bar_train.close()
+                    progress_bar_eval = tqdm(
+                        desc="Evaluation; steps-{}k".format(int(step_idx/1000)),
+                    )
+                    for eval_batch in valid_data_loader:
+                        clean_audios, noisy_audios = eval_batch
+                        clean_audios = clean_audios.to(device)
+                        noisy_audios = noisy_audios.to(device)
+                        est_spec, est_wav, est_mask = model.forward(noisy_audios)
+                        denoise_audios = est_wav
+                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+                        map_loss, pha_loss = model.mag_pha_loss_fn(est_mask, clean_audios, noisy_audios)
+                        loss = 0.5 * map_loss + 0.5 * pha_loss + 0.5 * neg_si_snr_loss
+                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                            logger.info(f"find nan or inf in loss.")
+                            continue
+                        denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+                        total_pesq_score += pesq_score
+                        total_loss += loss.item()
+                        total_neg_si_snr_loss += neg_si_snr_loss.item()
+                        total_map_loss += map_loss.item()
+                        total_pha_loss += pha_loss.item()
+                        total_batches += 1
+                        average_pesq_score = round(total_pesq_score / total_batches, 4)
+                        average_loss = round(total_loss / total_batches, 4)
+                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+                        average_mag_loss = round(total_map_loss / total_batches, 4)
+                        average_pha_loss = round(total_pha_loss / total_batches, 4)
+                        progress_bar_eval.update(1)
+                        progress_bar_eval.set_postfix({
+                            "lr": lr_scheduler.get_last_lr()[0],
+                            "pesq_score": average_pesq_score,
+                            "loss": average_loss,
+                            "neg_si_snr_loss": average_neg_si_snr_loss,
+                            "mag_loss": average_mag_loss,
+                            "pha_loss": average_pha_loss,
+                        })
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_map_loss = 0.
+                    total_pha_loss = 0.
+                    total_batches = 0.
+                    progress_bar_eval.close()
+                    progress_bar_train = tqdm(
+                        initial=progress_bar_train.n,
+                        postfix=progress_bar_train.postfix,
+                        desc=progress_bar_train.desc,
+                    )
+                    # save path
+                    save_dir = serialization_dir / "steps-{}".format(step_idx)
+                    save_dir.mkdir(parents=True, exist_ok=False)
+                    # save models
+                    model.save_pretrained(save_dir.as_posix())
+                    model_list.append(save_dir)
+                    if len(model_list) >= args.num_serialized_models_to_keep:
+                        model_to_delete: Path = model_list.pop(0)
+                        shutil.rmtree(model_to_delete.as_posix())
+                    # save optim
+                    torch.save(optimizer.state_dict(), (save_dir / "optimizer.pth").as_posix())
+                    # save metric
+                    if best_metric is None:
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    elif average_pesq_score > best_metric:
+                        # great is better.
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    else:
+                        pass
+                    metrics = {
+                        "epoch_idx": epoch_idx,
+                        "best_epoch_idx": best_epoch_idx,
+                        "best_step_idx": best_step_idx,
+                        "pesq_score": average_pesq_score,
+                        "loss": average_loss,
+                        "neg_si_snr_loss": average_neg_si_snr_loss,
+                        "mag_loss": average_mag_loss,
+                        "pha_loss": average_pha_loss,
+                    }
+                    metrics_filename = save_dir / "metrics_epoch.json"
+                    with open(metrics_filename, "w", encoding="utf-8") as f:
+                        json.dump(metrics, f, indent=4, ensure_ascii=False)
+                    # save best
+                    best_dir = serialization_dir / "best"
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        if best_dir.exists():
+                            shutil.rmtree(best_dir)
+                        shutil.copytree(save_dir, best_dir)
+                    # early stop
+                    early_stop_flag = False
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        patience_count = 0
+                    else:
+                        patience_count += 1
+                    if patience_count >= args.patience:
+                        early_stop_flag = True
+                    # early stop
+                    if early_stop_flag:
+                        break
+    return
+if __name__ == "__main__":
+    main()

examples/frcrn/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+model_name: "frcrn"
+num_gpus: -1
+lr: 0.001
+max_epochs: 100
+weight_decay: 1.0e-05
+clip_grad_norm: 10.0
+seed: 1234
+sample_rate: 8000
+segment_size: 32000
+nfft: 512
+win_size: 512
+hop_size: 256
+win_type: hann
+use_complex_networks: true
+model_depth: 20
+model_complexity: 45
+num_workers: 4
+batch_size: 4

toolbox/torchaudio/losses/irm.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import List
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class CIRMLoss(nn.Module):
+    def __init__(self,
+                 n_fft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 center: bool = True,
+                 eps: float = 1e-8,
+                 reduction: str = "mean",
+                 ):
+        super(CIRMLoss, self).__init__()
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.center = center
+        self.eps = eps
+        self.reduction = reduction
+        self.window = nn.Parameter(torch.hann_window(win_size), requires_grad=False)
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+    def forward(self, clean: torch.Tensor, noisy: torch.Tensor, mask_real: torch.Tensor, mask_imag: torch.Tensor):
+        """
+        :param clean: waveform
+        :param noisy: waveform
+        :param mask_real: shape: [b, f, t]
+        :param mask_imag: shape: [b, f, t]
+        :return:
+        """
+        if noisy.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        # clean_stft, noisy_stft shape: [b, f, t]
+        clean_stft = torch.stft(
+            clean,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        noisy_stft = torch.stft(
+            noisy,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        # [b, f, t]
+        clean_stft_spec_real = torch.real(clean_stft)
+        clean_stft_spec_imag = torch.imag(clean_stft)
+        noisy_stft_spec_real = torch.real(noisy_stft)
+        noisy_stft_spec_imag = torch.imag(noisy_stft)
+        noisy_power = noisy_stft_spec_real ** 2 + noisy_stft_spec_imag ** 2
+        sr = clean_stft_spec_real
+        yr = noisy_stft_spec_real
+        si = clean_stft_spec_imag
+        yi = noisy_stft_spec_imag
+        y_pow = noisy_power
+        # (Sr * Yr + Si * Yi) / (Y_pow + 1e-8)
+        gth_mask_real = (sr * yr + si * yi) / (y_pow + self.eps)
+        # (Si * Yr - Sr * Yi) / (Y_pow + 1e-8)
+        gth_mask_imag = (sr * yr - si * yi) / (y_pow + self.eps)
+        gth_mask_real[gth_mask_real > 2] = 1
+        gth_mask_real[gth_mask_real < -2] = -1
+        gth_mask_imag[gth_mask_imag > 2] = 1
+        gth_mask_imag[gth_mask_imag < -2] = -1
+        amp_loss = F.mse_loss(gth_mask_real, mask_real)
+        phase_loss = F.mse_loss(gth_mask_imag, mask_imag)
+        loss = amp_loss + phase_loss
+        return loss
+def main():
+    batch_size = 2
+    signal_length = 16000
+    estimated_signal = torch.randn(batch_size, signal_length)
+    target_signal = torch.randn(batch_size, signal_length)
+    loss_fn = CIRMLoss()
+    loss = loss_fn.forward(estimated_signal, target_signal)
+    print(f"loss: {loss.item()}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/losses/spectral.py CHANGED Viewed

@@ -346,6 +346,76 @@ class MultiResolutionSTFTLoss(torch.nn.Module):
         return loss
 def main():
     batch_size = 2
     signal_length = 16000
@@ -354,7 +424,8 @@ def main():
     # loss_fn = LSDLoss()
     # loss_fn = ComplexSpectralLoss()
-    loss_fn = MultiResolutionSTFTLoss()
     loss = loss_fn.forward(estimated_signal, target_signal)
     print(f"loss: {loss.item()}")

         return loss
+class WeightedMagnitudePhaseLoss(nn.Module):
+    def __init__(self,
+                 n_fft: int = 1024,
+                 win_size: int = 600,
+                 hop_size: int = 120,
+                 center: bool = True,
+                 reduction: str = "mean",
+                 mag_weight: float = 0.9,
+                 pha_weight: float = 0.3,
+                 ):
+        super(WeightedMagnitudePhaseLoss, self).__init__()
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.center = center
+        self.reduction = reduction
+        self.mag_weight = mag_weight
+        self.pha_weight = pha_weight
+        self.window = nn.Parameter(torch.hann_window(win_size), requires_grad=False)
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        :param denoise:
+        :param clean:
+        :return:
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        # denoise_stft, clean_stft shape: [b, f, t]
+        denoise_stft = torch.stft(
+            denoise,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        clean_stft = torch.stft(
+            clean,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        denoise_stft_spec = torch.view_as_real(denoise_stft)
+        denoise_mag = torch.sqrt(denoise_stft_spec.pow(2).sum(-1) + 1e-9)
+        denoise_pha = torch.atan2(denoise_stft_spec[:, :, :, 1] + 1e-10, denoise_stft_spec[:, :, :, 0] + 1e-5)
+        clean_stft_spec = torch.view_as_real(clean_stft)
+        clean_mag = torch.sqrt(clean_stft_spec.pow(2).sum(-1) + 1e-9)
+        clean_pha = torch.atan2(clean_stft_spec[:, :, :, 1] + 1e-10, clean_stft_spec[:, :, :, 0] + 1e-5)
+        mag_loss = F.mse_loss(denoise_mag, clean_mag, reduction=self.reduction)
+        pha_loss = F.mse_loss(denoise_pha, clean_pha, reduction=self.reduction)
+        loss = self.mag_weight * mag_loss + self.pha_weight * pha_loss
+        return loss
 def main():
     batch_size = 2
     signal_length = 16000
     # loss_fn = LSDLoss()
     # loss_fn = ComplexSpectralLoss()
+    # loss_fn = MultiResolutionSTFTLoss()
+    loss_fn = WeightedMagnitudePhaseLoss()
     loss = loss_fn.forward(estimated_signal, target_signal)
     print(f"loss: {loss.item()}")

toolbox/torchaudio/models/frcrn/complex_nn.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Union, Tuple
+import torch
+import torch.nn as nn
+from toolbox.torchaudio.models.frcrn.uni_deep_fsmn import UniDeepFsmn
+class ComplexUniDeepFsmn(nn.Module):
+    def __init__(self, input_dim: int, hidden_size: int, lorder: int = 20):
+        super(ComplexUniDeepFsmn, self).__init__()
+        self.fsmn_re_l1 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+        self.fsmn_im_l1 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+        self.fsmn_re_l2 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+        self.fsmn_im_l2 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: torch.Tensor, shape: [b, c, h, t, 2]
+        :return: torch.Tensor, shape: [b, h, t, 2]
+        """
+        b, c, h, t, d = x.size()
+        x = torch.reshape(x, shape=(b, c * h, t, d))
+        # x shape: [b, h', t, 2]
+        x = torch.transpose(x, dim0=1, dim1=2)
+        # x shape: [b, t, h', 2]
+        real_l1 = self.fsmn_re_l1(x[..., 0]) - self.fsmn_im_l1(x[..., 1])
+        imaginary_l1 = self.fsmn_re_l1(x[..., 1]) + self.fsmn_im_l1(x[..., 0])
+        # real, image shape: [b, t, h']
+        real = self.fsmn_re_l2(real_l1) - self.fsmn_im_l2(imaginary_l1)
+        imaginary = self.fsmn_re_l2(imaginary_l1) + self.fsmn_im_l2(real_l1)
+        # real, image shape: [b, t, h']
+        output = torch.stack(tensors=(real, imaginary), dim=-1)
+        # output shape: [b, t, h', 2]
+        output = torch.transpose(output, dim0=1, dim1=2)
+        # output shape: [b, h', t, 2]
+        output = torch.reshape(output, shape=(b, c, h, t, d))
+        # output shape: [b, c, h, t, 2]
+        return output
+class ComplexUniDeepFsmnL1(nn.Module):
+    def __init__(self, input_dim: int, hidden_size: int, lorder: int = 20):
+        super(ComplexUniDeepFsmnL1, self).__init__()
+        self.fsmn_re_l1 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+        self.fsmn_im_l1 = UniDeepFsmn(input_dim, hidden_size, lorder=lorder)
+    def forward(self, x: torch.Tensor):
+        b, c, h, t, d = x.size()
+        x = torch.transpose(x, dim0=1, dim1=3)
+        # x shape: [b, t, h, c, 2]
+        x = torch.reshape(x, shape=(b * t, h, c, d))
+        # x shape: [b*t, h, c, 2]
+        real = self.fsmn_re_l1(x[..., 0]) - self.fsmn_im_l1(x[..., 1])
+        imaginary = self.fsmn_re_l1(x[..., 1]) + self.fsmn_im_l1(x[..., 0])
+        # real, image shape: [b*t, h, c]
+        output = torch.stack(tensors=(real, imaginary), dim=-1)
+        # output shape: [b*t, h, c, 2]
+        output = torch.reshape(output, shape=(b, t, h, c, d))
+        # output shape: [b, t, h, c, 2]
+        output = torch.transpose(output, dim0=1, dim1=3)
+        # output shape: [b, c, h, t, 2]
+        return output
+class ComplexConv2d(nn.Module):
+    # https://github.com/litcoderr/ComplexCNN/blob/master/complexcnn/modules.py
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 **kwargs
+                 ):
+        super().__init__()
+        # Model components
+        self.conv_re = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **kwargs
+        )
+        self.conv_im = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **kwargs
+        )
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: torch.Tensor, shape: [b, c, h, w, 2]
+        :return:
+        """
+        real = self.conv_re(x[..., 0]) - self.conv_im(x[..., 1])
+        imaginary = self.conv_re(x[..., 1]) + self.conv_im(x[..., 0])
+        output = torch.stack((real, imaginary), dim=-1)
+        return output
+class ComplexConvTranspose2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 output_padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias=True,
+                 **kwargs
+                 ):
+        super().__init__()
+        # Model components
+        self.tconv_re = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            **kwargs
+        )
+        self.tconv_im = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            **kwargs
+        )
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: torch.Tensor, shape: [b, c, h, w, 2]
+        :return:
+        """
+        real = self.tconv_re(x[..., 0]) - self.tconv_im(x[..., 1])
+        imaginary = self.tconv_re(x[..., 1]) + self.tconv_im(x[..., 0])
+        output = torch.stack((real, imaginary), dim=-1)
+        return output
+class ComplexBatchNorm2d(nn.Module):
+    def __init__(self,
+                 num_features: int,
+                 eps: float = 1e-5,
+                 momentum: float = 0.1,
+                 affine: bool = True,
+                 track_running_stats: bool = True,
+                 **kwargs
+                 ):
+        super().__init__()
+        self.bn_re = nn.BatchNorm2d(
+            num_features=num_features,
+            momentum=momentum,
+            affine=affine,
+            eps=eps,
+            track_running_stats=track_running_stats,
+            **kwargs
+        )
+        self.bn_im = nn.BatchNorm2d(
+            num_features=num_features,
+            momentum=momentum,
+            affine=affine,
+            eps=eps,
+            track_running_stats=track_running_stats,
+            **kwargs
+        )
+    def forward(self, x: torch.Tensor):
+        real = self.bn_re(x[..., 0])
+        imag = self.bn_im(x[..., 1])
+        output = torch.stack((real, imag), dim=-1)
+        return output
+def main():
+    # x = torch.rand(size=(1, 1, 32, 200, 2))
+    # fsmn = ComplexUniDeepFsmn(
+    #     input_dim=32,
+    #     hidden_size=64,
+    # )
+    # result = fsmn.forward(x)
+    # print(result.shape)
+    # x = torch.rand(size=(1, 32, 32, 200, 2))
+    # fsmn = ComplexUniDeepFsmnL1(
+    #     input_dim=32,
+    #     hidden_size=64,
+    # )
+    # result = fsmn.forward(x)
+    # print(result.shape)
+    # x = torch.rand(size=(1, 32, 200, 200, 2))
+    x = torch.rand(size=(1, 1, 320, 200, 2))
+    conv2d = ComplexConv2d(
+        in_channels=1,
+        out_channels=128,
+        kernel_size=(5, 2),
+        stride=(2, 1),
+        padding=(0, 1),
+    )
+    result = conv2d.forward(x)
+    print(result.shape)
+    # x = torch.rand(size=(1, 32, 200, 200, 2))
+    # x = torch.rand(size=(1, 64, 15, 2000, 2))
+    # tconv = ComplexConvTranspose2d(
+    #     in_channels=64,
+    #     out_channels=32,
+    #     kernel_size=(3, 3),
+    #     stride=(2, 1),
+    #     padding=(0, 1),
+    # )
+    # result = tconv.forward(x)
+    # print(result.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/frcrn/configuration_frcrn.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/checkpoints/FRCRN_SE_16K/config.yaml
+https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/config/inference/FRCRN_SE_16K.yaml
+"""
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class FRCRNConfig(PretrainedConfig):
+    def __init__(self,
+                 num_gpus: int = -1,
+                 lr: float = 0.001,
+                 max_epochs: int = 100,
+                 weight_decay: float = 0.00001,
+                 clip_grad_norm: float = 10.,
+                 seed: int = 1234,
+                 sample_rate: int = 8000,
+                 segment_size: int = 32000,
+                 nfft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 win_type: str = "hann",
+                 use_complex_networks: bool = True,
+                 model_depth: int = 20,
+                 model_complexity: int = 45,
+                 num_workers: int = 4,
+                 batch_size: int = 4,
+                 **kwargs
+                 ):
+        super(FRCRNConfig, self).__init__(**kwargs)
+        self.num_gpus = num_gpus
+        self.lr = lr
+        self.max_epochs = max_epochs
+        self.weight_decay = weight_decay
+        self.clip_grad_norm = clip_grad_norm
+        self.seed = seed
+        self.sample_rate = sample_rate
+        self.segment_size = segment_size
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.use_complex_networks = use_complex_networks
+        self.model_depth = model_depth
+        self.model_complexity = model_complexity
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+def main():
+    config = FRCRNConfig()
+    config.to_yaml_file("config.yaml")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/frcrn/conv_stft.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):
+    if win_type == "None" or win_type is None:
+        window = np.ones(win_size)
+    else:
+        window = get_window(win_type, win_size, fftbins=True)**0.5
+    fourier_basis = np.fft.rfft(np.eye(nfft))[:win_size]
+    real_kernel = np.real(fourier_basis)
+    image_kernel = np.imag(fourier_basis)
+    kernel = np.concatenate([real_kernel, image_kernel], 1).T
+    if inverse:
+        kernel = np.linalg.pinv(kernel).T
+    kernel = kernel * window
+    kernel = kernel[:, None, :]
+    result = (
+        torch.from_numpy(kernel.astype(np.float32)),
+        torch.from_numpy(window[None, :, None].astype(np.float32))
+    )
+    return result
+class ConvSTFT(nn.Module):
+    def __init__(self,
+                 nfft: int,
+                 win_size: int,
+                 hop_size: int,
+                 win_type: str = "hamming",
+                 feature_type: str = "real",
+                 requires_grad: bool = False):
+        super(ConvSTFT, self).__init__()
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+        kernel, _ = init_kernels(self.nfft, win_size, hop_size, win_type)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.stride = hop_size
+        self.dim = self.nfft
+        self.feature_type = feature_type
+    def forward(self, inputs: torch.Tensor):
+        if inputs.dim() == 2:
+            inputs = torch.unsqueeze(inputs, 1)
+        outputs = F.conv1d(inputs, self.weight, stride=self.stride)
+        if self.feature_type == "complex":
+            return outputs
+        else:
+            dim = self.dim // 2 + 1
+            real = outputs[:, :dim, :]
+            imag = outputs[:, dim:, :]
+            mags = torch.sqrt(real**2 + imag**2)
+            phase = torch.atan2(imag, real)
+            return mags, phase
+class ConviSTFT(nn.Module):
+    def __init__(self,
+                 win_size: int,
+                 hop_size: int,
+                 nfft: int = None,
+                 win_type: str = "hamming",
+                 feature_type: str = "real",
+                 requires_grad: bool = False):
+        super(ConviSTFT, self).__init__()
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+        kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.stride = hop_size
+        self.dim = self.nfft
+        self.feature_type = feature_type
+        self.register_buffer("window", window)
+        self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
+    def forward(self,
+                inputs: torch.Tensor,
+                phase: torch.Tensor = None):
+        """
+        :param inputs: torch.Tensor, shape: [b, n+2, t] (complex spec) or [b, n//2+1, t] (mags)
+        :param phase: torch.Tensor, shape: [b, n//2+1, t]
+        :return:
+        """
+        if phase is not None:
+            real = inputs * torch.cos(phase)
+            imag = inputs * torch.sin(phase)
+            inputs = torch.cat([real, imag], 1)
+        outputs = F.conv_transpose1d(inputs, self.weight, stride=self.stride)
+        # this is from torch-stft: https://github.com/pseeth/torch-stft
+        t = self.window.repeat(1, 1, inputs.size(-1))**2
+        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        outputs = outputs / (coff + 1e-8)
+        return outputs
+def main():
+    stft = ConvSTFT(win_size=512, hop_size=200, feature_type="complex")
+    istft = ConviSTFT(win_size=512, hop_size=200, feature_type="complex")
+    mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)
+    spec = stft.forward(mixture)
+    # shape: [batch_size, freq_bins, time_steps]
+    print(spec.shape)
+    waveform = istft.forward(spec)
+    # shape: [batch_size, channels, num_samples]
+    print(waveform.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/frcrn/modeling_frcrn.py CHANGED Viewed

@@ -2,9 +2,324 @@
 # -*- coding: utf-8 -*-
 """
 https://arxiv.org/abs/2206.07293
 """
-from modelscope.models.audio.ans.frcrn import FRCRN
 if __name__ == "__main__":
-    pass

 # -*- coding: utf-8 -*-
 """
 https://arxiv.org/abs/2206.07293
+https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/frcrn.py
+https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/models/frcrn_se/frcrn.py
 """
+import os
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.models.frcrn.configuration_frcrn import FRCRNConfig
+from toolbox.torchaudio.models.frcrn.conv_stft import ConviSTFT, ConvSTFT
+from toolbox.torchaudio.models.frcrn.unet import UNet
+class FRCRN(nn.Module):
+    """ Frequency Recurrent CRN """
+    def __init__(self,
+                 use_complex_networks: bool = True,
+                 model_complexity: int = 45,
+                 model_depth: int = 14,
+                 padding_mode: str = "zeros",
+                 nfft: int = 640,
+                 win_size: int = 640,
+                 hop_size: int = 320,
+                 win_type: str = "hann",
+                 ):
+        """
+        :param use_complex_networks: bool, Whether to use complex networks.
+        :param model_complexity: int, define the model complexity with the number of layers
+        :param model_depth: int, Only two options are available : 10, 20
+        :param padding_mode: str, Encoder's convolution filter. 'zeros', 'reflect'
+        :param nfft: int, number of Short Time Fourier Transform (STFT) points
+        :param win_size: int, length of window used for defining one frame of sample points
+        :param hop_size: int, length of window shifting (equivalent to hop_size)
+        :param win_type: str, windowing type used in STFT, eg. 'hanning', 'hamming'
+        """
+        super().__init__()
+        self.freq_bins = nfft // 2 + 1
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.stft = ConvSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            feature_type="complex",
+            requires_grad=False
+        )
+        self.istft = ConviSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            win_type=self.win_type,
+            feature_type="complex",
+            requires_grad=False
+        )
+        self.unet = UNet(
+            in_channels=1,
+            use_complex_networks=use_complex_networks,
+            model_complexity=model_complexity,
+            model_depth=model_depth,
+            padding_mode=padding_mode
+        )
+        self.unet2 = UNet(
+            in_channels=1,
+            use_complex_networks=use_complex_networks,
+            model_complexity=model_complexity,
+            model_depth=model_depth,
+            padding_mode=padding_mode
+        )
+    def forward(self, noisy: torch.Tensor):
+        """
+        :param noisy: torch.Tensor, shape: [b, n_samples] or [b, c, n_samples]
+        :return:
+        """
+        if noisy.dim() == 2:
+            noisy = torch.unsqueeze(noisy, dim=1)
+        _, _, n_samples = noisy.shape
+        remainder = (n_samples - self.win_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            noisy = F.pad(noisy, pad=(0, n_samples_pad), mode="constant", value=0)
+        # [batch_size, freq_bins * 2, time_steps]
+        cmp_spec = self.stft.forward(noisy)
+        # [batch_size, 1, freq_bins * 2, time_steps]
+        cmp_spec = torch.unsqueeze(cmp_spec, 1)
+        # [batch_size, 2, freq_bins, time_steps]
+        cmp_spec = torch.cat([
+            cmp_spec[:, :, :self.freq_bins, :],
+            cmp_spec[:, :, self.freq_bins:, :],
+        ], dim=1)
+        # [batch_size, 2, freq_bins, time_steps, 1]
+        cmp_spec = torch.unsqueeze(cmp_spec, dim=4)
+        cmp_spec = torch.transpose(cmp_spec, 1, 4)
+        # [batch_size, 1, freq_bins, time_steps, 2]
+        unet1_out = self.unet.forward(cmp_spec)
+        cmp_mask1 = torch.tanh(unet1_out)
+        unet2_out = self.unet2.forward(unet1_out)
+        cmp_mask2 = torch.tanh(unet2_out)
+        # est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask1)
+        cmp_mask2 = cmp_mask2 + cmp_mask1
+        est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask2)
+        # est_wav shape: [b, n_samples]
+        est_wav = est_wav[:, :n_samples]
+        return est_spec, est_wav, est_mask
+    def apply_mask(self,
+                   cmp_spec: torch.Tensor,
+                   cmp_mask: torch.Tensor,
+                   ):
+        """
+        :param cmp_spec: torch.Tensor, shape: [batch_size, 1, freq_bins, time_steps, 2]
+        :param cmp_mask: torch.Tensor, shape: [batch_size, 1, freq_bins, time_steps, 2]
+        :return:
+        """
+        est_spec = torch.cat(
+            tensors=[
+                cmp_spec[..., 0] * cmp_mask[..., 0] - cmp_spec[..., 1] * cmp_mask[..., 1],
+                cmp_spec[..., 0] * cmp_mask[..., 1] + cmp_spec[..., 1] * cmp_mask[..., 0]
+            ], dim=1
+        )
+        # est_spec shape: [b, 2, n//2+1, t]
+        est_spec = torch.cat(tensors=[est_spec[:, 0, :, :], est_spec[:, 1, :, :]], dim=1)
+        # est_spec shape: [b, n+2, t]
+        # cmp_mask shape: [b, 1, n//2+1, t, 2]
+        cmp_mask = torch.squeeze(cmp_mask, dim=1)
+        # cmp_mask shape: [b, n//2+1, t, 2]
+        cmp_mask = torch.cat(tensors=[cmp_mask[:, :, :, 0], cmp_mask[:, :, :, 1]], dim=1)
+        # cmp_mask shape: [b, n+2, t]
+        # est_spec shape: [b, n+2, t]
+        est_wav = self.istft(est_spec)
+        # est_wav shape: [b, 1, n_samples]
+        est_wav = torch.squeeze(est_wav, 1)
+        # est_wav shape: [b, n_samples]
+        return est_spec, est_wav, cmp_mask
+    def get_params(self, weight_decay=0.0):
+        """
+        为可训练参数配置 weight_decay (权重衰减) 的作用是实现 L2 正则化。
+        1. 防止过拟合: 通过向损失函数添加参数的 L2 范数 (平方和) 作为惩罚项, weight_decay 会限制模型权重的大小.
+        这使得模型倾向于学习更小的权重值, 降低对训练数据的过度敏感, 从而提高泛化能力.
+        2. 控制模型复杂度: 权重衰减直接作用于优化过程, 在梯度更新时对权重进行衰减,
+        公式: weight = weight - lr * (gradient + weight_decay * weight).
+        这相当于在梯度下降中额外引入了一个与当前权重值成正比的衰减力, 抑制权重快速增长.
+        3. 与优化器的具体实现相关
+        在 SGD 等传统优化器中, weight_decay 直接等价于 L2 正则化.
+        在 Adam 优化器中, 权重衰减的实现与参数更新耦合, 可能因学习率调整而效果减弱.
+        在 AdamW 优化器改进了这一点, 将权重衰减与学习率解耦, 使其更符合 L2 正则化的理论效果.
+        注意:
+        值过大会导致欠拟合, 过小则正则化效果弱, 常用范围是 1e-4到 1e-2.
+        某些场景 (如 BatchNorm 层) 可能需要通过参数分组对不同层设置不同的 weight_decay.
+        :param weight_decay:
+        :return:
+        """
+        weights, biases = [], []
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                biases += [param]
+            else:
+                weights += [param]
+        params = [{
+            'params': weights,
+            'weight_decay': weight_decay,
+        }, {
+            'params': biases,
+            'weight_decay': 0.0,
+        }]
+        return params
+    def mag_pha_loss_fn(self, est_mask: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
+        """
+        :param est_mask: torch.Tensor, shape: [b, n+2, t]
+        :param clean:
+        :param noisy:
+        :return:
+        """
+        clean_stft = self.stft(clean)
+        clean_re = clean_stft[:, :self.freq_bins, :]
+        clean_im = clean_stft[:, self.freq_bins:, :]
+        noisy_stft = self.stft(noisy)
+        noisy_re = noisy_stft[:, :self.freq_bins, :]
+        noisy_im = noisy_stft[:, self.freq_bins:, :]
+        noisy_power = noisy_re ** 2 + noisy_im ** 2
+        sr = clean_re
+        yr = noisy_re
+        si = clean_im
+        yi = noisy_im
+        y_pow = noisy_power
+        # (Sr * Yr + Si * Yi) / (Y_pow + 1e-8)
+        gth_mask_re = (sr * yr + si * yi) / (y_pow + self.eps)
+        # (Si * Yr - Sr * Yi) / (Y_pow + 1e-8)
+        gth_mask_im = (sr * yr - si * yi) / (y_pow + self.eps)
+        gth_mask_re[gth_mask_re > 2] = 1
+        gth_mask_re[gth_mask_re < -2] = -1
+        gth_mask_im[gth_mask_im > 2] = 1
+        gth_mask_im[gth_mask_im < -2] = -1
+        mask_re = est_mask[:, :self.freq_bins, :]
+        mask_im = est_mask[:, self.freq_bins:, :]
+        amp_loss = F.mse_loss(gth_mask_re, mask_re)
+        phase_loss = F.mse_loss(gth_mask_im, mask_im)
+        return amp_loss, phase_loss
+MODEL_FILE = "model.pt"
+class FRCRNPretrainedModel(FRCRN):
+    def __init__(self,
+                 config: FRCRNConfig,
+                 ):
+        super(FRCRNPretrainedModel, self).__init__(
+            use_complex_networks=config.use_complex_networks,
+            model_complexity=config.model_complexity,
+            model_depth=config.model_depth,
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+        )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = FRCRNConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    # model = FRCRN(
+    #     use_complex_networks=True,
+    #     model_complexity=45,
+    #     model_depth=14,
+    #     padding_mode="zeros",
+    #     nfft=512,
+    #     win_size=400,
+    #     hop_size=200,
+    #     win_type="hann",
+    # )
+    model = FRCRN(
+        use_complex_networks=True,
+        model_complexity=45,
+        model_depth=14,
+        padding_mode="zeros",
+        nfft=640,
+        win_size=640,
+        hop_size=320,
+        win_type="hann",
+    )
+    mixture = torch.rand(size=(1, 8000), dtype=torch.float32)
+    est_spec, est_wav, est_mask = model.forward(mixture)
+    print(est_spec.shape)
+    print(est_wav.shape)
+    print(est_mask.shape)
+    return
 if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/frcrn/unet.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Union, Tuple
+import torch
+import torch.nn as nn
+from toolbox.torchaudio.models.frcrn import complex_nn
+class SELayer(nn.Module):
+    def __init__(self, channels: int, reduction: int = 16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc_r = nn.Sequential(
+            nn.Linear(channels, channels // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channels // reduction, channels),
+            nn.Sigmoid()
+        )
+        self.fc_i = nn.Sequential(
+            nn.Linear(channels, channels // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channels // reduction, channels),
+            nn.Sigmoid()
+        )
+    def forward(self, x: torch.Tensor):
+        b, c, _, _, _ = x.size()
+        x_r = self.avg_pool(x[:, :, :, :, 0]).view(b, c)
+        x_i = self.avg_pool(x[:, :, :, :, 1]).view(b, c)
+        y_r = self.fc_r(x_r).view(b, c, 1, 1, 1) - self.fc_i(x_i).view(b, c, 1, 1, 1)
+        y_i = self.fc_r(x_i).view(b, c, 1, 1, 1) + self.fc_i(x_r).view(b, c, 1, 1, 1)
+        y = torch.cat(tensors=[y_r, y_i], dim=4)
+        return x * y
+class Encoder(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]],
+                 padding: Union[int, Tuple[int, int]] = None,
+                 use_complex_networks: bool = False,
+                 padding_mode: str = "zeros"
+                 ):
+        super().__init__()
+        if padding is None:
+            padding = [(k - 1) // 2 for k in kernel_size]  # 'SAME' padding
+        if use_complex_networks:
+            conv = complex_nn.ComplexConv2d
+            bn = complex_nn.ComplexBatchNorm2d
+        else:
+            conv = nn.Conv2d
+            bn = nn.BatchNorm2d
+        self.conv = conv(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode
+        )
+        self.bn = bn(out_channels)
+        self.relu = nn.LeakyReLU(inplace=True)
+    def forward(self, x: torch.Tensor):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]],
+                 padding: Union[int, Tuple[int, int]] = (0, 0),
+                 use_complex_networks: bool = False,
+                 ):
+        super().__init__()
+        if use_complex_networks:
+            tconv = complex_nn.ComplexConvTranspose2d
+            bn = complex_nn.ComplexBatchNorm2d
+        else:
+            tconv = nn.ConvTranspose2d
+            bn = nn.BatchNorm2d
+        self.transconv = tconv(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding
+        )
+        self.bn = bn(out_channels)
+        self.relu = nn.LeakyReLU(inplace=True)
+    def forward(self, x):
+        x = self.transconv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class UNetConfig14(object):
+    """
+    inputs x shape: [1, 1, 321, 2000, 2]
+    sample rate: 16000
+    nfft: 640
+    win_size: 640
+    hop_size: 320 (200ms)
+    """
+    def __init__(self, in_channels: int):
+        self.enc_channels = [in_channels, 128, 128, 128, 128, 128, 128, 128]
+        self.enc_kernel_sizes = [(5, 2), (5, 2), (5, 2), (5, 2), (5, 2), (5, 2), (2, 2)]
+        self.enc_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+        self.enc_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+        self.dec_channels = [64, 128, 128, 128, 128, 128, 128, 1]
+        self.dec_kernel_sizes = [(2, 2), (5, 2), (5, 2), (5, 2), (6, 2), (5, 2), (5, 2)]
+        self.dec_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+        self.dec_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+class UNetConfig10(object):
+    """
+    inputs x shape: [1, 1, 65, 200, 2]
+    sample rate: 8000
+    nfft: 128
+    win_size: 128
+    hop_size: 64 (8ms)
+    """
+    def __init__(self, in_channels: int):
+        self.enc_channels = [in_channels, 16, 32, 64, 128, 256]
+        self.enc_kernel_sizes = [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
+        self.enc_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+        self.enc_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+        self.dec_channels = [128, 128, 64, 32, 16, 1]
+        self.dec_kernel_sizes = [(3, 3), (3, 3), (3, 3), (4, 3), (3, 3)]
+        self.dec_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+        self.dec_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+class UNetConfig20(object):
+    """
+    inputs x shape: [1, 1, 257, 2000, 2]
+    sample rate: 8000
+    nfft: 512
+    win_size: 512
+    hop_size: 256 (32ms)
+    """
+    def __init__(self, in_channels: int, model_complexity: int):
+        self.enc_channels = [
+            in_channels,
+            model_complexity, model_complexity,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity * 2,
+            128
+        ]
+        self.enc_kernel_sizes = [(7, 1), (1, 7), (6, 4), (7, 5), (5, 3),
+                                 (5, 3), (5, 3), (5, 3), (5, 3), (5, 3)]
+        self.enc_strides = [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2),
+                            (2, 1), (2, 2), (2, 1), (2, 2), (2, 1)]
+        self.enc_paddings = [
+            (3, 0),
+            (0, 3),
+            None,  # (0, 2),
+            None,
+            None,  # (3,1),
+            None,  # (3,1),
+            None,  # (1,2),
+            None,
+            None,
+            None
+        ]
+        self.dec_channels = [
+            64,
+            model_complexity * 2,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity * 2, model_complexity * 2,
+            model_complexity, model_complexity,
+            1
+        ]
+        self.dec_kernel_sizes = [(4, 3), (4, 2), (4, 3), (4, 2), (4, 3),
+                                 (4, 2), (6, 3), (7, 4), (1, 7), (7, 1)]
+        self.dec_strides = [(2, 1), (2, 2), (2, 1), (2, 2), (2, 1),
+                            (2, 2), (2, 1), (2, 2), (1, 1), (1, 1)]
+        self.dec_paddings = [(1, 1), (1, 0), (1, 1), (1, 0), (1, 1),
+                             (1, 0), (2, 1), (2, 1), (0, 3), (3, 0)]
+class UNet(nn.Module):
+    def __init__(self,
+                 in_channels: int = 1,
+                 use_complex_networks: bool = False,
+                 model_complexity: int = 45,
+                 model_depth: int = 20,
+                 padding_mode: str = "zeros"
+                 ):
+        super().__init__()
+        if use_complex_networks:
+            model_complexity = int(model_complexity // 1.414)
+        # config
+        if model_depth == 14:
+            config = UNetConfig14(in_channels)
+        elif model_depth == 10:
+            config = UNetConfig10(in_channels)
+        elif model_depth == 20:
+            config = UNetConfig20(in_channels, model_complexity)
+        else:
+            raise AssertionError(f"Unknown model depth : {model_depth}")
+        self.model_length = model_depth // 2
+        self.fsmn = complex_nn.ComplexUniDeepFsmn(
+            config.enc_channels[-1],
+            config.enc_channels[-1]
+        )
+        # go down
+        self.encoder_layers = nn.ModuleList(modules=[])
+        for i in range(self.model_length):
+            encoder_layer = nn.Sequential(
+                complex_nn.ComplexUniDeepFsmnL1(
+                    config.enc_channels[i],
+                    config.enc_channels[i]
+                )
+                if i != 0 else nn.Identity(),
+                Encoder(
+                    config.enc_channels[i],
+                    config.enc_channels[i + 1],
+                    kernel_size=config.enc_kernel_sizes[i],
+                    stride=config.enc_strides[i],
+                    padding=config.enc_paddings[i],
+                    use_complex_networks=use_complex_networks,
+                    padding_mode=padding_mode
+                ),
+                SELayer(config.enc_channels[i + 1], reduction=8)
+            )
+            self.encoder_layers.append(encoder_layer)
+        self.decoder_layers = nn.ModuleList(modules=[])
+        for i in range(self.model_length):
+            decoder_layer = nn.Sequential(
+                Decoder(
+                    config.dec_channels[i] * 2,
+                    config.dec_channels[i + 1],
+                    kernel_size=config.dec_kernel_sizes[i],
+                    stride=config.dec_strides[i],
+                    padding=config.dec_paddings[i],
+                    use_complex_networks=use_complex_networks
+                ),
+                complex_nn.ComplexUniDeepFsmnL1(
+                    config.dec_channels[i + 1],
+                    config.dec_channels[i + 1]
+                )
+                if i < (self.model_length - 1) else nn.Identity(),
+                SELayer(
+                    config.dec_channels[i + 1],
+                    reduction=8
+                )
+                if i < (self.model_length - 2) else nn.Identity()
+            )
+            self.decoder_layers.append(decoder_layer)
+        if use_complex_networks:
+            conv = complex_nn.ComplexConv2d
+        else:
+            conv = nn.Conv2d
+        self.linear = conv(
+            in_channels=config.dec_channels[-1],
+            out_channels=1,
+            kernel_size=1,
+        )
+    def forward(self, inputs: torch.Tensor):
+        """
+        :param inputs: torch.Tensor, shape: [b, c, f, t, 2]
+        :return:
+        """
+        x = inputs
+        # go down
+        xs = list()
+        xs_se = list()
+        xs_se.append(x)
+        for encoder_layer in self.encoder_layers:
+            xs.append(x)
+            # print(f"x: {x.shape}")
+            x = encoder_layer.forward(x)
+            # print(f"x: {x.shape}")
+            xs_se.append(x)
+        # x shape: [b, c, 1, t', 2]
+        x = self.fsmn.forward(x)
+        # x shape: [b, c, 1, t', 2]
+        # print(f"fsmn")
+        p = x
+        for i, decoder_layers in enumerate(self.decoder_layers):
+            # print(f"x: {x.shape}")
+            p = decoder_layers.forward(p)
+            # print(f"p: {p.shape}")
+            if i == self.model_length - 1:
+                break
+            p = torch.cat(tensors=[p, xs_se[self.model_length - 1 - i]], dim=1)
+        # cmp_spec: [1, 1, 321, 200, 2]
+        # cmp_spec: [1, 1, 513, 200, 2]
+        cmp_spec = self.linear.forward(p)
+        return cmp_spec
+def main():
+    # [batch_size, 1, freq_bins, time_steps, 2]
+    x = torch.rand(size=(1, 1, 257, 2000, 2))
+    # x = torch.rand(size=(1, 1, 256, 2000, 2))
+    # x = torch.rand(size=(1, 1, 255, 2000, 2))
+    unet = UNet(
+        in_channels=1,
+        model_complexity=45,
+        model_depth=20,
+        use_complex_networks=True
+    )
+    print(unet)
+    result = unet.forward(x)
+    print(result.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/frcrn/uni_deep_fsmn.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/layers/uni_deep_fsmn.py
+https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/models/mossformer2_se/fsmn.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class UniDeepFsmn(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 hidden_size: int,
+                 lorder: int = 1,
+                 ):
+        super(UniDeepFsmn, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_size = hidden_size
+        self.lorder = lorder
+        self.linear = nn.Linear(input_dim, hidden_size)
+        self.project = nn.Linear(hidden_size, input_dim, bias=False)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            input_dim,
+            kernel_size=(lorder, 1),
+            stride=(1, 1),
+            groups=input_dim,
+            bias=False
+        )
+    def forward(self, inputs: torch.Tensor):
+        """
+        :param inputs: torch.Tensor, shape: [b, t, h]
+        :return: torch.Tensor, shape: [b, t, h]
+        """
+        x = F.relu(self.linear(inputs))
+        x = self.project(x)
+        x = torch.unsqueeze(x, 1)
+        # x shape: [b, 1, t, h]
+        x = x.permute(0, 3, 2, 1)
+        # x shape: [b, h, t, 1]
+        y = F.pad(x, [0, 0, self.lorder - 1, 0])
+        x = x + self.conv1(y)
+        x = x.permute(0, 3, 2, 1)
+        # x shape: [b, 1, t, h]
+        x = x.squeeze()
+        result = inputs + x
+        return result
+def main():
+    x = torch.rand(size=(1, 200, 32))
+    fsmn = UniDeepFsmn(
+        input_dim=32,
+        hidden_size=64,
+        lorder=3,
+    )
+    result = fsmn.forward(x)
+    print(result.shape)
+    return
+if __name__ == "__main__":
+    main()