Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on Mar 17

Commit

e86d760

1 Parent(s): b1eb75a

update

Browse files

Files changed (31) hide show

examples/conv_tasnet/run.sh +170 -0
examples/conv_tasnet/step_1_prepare_data.py +201 -0
examples/conv_tasnet/step_2_train_model.py +413 -0
examples/conv_tasnet/yaml/config.yaml +42 -0
examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_clean_emotional_speech.py +90 -0
examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_clean_read_speech.py +123 -0
examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_demand.py +71 -0
examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_impulse_responses.py +93 -0
examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_noise.py +77 -0
examples/data_preprocess/dns_challenge_to_8k/process_musan.py +8 -0
examples/mpnet/run.sh +2 -2
examples/nx_mpnet/yaml/config.yaml +5 -5
main.py +8 -1
requirements.txt +1 -0
toolbox/torchaudio/losses/__init__.py +6 -0
toolbox/torchaudio/losses/perceptual.py +75 -0
toolbox/torchaudio/losses/snr.py +101 -0
toolbox/torchaudio/losses/spectral.py +351 -0
toolbox/torchaudio/metrics/__init__.py +6 -0
toolbox/torchaudio/metrics/pesq.py +80 -0
toolbox/torchaudio/models/conv_tasnet/configuration_conv_tasnet.py +52 -0
toolbox/torchaudio/models/conv_tasnet/modeling_conv_tasnet.py +477 -2
toolbox/torchaudio/models/conv_tasnet/utils.py +55 -0
toolbox/torchaudio/models/conv_tasnet/yaml/config.yaml +17 -0
toolbox/torchaudio/models/demucs/__init__.py +6 -0
toolbox/torchaudio/models/demucs/configuration_demucs.py +51 -0
toolbox/torchaudio/models/demucs/modeling_demucs.py +299 -0
toolbox/torchaudio/models/demucs/resample.py +81 -0
toolbox/torchaudio/models/nx_dfnet/configuration_nx_dfnet.py +102 -0
toolbox/torchaudio/models/nx_dfnet/modeling_nx_dfnet.py +989 -0
toolbox/torchaudio/models/nx_dfnet/utils.py +55 -0

examples/conv_tasnet/run.sh ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env bash
+: <<'END'
+sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name mpnet-aishell-20250224 \
+--noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "E:/programmer/asr_datasets/aishell/data_aishell/wav/train"
+sh run.sh --stage 3 --stop_stage 3 --system_version centos --file_folder_name file_dir --final_model_name mpnet-aishell-20250224 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-clean-unet-aishell-20250228 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
+--max_epochs 100
+sh run.sh --stage 1 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name mpnet-nx-speech-20250224 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech" \
+--max_epochs 100 --max_count 10000
+END
+# params
+system_version="windows";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=9
+work_dir="$(pwd)"
+file_folder_name=file_folder_name
+final_model_name=final_model_name
+config_file="yaml/config.yaml"
+limit=10
+noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
+speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+max_count=10000000
+nohup_name=nohup.out
+# model params
+batch_size=64
+max_epochs=200
+save_top_k=10
+patience=5
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+file_dir="${work_dir}/${file_folder_name}"
+final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
+evaluation_audio_dir="${file_dir}/evaluation_audio"
+dataset="${file_dir}/dataset.xlsx"
+train_dataset="${file_dir}/train.xlsx"
+valid_dataset="${file_dir}/valid.xlsx"
+$verbose && echo "system_version: ${system_version}"
+$verbose && echo "file_folder_name: ${file_folder_name}"
+if [ $system_version == "windows" ]; then
+  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
+elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
+  #source /data/local/bin/nx_denoise/bin/activate
+  alias python3='/data/local/bin/nx_denoise/bin/python3'
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: prepare data"
+  cd "${work_dir}" || exit 1
+  python3 step_1_prepare_data.py \
+  --file_dir "${file_dir}" \
+  --noise_dir "${noise_dir}" \
+  --speech_dir "${speech_dir}" \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --max_count "${max_count}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: train model"
+  cd "${work_dir}" || exit 1
+  python3 step_2_train_model.py \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --serialization_dir "${file_dir}" \
+  --config_file "${config_file}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: test model"
+  cd "${work_dir}" || exit 1
+  python3 step_3_evaluation.py \
+  --valid_dataset "${valid_dataset}" \
+  --model_dir "${file_dir}/best" \
+  --evaluation_audio_dir "${evaluation_audio_dir}" \
+  --limit "${limit}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: collect files"
+  cd "${work_dir}" || exit 1
+  mkdir -p ${final_model_dir}
+  cp "${file_dir}/best"/* "${final_model_dir}"
+  cp -r "${file_dir}/evaluation_audio" "${final_model_dir}"
+  cd "${final_model_dir}/.." || exit 1;
+  if [ -e "${final_model_name}.zip" ]; then
+    rm -rf "${final_model_name}_backup.zip"
+    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
+  fi
+  zip -r "${final_model_name}.zip" "${final_model_name}"
+  rm -rf "${final_model_name}"
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: clear file_dir"
+  cd "${work_dir}" || exit 1
+  rm -rf "${file_dir}";
+fi

examples/conv_tasnet/step_1_prepare_data.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+from pathlib import Path
+import random
+import sys
+import shutil
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import pandas as pd
+from scipy.io import wavfile
+from tqdm import tqdm
+import librosa
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_dir", default="./", type=str)
+    parser.add_argument(
+        "--noise_dir",
+        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
+        type=str
+    )
+    parser.add_argument(
+        "--speech_dir",
+        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
+        type=str
+    )
+    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
+    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
+    parser.add_argument("--duration", default=2.0, type=float)
+    parser.add_argument("--min_snr_db", default=-10, type=float)
+    parser.add_argument("--max_snr_db", default=20, type=float)
+    parser.add_argument("--target_sample_rate", default=8000, type=int)
+    parser.add_argument("--max_count", default=10000, type=int)
+    args = parser.parse_args()
+    return args
+def filename_generator(data_dir: str):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        yield filename.as_posix()
+def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
+        raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
+        if raw_duration < duration:
+            # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
+            continue
+        if signal.ndim != 1:
+            raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
+        signal_length = len(signal)
+        win_size = int(duration * sample_rate)
+        for begin in range(0, signal_length - win_size, win_size):
+            row = {
+                "filename": filename.as_posix(),
+                "raw_duration": round(raw_duration, 4),
+                "offset": round(begin / sample_rate, 4),
+                "duration": round(duration, 4),
+            }
+            yield row
+def get_dataset(args):
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    noise_dir = Path(args.noise_dir)
+    speech_dir = Path(args.speech_dir)
+    noise_generator = target_second_signal_generator(
+        noise_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate
+    )
+    speech_generator = target_second_signal_generator(
+        speech_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate
+    )
+    dataset = list()
+    count = 0
+    process_bar = tqdm(desc="build dataset excel")
+    for noise, speech in zip(noise_generator, speech_generator):
+        if count >= args.max_count:
+            break
+        noise_filename = noise["filename"]
+        noise_raw_duration = noise["raw_duration"]
+        noise_offset = noise["offset"]
+        noise_duration = noise["duration"]
+        speech_filename = speech["filename"]
+        speech_raw_duration = speech["raw_duration"]
+        speech_offset = speech["offset"]
+        speech_duration = speech["duration"]
+        random1 = random.random()
+        random2 = random.random()
+        row = {
+            "noise_filename": noise_filename,
+            "noise_raw_duration": noise_raw_duration,
+            "noise_offset": noise_offset,
+            "noise_duration": noise_duration,
+            "speech_filename": speech_filename,
+            "speech_raw_duration": speech_raw_duration,
+            "speech_offset": speech_offset,
+            "speech_duration": speech_duration,
+            "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
+            "random1": random1,
+            "random2": random2,
+            "flag": "TRAIN" if random2 < 0.8 else "TEST",
+        }
+        dataset.append(row)
+        count += 1
+        duration_seconds = count * args.duration
+        duration_hours = duration_seconds / 3600
+        process_bar.update(n=1)
+        process_bar.set_postfix({
+            # "duration_seconds": round(duration_seconds, 4),
+            "duration_hours": round(duration_hours, 4),
+        })
+    dataset = pd.DataFrame(dataset)
+    dataset = dataset.sort_values(by=["random1"], ascending=False)
+    dataset.to_excel(
+        file_dir / "dataset.xlsx",
+        index=False,
+    )
+    return
+def split_dataset(args):
+    """分割训练集, 测试集"""
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    df = pd.read_excel(file_dir / "dataset.xlsx")
+    train = list()
+    test = list()
+    for i, row in df.iterrows():
+        flag = row["flag"]
+        if flag == "TRAIN":
+            train.append(row)
+        else:
+            test.append(row)
+    train = pd.DataFrame(train)
+    train.to_excel(
+        args.train_dataset,
+        index=False,
+        # encoding="utf_8_sig"
+    )
+    test = pd.DataFrame(test)
+    test.to_excel(
+        args.valid_dataset,
+        index=False,
+        # encoding="utf_8_sig"
+    )
+    return
+def main():
+    args = get_args()
+    get_dataset(args)
+    split_dataset(args)
+    return
+if __name__ == "__main__":
+    main()

examples/conv_tasnet/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,413 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/yxlu-0102/MP-SENet/blob/main/train.py
+"""
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_excel_dataset import DenoiseExcelDataset
+from toolbox.torchaudio.models.conv_tasnet.configuration_conv_tasnet import ConvTasNetConfig
+from toolbox.torchaudio.models.conv_tasnet.modeling_conv_tasnet import ConvTasNet, ConvTasNetPretrainedModel
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.losses.spectral import LSDLoss
+from toolbox.torchaudio.losses.perceptual import NegSTOILoss
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
+    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
+    parser.add_argument("--max_epochs", default=100, type=int)
+    parser.add_argument("--num_serialized_models_to_keep", default=10, type=int)
+    parser.add_argument("--patience", default=5, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self):
+        pass
+    def __call__(self, batch: List[dict]):
+        clean_audios = list()
+        noisy_audios = list()
+        for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
+            clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
+        clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
+        # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = ConvTasNetConfig.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(config.seed)
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    logger.info(f"set seed: {config.seed}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info(f"GPU available count: {n_gpu}; device: {device}")
+    # datasets
+    train_dataset = DenoiseExcelDataset(
+        excel_file=args.train_dataset,
+        expected_sample_rate=8000,
+        max_wave_value=32768.0,
+    )
+    valid_dataset = DenoiseExcelDataset(
+        excel_file=args.valid_dataset,
+        expected_sample_rate=8000,
+        max_wave_value=32768.0,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=16,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=16,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = ConvTasNetPretrainedModel(config).to(device)
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, categorical_accuracy")
+    optimizer = torch.optim.AdamW(model.parameters(), args.learning_rate)
+    # resume training
+    last_epoch = -1
+    for epoch_i in serialization_dir.glob("epoch-*"):
+        epoch_i = Path(epoch_i)
+        epoch_idx = epoch_i.stem.split("-")[1]
+        epoch_idx = int(epoch_idx)
+        if epoch_idx > last_epoch:
+            last_epoch = epoch_idx
+    if last_epoch != -1:
+        logger.info(f"resume from epoch-{last_epoch}.")
+        model_pt = serialization_dir / f"epoch-{last_epoch}/model.pt"
+        optimizer_pth = serialization_dir / f"epoch-{last_epoch}/optimizer.pth"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        logger.info(f"load state dict for optimizer.")
+        with open(optimizer_pth.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        optimizer.load_state_dict(state_dict)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer,
+        milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+    )
+    ae_loss_fn = nn.L1Loss(reduction="mean").to(device)
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    neg_stoi_loss_fn = NegSTOILoss(sample_rate=8000, reduction="mean").to(device)
+    lds_loss_fn = LSDLoss(reduction="mean").to(device)
+    # training loop
+    # state
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    average_ae_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
+    average_neg_stoi_loss = 1000000000
+    average_lds_loss = 1000000000
+    model_list = list()
+    best_idx_epoch = None
+    best_metric = None
+    patience_count = 0
+    logger.info("training")
+    for idx_epoch in range(max(0, last_epoch+1), args.max_epochs):
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_ae_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_neg_stoi_loss = 0.
+        total_lds_loss = 0.
+        total_batches = 0.
+        progress_bar = tqdm(
+            total=len(train_data_loader),
+            desc="Training; epoch: {}".format(idx_epoch),
+        )
+        for batch in train_data_loader:
+            clean_audios, noisy_audios = batch
+            clean_audios = clean_audios.to(device)
+            noisy_audios = noisy_audios.to(device)
+            denoise_audios = model.forward(noisy_audios)
+            denoise_audios = torch.squeeze(denoise_audios, dim=1)
+            ae_loss = ae_loss_fn.forward(denoise_audios, clean_audios)
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+            neg_stoi_loss = neg_stoi_loss_fn.forward(denoise_audios, clean_audios)
+            lds_loss = lds_loss_fn.forward(denoise_audios, clean_audios)
+            loss = 0.25 * ae_loss + 0.25 * neg_si_snr_loss + 0.25 * neg_stoi_loss + 0.25 * lds_loss
+            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=8000, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_ae_loss += ae_loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
+            total_neg_stoi_loss += neg_stoi_loss.item()
+            total_lds_loss += lds_loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            average_ae_loss = round(total_ae_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+            average_neg_stoi_loss = round(total_neg_stoi_loss / total_batches, 4)
+            average_lds_loss = round(total_lds_loss / total_batches, 4)
+            progress_bar.update(1)
+            progress_bar.set_postfix({
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+                "ae_loss": average_ae_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
+                "neg_stoi_loss": average_neg_stoi_loss,
+                "lds_loss": average_lds_loss,
+            })
+        # evaluation
+        model.eval()
+        torch.cuda.empty_cache()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_ae_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_neg_stoi_loss = 0.
+        total_lds_loss = 0.
+        total_batches = 0.
+        progress_bar = tqdm(
+            total=len(valid_data_loader),
+            desc="Evaluation; epoch: {}".format(idx_epoch),
+        )
+        with torch.no_grad():
+            for batch in valid_data_loader:
+                clean_audios, noisy_audios = batch
+                clean_audios = clean_audios.to(device)
+                noisy_audios = noisy_audios.to(device)
+                denoise_audios = model.forward(noisy_audios)
+                denoise_audios = torch.squeeze(denoise_audios, dim=1)
+                ae_loss = ae_loss_fn.forward(denoise_audios, clean_audios)
+                neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+                neg_stoi_loss = neg_stoi_loss_fn.forward(denoise_audios, clean_audios)
+                lds_loss = lds_loss_fn.forward(denoise_audios, clean_audios)
+                loss = 0.25 * ae_loss + 0.25 * neg_si_snr_loss + 0.25 * neg_stoi_loss + 0.25 * lds_loss
+                denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+                clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=8000, mode="nb")
+                total_pesq_score += pesq_score
+                total_loss += loss.item()
+                total_ae_loss += ae_loss.item()
+                total_neg_si_snr_loss += neg_si_snr_loss.item()
+                total_neg_stoi_loss += neg_stoi_loss.item()
+                total_lds_loss += lds_loss.item()
+                total_batches += 1
+                average_pesq_score = round(total_pesq_score / total_batches, 4)
+                average_loss = round(total_loss / total_batches, 4)
+                average_ae_loss = round(total_ae_loss / total_batches, 4)
+                average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+                average_neg_stoi_loss = round(total_neg_stoi_loss / total_batches, 4)
+                average_lds_loss = round(total_lds_loss / total_batches, 4)
+                progress_bar.update(1)
+                progress_bar.set_postfix({
+                    "pesq_score": average_pesq_score,
+                    "loss": average_loss,
+                    "ae_loss": average_ae_loss,
+                    "neg_si_snr_loss": average_neg_si_snr_loss,
+                    "neg_stoi_loss": average_neg_stoi_loss,
+                    "lds_loss": average_lds_loss,
+                })
+        # scheduler
+        lr_scheduler.step()
+        # save path
+        epoch_dir = serialization_dir / "epoch-{}".format(idx_epoch)
+        epoch_dir.mkdir(parents=True, exist_ok=False)
+        # save models
+        model.save_pretrained(epoch_dir.as_posix())
+        model_list.append(epoch_dir)
+        if len(model_list) >= args.num_serialized_models_to_keep:
+            model_to_delete: Path = model_list.pop(0)
+            shutil.rmtree(model_to_delete.as_posix())
+        # save optim
+        torch.save(optimizer.state_dict(), (epoch_dir / "optimizer.pth").as_posix())
+        # save metric
+        if best_metric is None:
+            best_idx_epoch = idx_epoch
+            best_metric = average_loss
+        elif average_loss < best_metric:
+            # great is better.
+            best_idx_epoch = idx_epoch
+            best_metric = average_loss
+        else:
+            pass
+        metrics = {
+            "idx_epoch": idx_epoch,
+            "best_idx_epoch": best_idx_epoch,
+            "pesq_score": average_pesq_score,
+            "loss": average_loss,
+            "ae_loss": average_ae_loss,
+            "neg_si_snr_loss": average_neg_si_snr_loss,
+            "neg_stoi_loss": average_neg_stoi_loss,
+            "lds_loss": average_lds_loss,
+        }
+        metrics_filename = epoch_dir / "metrics_epoch.json"
+        with open(metrics_filename, "w", encoding="utf-8") as f:
+            json.dump(metrics, f, indent=4, ensure_ascii=False)
+        # save best
+        best_dir = serialization_dir / "best"
+        if best_idx_epoch == idx_epoch:
+            if best_dir.exists():
+                shutil.rmtree(best_dir)
+            shutil.copytree(epoch_dir, best_dir)
+        # early stop
+        early_stop_flag = False
+        if best_idx_epoch == idx_epoch:
+            patience_count = 0
+        else:
+            patience_count += 1
+        if patience_count >= args.patience:
+            early_stop_flag = True
+        # early stop
+        if early_stop_flag:
+            break
+    return
+if __name__ == "__main__":
+    main()

examples/conv_tasnet/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+model_name: "nx_clean_unet"
+sample_rate: 8000
+segment_size: 16000
+n_fft: 512
+win_size: 200
+hop_size: 80
+down_sampling_num_layers: 6
+down_sampling_in_channels: 1
+down_sampling_hidden_channels: 64
+down_sampling_kernel_size: 4
+down_sampling_stride: 2
+causal_in_channels: 1
+causal_out_channels: 1
+causal_kernel_size: 3
+causal_bias: false
+causal_separable: true
+causal_f_stride: 1
+causal_num_layers: 5
+tsfm_hidden_size: 256
+tsfm_attention_heads: 8
+tsfm_num_blocks: 6
+tsfm_dropout_rate: 0.1
+tsfm_max_length: 512
+tsfm_chunk_size: 1
+tsfm_num_left_chunks: 128
+tsfm_num_right_chunks: 4
+discriminator_dim: 32
+discriminator_in_channel: 2
+compress_factor: 0.3
+batch_size: 64
+learning_rate: 0.0005
+adam_b1: 0.8
+adam_b2: 0.99
+lr_decay: 0.99
+seed: 1234

examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_clean_emotional_speech.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/microsoft/DNS-Challenge/blob/master/download-dns-challenge-3.sh
+1.2G
+wget https://dns3public.blob.core.windows.net/dns3archive/DEMAND.tar.bz2
+14G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.noise.tar.bz2
+38G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.read_speech.tar.bz2
+247M
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.emotional_speech.tar.bz2
+"""
+import argparse
+import os
+from pathlib import Path
+import sys
+import numpy as np
+from tqdm import tqdm
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+from scipy.io import wavfile
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.emotional_speech\datasets\clean\emotional_speech",
+        type=str
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=r"E:\programmer\asr_datasets\denoise\dns-clean-emotional-speech-8k",
+        type=str
+    )
+    parser.add_argument("--sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # finished_set
+    finished_set = set()
+    for filename in tqdm(output_dir.glob("**/*.wav")):
+        name = filename.stem
+        finished_set.add(name)
+    print(f"finished_set count: {len(finished_set)}")
+    for filename in tqdm(data_dir.glob("**/*.wav")):
+        label = filename.parts[-2]
+        name = filename.stem
+        # print(f"filename: {filename.as_posix()}")
+        if name in finished_set:
+            continue
+        signal, _ = librosa.load(filename.as_posix(), sr=args.sample_rate)
+        signal = signal * (1 << 15)
+        signal = np.array(signal, dtype=np.int16)
+        to_file = output_dir / f"{label}/{name}.wav"
+        to_file.parent.mkdir(parents=True, exist_ok=True)
+        wavfile.write(
+            to_file.as_posix(),
+            rate=args.sample_rate,
+            data=signal,
+        )
+    return
+if __name__ == "__main__":
+    main()

examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_clean_read_speech.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/microsoft/DNS-Challenge/blob/master/download-dns-challenge-3.sh
+1.2G
+wget https://dns3public.blob.core.windows.net/dns3archive/DEMAND.tar.bz2
+14G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.noise.tar.bz2
+38G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.read_speech.tar.bz2
+12G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.french_data.tar.bz2
+43G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.german_speech.tar.bz2
+7.9G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.italian_speech.tar.bz2
+12G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.mandarin_speech.tar.bz2
+3.1G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.russian_speech.tar.bz2
+9.7G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.spanish_speech.tar.bz2
+617M
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.singing_voice.tar.bz2
+"""
+import argparse
+import os
+from pathlib import Path
+import sys
+import numpy as np
+from tqdm import tqdm
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+from scipy.io import wavfile
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        # default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.read_speech\datasets\clean",
+        # default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.mandarin_speech\datasets\clean\mandarin_speech",
+        # default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.singing_voice\datasets\clean\singing_voice",
+        # default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.german_speech\datasets\clean\german_speech",
+        # default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.italian_speech\datasets\clean\italian_speech",
+        default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.russian_speech\datasets\clean\russian_speech",
+        type=str
+    )
+    parser.add_argument(
+        "--output_dir",
+        # default=r"E:\programmer\asr_datasets\denoise\dns-clean-read-speech-8k",
+        # default=r"E:\programmer\asr_datasets\denoise\dns-clean-mandarin-speech-8k",
+        # default=r"E:\programmer\asr_datasets\denoise\dns-clean-singing-voice-8k",
+        # default=r"E:\programmer\asr_datasets\denoise\dns-clean-german-speech-8k",
+        # default=r"E:\programmer\asr_datasets\denoise\dns-clean-italian-speech-8k",
+        default=r"E:\programmer\asr_datasets\denoise\dns-clean-russian-speech-8k",
+        type=str
+    )
+    parser.add_argument("--sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # finished_set
+    finished_set = set()
+    for filename in tqdm(output_dir.glob("**/*.wav")):
+        name = filename.stem
+        finished_set.add(name)
+    print(f"finished_set count: {len(finished_set)}")
+    for filename in tqdm(data_dir.glob("**/*.wav")):
+        label = filename.parts[-2]
+        name = filename.stem
+        relative_name = filename.relative_to(data_dir)
+        # print(f"filename: {filename.as_posix()}")
+        if name in finished_set:
+            continue
+        finished_set.add(name)
+        try:
+            signal, _ = librosa.load(filename.as_posix(), sr=args.sample_rate)
+        except Exception:
+            print(f"skip file: {filename.as_posix()}")
+            continue
+        signal = signal * (1 << 15)
+        signal = np.array(signal, dtype=np.int16)
+        to_file = output_dir / relative_name.as_posix()
+        to_file.parent.mkdir(parents=True, exist_ok=True)
+        wavfile.write(
+            to_file.as_posix(),
+            rate=args.sample_rate,
+            data=signal,
+        )
+    return
+if __name__ == "__main__":
+    main()

examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_demand.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/microsoft/DNS-Challenge/blob/master/download-dns-challenge-3.sh
+1.2G
+wget https://dns3public.blob.core.windows.net/dns3archive/DEMAND.tar.bz2
+"""
+import argparse
+import os
+from pathlib import Path
+import random
+import sys
+import shutil
+import numpy as np
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+from scipy.io import wavfile
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        default=r"E:\programmer\asr_datasets\dns-challenge\DEMAND\demand",
+        type=str
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=r"E:\programmer\asr_datasets\denoise\demand-8k",
+        type=str
+    )
+    parser.add_argument("--sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=False)
+    for filename in data_dir.glob("**/ch01.wav"):
+        label = filename.parts[-2]
+        name = filename.stem
+        signal, _ = librosa.load(filename.as_posix(), sr=args.sample_rate)
+        signal = signal * (1 << 15)
+        signal = np.array(signal, dtype=np.int16)
+        to_file = output_dir / f"{label}/{name}.wav"
+        to_file.parent.mkdir(parents=True, exist_ok=True)
+        wavfile.write(
+            to_file.as_posix(),
+            rate=args.sample_rate,
+            data=signal,
+        )
+    return
+if __name__ == '__main__':
+    main()

examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_impulse_responses.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/microsoft/DNS-Challenge/blob/master/download-dns-challenge-3.sh
+1.2G
+wget https://dns3public.blob.core.windows.net/dns3archive/DEMAND.tar.bz2
+14G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.noise.tar.bz2
+38G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.read_speech.tar.bz2
+247M
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.emotional_speech.tar.bz2
+240M
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.impulse_responses.tar.bz2
+"""
+import argparse
+import os
+from pathlib import Path
+import sys
+import numpy as np
+from tqdm import tqdm
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+from scipy.io import wavfile
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        default=r"E:\programmer\asr_datasets\dns-challenge\datasets.clean.emotional_speech\datasets\clean\emotional_speech",
+        type=str
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=r"E:\programmer\asr_datasets\denoise\dns-clean-emotional-speech-8k",
+        type=str
+    )
+    parser.add_argument("--sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # finished_set
+    finished_set = set()
+    for filename in tqdm(output_dir.glob("**/*.wav")):
+        name = filename.stem
+        finished_set.add(name)
+    print(f"finished_set count: {len(finished_set)}")
+    for filename in tqdm(data_dir.glob("**/*.wav")):
+        label = filename.parts[-2]
+        name = filename.stem
+        # print(f"filename: {filename.as_posix()}")
+        if name in finished_set:
+            continue
+        signal, _ = librosa.load(filename.as_posix(), sr=args.sample_rate)
+        signal = signal * (1 << 15)
+        signal = np.array(signal, dtype=np.int16)
+        to_file = output_dir / f"{label}/{name}.wav"
+        to_file.parent.mkdir(parents=True, exist_ok=True)
+        wavfile.write(
+            to_file.as_posix(),
+            rate=args.sample_rate,
+            data=signal,
+        )
+    return
+if __name__ == "__main__":
+    main()

examples/data_preprocess/dns_challenge_to_8k/process_dns_challenge_noise.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/microsoft/DNS-Challenge/blob/master/download-dns-challenge-3.sh
+1.2G
+wget https://dns3public.blob.core.windows.net/dns3archive/DEMAND.tar.bz2
+14G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.noise.tar.bz2
+38G
+wget https://dns3public.blob.core.windows.net/dns3archive/datasets/datasets.clean.read_speech.tar.bz2
+"""
+import argparse
+import os
+from pathlib import Path
+import sys
+import numpy as np
+from tqdm import tqdm
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+from scipy.io import wavfile
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        default=r"E:\programmer\asr_datasets\dns-challenge\datasets.noise\datasets",
+        type=str
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=r"E:\programmer\asr_datasets\denoise\dns-noise-8k",
+        type=str
+    )
+    parser.add_argument("--sample_rate", default=8000, type=int)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for filename in tqdm(data_dir.glob("**/*.wav")):
+        label = filename.parts[-2]
+        name = filename.stem
+        # print(f"filename: {filename.as_posix()}")
+        signal, _ = librosa.load(filename.as_posix(), sr=args.sample_rate)
+        signal = signal * (1 << 15)
+        signal = np.array(signal, dtype=np.int16)
+        to_file = output_dir / f"{label}/{name}.wav"
+        to_file.parent.mkdir(parents=True, exist_ok=True)
+        wavfile.write(
+            to_file.as_posix(),
+            rate=args.sample_rate,
+            data=signal,
+        )
+    return
+if __name__ == '__main__':
+    main()

examples/data_preprocess/dns_challenge_to_8k/process_musan.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://www.openslr.org/17/
+"""
+if __name__ == '__main__':
+    pass

examples/mpnet/run.sh CHANGED Viewed

@@ -17,10 +17,10 @@ sh run.sh --stage 5 --stop_stage 5 --system_version centos --file_folder_name fi
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
-sh run.sh --stage 1 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name mpnet-nx-speech-20250224 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech" \
---max_epochs 1
 END

 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
+sh run.sh --stage 1 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name mpnet-nx-speech \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech" \
+--max_epochs 100
 END

examples/nx_mpnet/yaml/config.yaml CHANGED Viewed

@@ -15,15 +15,15 @@ mask_hidden_size: 64
 phase_num_blocks: 4
 phase_hidden_size: 64
-tsfm_hidden_size: 128
-tsfm_attention_heads: 8
-tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.0
 tsfm_max_time_relative_position: 2048
 tsfm_max_freq_relative_position: 256
 tsfm_chunk_size: 1
-tsfm_num_left_chunks: 64
-tsfm_num_right_chunks: 32
 discriminator_dim: 32
 discriminator_in_channel: 2

 phase_num_blocks: 4
 phase_hidden_size: 64
+tsfm_hidden_size: 64
+tsfm_attention_heads: 4
+tsfm_num_blocks: 4
 tsfm_dropout_rate: 0.0
 tsfm_max_time_relative_position: 2048
 tsfm_max_freq_relative_position: 256
 tsfm_chunk_size: 1
+tsfm_num_left_chunks: 128
+tsfm_num_right_chunks: 64
 discriminator_dim: 32
 discriminator_in_channel: 2

main.py CHANGED Viewed

@@ -67,6 +67,13 @@ denoise_engines = {
                         project_path / "trained_models/mpnet-nx-speech-1-epoch.zip").as_posix()
         }
     },
     "mpnet-aishell-1-epoch": {
         "infer_cls": InferenceMPNet,
         "kwargs": {
@@ -187,7 +194,7 @@ def main():
                     outputs=[shell_output],
                 )
-    # http://127.0.0.1:7864/
     blocks.queue().launch(
         share=False if platform.system() == "Windows" else False,
         server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",

                         project_path / "trained_models/mpnet-nx-speech-1-epoch.zip").as_posix()
         }
     },
+    "mpnet-nx-speech-20-epoch": {
+        "infer_cls": InferenceMPNet,
+        "kwargs": {
+            "pretrained_model_path_or_zip_file": (
+                    project_path / "trained_models/mpnet-nx-speech-20-epoch.zip").as_posix()
+        }
+    },
     "mpnet-aishell-1-epoch": {
         "infer_cls": InferenceMPNet,
         "kwargs": {
                     outputs=[shell_output],
                 )
+    # http://127.0.0.1:7865/
     blocks.queue().launch(
         share=False if platform.system() == "Windows" else False,
         server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ torch-pesq==0.1.2
 torchmetrics==1.6.1
 torchmetrics[audio]==1.6.1
 einops==0.8.1

 torchmetrics==1.6.1
 torchmetrics[audio]==1.6.1
 einops==0.8.1
+torch_stoi==0.2.3

toolbox/torchaudio/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/losses/perceptual.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://zhuanlan.zhihu.com/p/627039860
+"""
+import torch
+import torch.nn as nn
+from torch_stoi import NegSTOILoss as TorchNegSTOILoss
+class PMSQELoss(object):
+    """
+    A Deep Learning Loss Function based on the Perceptual Evaluation of the Speech Quality
+    https://sigmat.ugr.es/PMSQE/
+    On Loss Functions for Supervised Monaural Time-Domain Speech Enhancement
+    https://arxiv.org/abs/1909.01019
+    https://github.com/asteroid-team/asteroid/blob/master/asteroid/losses/pmsqe.py
+    """
+class NegSTOILoss(nn.Module):
+    """
+    STOI短时客观可懂度(Short-Time Objective Intelligibility)，
+    通过计算语音信号的时域和频域特征之间的相关性来预测语音的可理解度，
+    范围从0到1，分数越高可懂度越高。
+    它适用于评估噪声环境下的语音可懂度改善效果。
+    https://github.com/mpariente/pytorch_stoi
+    https://github.com/mpariente/pystoi
+    https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/loss/stoi_loss.py
+    """
+    def __init__(self,
+                 sample_rate: int,
+                 reduction: str = "mean",
+                 ):
+        super(NegSTOILoss, self).__init__()
+        self.loss_fn = TorchNegSTOILoss(sample_rate=sample_rate)
+        self.reduction = reduction
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        batch_loss = self.loss_fn.forward(denoise, clean)
+        if self.reduction == "mean":
+            loss = torch.mean(batch_loss)
+        elif self.reduction == "sum":
+            loss = torch.sum(batch_loss)
+        else:
+            raise AssertionError
+        return loss
+def main():
+    sample_rate = 16000
+    loss_func = NegSTOILoss(
+        sample_rate=sample_rate,
+        reduction="mean",
+    )
+    denoise = torch.randn(2, sample_rate)
+    clean = torch.randn(2, sample_rate)
+    loss_batch = loss_func.forward(denoise, clean)
+    print(loss_batch)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/losses/snr.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://zhuanlan.zhihu.com/p/627039860
+"""
+import torch
+import torch.nn as nn
+class NegativeSNRLoss(nn.Module):
+    """
+    Signal-to-Noise Ratio
+    """
+    def __init__(self, eps: float = 1e-8):
+        super(NegativeSNRLoss, self).__init__()
+        self.eps = eps
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        Compute the SI-SNR loss between the estimated signal and the target signal.
+        :param denoise: The estimated signal (batch_size, signal_length)
+        :param clean: The target signal (batch_size, signal_length)
+        :return: The SI-SNR loss (batch_size,)
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        denoise = denoise - torch.mean(denoise, dim=-1, keepdim=True)
+        clean = clean - torch.mean(clean, dim=-1, keepdim=True)
+        noise = denoise - clean
+        clean_power = torch.norm(clean, p=2, dim=-1) ** 2
+        noise_power = torch.norm(noise, p=2, dim=-1) ** 2
+        snr = 10 * torch.log10((clean_power + self.eps) / (noise_power + self.eps))
+        return -snr.mean()
+class NegativeSISNRLoss(nn.Module):
+    """
+    Scale-Invariant Source-to-Noise Ratio
+    https://arxiv.org/abs/2206.07293
+    """
+    def __init__(self,
+                 reduction: str = "mean",
+                 eps: float = 1e-8,
+                 ):
+        super(NegativeSISNRLoss, self).__init__()
+        self.reduction = reduction
+        self.eps = eps
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        Compute the SI-SNR loss between the estimated signal and the target signal.
+        :param denoise: The estimated signal (batch_size, signal_length)
+        :param clean: The target signal (batch_size, signal_length)
+        :return: The SI-SNR loss (batch_size,)
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        denoise = denoise - torch.mean(denoise, dim=-1, keepdim=True)
+        clean = clean - torch.mean(clean, dim=-1, keepdim=True)
+        s_target = torch.sum(denoise * clean, dim=-1, keepdim=True) * clean / (torch.norm(clean, p=2, dim=-1, keepdim=True) ** 2 + self.eps)
+        e_noise = denoise - s_target
+        batch_si_snr = 10 * torch.log10(torch.norm(s_target, p=2, dim=-1) ** 2 / (torch.norm(e_noise, p=2, dim=-1) ** 2 + self.eps))
+        # si_snr shape: [batch_size,]
+        if self.reduction == "mean":
+            loss = torch.mean(batch_si_snr)
+        elif self.reduction == "sum":
+            loss = torch.sum(batch_si_snr)
+        else:
+            raise AssertionError
+        return -loss
+def main():
+    batch_size = 2
+    signal_length = 16000
+    estimated_signal = torch.randn(batch_size, signal_length)
+    target_signal = torch.randn(batch_size, signal_length)
+    si_snr_loss = NegativeSISNRLoss()
+    loss = si_snr_loss.forward(estimated_signal, target_signal)
+    print(f"loss: {loss.item()}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/losses/spectral.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://zhuanlan.zhihu.com/p/627039860
+https://github.com/facebookresearch/denoiser/blob/main/denoiser/stft_loss.py
+"""
+from typing import List
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class LSDLoss(nn.Module):
+    """
+    Log Spectral Distance
+    Mean square error of power spectrum
+    """
+    def __init__(self,
+                 n_fft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 center: bool = True,
+                 eps: float = 1e-8,
+                 reduction: str = "mean",
+                 ):
+        super(LSDLoss, self).__init__()
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.center = center
+        self.eps = eps
+        self.reduction = reduction
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+    def forward(self, denoise_power: torch.Tensor, clean_power: torch.Tensor):
+        """
+        :param denoise_power: The estimated signal (batch_size, signal_length)
+        :param clean_power: The target signal (batch_size, signal_length)
+        :return:
+        """
+        denoise_power = denoise_power + self.eps
+        clean_power = clean_power + self.eps
+        log_denoise_power = torch.log10(denoise_power)
+        log_clean_power = torch.log10(clean_power)
+        # mean_square_error shape: [b, f]
+        mean_square_error = torch.mean(torch.square(log_denoise_power - log_clean_power), dim=-1)
+        if self.reduction == "mean":
+            lsd_loss = torch.mean(mean_square_error)
+        elif self.reduction == "sum":
+            lsd_loss = torch.sum(mean_square_error)
+        else:
+            raise AssertionError
+        return lsd_loss
+class ComplexSpectralLoss(nn.Module):
+    def __init__(self,
+                 n_fft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 center: bool = True,
+                 eps: float = 1e-8,
+                 reduction: str = "mean",
+                 factor_mag: float = 0.5,
+                 factor_pha: float = 0.3,
+                 factor_gra: float = 0.2,
+                 ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.center = center
+        self.eps = eps
+        self.reduction = reduction
+        self.factor_mag = factor_mag
+        self.factor_pha = factor_pha
+        self.factor_gra = factor_gra
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+        self.window = nn.Parameter(torch.hann_window(win_size), requires_grad=False)
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        :param denoise: The estimated signal (batch_size, signal_length)
+        :param clean: The target signal (batch_size, signal_length)
+        :return:
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        # denoise_stft, clean_stft shape: [b, f, t]
+        denoise_stft = torch.stft(
+            denoise,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        clean_stft = torch.stft(
+            clean,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        # complex_diff shape: [b, f, t], dtype: torch.complex64
+        complex_diff = denoise_stft - clean_stft
+        # magnitude_diff, phase_diff shape: [b, f, t], dtype: torch.float32
+        magnitude_diff = torch.abs(complex_diff)
+        phase_diff = torch.angle(complex_diff)
+        # magnitude_loss, phase_loss shape: [b,]
+        magnitude_loss = torch.norm(magnitude_diff, p=2, dim=(-1, -2))
+        phase_loss = torch.norm(phase_diff, p=1, dim=(-1, -2))
+        # phase_grad shape: [b, f, t-1], dtype: torch.float32
+        phase_grad = torch.diff(torch.angle(denoise_stft), dim=-1)
+        grad_loss = torch.mean(torch.abs(phase_grad), dim=(-1, -2))
+        # loss, grad_loss shape: [b,]
+        batch_loss = self.factor_mag * magnitude_loss + self.factor_pha * phase_loss + self.factor_gra * grad_loss
+        # print(f"magnitude_loss: {magnitude_loss}")
+        # print(f"phase_loss: {phase_loss}")
+        # print(f"grad_loss: {grad_loss}")
+        if self.reduction == "mean":
+            loss = torch.mean(batch_loss)
+        elif self.reduction == "sum":
+            loss = torch.sum(batch_loss)
+        else:
+            raise AssertionError
+        return loss
+class SpectralConvergenceLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+    def __init__(self,
+                 reduction: str = "mean",
+                 ):
+        super(SpectralConvergenceLoss, self).__init__()
+        self.reduction = reduction
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+    def forward(self,
+                denoise_magnitude: torch.Tensor,
+                clean_magnitude: torch.Tensor,
+                ):
+        """
+        :param denoise_magnitude: Tensor, shape: [batch_size, time_steps, freq_bins]
+        :param clean_magnitude: Tensor, shape: [batch_size, time_steps, freq_bins]
+        :return:
+        """
+        error_norm = torch.norm(denoise_magnitude - clean_magnitude, p="fro", dim=(-1, -2))
+        truth_norm = torch.norm(clean_magnitude, p="fro", dim=(-1, -2))
+        batch_loss = error_norm / truth_norm
+        if self.reduction == "mean":
+            loss = torch.mean(batch_loss)
+        elif self.reduction == "sum":
+            loss = torch.sum(batch_loss)
+        else:
+            raise AssertionError
+        return loss
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+    def __init__(self,
+                 reduction: str = "mean",
+                 ):
+        super(LogSTFTMagnitudeLoss, self).__init__()
+        self.reduction = reduction
+        if reduction not in ("sum", "mean"):
+            raise AssertionError(f"param reduction must be sum or mean.")
+    def forward(self,
+                denoise_magnitude: torch.Tensor,
+                clean_magnitude: torch.Tensor,
+                ):
+        """
+        :param denoise_magnitude: Tensor, shape: [batch_size, time_steps, freq_bins]
+        :param clean_magnitude: Tensor, shape: [batch_size, time_steps, freq_bins]
+        :return:
+        """
+        return F.l1_loss(torch.log(denoise_magnitude), torch.log(clean_magnitude))
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+    def __init__(self,
+                 n_fft: int = 1024,
+                 win_size: int = 600,
+                 hop_size: int = 120,
+                 center: bool = True,
+                 reduction: str = "mean",
+                 ):
+        super(STFTLoss, self).__init__()
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.center = center
+        self.reduction = reduction
+        self.window = nn.Parameter(torch.hann_window(win_size), requires_grad=False)
+        self.spectral_convergence_loss = SpectralConvergenceLoss(reduction=reduction)
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss(reduction=reduction)
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        :param denoise:
+        :param clean:
+        :return:
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        # denoise_stft, clean_stft shape: [b, f, t]
+        denoise_stft = torch.stft(
+            denoise,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        clean_stft = torch.stft(
+            clean,
+            n_fft=self.n_fft,
+            win_length=self.win_size,
+            hop_length=self.hop_size,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            return_complex=True
+        )
+        denoise_magnitude = torch.abs(denoise_stft)
+        clean_magnitude = torch.abs(clean_stft)
+        sc_loss = self.spectral_convergence_loss.forward(denoise_magnitude, clean_magnitude)
+        mag_loss = self.log_stft_magnitude_loss.forward(denoise_magnitude, clean_magnitude)
+        return sc_loss, mag_loss
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+    def __init__(self,
+                 fft_size_list: List[int] = None,
+                 win_size_list: List[int] = None,
+                 hop_size_list: List[int] = None,
+                 factor_sc=0.1,
+                 factor_mag=0.1,
+                 ):
+        super(MultiResolutionSTFTLoss, self).__init__()
+        fft_size_list = fft_size_list or [1024, 2048, 512]
+        win_size_list = win_size_list or [600, 1200, 240]
+        hop_size_list = hop_size_list or [120, 240, 50]
+        if not len(fft_size_list) == len(win_size_list) == len(hop_size_list):
+            raise AssertionError
+        loss_fn_list = list()
+        for n_fft, win_size, hop_size in zip(fft_size_list, win_size_list, hop_size_list):
+            loss_fn_list.append(
+                STFTLoss(
+                    n_fft=n_fft,
+                    win_size=win_size,
+                    hop_size=hop_size,
+                )
+            )
+        self.loss_fn_list = loss_fn_list
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+    def forward(self, denoise: torch.Tensor, clean: torch.Tensor):
+        """
+        :param denoise:
+        :param clean:
+        :return:
+        """
+        if denoise.shape != clean.shape:
+            raise AssertionError("Input signals must have the same shape")
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for loss_fn in self.loss_fn_list:
+            sc_l, mag_l = loss_fn.forward(denoise, clean)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss = sc_loss / len(self.loss_fn_list)
+        mag_loss = mag_loss / len(self.loss_fn_list)
+        sc_loss = self.factor_sc * sc_loss
+        mag_loss = self.factor_mag * mag_loss
+        loss = sc_loss + mag_loss
+        return loss
+def main():
+    batch_size = 2
+    signal_length = 16000
+    estimated_signal = torch.randn(batch_size, signal_length)
+    target_signal = torch.randn(batch_size, signal_length)
+    # loss_fn = LSDLoss()
+    # loss_fn = ComplexSpectralLoss()
+    loss_fn = MultiResolutionSTFTLoss()
+    loss = loss_fn.forward(estimated_signal, target_signal)
+    print(f"loss: {loss.item()}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/metrics/pesq.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from joblib import Parallel, delayed
+import numpy as np
+from pesq import pesq
+from typing import List
+from pesq import cypesq
+def run_pesq(clean_audio: np.ndarray,
+             noisy_audio: np.ndarray,
+             sample_rate: int = 16000,
+             mode: str = "wb",
+             ) -> float:
+    if sample_rate == 8000 and mode == "wb":
+        raise AssertionError(f"mode should be `nb` when sample_rate is 8000")
+    try:
+        pesq_score = pesq(sample_rate, clean_audio, noisy_audio, mode)
+    except cypesq.NoUtterancesError as e:
+        pesq_score = -1
+    except Exception as e:
+        print(f"pesq failed. error type: {type(e)}, error text: {str(e)}")
+        pesq_score = -1
+    return pesq_score
+def run_batch_pesq(clean_audio_list: List[np.ndarray],
+                   noisy_audio_list: List[np.ndarray],
+                   sample_rate: int = 16000,
+                   mode: str = "wb",
+                   n_jobs: int = 4,
+                   ) -> List[float]:
+    parallel = Parallel(n_jobs=n_jobs)
+    parallel_tasks = list()
+    for clean_audio, noisy_audio in zip(clean_audio_list, noisy_audio_list):
+        parallel_task = delayed(run_pesq)(clean_audio, noisy_audio, sample_rate, mode)
+        parallel_tasks.append(parallel_task)
+    pesq_score_list = parallel.__call__(parallel_tasks)
+    return pesq_score_list
+def run_pesq_score(clean_audio_list: List[np.ndarray],
+                   noisy_audio_list: List[np.ndarray],
+                   sample_rate: int = 16000,
+                   mode: str = "wb",
+                   n_jobs: int = 4,
+                   ) -> List[float]:
+    pesq_score_list = run_batch_pesq(clean_audio_list=clean_audio_list,
+                                     noisy_audio_list=noisy_audio_list,
+                                     sample_rate=sample_rate,
+                                     mode=mode,
+                                     n_jobs=n_jobs,
+                                     )
+    pesq_score = np.mean(pesq_score_list)
+    return pesq_score
+def main():
+    clean_audio = np.random.uniform(low=0, high=1, size=(2, 160000,))
+    noisy_audio = np.random.uniform(low=0, high=1, size=(2, 160000,))
+    clean_audio_list = list(clean_audio)
+    noisy_audio_list = list(noisy_audio)
+    pesq_score_list = run_batch_pesq(clean_audio_list, noisy_audio_list)
+    print(pesq_score_list)
+    pesq_score = run_pesq_score(clean_audio_list, noisy_audio_list)
+    print(pesq_score)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/conv_tasnet/configuration_conv_tasnet.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Tuple
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class ConvTasNetConfig(PretrainedConfig):
+    """
+    https://github.com/kaituoxu/Conv-TasNet/blob/master/src/train.py
+    """
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 segment_size: int = 4,
+                 win_size: int = 20,
+                 freq_bins: int = 256,
+                 bottleneck_channels: int = 256,
+                 num_speakers: int = 2,
+                 num_blocks: int = 4,
+                 num_sub_blocks: int = 8,
+                 sub_blocks_channels: int = 512,
+                 sub_blocks_kernel_size: int = 3,
+                 norm_type: str = "gLN",
+                 causal: bool = False,
+                 mask_nonlinear: str = "relu",
+                 **kwargs
+                 ):
+        super(ConvTasNetConfig, self).__init__(**kwargs)
+        self.sample_rate = sample_rate
+        self.segment_size = segment_size
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.bottleneck_channels = bottleneck_channels
+        self.num_speakers = num_speakers
+        self.num_blocks = num_blocks
+        self.num_sub_blocks = num_sub_blocks
+        self.sub_blocks_channels = sub_blocks_channels
+        self.sub_blocks_kernel_size = sub_blocks_kernel_size
+        self.norm_type = norm_type
+        self.causal = causal
+        self.mask_nonlinear = mask_nonlinear
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/conv_tasnet/modeling_conv_tasnet.py CHANGED Viewed

@@ -2,8 +2,483 @@
 # -*- coding: utf-8 -*-
 """
 https://github.com/kaituoxu/Conv-TasNet/blob/master/src/conv_tasnet.py
 """
-if __name__ == '__main__':
-    pass

 # -*- coding: utf-8 -*-
 """
 https://github.com/kaituoxu/Conv-TasNet/blob/master/src/conv_tasnet.py
+https://pytorch.org/audio/2.5.0/generated/torchaudio.models.ConvTasNet.html
 """
+import os
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.models.conv_tasnet.utils import overlap_and_add
+from toolbox.torchaudio.models.conv_tasnet.configuration_conv_tasnet import ConvTasNetConfig
+class ChannelwiseLayerNorm(nn.Module):
+    """Channel-wise Layer Normalization (cLN)"""
+    def __init__(self,
+                 channels: int,
+                 eps: float = 1e-8
+                 ):
+        super(ChannelwiseLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channels, 1))
+        self.beta = nn.Parameter(torch.Tensor(1, channels,1 ))
+        self.reset_parameters()
+        self.eps = eps
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        :param y: Tensor, shape: [batch_size, channels, time_steps]
+        :return: gln_y: Tensor, shape: [batch_size, channels, time_steps]
+        """
+        # mean, var shape: [batch_size, 1, time_steps]
+        mean = torch.mean(y, dim=1, keepdim=True)
+        var = torch.var(y, dim=1, keepdim=True, unbiased=False)
+        cln_y = self.gamma * (y - mean) / torch.pow(var + self.eps, 0.5) + self.beta
+        return cln_y
+class GlobalLayerNorm(nn.Module):
+    """Global Layer Normalization (gLN)"""
+    def __init__(self,
+                 channels: int,
+                 eps: float = 1e-8
+                 ):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channels, 1))
+        self.beta = nn.Parameter(torch.Tensor(1, channels,1 ))
+        self.reset_parameters()
+        self.eps = eps
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        :param y: Tensor, shape: [batch_size, channels, time_steps]
+        :return: gln_y: Tensor, shape: [batch_size, channels, time_steps]
+        """
+        # mean, var shape: [batch_size, 1, 1]
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        var = (torch.pow(y-mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gln_y = self.gamma * (y - mean) / torch.pow(var + self.eps, 0.5) + self.beta
+        return gln_y
+def choose_norm(norm_type: str, channels: int):
+    """
+    The input of normalization will be (M, C, K), where M is batch size,
+       C is channel size and K is sequence length.
+    """
+    if norm_type == "gLN":
+        return GlobalLayerNorm(channels)
+    elif norm_type == "cLN":
+        return ChannelwiseLayerNorm(channels)
+    else: # norm_type == "BN":
+        # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
+        # along M and K, so this BN usage is right.
+        return nn.BatchNorm1d(channels)
+class Chomp1d(nn.Module):
+    """
+    To ensure the output length is the same as the input.
+    """
+    def __init__(self, chomp_size: int):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: Tensor, shape: [batch_size, hidden_size, k_pad]
+        :return: Tensor, shape: [batch_size, hidden_size, k]
+        """
+        return x[:, :, :-self.chomp_size].contiguous()
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: int,
+                 padding: int,
+                 dilation: int,
+                 norm_type="gLN",
+                 causal=False
+                 ):
+        super(DepthwiseSeparableConv, self).__init__()
+        # Use `groups` option to implement depthwise convolution
+        # [M, H, K] -> [M, H, K]
+        self.depthwise_conv = nn.Conv1d(
+            in_channels=in_channels, out_channels=in_channels,
+            kernel_size=kernel_size, stride=stride,
+            padding=padding, dilation=dilation,
+            groups=in_channels, bias=False,
+        )
+        self.chomp = None
+        if causal:
+            self.chomp = Chomp1d(padding)
+        self.prelu = nn.PReLU()
+        self.norm = choose_norm(norm_type, in_channels)
+        # [M, H, K] -> [M, B, K]
+        self.pointwise_conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1, bias=False
+        )
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: Tensor, shape: [batch_size, hidden_size, k]
+        :return: Tensor, shape: [batch_size, b, k]
+        """
+        x = self.depthwise_conv.forward(x)
+        if self.chomp is not None:
+            x = self.chomp.forward(x)
+        x = self.prelu.forward(x)
+        x = self.norm.forward(x)
+        x = self.pointwise_conv.forward(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, win_size: int, freq_bins: int):
+        super(Encoder, self).__init__()
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.conv1d_U = nn.Conv1d(
+            in_channels=1,
+            out_channels=freq_bins,
+            kernel_size=win_size,
+            stride=win_size // 2,
+            bias=False
+        )
+    def forward(self, mixture):
+        """
+        :param mixture: Tensor, shape: [batch_size, num_samples]
+        :return: mixture_w, Tensor, shape: [batch_size, freq_bins, time_steps],
+                  where time_steps = (num_samples-win_size) / (win_size/2) + 1 = 2num_samples/win_size-1
+        """
+        mixture = torch.unsqueeze(mixture, 1)  # [M, 1, T]
+        mixture_w = F.relu(self.conv1d_U(mixture))  # [M, N, K]
+        return mixture_w
+class Decoder(nn.Module):
+    def __init__(self, win_size: int, freq_bins: int):
+        super(Decoder, self).__init__()
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.basis_signals = nn.Linear(
+            in_features=freq_bins,
+            out_features=win_size,
+            bias=False
+        )
+    def forward(self,
+                mixture_w: torch.Tensor,
+                est_mask: torch.Tensor,
+                ):
+        """
+        :param mixture_w: Tensor, shape: [batch_size, freq_bins, time_steps],
+                where time_steps = (num_samples-win_size) / (win_size/2) + 1 = 2num_samples/win_size-1
+        :param est_mask: Tensor, shape: [batch_size, c, freq_bins, time_steps],
+        :return: Tensor, shape: [batch_size, c, num_samples],
+        """
+        source_w = torch.unsqueeze(mixture_w, 1) * est_mask
+        source_w = torch.transpose(source_w, 2, 3)
+        est_source = self.basis_signals(source_w)
+        est_source = overlap_and_add(est_source, self.win_size//2)
+        return est_source
+class TemporalBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: int,
+                 padding: int,
+                 dilation: int,
+                 norm_type="gLN",
+                 causal=False
+                 ):
+        super(TemporalBlock, self).__init__()
+        self.conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        self.prelu = nn.PReLU()
+        self.norm = choose_norm(norm_type, out_channels)
+        # [M, H, K] -> [M, B, K]
+        self.dsconv = DepthwiseSeparableConv(
+            out_channels, in_channels,
+            kernel_size, stride,
+            padding, dilation,
+            norm_type, causal,
+        )
+    def forward(self, x):
+        residual = x
+        x = self.conv1x1.forward(x)
+        x = self.prelu.forward(x)
+        x = self.norm.forward(x)
+        x = self.dsconv.forward(x)
+        out = x + residual
+        return out
+class TemporalConvNet(nn.Module):
+    def __init__(self,
+                 freq_bins: int = 256,
+                 bottleneck_channels: int = 256,
+                 num_speakers: int = 2,
+                 num_blocks: int = 4,
+                 num_sub_blocks: int = 8,
+                 sub_blocks_channels: int = 512,
+                 sub_blocks_kernel_size: int = 3,
+                 norm_type: str = "gLN",
+                 causal: bool = False,
+                 mask_nonlinear: str = "relu",
+                 ):
+        super(TemporalConvNet, self).__init__()
+        self.freq_bins = freq_bins
+        self.bottleneck_channels = bottleneck_channels
+        self.num_speakers = num_speakers
+        self.num_blocks = num_blocks
+        self.num_sub_blocks = num_sub_blocks
+        self.sub_blocks_channels = sub_blocks_channels
+        self.sub_blocks_kernel_size = sub_blocks_kernel_size
+        self.mask_nonlinear = mask_nonlinear
+        self.layer_norm = ChannelwiseLayerNorm(freq_bins)
+        self.bottleneck_conv1x1 = nn.Conv1d(freq_bins, bottleneck_channels, 1, bias=False)
+        self.temporal_conv_list = nn.ModuleList([])
+        for num_block_idx in range(num_blocks):
+            sub_blocks = list()
+            for num_sub_block_idx in range(num_sub_blocks):
+                dilation = 2 ** num_sub_block_idx
+                padding = (sub_blocks_kernel_size - 1) * dilation
+                if not causal:
+                    padding = padding // 2
+                temporal_block = TemporalBlock(
+                    bottleneck_channels, sub_blocks_channels,
+                    sub_blocks_kernel_size, stride=1,
+                    padding=padding, dilation=dilation,
+                    norm_type=norm_type, causal=causal,
+                )
+                sub_blocks.append(temporal_block)
+            self.temporal_conv_list.extend(sub_blocks)
+        self.mask_conv1x1 = nn.Conv1d(
+            in_channels=bottleneck_channels,
+            out_channels=num_speakers * freq_bins,
+            kernel_size=1,
+            bias=False,
+        )
+    def forward(self, mixture_w: torch.Tensor):
+        """
+        :param mixture_w: Tensor, shape: [batch_size, freq_bins, time_steps]
+        :return: est_mask: Tensor, shape: [batch_size, freq_bins, time_steps]
+        """
+        batch_size, freq_bins, time_steps = mixture_w.size()
+        x = self.layer_norm.forward(mixture_w)
+        x = self.bottleneck_conv1x1.forward(x)
+        for temporal_conv in self.temporal_conv_list:
+            x = temporal_conv.forward(x)
+        score = self.mask_conv1x1.forward(x)
+        # [M, C*N, K] -> [M, C, N, K]
+        score = score.view(batch_size, self.num_speakers, freq_bins, time_steps)
+        if self.mask_nonlinear == "softmax":
+            est_mask = F.softmax(score, dim=1)
+        elif self.mask_nonlinear == "relu":
+            est_mask = F.relu(score)
+        else:
+            raise ValueError("Unsupported mask non-linear function")
+        return est_mask
+class ConvTasNet(nn.Module):
+    def __init__(self,
+                 win_size: int = 20,
+                 freq_bins: int = 256,
+                 bottleneck_channels: int = 256,
+                 num_speakers: int = 2,
+                 num_blocks: int = 4,
+                 num_sub_blocks: int = 8,
+                 sub_blocks_channels: int = 512,
+                 sub_blocks_kernel_size: int = 3,
+                 norm_type: str = "gLN",
+                 causal: bool = False,
+                 mask_nonlinear: str = "relu",
+                 ):
+        super(ConvTasNet, self).__init__()
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.bottleneck_channels = bottleneck_channels
+        self.num_speakers = num_speakers
+        self.num_blocks = num_blocks
+        self.num_sub_blocks = num_sub_blocks
+        self.sub_blocks_channels = sub_blocks_channels
+        self.sub_blocks_kernel_size = sub_blocks_kernel_size
+        self.norm_type = norm_type
+        self.causal = causal
+        self.mask_nonlinear = mask_nonlinear
+        self.encoder = Encoder(win_size, freq_bins)
+        self.separator = TemporalConvNet(
+            freq_bins=freq_bins,
+            bottleneck_channels=bottleneck_channels,
+            sub_blocks_channels=sub_blocks_channels,
+            sub_blocks_kernel_size=sub_blocks_kernel_size,
+            num_sub_blocks=num_sub_blocks,
+            num_blocks=num_blocks,
+            num_speakers=num_speakers,
+            norm_type=norm_type,
+            causal=causal,
+            mask_nonlinear=mask_nonlinear,
+        )
+        self.decoder = Decoder(win_size=win_size, freq_bins=freq_bins)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+    def forward(self, mixture: torch.Tensor):
+        """
+        :param mixture: Tensor, shape: [batch_size, num_samples]
+        :return: est_source: Tensor, shape: [batch_size, c, num_samples]
+        """
+        # mixture shape: [batch_size, num_samples]
+        mixture_w = self.encoder.forward(mixture)
+        # mixture_w shape: [batch_size, freq_bins, time_steps]
+        est_mask = self.separator.forward(mixture_w)
+        # est_mask shape: [batch_size, num_speakers, freq_bins, time_steps]
+        est_source = self.decoder.forward(mixture_w, est_mask)
+        num_samples1 = mixture.size(-1)
+        num_samples2 = est_source.size(-1)
+        est_source = F.pad(est_source, (0, num_samples1 - num_samples2))
+        return est_source
+MODEL_FILE = "model.pt"
+class ConvTasNetPretrainedModel(ConvTasNet):
+    def __init__(self,
+                 config: ConvTasNetConfig,
+                 ):
+        super(ConvTasNetPretrainedModel, self).__init__(
+            win_size=config.win_size,
+            freq_bins=config.freq_bins,
+            bottleneck_channels=config.bottleneck_channels,
+            sub_blocks_channels=config.sub_blocks_channels,
+            sub_blocks_kernel_size=config.sub_blocks_kernel_size,
+            num_sub_blocks=config.num_sub_blocks,
+            num_blocks=config.num_blocks,
+            num_speakers=config.num_speakers,
+            norm_type=config.norm_type,
+            causal=config.causal,
+            mask_nonlinear=config.mask_nonlinear,
+        )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = ConvTasNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    config = ConvTasNetConfig()
+    tas_net = ConvTasNet(
+        win_size=config.win_size,
+        freq_bins=config.freq_bins,
+        bottleneck_channels=config.bottleneck_channels,
+        sub_blocks_channels=config.sub_blocks_channels,
+        sub_blocks_kernel_size=config.sub_blocks_kernel_size,
+        num_sub_blocks=config.num_sub_blocks,
+        num_blocks=config.num_blocks,
+        num_speakers=config.num_speakers,
+        norm_type=config.norm_type,
+        causal=config.causal,
+        mask_nonlinear=config.mask_nonlinear,
+    )
+    print(tas_net)
+    mixture = torch.rand(size=(1, 8000*4), dtype=torch.float32)
+    outputs = tas_net.forward(mixture)
+    print(outputs.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/conv_tasnet/utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/kaituoxu/Conv-TasNet/blob/master/src/utils.py
+"""
+import math
+import torch
+def overlap_and_add(signal: torch.Tensor, frame_step: int):
+    """
+    Reconstructs a signal from a framed representation.
+    Adds potentially overlapping frames of a signal with shape
+    `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+    The resulting tensor has shape `[..., output_size]` where
+        output_size = (frames - 1) * frame_step + frame_length
+    Based on https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    :param signal: Tensor, shape: [..., frames, frame_length]. All dimensions may be unknown, and rank must be at least 2.
+    :param frame_step: int, overlap offsets. Must be less than or equal to frame_length.
+    :return: Tensor, shape: [..., output_size].
+    containing the overlap-added frames of signal's inner-most two dimensions.
+        output_size = (frames - 1) * frame_step + frame_length
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes).unfold(0, subframes_per_frame, subframe_step)
+    frame = frame.clone().detach()
+    frame = frame.to(signal.device)
+    frame = frame.long()
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/conv_tasnet/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+model_name: "conv_tasnet"
+sample_rate: 8000
+segment_size: 4
+win_size: 20
+freq_bins: 256
+bottleneck_channels: 256
+num_speakers: 2
+num_blocks: 4
+num_sub_blocks: 8
+sub_blocks_channels: 512
+sub_blocks_kernel_size: 3
+norm_type: "gLN"
+causal: false
+mask_nonlinear: "relu"

toolbox/torchaudio/models/demucs/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/models/demucs/configuration_demucs.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class DemucsConfig(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 in_channels: int = 1,
+                 out_channels: int = 1,
+                 hidden_channels: int = 48,
+                 depth: int = 5,
+                 kernel_size: int = 8,
+                 stride: int = 4,
+                 causal: bool = True,
+                 resample: int = 4,
+                 growth: int = 2,
+                 max_hidden: int = 10_000,
+                 do_normalize: bool = True,
+                 rescale: float = 0.1,
+                 floor: float = 1e-3,
+                 **kwargs
+                 ):
+        super(DemucsConfig, self).__init__(**kwargs)
+        self.sample_rate = sample_rate
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.depth = depth
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.causal = causal
+        self.resample = resample
+        self.growth = growth
+        self.max_hidden = max_hidden
+        self.do_normalize = do_normalize
+        self.rescale = rescale
+        self.floor = floor
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/demucs/modeling_demucs.py ADDED Viewed

	@@ -0,0 +1,299 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://arxiv.org/abs/2006.12847
+https://github.com/facebookresearch/denoiser
+"""
+import math
+import os
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.models.demucs.configuration_demucs import DemucsConfig
+from toolbox.torchaudio.models.demucs.resample import upsample2, downsample2
+activation_layer_dict = {
+    "glu": nn.GLU,
+    "relu": nn.ReLU,
+    "identity": nn.Identity,
+    "sigmoid": nn.Sigmoid,
+}
+class BLSTM(nn.Module):
+    def __init__(self,
+                 hidden_size: int,
+                 num_layers: int = 2,
+                 bidirectional: bool = True,
+                 ):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=bidirectional,
+                            num_layers=num_layers,
+                            hidden_size=hidden_size,
+                            input_size=hidden_size
+                            )
+        self.linear = None
+        if bidirectional:
+            self.linear = nn.Linear(2 * hidden_size, hidden_size)
+    def forward(self,
+                x: torch.Tensor,
+                hx: torch.Tensor = None
+                ):
+        x, hx = self.lstm.forward(x, hx)
+        if self.linear:
+            x = self.linear(x)
+        return x, hx
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference)**0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+class DemucsModel(nn.Module):
+    def __init__(self,
+                 in_channels: int = 1,
+                 out_channels: int = 1,
+                 hidden_channels: int = 48,
+                 depth: int = 5,
+                 kernel_size: int = 8,
+                 stride: int = 4,
+                 causal: bool = True,
+                 resample: int = 4,
+                 growth: int = 2,
+                 max_hidden: int = 10_000,
+                 do_normalize: bool = True,
+                 rescale: float = 0.1,
+                 floor: float = 1e-3,
+                 ):
+        super(DemucsModel, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.depth = depth
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.causal = causal
+        self.resample = resample
+        self.growth = growth
+        self.max_hidden = max_hidden
+        self.do_normalize = do_normalize
+        self.rescale = rescale
+        self.floor = floor
+        if resample not in [1, 2, 4]:
+            raise ValueError("Resample should be 1, 2 or 4.")
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        for index in range(depth):
+            encode = []
+            encode += [
+                nn.Conv1d(in_channels, hidden_channels, kernel_size, stride),
+                nn.ReLU(),
+                nn.Conv1d(hidden_channels, hidden_channels * 2, 1),
+                nn.GLU(1),
+            ]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            decode += [
+                nn.Conv1d(hidden_channels, 2 * hidden_channels, 1),
+                nn.GLU(1),
+                nn.ConvTranspose1d(hidden_channels, out_channels, kernel_size, stride),
+            ]
+            if index > 0:
+                decode.append(nn.ReLU())
+            self.decoder.insert(0, nn.Sequential(*decode))
+            out_channels = hidden_channels
+            in_channels = hidden_channels
+            hidden_channels = min(int(growth * hidden_channels), max_hidden)
+        self.lstm = BLSTM(in_channels, bidirectional=not causal)
+        if rescale:
+            rescale_module(self, reference=rescale)
+    @staticmethod
+    def valid_length(length: int, depth: int, kernel_size: int, stride: int, resample: int):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length.
+        """
+        length = math.ceil(length * resample)
+        for idx in range(depth):
+            length = math.ceil((length - kernel_size) / stride) + 1
+            length = max(length, 1)
+        for idx in range(depth):
+            length = (length - 1) * stride + kernel_size
+        length = int(math.ceil(length / resample))
+        return int(length)
+    def forward(self, noisy: torch.Tensor):
+        """
+        :param noisy: Tensor, shape: [batch_size, num_samples] or [batch_size, channels, num_samples]
+        :return:
+        """
+        if noisy.dim() == 2:
+            noisy = noisy.unsqueeze(1)
+        # noisy shape: [batch_size, channels, num_samples]
+        if self.do_normalize:
+            mono = noisy.mean(dim=1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            noisy = noisy / (self.floor + std)
+        else:
+            std = 1
+        _, _, length = noisy.shape
+        x = noisy
+        length_ = self.valid_length(length, self.depth, self.kernel_size, self.stride, self.resample)
+        x = F.pad(x, (0, length_ - length))
+        if self.resample == 2:
+            x = upsample2(x)
+        elif self.resample == 4:
+            x = upsample2(x)
+            x = upsample2(x)
+        skips = []
+        for encode in self.encoder:
+            x = encode(x)
+            skips.append(x)
+        x = x.permute(2, 0, 1)
+        x, _ = self.lstm(x)
+        x = x.permute(1, 2, 0)
+        for decode in self.decoder:
+            skip = skips.pop(-1)
+            x = x + skip[..., :x.shape[-1]]
+            x = decode(x)
+        if self.resample == 2:
+            x = downsample2(x)
+        elif self.resample == 4:
+            x = downsample2(x)
+            x = downsample2(x)
+        x = x[..., :length]
+        return std * x
+MODEL_FILE = "model.pt"
+class DemucsPretrainedModel(DemucsModel):
+    def __init__(self,
+                 config: DemucsConfig,
+                 ):
+        super(DemucsPretrainedModel, self).__init__(
+            # sample_rate=config.sample_rate,
+            in_channels=config.in_channels,
+            out_channels=config.out_channels,
+            hidden_channels=config.hidden_channels,
+            depth=config.depth,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            causal=config.causal,
+            resample=config.resample,
+            growth=config.growth,
+            max_hidden=config.max_hidden,
+            do_normalize=config.do_normalize,
+            rescale=config.rescale,
+            floor=config.floor,
+        )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = DemucsConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    config = DemucsConfig()
+    model = DemucsModel(
+        in_channels=config.in_channels,
+        out_channels=config.out_channels,
+        hidden_channels=config.hidden_channels,
+        depth=config.depth,
+        kernel_size=config.kernel_size,
+        stride=config.stride,
+        causal=config.causal,
+        resample=config.resample,
+        growth=config.growth,
+        max_hidden=config.max_hidden,
+        do_normalize=config.do_normalize,
+        rescale=config.rescale,
+        floor=config.floor,
+    )
+    print(model)
+    noisy = torch.rand(size=(1, 8000*4), dtype=torch.float32)
+    denoise = model.forward(noisy)
+    print(denoise.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/demucs/resample.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import math
+import torch as th
+from torch.nn import functional as F
+def sinc(t):
+    """sinc.
+    :param t: the input tensor
+    """
+    return th.where(t == 0, th.tensor(1., device=t.device, dtype=t.dtype), th.sin(t) / t)
+def kernel_upsample2(zeros=56):
+    """kernel_upsample2.
+    """
+    win = th.hann_window(4 * zeros + 1, periodic=False)
+    winodd = win[1::2]
+    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
+    t *= math.pi
+    kernel = (sinc(t) * winodd).view(1, 1, -1)
+    return kernel
+def upsample2(x, zeros=56):
+    """
+    Upsampling the input by 2 using sinc interpolation.
+    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
+    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
+    Vol. 9. IEEE, 1984.
+    """
+    *other, time = x.shape
+    kernel = kernel_upsample2(zeros).to(x)
+    out = F.conv1d(x.view(-1, 1, time), kernel, padding=zeros)[..., 1:].view(*other, time)
+    y = th.stack([x, out], dim=-1)
+    return y.view(*other, -1)
+def kernel_downsample2(zeros=56):
+    """kernel_downsample2.
+    """
+    win = th.hann_window(4 * zeros + 1, periodic=False)
+    winodd = win[1::2]
+    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
+    t.mul_(math.pi)
+    kernel = (sinc(t) * winodd).view(1, 1, -1)
+    return kernel
+def downsample2(x, zeros=56):
+    """
+    Downsampling the input by 2 using sinc interpolation.
+    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
+    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
+    Vol. 9. IEEE, 1984.
+    """
+    if x.shape[-1] % 2 != 0:
+        x = F.pad(x, (0, 1))
+    xeven = x[..., ::2]
+    xodd = x[..., 1::2]
+    *other, time = xodd.shape
+    kernel = kernel_downsample2(zeros).to(x)
+    out = xeven + F.conv1d(xodd.view(-1, 1, time), kernel, padding=zeros)[..., :-1].view(
+        *other, time)
+    return out.view(*other, -1).mul(0.5)
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/nx_dfnet/configuration_nx_dfnet.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Tuple
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class NXDfNetConfig(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 freq_bins: int = 256,
+                 win_size: int = 200,
+                 hop_size: int = 100,
+                 conv_channels: int = 64,
+                 conv_kernel_size_input: Tuple[int, int] = (3, 3),
+                 conv_kernel_size_inner: Tuple[int, int] = (1, 3),
+                 conv_lookahead: int = 0,
+                 convt_kernel_size_inner: Tuple[int, int] = (1, 3),
+                 embedding_hidden_size: int = 256,
+                 encoder_combine_op: str = "concat",
+                 encoder_emb_skip_op: str = "none",
+                 encoder_emb_linear_groups: int = 16,
+                 encoder_emb_hidden_size: int = 256,
+                 encoder_linear_groups: int = 32,
+                 lsnr_max: int = 30,
+                 lsnr_min: int = -15,
+                 norm_tau: float = 1.,
+                 decoder_emb_num_layers: int = 3,
+                 decoder_emb_skip_op: str = "none",
+                 decoder_emb_linear_groups: int = 16,
+                 decoder_emb_hidden_size: int = 256,
+                 df_decoder_hidden_size: int = 256,
+                 df_num_layers: int = 2,
+                 df_order: int = 5,
+                 df_bins: int = 96,
+                 df_gru_skip: str =  "grouped_linear",
+                 df_decoder_linear_groups: int =  16,
+                 df_pathway_kernel_size_t: int = 5,
+                 df_lookahead: int = 2,
+                 use_post_filter: bool = False,
+                 **kwargs
+                 ):
+        super(NXDfNetConfig, self).__init__(**kwargs)
+        # transform
+        self.sample_rate = sample_rate
+        self.freq_bins = freq_bins
+        self.win_size = win_size
+        self.hop_size = hop_size
+        # conv
+        self.conv_channels = conv_channels
+        self.conv_kernel_size_input = conv_kernel_size_input
+        self.conv_kernel_size_inner = conv_kernel_size_inner
+        self.conv_lookahead = conv_lookahead
+        self.convt_kernel_size_inner = convt_kernel_size_inner
+        self.embedding_hidden_size = embedding_hidden_size
+        # encoder
+        self.encoder_emb_skip_op = encoder_emb_skip_op
+        self.encoder_emb_linear_groups = encoder_emb_linear_groups
+        self.encoder_emb_hidden_size = encoder_emb_hidden_size
+        self.encoder_linear_groups = encoder_linear_groups
+        self.encoder_combine_op = encoder_combine_op
+        self.lsnr_max = lsnr_max
+        self.lsnr_min = lsnr_min
+        self.norm_tau = norm_tau
+        # decoder
+        self.decoder_emb_num_layers = decoder_emb_num_layers
+        self.decoder_emb_skip_op = decoder_emb_skip_op
+        self.decoder_emb_linear_groups = decoder_emb_linear_groups
+        self.decoder_emb_hidden_size = decoder_emb_hidden_size
+        # df decoder
+        self.df_decoder_hidden_size = df_decoder_hidden_size
+        self.df_num_layers = df_num_layers
+        self.df_order = df_order
+        self.df_bins = df_bins
+        self.df_gru_skip = df_gru_skip
+        self.df_decoder_linear_groups = df_decoder_linear_groups
+        self.df_pathway_kernel_size_t = df_pathway_kernel_size_t
+        self.df_lookahead = df_lookahead
+        # runtime
+        self.use_post_filter = use_post_filter
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/nx_dfnet/modeling_nx_dfnet.py ADDED Viewed

	@@ -0,0 +1,989 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+import math
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torchaudio
+from toolbox.torchaudio.models.nx_dfnet.utils import overlap_and_add
+from toolbox.torchaudio.models.nx_dfnet.configuration_nx_dfnet import NXDfNetConfig
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+MODEL_FILE = "model.pt"
+norm_layer_dict = {
+    "batch_norm_2d": torch.nn.BatchNorm2d
+}
+activation_layer_dict = {
+    "relu": torch.nn.ReLU,
+    "identity": torch.nn.Identity,
+    "sigmoid": torch.nn.Sigmoid,
+}
+class CausalConv2d(nn.Sequential):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 fpad: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 lookahead: int = 0
+                 ):
+        """
+        Causal Conv2d by delaying the signal for any lookahead.
+        Expected input format: [batch_size, channels, time_steps, spec_dim]
+        :param in_channels:
+        :param out_channels:
+        :param kernel_size:
+        :param fstride:
+        :param dilation:
+        :param fpad:
+        """
+        super(CausalConv2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+        if fpad:
+            fpad_ = kernel_size[1] // 2 + dilation - 1
+        else:
+            fpad_ = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        pad = (0, 0, kernel_size[0] - 1 - lookahead, lookahead)
+        layers = list()
+        if any(x > 0 for x in pad):
+            layers.append(nn.ConstantPad2d(pad, 0.0))
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        if max(kernel_size) == 1:
+            separable = False
+        layers.append(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                padding=(0, fpad_),
+                stride=(1, fstride),  # stride over time is always 1
+                dilation=(1, dilation),  # dilation over time is always 1
+                groups=groups,
+                bias=bias,
+            )
+        )
+        if separable:
+            layers.append(
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=1,
+                    bias=False,
+                )
+            )
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            layers.append(activation_layer())
+        super().__init__(*layers)
+    def forward(self, inputs):
+        for module in self:
+            inputs = module(inputs)
+        return inputs
+class CausalConvTranspose2d(nn.Sequential):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 fpad: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 lookahead: int = 0
+                 ):
+        """
+        Causal ConvTranspose2d.
+        Expected input format: [batch_size, channels, time_steps, spec_dim]
+        """
+        super(CausalConvTranspose2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        if fpad:
+            fpad_ = kernel_size[1] // 2
+        else:
+            fpad_ = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        pad = (0, 0, kernel_size[0] - 1 - lookahead, lookahead)
+        layers = []
+        if any(x > 0 for x in pad):
+            layers.append(nn.ConstantPad2d(pad, 0.0))
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        layers.append(
+            nn.ConvTranspose2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                padding=(kernel_size[0] - 1, fpad_ + dilation - 1),
+                output_padding=(0, fpad_),
+                stride=(1, fstride),  # stride over time is always 1
+                dilation=(1, dilation),  # dilation over time is always 1
+                groups=groups,
+                bias=bias,
+            )
+        )
+        if separable:
+            layers.append(
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=1,
+                    bias=False,
+                )
+            )
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            layers.append(activation_layer())
+        super().__init__(*layers)
+class GroupedLinear(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, groups: int = 1):
+        super().__init__()
+        # self.weight: Tensor
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.groups = groups
+        assert input_size % groups == 0, f"Input size {input_size} not divisible by {groups}"
+        assert hidden_size % groups == 0, f"Hidden size {hidden_size} not divisible by {groups}"
+        self.ws = input_size // groups
+        self.register_parameter(
+            "weight",
+            torch.nn.Parameter(
+                torch.zeros(groups, input_size // groups, hidden_size // groups), requires_grad=True
+            ),
+        )
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))  # type: ignore
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [..., I]
+        b, t, _ = x.shape
+        # new_shape = list(x.shape)[:-1] + [self.groups, self.ws]
+        new_shape = (b, t, self.groups, self.ws)
+        x = x.view(new_shape)
+        # The better way, but not supported by torchscript
+        # x = x.unflatten(-1, (self.groups, self.ws))  # [..., G, I/G]
+        x = torch.einsum("btgi,gih->btgh", x, self.weight)  # [..., G, H/G]
+        x = x.flatten(2, 3)  # [B, T, H]
+        return x
+    def __repr__(self):
+        cls = self.__class__.__name__
+        return f"{cls}(input_size: {self.input_size}, hidden_size: {self.hidden_size}, groups: {self.groups})"
+class SqueezedGRU_S(nn.Module):
+    """
+    SGE net: Video object detection with squeezed GRU and information entropy map
+    https://arxiv.org/abs/2106.07224
+    """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: Optional[int] = None,
+        num_layers: int = 1,
+        linear_groups: int = 8,
+        batch_first: bool = True,
+        skip_op: str = "none",
+        activation_layer: str = "identity",
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.linear_in = nn.Sequential(
+            GroupedLinear(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                groups=linear_groups,
+            ),
+            activation_layer_dict[activation_layer](),
+        )
+        # gru skip operator
+        self.gru_skip_op = None
+        if skip_op == "none":
+            self.gru_skip_op = None
+        elif skip_op == "identity":
+            if not input_size != output_size:
+                raise AssertionError("Dimensions do not match")
+            self.gru_skip_op = nn.Identity()
+        elif skip_op == "grouped_linear":
+            self.gru_skip_op = GroupedLinear(
+                input_size=hidden_size,
+                hidden_size=hidden_size,
+                groups=linear_groups,
+            )
+        else:
+            raise NotImplementedError()
+        self.gru = nn.GRU(
+            input_size=hidden_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=batch_first,
+            bidirectional=False,
+        )
+        if output_size is not None:
+            self.linear_out = nn.Sequential(
+                GroupedLinear(
+                    input_size=hidden_size,
+                    hidden_size=output_size,
+                    groups=linear_groups,
+                ),
+                activation_layer_dict[activation_layer](),
+            )
+        else:
+            self.linear_out = nn.Identity()
+    def forward(self, inputs: torch.Tensor, h=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.linear_in(inputs)
+        x, h = self.gru.forward(x, h)
+        x = self.linear_out(x)
+        if self.gru_skip_op is not None:
+            x = x + self.gru_skip_op(inputs)
+        return x, h
+class Add(nn.Module):
+    def forward(self, a, b):
+        return a + b
+class Concat(nn.Module):
+    def forward(self, a, b):
+        return torch.cat((a, b), dim=-1)
+class DeepSTFT(nn.Module):
+    def __init__(self, win_size: int, freq_bins: int):
+        super(DeepSTFT, self).__init__()
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.conv1d_U = nn.Conv1d(
+            in_channels=1,
+            out_channels=freq_bins * 2,
+            kernel_size=win_size,
+            stride=win_size // 2,
+            bias=False
+        )
+    def forward(self, signal: torch.Tensor):
+        """
+        :param signal: Tensor, shape: [batch_size, num_samples]
+        :return: v, Tensor, shape: [batch_size, freq_bins, time_steps, 2],
+                  where time_steps = (num_samples-win_size) / (win_size/2) + 1 = 2num_samples/win_size-1
+        """
+        signal = torch.unsqueeze(signal, 1)
+        # signal shape: [batch_size, 1, num_samples]
+        spec = F.relu(self.conv1d_U(signal))
+        # spec shape: [batch_size, freq_bins * 2, time_steps]
+        b, f2, t = spec.shape
+        spec = spec.view(b, f2//2, 2, t).permute(0, 1, 3, 2)
+        # spec shape: [batch_size, freq_bins, time_steps, 2]
+        return spec
+class DeepISTFT(nn.Module):
+    def __init__(self, win_size: int, freq_bins: int):
+        super(DeepISTFT, self).__init__()
+        self.win_size = win_size
+        self.freq_bins = freq_bins
+        self.basis_signals = nn.Linear(
+            in_features=freq_bins * 2,
+            out_features=win_size,
+            bias=False
+        )
+    def forward(self,
+                spec: torch.Tensor,
+                ):
+        """
+        :param spec: Tensor, shape: [batch_size, freq_bins, time_steps, 2],
+                where time_steps = (num_samples-win_size) / (win_size/2) + 1 = 2num_samples/win_size-1
+        :return: Tensor, shape: [batch_size, c, num_samples],
+        """
+        b, f, t, _ = spec.shape
+        # spec shape: [b, f, t, 2]
+        spec = spec.permute(0, 2, 1, 3)
+        # spec shape: [b, t, f, 2]
+        spec = spec.view(b, 1, t, -1)
+        # spec shape: [b, 1, t, f2]
+        signal = self.basis_signals(spec)
+        # signal shape: [b, 1, t, win_size]
+        signal = overlap_and_add(signal, self.win_size//2)
+        # signal shape: [b, 1, num_samples]
+        return signal
+class Encoder(nn.Module):
+    def __init__(self, config: NXDfNetConfig):
+        super(Encoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.freq_bins // 4
+        self.embedding_output_size = config.conv_channels * config.freq_bins // 4
+        self.embedding_hidden_size = config.embedding_hidden_size
+        self.spec_conv0 = CausalConv2d(
+            in_channels=1,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_input,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.spec_conv1 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+            lookahead=config.conv_lookahead,
+        )
+        self.spec_conv2 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+            lookahead=config.conv_lookahead,
+        )
+        self.spec_conv3 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.df_conv0 = CausalConv2d(
+            in_channels=2,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_input,
+            bias=False,
+            separable=True,
+            fstride=1,
+        )
+        self.df_conv1 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+        )
+        self.df_fc_emb = nn.Sequential(
+            GroupedLinear(
+                config.conv_channels * config.df_bins // 2,
+                self.embedding_input_size,
+                groups=config.encoder_linear_groups
+            ),
+            nn.ReLU(inplace=True)
+        )
+        if config.encoder_combine_op == "concat":
+            self.embedding_input_size *= 2
+            self.combine = Concat()
+        else:
+            self.combine = Add()
+        # emb_gru
+        if config.freq_bins % 8 != 0:
+            raise AssertionError("freq_bins should be divisible by 8")
+        self.emb_gru = SqueezedGRU_S(
+            self.embedding_input_size,
+            self.embedding_hidden_size,
+            output_size=self.embedding_output_size,
+            num_layers=1,
+            batch_first=True,
+            skip_op=config.encoder_emb_skip_op,
+            linear_groups=config.encoder_emb_linear_groups,
+            activation_layer="relu",
+        )
+        # lsnr
+        self.lsnr_fc = nn.Sequential(
+            nn.Linear(self.embedding_output_size, 1),
+            nn.Sigmoid()
+        )
+        self.lsnr_scale = config.lsnr_max - config.lsnr_min
+        self.lsnr_offset = config.lsnr_min
+    def forward(self,
+                power_spec: torch.Tensor,
+                df_spec: torch.Tensor,
+                hidden_state: torch.Tensor = None,
+                ):
+        # power_spec shape: (batch_size, 1, time_steps, spec_dim)
+        e0 = self.spec_conv0.forward(power_spec)
+        e1 = self.spec_conv1.forward(e0)
+        e2 = self.spec_conv2.forward(e1)
+        e3 = self.spec_conv3.forward(e2)
+        # e0 shape: [batch_size, channels, time_steps, spec_dim]
+        # e1 shape: [batch_size, channels, time_steps, spec_dim // 2]
+        # e2 shape: [batch_size, channels, time_steps, spec_dim // 4]
+        # e3 shape: [batch_size, channels, time_steps, spec_dim // 4]
+        # df_spec, shape: (batch_size, 2, time_steps, df_bins)
+        c0 = self.df_conv0(df_spec)
+        c1 = self.df_conv1(c0)
+        # c0 shape: [batch_size, channels, time_steps, df_bins]
+        # c1 shape: [batch_size, channels, time_steps, df_bins // 2]
+        cemb = c1.permute(0, 2, 3, 1)
+        # cemb shape: [batch_size, time_steps, df_bins // 2, channels]
+        cemb = cemb.flatten(2)
+        # cemb shape: [batch_size, time_steps, df_bins // 2 * channels]
+        cemb = self.df_fc_emb(cemb)
+        # cemb shape: [batch_size, time_steps, spec_dim // 4 * channels]
+        # e3 shape: [batch_size, channels, time_steps, spec_dim // 4]
+        emb = e3.permute(0, 2, 3, 1)
+        # emb shape: [batch_size, time_steps, spec_dim // 4, channels]
+        emb = emb.flatten(2)
+        # emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
+        emb = self.combine(emb, cemb)
+        # if concat; emb shape: [batch_size, time_steps, spec_dim // 4 * channels * 2]
+        # if add; emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
+        emb, h = self.emb_gru.forward(emb, hidden_state)
+        # emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
+        # h shape: [batch_size, 1, spec_dim]
+        lsnr = self.lsnr_fc(emb) * self.lsnr_scale + self.lsnr_offset
+        # lsnr shape: [batch_size, time_steps, 1]
+        return e0, e1, e2, e3, emb, c0, lsnr, h
+class Decoder(nn.Module):
+    def __init__(self, config: NXDfNetConfig):
+        super(Decoder, self).__init__()
+        if config.freq_bins % 8 != 0:
+            raise AssertionError("freq_bins should be divisible by 8")
+        self.emb_in_dim = config.conv_channels * config.freq_bins // 4
+        self.emb_out_dim = config.conv_channels * config.freq_bins // 4
+        self.emb_hidden_dim = config.decoder_emb_hidden_size
+        self.emb_gru = SqueezedGRU_S(
+            self.emb_in_dim,
+            self.emb_hidden_dim,
+            output_size=self.emb_out_dim,
+            num_layers=config.decoder_emb_num_layers - 1,
+            batch_first=True,
+            skip_op=config.decoder_emb_skip_op,
+            linear_groups=config.decoder_emb_linear_groups,
+            activation_layer="relu",
+        )
+        self.conv3p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.convt3 = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.conv_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.conv2p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.convt2 = CausalConvTranspose2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.convt_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+            lookahead=config.conv_lookahead,
+        )
+        self.conv1p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.convt1 = CausalConvTranspose2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=config.convt_kernel_size_inner,
+            bias=False,
+            separable=True,
+            fstride=2,
+            lookahead=config.conv_lookahead,
+        )
+        self.conv0p = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=config.conv_channels,
+            kernel_size=1,
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+        self.conv0_out = CausalConv2d(
+            in_channels=config.conv_channels,
+            out_channels=1,
+            kernel_size=config.conv_kernel_size_inner,
+            activation_layer="sigmoid",
+            bias=False,
+            separable=True,
+            fstride=1,
+            lookahead=config.conv_lookahead,
+        )
+    def forward(self, emb, e3, e2, e1, e0) -> torch.Tensor:
+        # Estimates erb mask
+        b, _, t, f8 = e3.shape
+        # emb shape: [batch_size, time_steps, (freq_dim // 4) * conv_channels]
+        emb, _ = self.emb_gru(emb)
+        # emb shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
+        emb = emb.view(b, t, f8, -1).permute(0, 3, 1, 2)
+        e3 = self.convt3(self.conv3p(e3) + emb)
+        # e3 shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
+        e2 = self.convt2(self.conv2p(e2) + e3)
+        # e2 shape: [batch_size, conv_channels, time_steps, freq_dim // 2]
+        e1 = self.convt1(self.conv1p(e1) + e2)
+        # e1 shape: [batch_size, conv_channels, time_steps, freq_dim]
+        mask = self.conv0_out(self.conv0p(e0) + e1)
+        # mask shape: [batch_size, 1, time_steps, freq_dim]
+        return mask
+class DfDecoder(nn.Module):
+    def __init__(self, config: NXDfNetConfig):
+        super(DfDecoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.freq_bins // 4
+        self.df_decoder_hidden_size = config.df_decoder_hidden_size
+        self.df_num_layers = config.df_num_layers
+        self.df_order = config.df_order
+        self.df_bins = config.df_bins
+        self.df_out_ch = config.df_order * 2
+        self.df_convp = CausalConv2d(
+            config.conv_channels,
+            self.df_out_ch,
+            fstride=1,
+            kernel_size=(config.df_pathway_kernel_size_t, 1),
+            separable=True,
+            bias=False,
+        )
+        self.df_gru = SqueezedGRU_S(
+            self.embedding_input_size,
+            self.df_decoder_hidden_size,
+            num_layers=self.df_num_layers,
+            batch_first=True,
+            skip_op="none",
+            activation_layer="relu",
+        )
+        if config.df_gru_skip == "none":
+            self.df_skip = None
+        elif config.df_gru_skip == "identity":
+            if config.embedding_hidden_size != config.df_decoder_hidden_size:
+                raise AssertionError("Dimensions do not match")
+            self.df_skip = nn.Identity()
+        elif config.df_gru_skip == "grouped_linear":
+            self.df_skip = GroupedLinear(
+                self.embedding_input_size,
+                self.df_decoder_hidden_size,
+                groups=config.df_decoder_linear_groups
+            )
+        else:
+            raise NotImplementedError()
+        self.df_out: nn.Module
+        out_dim = self.df_bins * self.df_out_ch
+        self.df_out = nn.Sequential(
+            GroupedLinear(
+                input_size=self.df_decoder_hidden_size,
+                hidden_size=out_dim,
+                groups=config.df_decoder_linear_groups
+            ),
+            nn.Tanh()
+        )
+        self.df_fc_a = nn.Sequential(
+            nn.Linear(self.df_decoder_hidden_size, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, emb: torch.Tensor, c0: torch.Tensor) -> torch.Tensor:
+        # emb shape: [batch_size, time_steps, df_bins // 4 * channels]
+        b, t, _ = emb.shape
+        df_coefs, _ = self.df_gru(emb)
+        if self.df_skip is not None:
+            df_coefs = df_coefs + self.df_skip(emb)
+        # df_coefs shape: [batch_size, time_steps, df_decoder_hidden_size]
+        # c0 shape: [batch_size, channels, time_steps, df_bins]
+        c0 = self.df_convp(c0)
+        # c0 shape: [batch_size, df_order * 2, time_steps, df_bins]
+        c0 = c0.permute(0, 2, 3, 1)
+        # c0 shape: [batch_size, time_steps, df_bins, df_order * 2]
+        df_coefs = self.df_out(df_coefs)  # [B, T, F*O*2], O: df_order
+        # df_coefs shape: [batch_size, time_steps, df_bins * df_order * 2]
+        df_coefs = df_coefs.view(b, t, self.df_bins, self.df_out_ch)
+        # df_coefs shape: [batch_size, time_steps, df_bins, df_order * 2]
+        df_coefs = df_coefs + c0
+        # df_coefs shape: [batch_size, time_steps, df_bins, df_order * 2]
+        return df_coefs
+class DfOutputReshapeMF(nn.Module):
+    """Coefficients output reshape for multiframe/MultiFrameModule
+    Requires input of shape B, C, T, F, 2.
+    """
+    def __init__(self, df_order: int, df_bins: int):
+        super().__init__()
+        self.df_order = df_order
+        self.df_bins = df_bins
+    def forward(self, coefs: torch.Tensor) -> torch.Tensor:
+        # [B, T, F, O*2] -> [B, O, T, F, 2]
+        new_shape = list(coefs.shape)
+        new_shape[-1] = -1
+        new_shape.append(2)
+        coefs = coefs.view(new_shape)
+        coefs = coefs.permute(0, 3, 1, 2, 4)
+        return coefs
+class Mask(nn.Module):
+    def __init__(self, use_post_filter: bool = False, eps: float = 1e-12):
+        super().__init__()
+        self.use_post_filter = use_post_filter
+        self.eps = eps
+    def post_filter(self, mask: torch.Tensor, beta: float = 0.02) -> torch.Tensor:
+        """
+        Post-Filter
+        A Perceptually-Motivated Approach for Low-Complexity, Real-Time Enhancement of Fullband Speech.
+        https://arxiv.org/abs/2008.04259
+        :param mask: Real valued mask, typically of shape [B, C, T, F].
+        :param beta: Global gain factor.
+        :return:
+        """
+        mask_sin = mask * torch.sin(np.pi * mask / 2)
+        mask_pf = (1 + beta) * mask / (1 + beta * mask.div(mask_sin.clamp_min(self.eps)).pow(2))
+        return mask_pf
+    def forward(self, spec: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # spec shape: [batch_size, 1, time_steps, freq_bins, 2]
+        if not self.training and self.use_post_filter:
+            mask = self.post_filter(mask)
+        # mask shape: [batch_size, 1, time_steps, freq_bins]
+        mask = mask.unsqueeze(4)
+        # mask shape: [batch_size, 1, time_steps, freq_bins, 1]
+        return spec * mask
+class DeepFiltering(nn.Module):
+    def __init__(self,
+                 df_bins: int,
+                 df_order: int,
+                 lookahead: int = 0,
+                 ):
+        super(DeepFiltering, self).__init__()
+        self.df_bins = df_bins
+        self.df_order = df_order
+        self.need_unfold = df_order > 1
+        self.lookahead = lookahead
+        self.pad = nn.ConstantPad2d((0, 0, df_order - 1 - lookahead, lookahead), 0.0)
+    def spec_unfold(self, spec: torch.Tensor):
+        """
+        Pads and unfolds the spectrogram according to frame_size.
+        :param spec: complex Tensor, Spectrogram of shape [B, C, T, F].
+        :return: Tensor, Unfolded spectrogram of shape [B, C, T, F, N], where N: frame_size.
+        """
+        if self.need_unfold:
+            # spec shape: [batch_size, freq_bins, time_steps]
+            spec_pad = self.pad(spec)
+            # spec_pad shape: [batch_size, 1, time_steps_pad, freq_bins]
+            spec_unfold = spec_pad.unfold(2, self.df_order, 1)
+            # spec_unfold shape: [batch_size, 1, time_steps, freq_bins, df_order]
+            return spec_unfold
+        else:
+            return spec.unsqueeze(-1)
+    def forward(self,
+                spec: torch.Tensor,
+                coefs: torch.Tensor,
+                ):
+        # spec shape: [batch_size, 1, time_steps, freq_bins, 2]
+        spec = spec.contiguous()
+        spec_u = self.spec_unfold(torch.view_as_complex(spec))
+        # spec_u shape: [batch_size, 1, time_steps, freq_bins, df_order]
+        # coefs shape: [batch_size, df_order, time_steps, df_bins, 2]
+        coefs = torch.view_as_complex(coefs)
+        # coefs shape: [batch_size, df_order, time_steps, df_bins]
+        spec_f = spec_u.narrow(-2, 0, self.df_bins)
+        # spec_f shape: [batch_size, 1, time_steps, df_bins, df_order]
+        coefs = coefs.view(coefs.shape[0], -1, self.df_order, *coefs.shape[2:])
+        # coefs shape: [batch_size, 1, df_order, time_steps, df_bins]
+        spec_f = self.df(spec_f, coefs)
+        # spec_f shape: [batch_size, 1, time_steps, df_bins]
+        if self.training:
+            spec = spec.clone()
+        spec[..., :self.df_bins, :] = torch.view_as_real(spec_f)
+        # spec shape: [batch_size, 1, time_steps, freq_bins, 2]
+        return spec
+    @staticmethod
+    def df(spec: torch.Tensor, coefs: torch.Tensor) -> torch.Tensor:
+        """
+        Deep filter implementation using `torch.einsum`. Requires unfolded spectrogram.
+        :param spec: (complex Tensor). Spectrogram of shape [B, C, T, F, N].
+        :param coefs: (complex Tensor). Coefficients of shape [B, C, N, T, F].
+        :return: (complex Tensor). Spectrogram of shape [B, C, T, F].
+        """
+        return torch.einsum("...tfn,...ntf->...tf", spec, coefs)
+class NXDfNet(nn.Module):
+    def __init__(self, config: NXDfNetConfig):
+        super(NXDfNet, self).__init__()
+        self.config = config
+        self.stft = DeepSTFT(win_size=config.win_size, freq_bins=config.freq_bins)
+        self.istft = DeepISTFT(win_size=config.win_size, freq_bins=config.freq_bins)
+        self.encoder = Encoder(config)
+        self.decoder = Decoder(config)
+        self.df_decoder = DfDecoder(config)
+        self.df_out_transform = DfOutputReshapeMF(config.df_order, config.df_bins)
+        self.df_op = DeepFiltering(
+            df_bins=config.df_bins,
+            df_order=config.df_order,
+            lookahead=config.df_lookahead,
+        )
+        self.mask = Mask(use_post_filter=config.use_post_filter)
+    def forward(self,
+                noisy: torch.Tensor,
+                ):
+        """
+        :param noisy: Tensor, shape: [batch_size, num_samples]
+        :return:
+        """
+        spec = self.stft.forward(noisy)
+        # spec shape: [batch_size, freq_bins, time_steps, 2]
+        power_spec = torch.sum(torch.square(spec), dim=-1)
+        power_spec = power_spec.unsqueeze(1).permute(0, 1, 3, 2)
+        # power_spec shape: [batch_size, freq_bins, time_steps]
+        # power_spec shape: [batch_size, 1, freq_bins, time_steps]
+        # power_spec shape: [batch_size, 1, time_steps, freq_bins]
+        df_spec = spec.permute(0, 3, 2, 1)
+        # df_spec shape: [batch_size, 2, time_steps, freq_bins]
+        df_spec = df_spec[..., :self.df_decoder.df_bins]
+        # df_spec shape: [batch_size, 2, time_steps, df_bins]
+        # spec shape: [batch_size, freq_bins, time_steps, 2]
+        spec = torch.transpose(spec, dim0=1, dim1=2)
+        # spec shape: [batch_size, time_steps, freq_bins, 2]
+        spec = torch.unsqueeze(spec, dim=1)
+        # spec shape: [batch_size, 1, time_steps, freq_bins, 2]
+        e0, e1, e2, e3, emb, c0, _, h = self.encoder.forward(power_spec, df_spec)
+        mask = self.decoder.forward(emb, e3, e2, e1, e0)
+        # mask shape: [batch_size, 1, time_steps, freq_bins]
+        if torch.any(mask > 1) or torch.any(mask < 0):
+            raise AssertionError
+        spec_m = self.mask.forward(spec, mask)
+        # lsnr shape: [batch_size, time_steps, 1]
+        # lsnr = torch.transpose(lsnr, dim0=2, dim1=1)
+        # lsnr shape: [batch_size, 1, time_steps]
+        df_coefs = self.df_decoder.forward(emb, c0)
+        df_coefs = self.df_out_transform(df_coefs)
+        # df_coefs shape: [batch_size, df_order, time_steps, df_bins, 2]
+        spec_e = self.df_op.forward(spec.clone(), df_coefs)
+        # spec_e shape: [batch_size, 1, time_steps, freq_bins, 2]
+        spec_e[..., self.df_decoder.df_bins:, :] = spec_m[..., self.df_decoder.df_bins:, :]
+        spec_e = torch.squeeze(spec_e, dim=1)
+        spec_e = spec_e.permute(0, 2, 1, 3)
+        # spec_e shape: [batch_size, freq_bins, time_steps, 2]
+        denoise = self.istft.forward(spec_e)
+        # spec_e shape: [batch_size, freq_bins, time_steps, 2]
+        return denoise
+class NXDfNetPretrainedModel(NXDfNet):
+    def __init__(self,
+                 config: NXDfNetConfig,
+                 ):
+        super(NXDfNetPretrainedModel, self).__init__(
+            config=config,
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = NXDfNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    config = NXDfNetConfig()
+    model = NXDfNet(config=config)
+    inputs = torch.randn(size=(1, 16000), dtype=torch.float32)
+    denoise = model.forward(inputs)
+    print(denoise.shape)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/nx_dfnet/utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/kaituoxu/Conv-TasNet/blob/master/src/utils.py
+"""
+import math
+import torch
+def overlap_and_add(signal: torch.Tensor, frame_step: int):
+    """
+    Reconstructs a signal from a framed representation.
+    Adds potentially overlapping frames of a signal with shape
+    `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+    The resulting tensor has shape `[..., output_size]` where
+        output_size = (frames - 1) * frame_step + frame_length
+    Based on https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    :param signal: Tensor, shape: [..., frames, frame_length]. All dimensions may be unknown, and rank must be at least 2.
+    :param frame_step: int, overlap offsets. Must be less than or equal to frame_length.
+    :return: Tensor, shape: [..., output_size].
+    containing the overlap-added frames of signal's inner-most two dimensions.
+        output_size = (frames - 1) * frame_step + frame_length
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes).unfold(0, subframes_per_frame, subframe_step)
+    frame = frame.clone().detach()
+    frame = frame.to(signal.device)
+    frame = frame.long()
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+if __name__ == "__main__":
+    pass