Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on May 16

Commit

2171fed

1 Parent(s): a645af7

add dfnet2

Browse files

Files changed (19) hide show

examples/{clean_unet_aishell → clean_unet}/run.sh +0 -0
examples/{clean_unet_aishell → clean_unet}/step_1_prepare_data.py +0 -0
examples/{clean_unet_aishell → clean_unet}/step_2_train_model.py +0 -0
examples/{clean_unet_aishell → clean_unet}/step_3_evaluation.py +0 -0
examples/{clean_unet_aishell → clean_unet}/yaml/config.yaml +0 -0
examples/conv_tasnet/step_2_train_model.py +2 -0
examples/conv_tasnet_gan/run.sh +0 -156
examples/conv_tasnet_gan/step_1_prepare_data.py +0 -162
examples/conv_tasnet_gan/step_2_train_model.py +0 -582
examples/conv_tasnet_gan/yaml/config.yaml +0 -31
examples/conv_tasnet_gan/yaml/discriminator_config.yaml +0 -10
examples/dfnet/step_2_train_model.py +2 -0
examples/dfnet2/step_2_train_model.py +2 -0
examples/dtln/step_2_train_model.py +2 -0
examples/frcrn/step_2_train_model.py +2 -0
examples/lstm/step_2_train_model.py +2 -0
examples/rnnoise/step_2_train_model.py +3 -0
toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py +3 -0
toolbox/torchaudio/modules/utils/ema.py +87 -0

examples/{clean_unet_aishell → clean_unet}/run.sh RENAMED Viewed

File without changes

examples/{clean_unet_aishell → clean_unet}/step_1_prepare_data.py RENAMED Viewed

File without changes

examples/{clean_unet_aishell → clean_unet}/step_2_train_model.py RENAMED Viewed

File without changes

examples/{clean_unet_aishell → clean_unet}/step_3_evaluation.py RENAMED Viewed

File without changes

examples/{clean_unet_aishell → clean_unet}/yaml/config.yaml RENAMED Viewed

File without changes

examples/conv_tasnet/step_2_train_model.py CHANGED Viewed

@@ -346,6 +346,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -499,6 +500,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/conv_tasnet_gan/run.sh DELETED Viewed

@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-: <<'END'
-sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name conv-tasnet-dns3-20250319 \
---noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/dns3-noise" \
---speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech" \
---max_epochs 400
-END
-# params
-system_version="windows";
-verbose=true;
-stage=0 # start from 0 if you need to start from data preparation
-stop_stage=9
-work_dir="$(pwd)"
-file_folder_name=file_folder_name
-final_model_name=final_model_name
-config_file="yaml/config.yaml"
-discriminator_config_file="yaml/discriminator_config.yaml"
-limit=10
-noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
-speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
-max_count=10000000
-nohup_name=nohup.out
-# model params
-batch_size=64
-max_epochs=200
-save_top_k=10
-patience=5
-# parse options
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
-      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-      old_value="(eval echo \\$$name)";
-      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval "${name}=\"$2\"";
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-    *) break;
-  esac
-done
-file_dir="${work_dir}/${file_folder_name}"
-final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
-evaluation_audio_dir="${file_dir}/evaluation_audio"
-train_dataset="${file_dir}/train.jsonl"
-valid_dataset="${file_dir}/valid.jsonl"
-$verbose && echo "system_version: ${system_version}"
-$verbose && echo "file_folder_name: ${file_folder_name}"
-if [ $system_version == "windows" ]; then
-  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
-elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
-  #source /data/local/bin/nx_denoise/bin/activate
-  alias python3='/data/local/bin/nx_denoise/bin/python3'
-fi
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-  $verbose && echo "stage 1: prepare data"
-  cd "${work_dir}" || exit 1
-  python3 step_1_prepare_data.py \
-  --file_dir "${file_dir}" \
-  --noise_dir "${noise_dir}" \
-  --speech_dir "${speech_dir}" \
-  --train_dataset "${train_dataset}" \
-  --valid_dataset "${valid_dataset}" \
-  --max_count "${max_count}" \
-fi
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-  $verbose && echo "stage 2: train model"
-  cd "${work_dir}" || exit 1
-  python3 step_2_train_model.py \
-  --train_dataset "${train_dataset}" \
-  --valid_dataset "${valid_dataset}" \
-  --serialization_dir "${file_dir}" \
-  --config_file "${config_file}" \
-  --discriminator_config_file "${discriminator_config_file}" \
-fi
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-  $verbose && echo "stage 3: test model"
-  cd "${work_dir}" || exit 1
-  python3 step_3_evaluation.py \
-  --valid_dataset "${valid_dataset}" \
-  --model_dir "${file_dir}/best" \
-  --evaluation_audio_dir "${evaluation_audio_dir}" \
-  --limit "${limit}" \
-fi
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-  $verbose && echo "stage 4: collect files"
-  cd "${work_dir}" || exit 1
-  mkdir -p ${final_model_dir}
-  cp "${file_dir}/best"/* "${final_model_dir}"
-  cp -r "${file_dir}/evaluation_audio" "${final_model_dir}"
-  cd "${final_model_dir}/.." || exit 1;
-  if [ -e "${final_model_name}.zip" ]; then
-    rm -rf "${final_model_name}_backup.zip"
-    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
-  fi
-  zip -r "${final_model_name}.zip" "${final_model_name}"
-  rm -rf "${final_model_name}"
-fi
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-  $verbose && echo "stage 5: clear file_dir"
-  cd "${work_dir}" || exit 1
-  rm -rf "${file_dir}";
-fi

examples/conv_tasnet_gan/step_1_prepare_data.py DELETED Viewed

@@ -1,162 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import argparse
-import json
-import os
-from pathlib import Path
-import random
-import sys
-pwd = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(pwd, "../../"))
-import librosa
-import numpy as np
-from tqdm import tqdm
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--file_dir", default="./", type=str)
-    parser.add_argument(
-        "--noise_dir",
-        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
-        type=str
-    )
-    parser.add_argument(
-        "--speech_dir",
-        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
-        type=str
-    )
-    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
-    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
-    parser.add_argument("--duration", default=4.0, type=float)
-    parser.add_argument("--min_snr_db", default=-10, type=float)
-    parser.add_argument("--max_snr_db", default=20, type=float)
-    parser.add_argument("--target_sample_rate", default=8000, type=int)
-    parser.add_argument("--max_count", default=10000, type=int)
-    args = parser.parse_args()
-    return args
-def filename_generator(data_dir: str):
-    data_dir = Path(data_dir)
-    for filename in data_dir.glob("**/*.wav"):
-        yield filename.as_posix()
-def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000, max_epoch: int = 20000):
-    data_dir = Path(data_dir)
-    for epoch_idx in range(max_epoch):
-        for filename in data_dir.glob("**/*.wav"):
-            signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
-            raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
-            if raw_duration < duration:
-                # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
-                continue
-            if signal.ndim != 1:
-                raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
-            signal_length = len(signal)
-            win_size = int(duration * sample_rate)
-            for begin in range(0, signal_length - win_size, win_size):
-                if np.sum(signal[begin: begin+win_size]) == 0:
-                    continue
-                row = {
-                    "epoch_idx": epoch_idx,
-                    "filename": filename.as_posix(),
-                    "raw_duration": round(raw_duration, 4),
-                    "offset": round(begin / sample_rate, 4),
-                    "duration": round(duration, 4),
-                }
-                yield row
-def main():
-    args = get_args()
-    file_dir = Path(args.file_dir)
-    file_dir.mkdir(exist_ok=True)
-    noise_dir = Path(args.noise_dir)
-    speech_dir = Path(args.speech_dir)
-    noise_generator = target_second_signal_generator(
-        noise_dir.as_posix(),
-        duration=args.duration,
-        sample_rate=args.target_sample_rate,
-        max_epoch=100000,
-    )
-    speech_generator = target_second_signal_generator(
-        speech_dir.as_posix(),
-        duration=args.duration,
-        sample_rate=args.target_sample_rate,
-        max_epoch=1,
-    )
-    dataset = list()
-    count = 0
-    process_bar = tqdm(desc="build dataset excel")
-    with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
-        for noise, speech in zip(noise_generator, speech_generator):
-            if count >= args.max_count:
-                break
-            noise_filename = noise["filename"]
-            noise_raw_duration = noise["raw_duration"]
-            noise_offset = noise["offset"]
-            noise_duration = noise["duration"]
-            speech_filename = speech["filename"]
-            speech_raw_duration = speech["raw_duration"]
-            speech_offset = speech["offset"]
-            speech_duration = speech["duration"]
-            random1 = random.random()
-            random2 = random.random()
-            row = {
-                "noise_filename": noise_filename,
-                "noise_raw_duration": noise_raw_duration,
-                "noise_offset": noise_offset,
-                "noise_duration": noise_duration,
-                "speech_filename": speech_filename,
-                "speech_raw_duration": speech_raw_duration,
-                "speech_offset": speech_offset,
-                "speech_duration": speech_duration,
-                "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
-                "random1": random1,
-            }
-            row = json.dumps(row, ensure_ascii=False)
-            if random2 < (1 / 300 / 1):
-                fvalid.write(f"{row}\n")
-            else:
-                ftrain.write(f"{row}\n")
-            count += 1
-            duration_seconds = count * args.duration
-            duration_hours = duration_seconds / 3600
-            process_bar.update(n=1)
-            process_bar.set_postfix({
-                # "duration_seconds": round(duration_seconds, 4),
-                "duration_hours": round(duration_hours, 4),
-            })
-    return
-if __name__ == "__main__":
-    main()

examples/conv_tasnet_gan/step_2_train_model.py DELETED Viewed

@@ -1,582 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-"""
-https://github.com/kaituoxu/Conv-TasNet/tree/master/src
-一般场景：
-目标 SI-SNR ≥ 10 dB，适用于电话通信、基础语音助手等。
-高要求场景（如医疗助听、语音识别）：
-需 SI-SNR ≥ 14 dB，并配合 PESQ ≥ 3.0 和 STOI ≥ 0.851812。
-DeepFilterNet2 模型在 DNS4 数据集，超过500小时的音频上训练了 100 个 epoch。
-https://arxiv.org/abs/2205.05474
-"""
-import argparse
-import json
-import logging
-from logging.handlers import TimedRotatingFileHandler
-import os
-import platform
-from pathlib import Path
-import random
-import sys
-import shutil
-from typing import List
-pwd = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(pwd, "../../"))
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from torch.utils.data.dataloader import DataLoader
-from tqdm import tqdm
-from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
-from toolbox.torchaudio.models.conv_tasnet.configuration_conv_tasnet import ConvTasNetConfig
-from toolbox.torchaudio.models.conv_tasnet.modeling_conv_tasnet import ConvTasNet, ConvTasNetPretrainedModel
-from toolbox.torchaudio.models.discriminators.waveform_metric_discriminator.modeling_waveform_metric_discriminator import WaveformMetricDiscriminatorPretrainedModel
-from toolbox.torchaudio.models.discriminators.waveform_metric_discriminator.configuration_waveform_metric_discriminator import WaveformMetricDiscriminatorConfig
-from toolbox.torchaudio.models.nx_clean_unet.metrics import run_batch_pesq, run_pesq_score
-from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
-from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
-from toolbox.torchaudio.losses.perceptual import NegSTOILoss, PesqLoss
-from toolbox.torchaudio.metrics.pesq import run_pesq_score
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
-    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
-    parser.add_argument("--max_epochs", default=200, type=int)
-    parser.add_argument("--batch_size", default=64, type=int)
-    parser.add_argument("--num_serialized_models_to_keep", default=10, type=int)
-    parser.add_argument("--patience", default=5, type=int)
-    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
-    parser.add_argument("--seed", default=1234, type=int)
-    parser.add_argument("--config_file", default="config.yaml", type=str)
-    parser.add_argument("--discriminator_config_file", default="discriminator_config.yaml", type=str)
-    args = parser.parse_args()
-    return args
-def logging_config(file_dir: str):
-    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
-    logging.basicConfig(format=fmt,
-                        datefmt="%m/%d/%Y %H:%M:%S",
-                        level=logging.INFO)
-    file_handler = TimedRotatingFileHandler(
-        filename=os.path.join(file_dir, "main.log"),
-        encoding="utf-8",
-        when="D",
-        interval=1,
-        backupCount=7
-    )
-    file_handler.setLevel(logging.INFO)
-    file_handler.setFormatter(logging.Formatter(fmt))
-    logger = logging.getLogger(__name__)
-    logger.addHandler(file_handler)
-    return logger
-class CollateFunction(object):
-    def __init__(self):
-        pass
-    def __call__(self, batch: List[dict]):
-        clean_audios = list()
-        noisy_audios = list()
-        for sample in batch:
-            # noise_wave: torch.Tensor = sample["noise_wave"]
-            clean_audio: torch.Tensor = sample["speech_wave"]
-            noisy_audio: torch.Tensor = sample["mix_wave"]
-            # snr_db: float = sample["snr_db"]
-            clean_audios.append(clean_audio)
-            noisy_audios.append(noisy_audio)
-        clean_audios = torch.stack(clean_audios)
-        noisy_audios = torch.stack(noisy_audios)
-        # assert
-        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
-            raise AssertionError("nan or inf in clean_audios")
-        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
-            raise AssertionError("nan or inf in noisy_audios")
-        return clean_audios, noisy_audios
-collate_fn = CollateFunction()
-def main():
-    args = get_args()
-    config = ConvTasNetConfig.from_pretrained(
-        pretrained_model_name_or_path=args.config_file,
-    )
-    discriminator_config = WaveformMetricDiscriminatorConfig.from_pretrained(
-        pretrained_model_name_or_path=args.discriminator_config_file,
-    )
-    serialization_dir = Path(args.serialization_dir)
-    serialization_dir.mkdir(parents=True, exist_ok=True)
-    logger = logging_config(serialization_dir)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    logger.info(f"set seed: {args.seed}")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    logger.info(f"GPU available count: {n_gpu}; device: {device}")
-    # datasets
-    train_dataset = DenoiseJsonlDataset(
-        jsonl_file=args.train_dataset,
-        expected_sample_rate=config.sample_rate,
-        max_wave_value=32768.0,
-        min_snr_db=config.min_snr_db,
-        max_snr_db=config.max_snr_db,
-        # skip=825000,
-    )
-    valid_dataset = DenoiseJsonlDataset(
-        jsonl_file=args.valid_dataset,
-        expected_sample_rate=config.sample_rate,
-        max_wave_value=32768.0,
-        min_snr_db=config.min_snr_db,
-        max_snr_db=config.max_snr_db,
-    )
-    train_data_loader = DataLoader(
-        dataset=train_dataset,
-        batch_size=args.batch_size,
-        # shuffle=True,
-        sampler=None,
-        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
-        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
-        collate_fn=collate_fn,
-        pin_memory=False,
-        prefetch_factor=2,
-    )
-    valid_data_loader = DataLoader(
-        dataset=valid_dataset,
-        batch_size=args.batch_size,
-        # shuffle=True,
-        sampler=None,
-        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
-        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
-        collate_fn=collate_fn,
-        pin_memory=False,
-        prefetch_factor=2,
-    )
-    # models
-    logger.info(f"prepare models. config_file: {args.config_file}")
-    model = ConvTasNetPretrainedModel(config).to(device)
-    model.to(device)
-    model.train()
-    discriminator = WaveformMetricDiscriminatorPretrainedModel(discriminator_config).to(device)
-    discriminator.to(device)
-    discriminator.train()
-    # optimizer
-    logger.info("prepare optimizer, lr_scheduler, loss_fn, categorical_accuracy")
-    optimizer = torch.optim.AdamW(model.parameters(), config.lr, betas=[config.adam_b1, config.adam_b2])
-    discriminator_optimizer = torch.optim.AdamW(discriminator.parameters(), config.lr, betas=[config.adam_b1, config.adam_b2])
-    # resume training
-    last_step_idx = -1
-    last_epoch = -1
-    for step_idx_str in serialization_dir.glob("steps-*"):
-        step_idx_str = Path(step_idx_str)
-        step_idx = step_idx_str.stem.split("-")[1]
-        step_idx = int(step_idx)
-        if step_idx > last_step_idx:
-            last_step_idx = step_idx
-    if last_step_idx != -1:
-        logger.info(f"resume from steps-{last_step_idx}.")
-        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
-        optimizer_pth = serialization_dir / f"steps-{last_step_idx}/optimizer.pth"
-        discriminator_pt = serialization_dir / f"steps-{last_step_idx}/discriminator.pt"
-        discriminator_optimizer_pth = serialization_dir / f"steps-{last_step_idx}/discriminator_optimizer.pth"
-        logger.info(f"load state dict for model.")
-        with open(model_pt.as_posix(), "rb") as f:
-            state_dict = torch.load(f, map_location="cpu", weights_only=True)
-        model.load_state_dict(state_dict, strict=True)
-        if optimizer_pth.exists():
-            logger.info(f"load state dict for optimizer.")
-            with open(optimizer_pth.as_posix(), "rb") as f:
-                state_dict = torch.load(f, map_location="cpu", weights_only=True)
-            optimizer.load_state_dict(state_dict)
-        if discriminator_pt.exists():
-            logger.info(f"load state dict for discriminator.")
-            with open(model_pt.as_posix(), "rb") as f:
-                state_dict = torch.load(f, map_location="cpu", weights_only=True)
-            discriminator.load_state_dict(state_dict, strict=True)
-        if discriminator_optimizer_pth.exists():
-            logger.info(f"load state dict for discriminator_optimizer.")
-            with open(optimizer_pth.as_posix(), "rb") as f:
-                state_dict = torch.load(f, map_location="cpu", weights_only=True)
-            discriminator_optimizer.load_state_dict(state_dict)
-    if config.lr_scheduler == "CosineAnnealingLR":
-        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optimizer,
-            last_epoch=last_epoch,
-            # T_max=10 * config.eval_steps,
-            # eta_min=0.01 * config.lr,
-            **config.lr_scheduler_kwargs,
-        )
-        discriminator_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            discriminator_optimizer,
-            last_epoch=last_epoch,
-            # T_max=10 * config.eval_steps,
-            # eta_min=0.01 * config.lr,
-            **config.lr_scheduler_kwargs,
-        )
-    elif config.lr_scheduler == "MultiStepLR":
-        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
-            optimizer,
-            last_epoch=last_epoch,
-            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
-        )
-        discriminator_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
-            discriminator_optimizer,
-            last_epoch=last_epoch,
-            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
-        )
-    else:
-        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
-    ae_loss_fn = nn.L1Loss(reduction="mean").to(device)
-    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
-    neg_stoi_loss_fn = NegSTOILoss(sample_rate=config.sample_rate, reduction="mean").to(device)
-    mr_stft_loss_fn = MultiResolutionSTFTLoss(
-        fft_size_list=[256, 512, 1024],
-        win_size_list=[120, 240, 480],
-        hop_size_list=[25, 50, 100],
-        factor_sc=1.5,
-        factor_mag=1.0,
-        reduction="mean"
-    ).to(device)
-    pesq_loss_fn = PesqLoss(0.5, sample_rate=config.sample_rate).to(device)
-    # training loop
-    # state
-    average_pesq_score = 1000000000
-    average_loss = 1000000000
-    average_ae_loss = 1000000000
-    average_neg_si_snr_loss = 1000000000
-    average_neg_stoi_loss = 1000000000
-    average_mr_stft_loss = 1000000000
-    average_pesq_loss = 1000000000
-    average_discriminator_g_loss = 1000000000
-    average_discriminator_d_loss = 1000000000
-    model_list = list()
-    best_epoch_idx = None
-    best_step_idx = None
-    best_metric = None
-    patience_count = 0
-    step_idx = 0 if last_step_idx == -1 else last_step_idx
-    logger.info("training")
-    for epoch_idx in range(max(0, last_epoch+1), args.max_epochs):
-        # train
-        model.train()
-        total_pesq_score = 0.
-        total_loss = 0.
-        total_ae_loss = 0.
-        total_neg_si_snr_loss = 0.
-        total_neg_stoi_loss = 0.
-        total_mr_stft_loss = 0.
-        total_pesq_loss = 0.
-        total_discriminator_g_loss = 0.
-        total_discriminator_d_loss = 0.
-        total_batches = 0.
-        progress_bar_train = tqdm(
-            initial=step_idx,
-            desc="Training; epoch-{}".format(epoch_idx),
-        )
-        for train_batch in train_data_loader:
-            clean_audios, noisy_audios = train_batch
-            clean_audios: torch.Tensor = clean_audios.to(device)
-            noisy_audios: torch.Tensor = noisy_audios.to(device)
-            one_labels = torch.ones(clean_audios.shape[0]).to(device)
-            denoise_audios = model.forward(noisy_audios)
-            denoise_audios = torch.squeeze(denoise_audios, dim=1)
-            if torch.any(torch.isnan(denoise_audios)) or torch.any(torch.isinf(denoise_audios)):
-                raise AssertionError("nan or inf in denoise_audios")
-            # Discriminator
-            clean_audio_list = torch.split(clean_audios, 1, dim=0)
-            enhanced_audio_list = torch.split(denoise_audios, 1, dim=0)
-            clean_audio_list = [t.squeeze().detach().cpu().numpy() for t in clean_audio_list]
-            enhanced_audio_list = [t.squeeze().detach().cpu().numpy() for t in enhanced_audio_list]
-            pesq_score_list: List[float] = run_batch_pesq(clean_audio_list, enhanced_audio_list, sample_rate=config.sample_rate, mode="nb")
-            metric_r = discriminator.forward(clean_audios, clean_audios)
-            metric_g = discriminator.forward(denoise_audios.detach(), clean_audios)
-            loss_disc_r = F.mse_loss(one_labels, metric_r.flatten())
-            if -1 in pesq_score_list:
-                # print("-1 in batch_pesq_score!")
-                loss_disc_g = 0
-            else:
-                pesq_score_list: torch.FloatTensor = torch.tensor([(score - 1) / 3.5 for score in pesq_score_list], dtype=torch.float32)
-                loss_disc_g = F.mse_loss(pesq_score_list.to(device), metric_g.flatten())
-            discriminator_d_loss = loss_disc_r + loss_disc_g
-            discriminator_optimizer.zero_grad()
-            discriminator_d_loss.backward()
-            discriminator_optimizer.step()
-            discriminator_lr_scheduler.step()
-            # Generator
-            ae_loss = ae_loss_fn.forward(denoise_audios, clean_audios)
-            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
-            neg_stoi_loss = neg_stoi_loss_fn.forward(denoise_audios, clean_audios)
-            mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
-            pesq_loss = pesq_loss_fn.forward(denoise_audios, clean_audios)
-            metric_g = discriminator.forward(denoise_audios, clean_audios)
-            discriminator_g_loss = F.mse_loss(metric_g.flatten(), one_labels)
-            loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss + 0.2 * discriminator_g_loss
-            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
-                logger.info(f"find nan or inf in loss.")
-                continue
-            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
-            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
-            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            lr_scheduler.step()
-            total_pesq_score += pesq_score
-            total_loss += loss.item()
-            total_ae_loss += ae_loss.item()
-            total_neg_si_snr_loss += neg_si_snr_loss.item()
-            total_neg_stoi_loss += neg_stoi_loss.item()
-            total_mr_stft_loss += mr_stft_loss.item()
-            total_pesq_loss += pesq_loss.item()
-            total_discriminator_g_loss += discriminator_g_loss.item()
-            total_discriminator_d_loss += discriminator_d_loss.item()
-            total_batches += 1
-            average_pesq_score = round(total_pesq_score / total_batches, 4)
-            average_loss = round(total_loss / total_batches, 4)
-            average_ae_loss = round(total_ae_loss / total_batches, 4)
-            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
-            average_neg_stoi_loss = round(total_neg_stoi_loss / total_batches, 4)
-            average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
-            average_pesq_loss = round(total_pesq_loss / total_batches, 4)
-            average_discriminator_g_loss = round(total_discriminator_g_loss / total_batches, 4)
-            average_discriminator_d_loss = round(total_discriminator_d_loss / total_batches, 4)
-            progress_bar_train.update(1)
-            progress_bar_train.set_postfix({
-                "lr": lr_scheduler.get_last_lr()[0],
-                "pesq_score": average_pesq_score,
-                "loss": average_loss,
-                "ae_loss": average_ae_loss,
-                "neg_si_snr_loss": average_neg_si_snr_loss,
-                "neg_stoi_loss": average_neg_stoi_loss,
-                "mr_stft_loss": average_mr_stft_loss,
-                "pesq_loss": average_pesq_loss,
-                "disc_g_loss": average_discriminator_g_loss,
-                "disc_d_loss": average_discriminator_d_loss,
-            })
-            # evaluation
-            step_idx += 1
-            if step_idx % config.eval_steps == 0:
-                with torch.no_grad():
-                    torch.cuda.empty_cache()
-                    total_pesq_score = 0.
-                    total_loss = 0.
-                    total_ae_loss = 0.
-                    total_neg_si_snr_loss = 0.
-                    total_neg_stoi_loss = 0.
-                    total_mr_stft_loss = 0.
-                    total_pesq_loss = 0.
-                    total_batches = 0.
-                    progress_bar_train.close()
-                    progress_bar_eval = tqdm(
-                        desc="Evaluation; steps-{}k".format(int(step_idx/1000)),
-                    )
-                    for eval_batch in valid_data_loader:
-                        clean_audios, noisy_audios = eval_batch
-                        clean_audios = clean_audios.to(device)
-                        noisy_audios = noisy_audios.to(device)
-                        denoise_audios = model.forward(noisy_audios)
-                        denoise_audios = torch.squeeze(denoise_audios, dim=1)
-                        # Generator
-                        ae_loss = ae_loss_fn.forward(denoise_audios, clean_audios)
-                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
-                        neg_stoi_loss = neg_stoi_loss_fn.forward(denoise_audios, clean_audios)
-                        mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
-                        pesq_loss = pesq_loss_fn.forward(denoise_audios, clean_audios)
-                        loss = 1.0 * ae_loss + 0.8 * neg_si_snr_loss + 0.7 * mr_stft_loss + 0.5 * neg_stoi_loss + 0.5 * pesq_loss
-                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
-                            logger.info(f"find nan or inf in loss.")
-                            continue
-                        denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
-                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
-                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
-                        total_pesq_score += pesq_score
-                        total_loss += loss.item()
-                        total_ae_loss += ae_loss.item()
-                        total_neg_si_snr_loss += neg_si_snr_loss.item()
-                        total_neg_stoi_loss += neg_stoi_loss.item()
-                        total_mr_stft_loss += mr_stft_loss.item()
-                        total_pesq_loss += pesq_loss.item()
-                        total_batches += 1
-                        average_pesq_score = round(total_pesq_score / total_batches, 4)
-                        average_loss = round(total_loss / total_batches, 4)
-                        average_ae_loss = round(total_ae_loss / total_batches, 4)
-                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
-                        average_neg_stoi_loss = round(total_neg_stoi_loss / total_batches, 4)
-                        average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
-                        average_pesq_loss = round(total_pesq_loss / total_batches, 4)
-                        progress_bar_eval.update(1)
-                        progress_bar_eval.set_postfix({
-                            "lr": lr_scheduler.get_last_lr()[0],
-                            "pesq_score": average_pesq_score,
-                            "loss": average_loss,
-                            "ae_loss": average_ae_loss,
-                            "neg_si_snr_loss": average_neg_si_snr_loss,
-                            "neg_stoi_loss": average_neg_stoi_loss,
-                            "mr_stft_loss": average_mr_stft_loss,
-                            "pesq_loss": average_pesq_loss,
-                        })
-                total_pesq_score = 0.
-                total_loss = 0.
-                total_ae_loss = 0.
-                total_neg_si_snr_loss = 0.
-                total_neg_stoi_loss = 0.
-                total_mr_stft_loss = 0.
-                total_pesq_loss = 0.
-                total_discriminator_g_loss = 0.
-                total_discriminator_d_loss = 0.
-                total_batches = 0.
-                progress_bar_eval.close()
-                progress_bar_train = tqdm(
-                    initial=progress_bar_train.n,
-                    postfix=progress_bar_train.postfix,
-                    desc=progress_bar_train.desc,
-                )
-                # save path
-                save_dir = serialization_dir / "steps-{}".format(step_idx)
-                save_dir.mkdir(parents=True, exist_ok=False)
-                # save models
-                model.save_pretrained(save_dir.as_posix())
-                discriminator.save_pretrained(save_dir.as_posix())
-                # save optim
-                torch.save(optimizer.state_dict(), (save_dir / "optimizer.pth").as_posix())
-                torch.save(discriminator_optimizer.state_dict(), (save_dir / "discriminator_optimizer.pth").as_posix())
-                model_list.append(save_dir)
-                if len(model_list) >= args.num_serialized_models_to_keep:
-                    model_to_delete: Path = model_list.pop(0)
-                    shutil.rmtree(model_to_delete.as_posix())
-                # save metric
-                if best_metric is None:
-                    best_epoch_idx = epoch_idx
-                    best_step_idx = step_idx
-                    best_metric = average_pesq_score
-                elif average_pesq_score > best_metric:
-                    # great is better.
-                    best_epoch_idx = epoch_idx
-                    best_step_idx = step_idx
-                    best_metric = average_pesq_score
-                else:
-                    pass
-                metrics = {
-                    "epoch_idx": epoch_idx,
-                    "best_epoch_idx": best_epoch_idx,
-                    "best_step_idx": best_step_idx,
-                    "pesq_score": average_pesq_score,
-                    "loss": average_loss,
-                    "ae_loss": average_ae_loss,
-                    "neg_si_snr_loss": average_neg_si_snr_loss,
-                    "neg_stoi_loss": average_neg_stoi_loss,
-                    "mr_stft_loss": average_mr_stft_loss,
-                    "pesq_loss": average_pesq_loss,
-                }
-                metrics_filename = save_dir / "metrics_epoch.json"
-                with open(metrics_filename, "w", encoding="utf-8") as f:
-                    json.dump(metrics, f, indent=4, ensure_ascii=False)
-                # save best
-                best_dir = serialization_dir / "best"
-                if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
-                    if best_dir.exists():
-                        shutil.rmtree(best_dir)
-                    shutil.copytree(save_dir, best_dir)
-                # early stop
-                early_stop_flag = False
-                if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
-                    patience_count = 0
-                else:
-                    patience_count += 1
-                if patience_count >= args.patience:
-                    early_stop_flag = True
-                # early stop
-                if early_stop_flag:
-                    break
-    return
-if __name__ == "__main__":
-    main()

examples/conv_tasnet_gan/yaml/config.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-model_name: "conv_tasnet_gan"
-sample_rate: 8000
-segment_size: 4
-win_size: 20
-freq_bins: 256
-bottleneck_channels: 128
-num_speakers: 1
-num_blocks: 2
-num_sub_blocks: 4
-sub_blocks_channels: 256
-sub_blocks_kernel_size: 3
-norm_type: "gLN"
-causal: false
-mask_nonlinear: "relu"
-min_snr_db: -10
-max_snr_db: 20
-lr: 0.005
-adam_b1: 0.8
-adam_b2: 0.99
-lr_scheduler: "CosineAnnealingLR"
-lr_scheduler_kwargs:
-  T_max: 250000
-  eta_min: 0.00005
-eval_steps: 25000

examples/conv_tasnet_gan/yaml/discriminator_config.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-model_name: "conv_tasnet_gan"
-sample_rate: 8000
-segment_size: 16000
-n_fft: 512
-win_size: 200
-hop_size: 80
-discriminator_dim: 32
-discriminator_in_channel: 2

examples/dfnet/step_2_train_model.py CHANGED Viewed

@@ -315,6 +315,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -451,6 +452,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/dfnet2/step_2_train_model.py CHANGED Viewed

@@ -318,6 +318,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -457,6 +458,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/dtln/step_2_train_model.py CHANGED Viewed

@@ -301,6 +301,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -424,6 +425,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/frcrn/step_2_train_model.py CHANGED Viewed

@@ -305,6 +305,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -428,6 +429,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/lstm/step_2_train_model.py CHANGED Viewed

@@ -314,6 +314,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -435,6 +436,7 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

examples/rnnoise/step_2_train_model.py CHANGED Viewed

@@ -314,6 +314,7 @@ def main():
             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
                 with torch.no_grad():
                     torch.cuda.empty_cache()
@@ -435,6 +436,8 @@ def main():
                     # early stop
                     if early_stop_flag:
                         break
     return

             # evaluation
             step_idx += 1
             if step_idx % config.eval_steps == 0:
+                model.eval()
                 with torch.no_grad():
                     torch.cuda.empty_cache()
                     # early stop
                     if early_stop_flag:
                         break
+                model.train()
     return

toolbox/torchaudio/models/dfnet2/modeling_dfnet2.py CHANGED Viewed

@@ -1047,6 +1047,9 @@ class DfNet2(nn.Module):
         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
         return spec, feat_erb, feat_spec
     def forward(self,

         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
+        spec = spec.detach()
+        feat_erb = feat_erb.detach()
+        feat_spec = feat_spec.detach()
         return spec, feat_erb, feat_spec
     def forward(self,

toolbox/torchaudio/modules/utils/ema.py CHANGED Viewed

@@ -1,8 +1,95 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import torch.nn as nn
 class ExponentialMovingAverage(nn.Module):
     def __init__(self):
         super().__init__()

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+import math
+import numpy as np
 import torch.nn as nn
+def _calculate_norm_alpha(sample_rate: int, hop_size: int, tau: float):
+    """Exponential decay factor alpha for a given tau (decay window size [s])."""
+    dt = hop_size / sample_rate
+    result = math.exp(-dt / tau)
+    return result
+def get_norm_alpha(sample_rate: int, hop_size: int, norm_tau: float) -> float:
+    a_ = _calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
+    precision = 3
+    a = 1.0
+    while a >= 1.0:
+        a = round(a_, precision)
+        precision += 1
+    return a
+MEAN_NORM_INIT = [-60., -90.]
+def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray:
+    state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
+    state = np.expand_dims(state, axis=0)
+    state = np.repeat(state, channels, axis=0)
+    # state shape: (audio_channels, erb_bins)
+    return state
+def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None):
+    erb_feat = np.copy(erb_feat)
+    batch_size, time_steps, erb_bins = erb_feat.shape
+    if state is None:
+        state = make_erb_norm_state(erb_bins, erb_feat.shape[0])
+        # state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
+        # state = np.expand_dims(state, axis=0)
+        # state = np.repeat(state, erb_feat.shape[0], axis=0)
+    for i in range(batch_size):
+        for j in range(time_steps):
+            for k in range(erb_bins):
+                x = erb_feat[i][j][k]
+                s = state[i][k]
+                state[i][k] = x * (1. - alpha) + s * alpha
+                erb_feat[i][j][k] -= state[i][k]
+                erb_feat[i][j][k] /= 40.
+    return erb_feat
+UNIT_NORM_INIT = [0.001, 0.0001]
+def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray:
+    state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins)
+    state = np.expand_dims(state, axis=0)
+    state = np.repeat(state, channels, axis=0)
+    # state shape: (audio_channels, df_bins)
+    return state
+def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None):
+    spec_feat = np.copy(spec_feat)
+    batch_size, time_steps, df_bins = spec_feat.shape
+    if state is None:
+        state = make_spec_norm_state(df_bins, spec_feat.shape[0])
+    for i in range(batch_size):
+        for j in range(time_steps):
+            for k in range(df_bins):
+                x = spec_feat[i][j][k]
+                s = state[i][k]
+                state[i][k] = np.abs(x) * (1. - alpha) + s * alpha
+                spec_feat[i][j][k] /= np.sqrt(state[i][k])
+    return spec_feat
 class ExponentialMovingAverage(nn.Module):
     def __init__(self):
         super().__init__()