Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on 21 days ago

Commit

8c3c188

1 Parent(s): 4633f64

add microphone audio input

Browse files

Files changed (31) hide show

examples/dfnet/step_2_train_model.py +7 -4
examples/dfnet/yaml/config-512.yaml +0 -74
examples/dfnet/yaml/config.yaml +14 -14
examples/dtln/run.sh +156 -0
examples/dtln/step_1_prepare_data.py +164 -0
examples/dtln/step_2_train_model.py +428 -0
examples/dtln/yaml/config.yaml +23 -0
examples/{simple_lstm_irm_aishell → simple_lstm_irm}/run.sh +0 -0
examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_1_prepare_data.py +0 -0
examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_2_train_model.py +0 -2
examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_3_evaluation.py +0 -0
main.py +21 -5
toolbox/torchaudio/models/clean_unet/modeling_clean_unet.py +1 -1
toolbox/torchaudio/models/dfnet/conv_stft.py +0 -1
toolbox/torchaudio/models/dtln/__init__.py +6 -0
toolbox/torchaudio/models/dtln/configuration_dtln.py +66 -0
toolbox/torchaudio/models/dtln/modeling_dtln.py +340 -0
toolbox/torchaudio/models/dtln/yaml/config-160.yaml +23 -0
toolbox/torchaudio/models/dtln/yaml/config-256.yaml +23 -0
toolbox/torchaudio/models/frcrn/modeling_frcrn.py +2 -1
toolbox/torchaudio/models/frcrn/unet.py +3 -1
toolbox/torchaudio/models/simple_lstm_irm/modeling_simple_lstm_irm.py +0 -8
toolbox/torchaudio/models/simple_lstm_irm/yaml/config.yaml +6 -8
toolbox/torchaudio/models/tcnn/modeling_tcnn.py +336 -2
toolbox/torchaudio/models/zip_enhancer/__init__.py +5 -0
toolbox/torchaudio/models/zip_enhancer/modeling_zip_enhancer.py +154 -0
toolbox/torchaudio/models/zip_enhancer/scaling.py +249 -0
toolbox/torchaudio/models/zip_enhancer/zip_enhancer_layer.py +9 -0
toolbox/torchaudio/models/zip_enhancer/zipformer.py +9 -0
toolbox/torchaudio/modules/conv_stft.py +149 -0
toolbox/torchaudio/modules/erb_bands.py +0 -124

examples/dfnet/step_2_train_model.py CHANGED Viewed

@@ -1,5 +1,8 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import argparse
 import json
 import logging
@@ -25,8 +28,6 @@ from tqdm import tqdm
 from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
 from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
 from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
-from toolbox.torchaudio.losses.irm import IRMLoss
-from toolbox.torchaudio.losses.snr import LocalSNRLoss
 from toolbox.torchaudio.metrics.pesq import run_pesq_score
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
 from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNet, DfNetPretrainedModel
@@ -34,8 +35,8 @@ from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNet, DfNetPretraine
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
-    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
     parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
     parser.add_argument("--patience", default=10, type=int)
@@ -228,8 +229,10 @@ def main():
     # state
     average_pesq_score = 1000000000
     average_loss = 1000000000
     average_neg_si_snr_loss = 1000000000
     average_mask_loss = 1000000000
     model_list = list()
     best_epoch_idx = None

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+"""
+https://github.com/Rikorose/DeepFilterNet
+"""
 import argparse
 import json
 import logging
 from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
 from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
 from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
 from toolbox.torchaudio.metrics.pesq import run_pesq_score
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
 from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNet, DfNetPretrainedModel
 def get_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
     parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
     parser.add_argument("--patience", default=10, type=int)
     # state
     average_pesq_score = 1000000000
     average_loss = 1000000000
+    average_mr_stft_loss = 1000000000
     average_neg_si_snr_loss = 1000000000
     average_mask_loss = 1000000000
+    average_lsnr_loss = 1000000000
     model_list = list()
     best_epoch_idx = None

examples/dfnet/yaml/config-512.yaml DELETED Viewed

@@ -1,74 +0,0 @@
-model_name: "dfnet"
-# spec
-sample_rate: 8000
-n_fft: 512
-win_length: 200
-hop_length: 80
-spec_bins: 256
-# model
-conv_channels: 64
-conv_kernel_size_input:
-  - 3
-  - 3
-conv_kernel_size_inner:
-  - 1
-  - 3
-conv_lookahead: 0
-convt_kernel_size_inner:
-  - 1
-  - 3
-embedding_hidden_size: 256
-encoder_combine_op: "concat"
-encoder_emb_skip_op: "none"
-encoder_emb_linear_groups: 16
-encoder_emb_hidden_size: 256
-encoder_linear_groups: 32
-decoder_emb_num_layers: 3
-decoder_emb_skip_op: "none"
-decoder_emb_linear_groups: 16
-decoder_emb_hidden_size: 256
-df_decoder_hidden_size: 256
-df_num_layers: 2
-df_order: 5
-df_bins: 96
-df_gru_skip: "grouped_linear"
-df_decoder_linear_groups: 16
-df_pathway_kernel_size_t: 5
-df_lookahead: 2
-# lsnr
-n_frame: 3
-lsnr_max: 30
-lsnr_min: -15
-norm_tau: 1.
-# data
-min_snr_db: -10
-max_snr_db: 20
-# train
-lr: 0.001
-lr_scheduler: "CosineAnnealingLR"
-lr_scheduler_kwargs:
-  T_max: 250000
-  eta_min: 0.0001
-max_epochs: 100
-clip_grad_norm: 10.0
-seed: 1234
-num_workers: 8
-batch_size: 32
-eval_steps: 10000
-# runtime
-use_post_filter: true

examples/dfnet/yaml/config.yaml CHANGED Viewed

@@ -2,14 +2,14 @@ model_name: "dfnet"
 # spec
 sample_rate: 8000
-n_fft: 160
-win_length: 160
-hop_length: 80
-spec_bins: 80
 # model
-conv_channels: 32
 conv_kernel_size_input:
   - 3
   - 3
@@ -22,26 +22,26 @@ convt_kernel_size_inner:
   - 1
   - 3
-embedding_hidden_size: 80
 encoder_combine_op: "concat"
 encoder_emb_skip_op: "none"
-encoder_emb_linear_groups: 5
-encoder_emb_hidden_size: 80
-encoder_linear_groups: 10
 decoder_emb_num_layers: 3
 decoder_emb_skip_op: "none"
-decoder_emb_linear_groups: 5
-decoder_emb_hidden_size: 80
-df_decoder_hidden_size: 80
 df_num_layers: 2
 df_order: 5
-df_bins: 30
 df_gru_skip: "grouped_linear"
-df_decoder_linear_groups: 5
 df_pathway_kernel_size_t: 5
 df_lookahead: 2

 # spec
 sample_rate: 8000
+nfft: 512
+win_size: 200
+hop_size: 80
+spec_bins: 256
 # model
+conv_channels: 64
 conv_kernel_size_input:
   - 3
   - 3
   - 1
   - 3
+embedding_hidden_size: 256
 encoder_combine_op: "concat"
 encoder_emb_skip_op: "none"
+encoder_emb_linear_groups: 16
+encoder_emb_hidden_size: 256
+encoder_linear_groups: 32
 decoder_emb_num_layers: 3
 decoder_emb_skip_op: "none"
+decoder_emb_linear_groups: 16
+decoder_emb_hidden_size: 256
+df_decoder_hidden_size: 256
 df_num_layers: 2
 df_order: 5
+df_bins: 96
 df_gru_skip: "grouped_linear"
+df_decoder_linear_groups: 16
 df_pathway_kernel_size_t: 5
 df_lookahead: 2

examples/dtln/run.sh ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env bash
+: <<'END'
+sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
+--noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
+--speech_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/speech"
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet-dns3 \
+--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/dns3-noise" \
+--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"
+END
+# params
+system_version="windows";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=9
+work_dir="$(pwd)"
+file_folder_name=file_folder_name
+final_model_name=final_model_name
+config_file="yaml/config.yaml"
+limit=10
+noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
+speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+max_count=10000000
+nohup_name=nohup.out
+# model params
+batch_size=64
+max_epochs=200
+save_top_k=10
+patience=5
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+file_dir="${work_dir}/${file_folder_name}"
+final_model_dir="${work_dir}/../../trained_models/${final_model_name}";
+evaluation_audio_dir="${file_dir}/evaluation_audio"
+train_dataset="${file_dir}/train.jsonl"
+valid_dataset="${file_dir}/valid.jsonl"
+$verbose && echo "system_version: ${system_version}"
+$verbose && echo "file_folder_name: ${file_folder_name}"
+if [ $system_version == "windows" ]; then
+  alias python3='D:/Users/tianx/PycharmProjects/virtualenv/nx_denoise/Scripts/python.exe'
+elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then
+  #source /data/local/bin/nx_denoise/bin/activate
+  alias python3='/data/local/bin/nx_denoise/bin/python3'
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: prepare data"
+  cd "${work_dir}" || exit 1
+  python3 step_1_prepare_data.py \
+  --file_dir "${file_dir}" \
+  --noise_dir "${noise_dir}" \
+  --speech_dir "${speech_dir}" \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --max_count "${max_count}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: train model"
+  cd "${work_dir}" || exit 1
+  python3 step_2_train_model.py \
+  --train_dataset "${train_dataset}" \
+  --valid_dataset "${valid_dataset}" \
+  --serialization_dir "${file_dir}" \
+  --config_file "${config_file}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: test model"
+  cd "${work_dir}" || exit 1
+  python3 step_3_evaluation.py \
+  --valid_dataset "${valid_dataset}" \
+  --model_dir "${file_dir}/best" \
+  --evaluation_audio_dir "${evaluation_audio_dir}" \
+  --limit "${limit}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: collect files"
+  cd "${work_dir}" || exit 1
+  mkdir -p ${final_model_dir}
+  cp "${file_dir}/best"/* "${final_model_dir}"
+  cp -r "${file_dir}/evaluation_audio" "${final_model_dir}"
+  cd "${final_model_dir}/.." || exit 1;
+  if [ -e "${final_model_name}.zip" ]; then
+    rm -rf "${final_model_name}_backup.zip"
+    mv "${final_model_name}.zip" "${final_model_name}_backup.zip"
+  fi
+  zip -r "${final_model_name}.zip" "${final_model_name}"
+  rm -rf "${final_model_name}"
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: clear file_dir"
+  cd "${work_dir}" || exit 1
+  rm -rf "${file_dir}";
+fi

examples/dtln/step_1_prepare_data.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import random
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+import numpy as np
+from tqdm import tqdm
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_dir", default="./", type=str)
+    parser.add_argument(
+        "--noise_dir",
+        default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
+        type=str
+    )
+    parser.add_argument(
+        "--speech_dir",
+        default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
+        type=str
+    )
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--duration", default=4.0, type=float)
+    parser.add_argument("--min_snr_db", default=-10, type=float)
+    parser.add_argument("--max_snr_db", default=20, type=float)
+    parser.add_argument("--target_sample_rate", default=8000, type=int)
+    parser.add_argument("--max_count", default=10000, type=int)
+    args = parser.parse_args()
+    return args
+def filename_generator(data_dir: str):
+    data_dir = Path(data_dir)
+    for filename in data_dir.glob("**/*.wav"):
+        yield filename.as_posix()
+def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000, max_epoch: int = 20000):
+    data_dir = Path(data_dir)
+    for epoch_idx in range(max_epoch):
+        for filename in data_dir.glob("**/*.wav"):
+            signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
+            raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
+            if raw_duration < duration:
+                # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
+                continue
+            if signal.ndim != 1:
+                raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
+            signal_length = len(signal)
+            win_size = int(duration * sample_rate)
+            for begin in range(0, signal_length - win_size, win_size):
+                if np.sum(signal[begin: begin+win_size]) == 0:
+                    continue
+                row = {
+                    "epoch_idx": epoch_idx,
+                    "filename": filename.as_posix(),
+                    "raw_duration": round(raw_duration, 4),
+                    "offset": round(begin / sample_rate, 4),
+                    "duration": round(duration, 4),
+                }
+                yield row
+def main():
+    args = get_args()
+    file_dir = Path(args.file_dir)
+    file_dir.mkdir(exist_ok=True)
+    noise_dir = Path(args.noise_dir)
+    speech_dir = Path(args.speech_dir)
+    noise_generator = target_second_signal_generator(
+        noise_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=100000,
+    )
+    speech_generator = target_second_signal_generator(
+        speech_dir.as_posix(),
+        duration=args.duration,
+        sample_rate=args.target_sample_rate,
+        max_epoch=1,
+    )
+    dataset = list()
+    count = 0
+    process_bar = tqdm(desc="build dataset jsonl")
+    with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
+        for noise, speech in zip(noise_generator, speech_generator):
+            if count >= args.max_count > 0:
+                break
+            noise_filename = noise["filename"]
+            noise_raw_duration = noise["raw_duration"]
+            noise_offset = noise["offset"]
+            noise_duration = noise["duration"]
+            speech_filename = speech["filename"]
+            speech_raw_duration = speech["raw_duration"]
+            speech_offset = speech["offset"]
+            speech_duration = speech["duration"]
+            random1 = random.random()
+            random2 = random.random()
+            row = {
+                "count": count,
+                "noise_filename": noise_filename,
+                "noise_raw_duration": noise_raw_duration,
+                "noise_offset": noise_offset,
+                "noise_duration": noise_duration,
+                "speech_filename": speech_filename,
+                "speech_raw_duration": speech_raw_duration,
+                "speech_offset": speech_offset,
+                "speech_duration": speech_duration,
+                "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
+                "random1": random1,
+            }
+            row = json.dumps(row, ensure_ascii=False)
+            if random2 < (1 / 300 / 1):
+                fvalid.write(f"{row}\n")
+            else:
+                ftrain.write(f"{row}\n")
+            count += 1
+            duration_seconds = count * args.duration
+            duration_hours = duration_seconds / 3600
+            process_bar.update(n=1)
+            process_bar.set_postfix({
+                # "duration_seconds": round(duration_seconds, 4),
+                "duration_hours": round(duration_hours, 4),
+            })
+    return
+if __name__ == "__main__":
+    main()

examples/dtln/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,428 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/Rikorose/DeepFilterNet
+"""
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.losses.snr import NegativeSISNRLoss
+from toolbox.torchaudio.losses.spectral import LSDLoss, MultiResolutionSTFTLoss
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+from toolbox.torchaudio.models.dtln.configuration_dtln import DTLNConfig
+from toolbox.torchaudio.models.dtln.modeling_dtln import DTLNModel, DTLNPretrainedModel
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
+    parser.add_argument("--patience", default=10, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self):
+        pass
+    def __call__(self, batch: List[dict]):
+        clean_audios = list()
+        noisy_audios = list()
+        snr_db_list = list()
+        for sample in batch:
+            # noise_wave: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            # snr_db: float = sample["snr_db"]
+            clean_audios.append(clean_audio)
+            noisy_audios.append(noisy_audio)
+        clean_audios = torch.stack(clean_audios)
+        noisy_audios = torch.stack(noisy_audios)
+        # assert
+        if torch.any(torch.isnan(clean_audios)) or torch.any(torch.isinf(clean_audios)):
+            raise AssertionError("nan or inf in clean_audios")
+        if torch.any(torch.isnan(noisy_audios)) or torch.any(torch.isinf(noisy_audios)):
+            raise AssertionError("nan or inf in noisy_audios")
+        return clean_audios, noisy_audios
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = DTLNConfig.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(config.seed)
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    logger.info(f"set seed: {config.seed}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info(f"GPU available count: {n_gpu}; device: {device}")
+    # datasets
+    train_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.train_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+        # skip=225000,
+    )
+    valid_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.valid_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = DTLNPretrainedModel(config).to(device)
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, evaluation_metric")
+    optimizer = torch.optim.AdamW(model.parameters(), config.lr)
+    # resume training
+    last_step_idx = -1
+    last_epoch = -1
+    for step_idx_str in serialization_dir.glob("steps-*"):
+        step_idx_str = Path(step_idx_str)
+        step_idx = step_idx_str.stem.split("-")[1]
+        step_idx = int(step_idx)
+        if step_idx > last_step_idx:
+            last_step_idx = step_idx
+    # last_epoch = 1
+    if last_step_idx != -1:
+        logger.info(f"resume from steps-{last_step_idx}.")
+        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+    if config.lr_scheduler == "CosineAnnealingLR":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            last_epoch=last_epoch,
+            # T_max=10 * config.eval_steps,
+            # eta_min=0.01 * config.lr,
+            **config.lr_scheduler_kwargs,
+        )
+    elif config.lr_scheduler == "MultiStepLR":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            last_epoch=last_epoch,
+            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+        )
+    else:
+        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    neg_si_snr_loss_fn = NegativeSISNRLoss(reduction="mean").to(device)
+    mr_stft_loss_fn = MultiResolutionSTFTLoss(
+        fft_size_list=[256, 512, 1024],
+        win_size_list=[256, 512, 1024],
+        hop_size_list=[128, 256, 512],
+        factor_sc=1.5,
+        factor_mag=1.0,
+        reduction="mean"
+    ).to(device)
+    # training loop
+    # state
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    average_mr_stft_loss = 1000000000
+    average_neg_si_snr_loss = 1000000000
+    model_list = list()
+    best_epoch_idx = None
+    best_step_idx = None
+    best_metric = None
+    patience_count = 0
+    step_idx = 0 if last_step_idx == -1 else last_step_idx
+    logger.info("training")
+    for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_mr_stft_loss = 0.
+        total_neg_si_snr_loss = 0.
+        total_batches = 0.
+        progress_bar_train = tqdm(
+            initial=step_idx,
+            desc="Training; epoch-{}".format(epoch_idx),
+        )
+        for train_batch in train_data_loader:
+            clean_audios, noisy_audios = train_batch
+            clean_audios: torch.Tensor = clean_audios.to(device)
+            noisy_audios: torch.Tensor = noisy_audios.to(device)
+            denoise_audios = model.forward(noisy_audios)
+            mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+            neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+            if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                logger.info(f"find nan or inf in loss.")
+                continue
+            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_mr_stft_loss += mr_stft_loss.item()
+            total_neg_si_snr_loss += neg_si_snr_loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+            average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+            progress_bar_train.update(1)
+            progress_bar_train.set_postfix({
+                "lr": lr_scheduler.get_last_lr()[0],
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+                "mr_stft_loss": average_mr_stft_loss,
+                "neg_si_snr_loss": average_neg_si_snr_loss,
+            })
+            # evaluation
+            step_idx += 1
+            if step_idx % config.eval_steps == 0:
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_train.close()
+                    progress_bar_eval = tqdm(
+                        desc="Evaluation; steps-{}k".format(int(step_idx/1000)),
+                    )
+                    for eval_batch in valid_data_loader:
+                        clean_audios, noisy_audios = eval_batch
+                        clean_audios: torch.Tensor = clean_audios.to(device)
+                        noisy_audios: torch.Tensor = noisy_audios.to(device)
+                        denoise_audios = model.forward(noisy_audios)
+                        mr_stft_loss = mr_stft_loss_fn.forward(denoise_audios, clean_audios)
+                        neg_si_snr_loss = neg_si_snr_loss_fn.forward(denoise_audios, clean_audios)
+                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss
+                        if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
+                            logger.info(f"find nan or inf in loss.")
+                            continue
+                        denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+                        total_pesq_score += pesq_score
+                        total_loss += loss.item()
+                        total_mr_stft_loss += mr_stft_loss.item()
+                        total_neg_si_snr_loss += neg_si_snr_loss.item()
+                        total_batches += 1
+                        average_pesq_score = round(total_pesq_score / total_batches, 4)
+                        average_loss = round(total_loss / total_batches, 4)
+                        average_mr_stft_loss = round(total_mr_stft_loss / total_batches, 4)
+                        average_neg_si_snr_loss = round(total_neg_si_snr_loss / total_batches, 4)
+                        progress_bar_eval.update(1)
+                        progress_bar_eval.set_postfix({
+                            "lr": lr_scheduler.get_last_lr()[0],
+                            "pesq_score": average_pesq_score,
+                            "loss": average_loss,
+                            "mr_stft_loss": average_mr_stft_loss,
+                            "neg_si_snr_loss": average_neg_si_snr_loss,
+                        })
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_mr_stft_loss = 0.
+                    total_neg_si_snr_loss = 0.
+                    total_batches = 0.
+                    progress_bar_eval.close()
+                    progress_bar_train = tqdm(
+                        initial=progress_bar_train.n,
+                        postfix=progress_bar_train.postfix,
+                        desc=progress_bar_train.desc,
+                    )
+                    # save path
+                    save_dir = serialization_dir / "steps-{}".format(step_idx)
+                    save_dir.mkdir(parents=True, exist_ok=False)
+                    # save models
+                    model.save_pretrained(save_dir.as_posix())
+                    model_list.append(save_dir)
+                    if len(model_list) >= args.num_serialized_models_to_keep:
+                        model_to_delete: Path = model_list.pop(0)
+                        shutil.rmtree(model_to_delete.as_posix())
+                    # save metric
+                    if best_metric is None:
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    elif average_pesq_score >= best_metric:
+                        # great is better.
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    else:
+                        pass
+                    metrics = {
+                        "epoch_idx": epoch_idx,
+                        "best_epoch_idx": best_epoch_idx,
+                        "best_step_idx": best_step_idx,
+                        "pesq_score": average_pesq_score,
+                        "loss": average_loss,
+                        "mr_stft_loss": average_mr_stft_loss,
+                        "neg_si_snr_loss": average_neg_si_snr_loss,
+                    }
+                    metrics_filename = save_dir / "metrics_epoch.json"
+                    with open(metrics_filename, "w", encoding="utf-8") as f:
+                        json.dump(metrics, f, indent=4, ensure_ascii=False)
+                    # save best
+                    best_dir = serialization_dir / "best"
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        if best_dir.exists():
+                            shutil.rmtree(best_dir)
+                        shutil.copytree(save_dir, best_dir)
+                    # early stop
+                    early_stop_flag = False
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        patience_count = 0
+                    else:
+                        patience_count += 1
+                    if patience_count >= args.patience:
+                        early_stop_flag = True
+                    # early stop
+                    if early_stop_flag:
+                        break
+    return
+if __name__ == "__main__":
+    main()

examples/dtln/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+model_name: "DTLN"
+sample_rate: 8000
+fft_size: 256
+hop_size: 128
+win_type: hann
+max_snr_db: 20
+min_snr_db: -10
+encoder_size: 256
+max_epochs: 100
+batch_size: 4
+num_workers: 4
+seed: 1234
+eval_steps: 25000
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+clip_grad_norm: 10.0

examples/{simple_lstm_irm_aishell → simple_lstm_irm}/run.sh RENAMED Viewed

File without changes

examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_1_prepare_data.py RENAMED Viewed

File without changes

examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_2_train_model.py RENAMED Viewed

@@ -15,8 +15,6 @@ import sys
 import shutil
 from typing import List
-from torch import dtype
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))

 import shutil
 from typing import List
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))

examples/{simple_lstm_irm_aishell → simple_lstm_irm}/step_3_evaluation.py RENAMED Viewed

File without changes

main.py CHANGED Viewed

@@ -6,6 +6,7 @@ import logging
 from pathlib import Path
 import platform
 import shutil
 import zipfile
 import gradio as gr
@@ -83,8 +84,17 @@ def load_denoise_model(infer_cls, **kwargs):
     return infer_engine
-def when_click_denoise_button(noisy_audio_t, engine: str):
     sample_rate, signal = noisy_audio_t
     logger.info(f"run denoise; engine: {engine}, sample_rate: {sample_rate}, signal dtype: {signal.dtype}, signal shape: {signal.shape}")
     noisy_audio = np.array(signal / (1 << 15), dtype=np.float32)
@@ -140,7 +150,8 @@ def main():
     for filename in examples_dir.glob("**/*.wav"):
         examples.append([
             filename.as_posix(),
-            denoise_engine_choices[0]
         ])
     # ui
@@ -150,7 +161,12 @@ def main():
             with gr.TabItem("denoise"):
                 with gr.Row():
                     with gr.Column(variant="panel", scale=5):
-                        dn_noisy_audio = gr.Audio(label="noisy_audio")
                         dn_engine = gr.Dropdown(choices=denoise_engine_choices, value=denoise_engine_choices[0], label="engine")
                         dn_button = gr.Button(variant="primary")
                     with gr.Column(variant="panel", scale=5):
@@ -158,12 +174,12 @@ def main():
                 dn_button.click(
                     when_click_denoise_button,
-                    inputs=[dn_noisy_audio, dn_engine],
                     outputs=[dn_enhanced_audio]
                 )
                 gr.Examples(
                     examples=examples,
-                    inputs=[dn_noisy_audio, dn_engine],
                     outputs=[dn_enhanced_audio],
                     fn=when_click_denoise_button,
                     # cache_examples=True,

 from pathlib import Path
 import platform
 import shutil
+from typing import Tuple
 import zipfile
 import gradio as gr
     return infer_engine
+def when_click_denoise_button(noisy_audio_file_t = None, noisy_audio_microphone_t = None, engine: str = None):
+    if noisy_audio_file_t is None and noisy_audio_microphone_t is None:
+        raise gr.Error(f"audio file and microphone is null.")
+    if noisy_audio_file_t is not None and noisy_audio_microphone_t is not None:
+        gr.Warning(f"both audio file and microphone file is provided, audio file taking priority.")
+    noisy_audio_t: Tuple = noisy_audio_file_t or noisy_audio_microphone_t
     sample_rate, signal = noisy_audio_t
+    # Test: 使用 microphone 时，显示采样率是 44100，但 signal 实际是按 8000 的采样率的。
     logger.info(f"run denoise; engine: {engine}, sample_rate: {sample_rate}, signal dtype: {signal.dtype}, signal shape: {signal.shape}")
     noisy_audio = np.array(signal / (1 << 15), dtype=np.float32)
     for filename in examples_dir.glob("**/*.wav"):
         examples.append([
             filename.as_posix(),
+            None,
+            denoise_engine_choices[0],
         ])
     # ui
             with gr.TabItem("denoise"):
                 with gr.Row():
                     with gr.Column(variant="panel", scale=5):
+                        with gr.Tabs():
+                            with gr.TabItem("file"):
+                                dn_noisy_audio_file = gr.Audio(label="noisy_audio")
+                            with gr.TabItem("microphone"):
+                                dn_noisy_audio_microphone = gr.Audio(sources="microphone", label="noisy_audio")
                         dn_engine = gr.Dropdown(choices=denoise_engine_choices, value=denoise_engine_choices[0], label="engine")
                         dn_button = gr.Button(variant="primary")
                     with gr.Column(variant="panel", scale=5):
                 dn_button.click(
                     when_click_denoise_button,
+                    inputs=[dn_noisy_audio_file, dn_noisy_audio_microphone, dn_engine],
                     outputs=[dn_enhanced_audio]
                 )
                 gr.Examples(
                     examples=examples,
+                    inputs=[dn_noisy_audio_file, dn_noisy_audio_microphone, dn_engine],
                     outputs=[dn_enhanced_audio],
                     fn=when_click_denoise_button,
                     # cache_examples=True,

toolbox/torchaudio/models/clean_unet/modeling_clean_unet.py CHANGED Viewed

@@ -278,7 +278,7 @@ def main():
     print_size(model, keyword="tsfm")
     input_data = torch.ones([4, 1, int(4.5 * 16000)])
-    output = model(input_data)
     print(output.shape)
     # y = torch.rand([4, 1, int(4.5 * 16000)])

     print_size(model, keyword="tsfm")
     input_data = torch.ones([4, 1, int(4.5 * 16000)])
+    output = model.forward(input_data)
     print(output.shape)
     # y = torch.rand([4, 1, int(4.5 * 16000)])

toolbox/torchaudio/models/dfnet/conv_stft.py CHANGED Viewed

@@ -8,7 +8,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from scipy.signal import get_window
-from sympy.physics.units import power
 def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):

 import torch.nn as nn
 import torch.nn.functional as F
 from scipy.signal import get_window
 def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):

toolbox/torchaudio/models/dtln/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/dtln/configuration_dtln.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class DTLNConfig(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 fft_size: int = 200,
+                 hop_size: int = 80,
+                 win_type: str = "hann",
+                 encoder_size: int = 256,
+                 min_snr_db: float = -10,
+                 max_snr_db: float = 20,
+                 lr: float = 0.001,
+                 lr_scheduler: str = "CosineAnnealingLR",
+                 lr_scheduler_kwargs: dict = None,
+                 max_epochs: int = 100,
+                 clip_grad_norm: float = 10.,
+                 seed: int = 1234,
+                 num_workers: int = 4,
+                 batch_size: int = 4,
+                 eval_steps: int = 25000,
+                 **kwargs
+                 ):
+        super(DTLNConfig, self).__init__(**kwargs)
+        # transform
+        self.sample_rate = sample_rate
+        self.fft_size = fft_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        # model params
+        self.encoder_size = encoder_size
+        # data snr
+        self.min_snr_db = min_snr_db
+        self.max_snr_db = max_snr_db
+        # train
+        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
+        self.max_epochs = max_epochs
+        self.clip_grad_norm = clip_grad_norm
+        self.seed = seed
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.eval_steps = eval_steps
+def main():
+    config = DTLNConfig()
+    config.to_yaml_file("config.yaml")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/dtln/modeling_dtln.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/AkenoSyuRi/DTLNPytorch
+"""
+import os
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
+from toolbox.torchaudio.models.dtln.configuration_dtln import DTLNConfig
+class InstantLayerNormalization(nn.Module):
+    """
+    Class implementing instant layer normalization. It can also be called
+    channel-wise layer normalization and was proposed by
+    Luo & Mesgarani (https://arxiv.org/abs/1809.07454v2)
+    """
+    def __init__(self, channels):
+        super(InstantLayerNormalization, self).__init__()
+        self.epsilon = 1e-7
+        self.gamma = nn.Parameter(torch.ones(1, 1, channels), requires_grad=True)
+        self.beta = nn.Parameter(torch.zeros(1, 1, channels), requires_grad=True)
+        self.register_parameter("gamma", self.gamma)
+        self.register_parameter("beta", self.beta)
+    def forward(self, inputs: torch.Tensor):
+        # calculate mean of each frame
+        mean = torch.mean(inputs, dim=-1, keepdim=True)
+        # calculate variance of each frame
+        variance = torch.mean(torch.square(inputs - mean), dim=-1, keepdim=True)
+        # calculate standard deviation
+        std = torch.sqrt(variance + self.epsilon)
+        outputs = (inputs - mean) / std
+        # scale with gamma
+        outputs = outputs * self.gamma
+        # add the bias beta
+        outputs = outputs + self.beta
+        # return output
+        return outputs
+class SeperationBlock(nn.Module):
+    def __init__(self,
+                 input_size: int = 257,
+                 hidden_size: int = 128,
+                 dropout: float = 0.25,
+                 ):
+        super(SeperationBlock, self).__init__()
+        self.rnn1 = nn.LSTM(input_size=input_size,
+                            hidden_size=hidden_size,
+                            num_layers=1,
+                            batch_first=True,
+                            dropout=0.0,
+                            bidirectional=False,
+                            )
+        self.rnn2 = nn.LSTM(input_size=hidden_size,
+                            hidden_size=hidden_size,
+                            num_layers=1,
+                            batch_first=True,
+                            dropout=0.0,
+                            bidirectional=False,
+                            )
+        self.drop = nn.Dropout(dropout)
+        self.dense = nn.Linear(hidden_size, input_size)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x: torch.Tensor, in_states: torch.Tensor = None):
+        if in_states is None:
+            hx1 = None
+            hx2 = None
+        else:
+            h1_in, c1_in = in_states[:1, :, :, 0], in_states[:1, :, :, 1]
+            h2_in, c2_in = in_states[1:, :, :, 0], in_states[1:, :, :, 1]
+            hx1 = (h1_in, c1_in)
+            hx2 = (h2_in, c2_in)
+        x1, (h1, c1) = self.rnn1.forward(x, hx=hx1)
+        x1 = self.drop(x1)
+        x2, (h2, c2) = self.rnn2.forward(x1, hx=hx2)
+        x2 = self.drop(x2)
+        mask = self.dense(x2)
+        mask = self.sigmoid(mask)
+        h = torch.cat((h1, h2), dim=0)
+        c = torch.cat((c1, c2), dim=0)
+        out_states = torch.stack((h, c), dim=-1)
+        return mask, out_states
+MODEL_FILE = "model.pt"
+class DTLNModel(nn.Module):
+    def __init__(self,
+                 fft_size: int = 512,
+                 hop_size: int = 128,
+                 win_type: str = "hamming",
+                 encoder_size: int = 256,
+                 ):
+        super(DTLNModel, self).__init__()
+        self.fft_size = fft_size
+        self.hop_size = hop_size
+        self.encoder_size = encoder_size
+        self.stft = ConvSTFT(
+            nfft=fft_size,
+            win_size=fft_size,
+            hop_size=hop_size,
+            win_type=win_type,
+            power=None,
+            requires_grad=False
+        )
+        self.istft = ConviSTFT(
+            nfft=fft_size,
+            win_size=fft_size,
+            hop_size=hop_size,
+            win_type=win_type,
+            requires_grad=False
+        )
+        self.sep1 = SeperationBlock(input_size=(fft_size // 2 + 1),
+                                    hidden_size=128,
+                                    dropout=0.25,
+                                    )
+        self.encoder_conv1 = nn.Conv1d(in_channels=fft_size,
+                                       out_channels=self.encoder_size,
+                                       kernel_size=1,
+                                       stride=1,
+                                       bias=False,
+                                       )
+        # self.encoder_norm1 = nn.InstanceNorm1d(num_features=self.encoder_size, eps=1e-7, affine=True)
+        self.encoder_norm1 = InstantLayerNormalization(channels=self.encoder_size)
+        self.sep2 = SeperationBlock(input_size=self.encoder_size,
+                                    hidden_size=128,
+                                    dropout=0.25,
+                                    )
+        self.decoder_conv1 = nn.Conv1d(in_channels=self.encoder_size,
+                                       out_channels=fft_size,
+                                       kernel_size=1,
+                                       stride=1,
+                                       bias=False,
+                                       )
+    def signal_prepare(self, signal: torch.Tensor) -> torch.Tensor:
+        if signal.dim() == 2:
+            signal = torch.unsqueeze(signal, dim=1)
+        _, _, n_samples = signal.shape
+        remainder = (n_samples - self.fft_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal, n_samples
+    def forward(self,
+                noisy: torch.Tensor,
+                ):
+        noisy, num_samples = self.signal_prepare(noisy)
+        batch_size, _, num_samples_pad = noisy.shape
+        # print(f"num_samples: {num_samples}, num_samples_pad: {num_samples_pad}")
+        denoise_frame, _, _ = self.forward_chunk(noisy)
+        denoise = self.denoise_frame_to_denoise(denoise_frame, batch_size, num_samples_pad)
+        # denoise shape: [b, num_samples_pad]
+        denoise = denoise[:, :num_samples]
+        # denoise shape: [b, num_samples]
+        return denoise
+    def forward_chunk(self,
+                      noisy: torch.Tensor,
+                      in_state1: torch.Tensor = None,
+                      in_state2: torch.Tensor = None,
+                      ):
+        # noisy shape: [b, num_samples]
+        spec = self.stft.forward(noisy)
+        # spec shape: [b, f, t], torch.complex64
+        # t = (num_samples - win_size) / hop_size + 1
+        spec = torch.view_as_real(spec)
+        # spec shape: [b, f, t, 2]
+        real = spec[..., 0]
+        imag = spec[..., 1]
+        mag = torch.sqrt(real ** 2 + imag ** 2)
+        phase = torch.atan2(imag, real)
+        # shape: [b, f, t]
+        mag = mag.permute(0, 2, 1)
+        phase = phase.permute(0, 2, 1)
+        # shape: [b, t, f]
+        mask, out_state1 = self.sep1.forward(mag, in_state1)
+        # mask shape: [b, t, f]
+        estimated_mag = mask * mag
+        s1_stft = estimated_mag * torch.exp((1j * phase))
+        # s1_stft shape: [b, t, f], torch.complex64
+        y1 = torch.fft.irfft2(s1_stft, dim=-1)
+        # y1 shape: [b, t, fft_size], torch.float32
+        y1 = y1.permute(0, 2, 1)
+        # y1 shape: [b, fft_size, t]
+        encoded_f = self.encoder_conv1.forward(y1)
+        # shape: [b, c, t]
+        encoded_f = encoded_f.permute(0, 2, 1)
+        # shape: [b, t, c]
+        encoded_f_norm = self.encoder_norm1.forward(encoded_f)
+        # shape: [b, t, c]
+        mask_2, out_state2 = self.sep2.forward(encoded_f_norm, in_state2)
+        # shape: [b, t, c]
+        estimated = mask_2 * encoded_f
+        estimated = estimated.permute(0, 2, 1)
+        # shape: [b, c, t]
+        denoise_frame = self.decoder_conv1.forward(estimated)
+        # shape: [b, fft_size, t]
+        return denoise_frame, out_state1, out_state2
+    def denoise_frame_to_denoise(self, denoise_frame: torch.Tensor, batch_size: int, num_samples: int):
+        # overlap and add
+        # denoise_frame shape: [b, fft_size, t]
+        denoise = torch.nn.functional.fold(
+            denoise_frame,
+            output_size=(num_samples, 1),
+            kernel_size=(self.fft_size, 1),
+            padding=(0, 0),
+            stride=(self.hop_size, 1),
+        )
+        # denoise shape: [b, 1, num_samples, 1]
+        denoise = denoise.reshape(batch_size, -1)
+        # denoise shape: [b, num_samples]
+        return denoise
+class DTLNPretrainedModel(DTLNModel):
+    def __init__(self,
+                 config: DTLNConfig,
+                 ):
+        super(DTLNPretrainedModel, self).__init__(
+            fft_size=config.fft_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+            encoder_size=config.encoder_size,
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = DTLNConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    fft_size = 512
+    hop_size = 128
+    model = DTLNModel(fft_size=fft_size, hop_size=hop_size)
+    noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    batch_size, num_samples = noisy.shape
+    denoise = model.forward(noisy)
+    print(f"denoise.shape: {denoise.shape}")
+    t = (num_samples - fft_size) // hop_size + 1
+    denoise_list = list()
+    out_state1 = None
+    out_state2 = None
+    denoise_cache = torch.zeros(size=(batch_size, fft_size - hop_size,), dtype=noisy.dtype)
+    denoise_list.append(torch.clone(denoise_cache))
+    for i in range(t):
+        begin = i * hop_size
+        end = begin + fft_size
+        sub_noisy = noisy[:, begin: end]
+        with torch.no_grad():
+            sub_denoise_frame, out_state1, out_state2 = model.forward_chunk(sub_noisy, out_state1, out_state2)
+        # sub_denoise_frame shape: [b, fft_size, 1]
+        sub_denoise_frame = sub_denoise_frame[:, :, 0]
+        # sub_denoise_frame shape: [b, fft_size]
+        sub_denoise_frame[:, hop_size:] += denoise_cache
+        denoise_out = sub_denoise_frame[:, :hop_size]
+        denoise_cache = sub_denoise_frame[:, hop_size:]
+        # denoise_cache shape: [b, hop_size]
+        denoise_list.append(denoise_out)
+    denoise = torch.concat(denoise_list, dim=-1)
+    print(f"denoise.shape: {denoise.shape}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/dtln/yaml/config-160.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+model_name: "DTLN"
+sample_rate: 8000
+fft_size: 160
+hop_size: 80
+win_type: hann
+max_snr_db: 20
+min_snr_db: -10
+encoder_size: 256
+max_epochs: 100
+batch_size: 4
+num_workers: 4
+seed: 1234
+eval_steps: 25000
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+clip_grad_norm: 10.0

toolbox/torchaudio/models/dtln/yaml/config-256.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+model_name: "DTLN"
+sample_rate: 8000
+fft_size: 256
+hop_size: 128
+win_type: hann
+max_snr_db: 20
+min_snr_db: -10
+encoder_size: 256
+max_epochs: 100
+batch_size: 4
+num_workers: 4
+seed: 1234
+eval_steps: 25000
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+clip_grad_norm: 10.0

toolbox/torchaudio/models/frcrn/modeling_frcrn.py CHANGED Viewed

@@ -97,9 +97,10 @@ class FRCRN(nn.Module):
             n_samples_pad = self.hop_size - remainder
             noisy = F.pad(noisy, pad=(0, n_samples_pad), mode="constant", value=0)
-        # [batch_size, freq_bins * 2, time_steps]
         cmp_spec = self.stft.forward(noisy)
         # [batch_size, 1, freq_bins * 2, time_steps]
         cmp_spec = torch.unsqueeze(cmp_spec, 1)
         # [batch_size, 2, freq_bins, time_steps]

             n_samples_pad = self.hop_size - remainder
             noisy = F.pad(noisy, pad=(0, n_samples_pad), mode="constant", value=0)
+        # [batch_size, freq_bins * 2, num_samples]
         cmp_spec = self.stft.forward(noisy)
         # [batch_size, 1, freq_bins * 2, time_steps]
+        # time_steps = (num_samples - win_size) / hop_size + 1
         cmp_spec = torch.unsqueeze(cmp_spec, 1)
         # [batch_size, 2, freq_bins, time_steps]

toolbox/torchaudio/models/frcrn/unet.py CHANGED Viewed

@@ -71,6 +71,7 @@ class Encoder(nn.Module):
         self.relu = nn.LeakyReLU(inplace=True)
     def forward(self, x: torch.Tensor):
         x = self.conv(x)
         x = self.bn(x)
         x = self.relu(x)
@@ -351,7 +352,8 @@ def main():
     # result = unet.forward(x)
     # print(result.shape)
-    x = torch.rand(size=(1, 1, 65, 2000, 2))
     unet = UNet(
         in_channels=1,
         model_complexity=-1,

         self.relu = nn.LeakyReLU(inplace=True)
     def forward(self, x: torch.Tensor):
+        # x shape: [b, c, f, t, 2]
         x = self.conv(x)
         x = self.bn(x)
         x = self.relu(x)
     # result = unet.forward(x)
     # print(result.shape)
+    # x = torch.rand(size=(1, 1, 65, 2000, 2))
+    x = torch.rand(size=(1, 1, 65, 200, 2))
     unet = UNet(
         in_channels=1,
         model_complexity=-1,

toolbox/torchaudio/models/simple_lstm_irm/modeling_simple_lstm_irm.py CHANGED Viewed

@@ -38,16 +38,10 @@ class SimpleLstmIRM(nn.Module):
                  num_layers: int = 2,
                  batch_first: bool = True,
                  dropout: float = 0.4,
-                 lookback: int = 3,
-                 lookahead: int = 3,
                  ):
         super(SimpleLstmIRM, self).__init__()
         self.num_bins = num_bins
         self.hidden_size = hidden_size
-        self.lookback = lookback
-        self.lookahead = lookahead
-        # self.n_frames = lookback + 1 + lookahead
         self.lstm = nn.LSTM(input_size=num_bins,
                             hidden_size=hidden_size,
@@ -75,8 +69,6 @@ class SimpleLstmIRMPretrainedModel(SimpleLstmIRM):
         super(SimpleLstmIRMPretrainedModel, self).__init__(
             num_bins=config.num_bins,
             hidden_size=config.hidden_size,
-            lookback=config.lookback,
-            lookahead=config.lookahead,
         )
         self.config = config

                  num_layers: int = 2,
                  batch_first: bool = True,
                  dropout: float = 0.4,
                  ):
         super(SimpleLstmIRM, self).__init__()
         self.num_bins = num_bins
         self.hidden_size = hidden_size
         self.lstm = nn.LSTM(input_size=num_bins,
                             hidden_size=hidden_size,
         super(SimpleLstmIRMPretrainedModel, self).__init__(
             num_bins=config.num_bins,
             hidden_size=config.hidden_size,
         )
         self.config = config

toolbox/torchaudio/models/simple_lstm_irm/yaml/config.yaml CHANGED Viewed

@@ -2,15 +2,13 @@ model_name: "simple_lstm_irm"
 # spec
 sample_rate: 8000
-n_fft: 512
-win_length: 200
 hop_length: 80
 # model
-num_bins: 257
-hidden_size: 1024
-num_layers: 2
 batch_first: true
-dropout: 0.4
-lookback: 3
-lookahead: 3

 # spec
 sample_rate: 8000
+n_fft: 320
+win_length: 320
 hop_length: 80
 # model
+num_bins: 161
+hidden_size: 512
+num_layers: 3
 batch_first: true
+dropout: 0.1

toolbox/torchaudio/models/tcnn/modeling_tcnn.py CHANGED Viewed

@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 """
 https://github.com/LXP-Never/TCNN
 https://ieeexplore.ieee.org/abstract/document/8683634
@@ -9,7 +11,339 @@ https://ieeexplore.ieee.org/abstract/document/8683634
 https://github.com/WenzheLiu-Speech/awesome-speech-enhancement
 """
-if __name__ == '__main__':
-    pass

 # -*- coding: utf-8 -*-
 """
 https://github.com/LXP-Never/TCNN
+https://github.com/LXP-Never/TCNN/blob/main/TCNN_model.py
+https://github.com/HardeyPandya/Temporal-Convolutional-Neural-Network-Single-Channel-Speech-Enhancement
 https://ieeexplore.ieee.org/abstract/document/8683634
 https://github.com/WenzheLiu-Speech/awesome-speech-enhancement
 """
+from typing import Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size: int):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x: torch.Tensor):
+        return x[:, :, :-self.chomp_size].contiguous()
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: Union[str, _size_1_t] = 0,
+                 dilation: _size_1_t = 1,
+                 causal: bool = False,
+                 ):
+        super(DepthwiseSeparableConv, self).__init__()
+        # Use `groups` option to implement depthwise convolution
+        self.depthwise_conv = nn.Conv1d(
+            in_channels=in_channels, out_channels=in_channels,
+            kernel_size=kernel_size, stride=stride,
+            padding=padding, dilation=dilation,
+            groups=in_channels,
+            bias=False,
+        )
+        self.chomp1d = Chomp1d(padding) if causal else nn.Identity()
+        self.prelu = nn.PReLU()
+        self.norm = nn.BatchNorm1d(in_channels)
+        self.pointwise_conv = nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels,
+            kernel_size=1,
+            bias=False,
+        )
+    def forward(self, x: torch.Tensor):
+        # x shape: [b, c, t]
+        x = self.depthwise_conv.forward(x)
+        # x shape: [b, c, t_pad]
+        x = self.chomp1d(x)
+        # x shape: [b, c, t]
+        x = self.prelu(x)
+        x = self.norm(x)
+        x = self.pointwise_conv.forward(x)
+        return x
+class ResBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 hidden_channels: int,
+                 kernel_size: _size_1_t,
+                 dilation: _size_1_t = 1,
+                 ):
+        super(ResBlock, self).__init__()
+        self.conv1d = nn.Conv1d(in_channels=in_channels, out_channels=hidden_channels, kernel_size=1)
+        self.prelu = nn.PReLU(num_parameters=1)
+        self.norm = nn.BatchNorm1d(num_features=hidden_channels)
+        self.sconv = DepthwiseSeparableConv(
+            in_channels=hidden_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) * dilation,
+            dilation=dilation,
+            causal=True,
+        )
+    def forward(self, inputs: torch.Tensor):
+        x = inputs
+        # x shape: [b, in_channels, t]
+        x = self.conv1d.forward(x)
+        # x shape: [b, out_channels, t]
+        x = self.prelu(x)
+        x = self.norm(x)
+        # x shape: [b, out_channels, t]
+        x = self.sconv.forward(x)
+        # x shape: [b, in_channels, t]
+        result = x + inputs
+        return result
+class TCNNBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 hidden_channels: int,
+                 kernel_size: int = 3,
+                 init_dilation: int = 2,
+                 num_layers: int = 6
+                 ):
+        super(TCNNBlock, self).__init__()
+        self.layers = nn.ModuleList(modules=[])
+        for i in range(num_layers):
+            dilation_size = init_dilation ** i
+            # in_channels = in_channels if i == 0 else out_channels
+            self.layers.append(
+                ResBlock(
+                    in_channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation=dilation_size,
+                )
+            )
+    def forward(self, x: torch.Tensor):
+        for layer in self.layers:
+            # x shape: [b, c, t]
+            x = layer.forward(x)
+            # x shape: [b, c, t]
+        return x
+class TCNN(nn.Module):
+    def __init__(self):
+        super(TCNN, self).__init__()
+        self.win_size = 320
+        self.hop_size = 160
+        self.conv2d_1 = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 5), stride=(1, 1), padding=(1, 2)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.conv2d_2 = nn.Sequential(
+            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 5), stride=(1, 2), padding=(1, 2)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.conv2d_3 = nn.Sequential(
+            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.conv2d_4 = nn.Sequential(
+            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1)),
+            nn.BatchNorm2d(num_features=32),
+            nn.PReLU()
+        )
+        self.conv2d_5 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1)),
+            nn.BatchNorm2d(num_features=32),
+            nn.PReLU()
+        )
+        self.conv2d_6 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1)),
+            nn.BatchNorm2d(num_features=64),
+            nn.PReLU()
+        )
+        self.conv2d_7 = nn.Sequential(
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1)),
+            nn.BatchNorm2d(num_features=64),
+            nn.PReLU()
+        )
+        # 256 = 64 * 4
+        self.tcnn_block_1 = TCNNBlock(in_channels=256, hidden_channels=512, kernel_size=3, init_dilation=2, num_layers=6)
+        self.tcnn_block_2 = TCNNBlock(in_channels=256, hidden_channels=512, kernel_size=3, init_dilation=2, num_layers=6)
+        self.tcnn_block_3 = TCNNBlock(in_channels=256, hidden_channels=512, kernel_size=3, init_dilation=2, num_layers=6)
+        self.dconv2d_7 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1),
+                               output_padding=(0, 0)),
+            nn.BatchNorm2d(num_features=64),
+            nn.PReLU()
+        )
+        self.dconv2d_6 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=128, out_channels=32, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1),
+                               output_padding=(0, 0)),
+            nn.BatchNorm2d(num_features=32),
+            nn.PReLU()
+        )
+        self.dconv2d_5 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1),
+                               output_padding=(0, 0)),
+            nn.BatchNorm2d(num_features=32),
+            nn.PReLU()
+        )
+        self.dconv2d_4 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=64, out_channels=16, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1),
+                               output_padding=(0, 0)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.dconv2d_3 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=(3, 5), stride=(1, 2), padding=(1, 1),
+                               output_padding=(0, 1)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.dconv2d_2 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=(3, 5), stride=(1, 2), padding=(1, 2),
+                               output_padding=(0, 1)),
+            nn.BatchNorm2d(num_features=16),
+            nn.PReLU()
+        )
+        self.dconv2d_1 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=(3, 5), stride=(1, 1), padding=(1, 2),
+                               output_padding=(0, 0)),
+            nn.BatchNorm2d(num_features=1),
+            nn.PReLU()
+        )
+    def signal_prepare(self, signal: torch.Tensor) -> torch.Tensor:
+        if signal.dim() == 2:
+            signal = torch.unsqueeze(signal, dim=1)
+        _, _, n_samples = signal.shape
+        remainder = (n_samples - self.win_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal, n_samples
+    def forward(self,
+                noisy: torch.Tensor,
+                ):
+        noisy, num_samples = self.signal_prepare(noisy)
+        batch_size, _, num_samples_pad = noisy.shape
+        # n_frame = (num_samples_pad - self.win_size) / self.hop_size + 1
+        # unfold
+        # noisy shape: [b, 1, num_samples_pad]
+        noisy = noisy.unsqueeze(1)
+        # noisy shape: [b, 1, 1, num_samples_pad]
+        noisy_frame = torch.nn.functional.unfold(
+            input=noisy,
+            kernel_size=(1, self.win_size),
+            padding=(0, 0),
+            stride=(1, self.hop_size),
+        )
+        # noisy_frame shape: [b, win_size, n_frame]
+        noisy_frame = noisy_frame.unsqueeze(1)
+        # noisy_frame shape: [b, 1, win_size, n_frame]
+        noisy_frame = noisy_frame.permute(0, 1, 3, 2)
+        # noisy_frame shape: [b, 1, n_frame, win_size]
+        denoise_frame = self.forward_chunk(noisy_frame)
+        # denoise_frame shape: [b, c, n_frame, win_size]
+        denoise_frame = denoise_frame.squeeze(1)
+        # denoise_frame shape: [b, n_frame, win_size]
+        denoise = self.denoise_frame_to_denoise(denoise_frame, batch_size, num_samples_pad)
+        # denoise shape: [b, num_samples_pad]
+        denoise = denoise[:, :num_samples]
+        # denoise shape: [b, num_samples]
+        return denoise
+    def forward_chunk(self, inputs: torch.Tensor):
+        # inputs shape: [b, c, t, segment_length]
+        conv2d_1 = self.conv2d_1(inputs)
+        conv2d_2 = self.conv2d_2(conv2d_1)
+        conv2d_3 = self.conv2d_3(conv2d_2)
+        conv2d_4 = self.conv2d_4(conv2d_3)
+        conv2d_5 = self.conv2d_5(conv2d_4)
+        conv2d_6 = self.conv2d_6(conv2d_5)
+        conv2d_7 = self.conv2d_7(conv2d_6)
+        # shape: [b, c, t, 4]
+        reshape_1 = conv2d_7.permute(0, 1, 3, 2)
+        # shape: [b, c, 4, t]
+        batch_size, C, frame_len, frame_num = reshape_1.shape
+        reshape_1 = reshape_1.reshape(batch_size, C * frame_len, frame_num)
+        # shape: [b, c*4, t]
+        tcnn_block_1 = self.tcnn_block_1.forward(reshape_1)
+        tcnn_block_2 = self.tcnn_block_2.forward(tcnn_block_1)
+        tcnn_block_3 = self.tcnn_block_3.forward(tcnn_block_2)
+        # shape: [b, c*4, t]
+        reshape_2 = tcnn_block_3.reshape(batch_size, C, frame_len, frame_num)
+        reshape_2 = reshape_2.permute(0, 1, 3, 2)
+        # shape: [b, c, t, 4]
+        dconv2d_7 = self.dconv2d_7(torch.cat((conv2d_7, reshape_2), dim=1))
+        dconv2d_6 = self.dconv2d_6(torch.cat((conv2d_6, dconv2d_7), dim=1))
+        dconv2d_5 = self.dconv2d_5(torch.cat((conv2d_5, dconv2d_6), dim=1))
+        dconv2d_4 = self.dconv2d_4(torch.cat((conv2d_4, dconv2d_5), dim=1))
+        dconv2d_3 = self.dconv2d_3(torch.cat((conv2d_3, dconv2d_4), dim=1))
+        dconv2d_2 = self.dconv2d_2(torch.cat((conv2d_2, dconv2d_3), dim=1))
+        dconv2d_1 = self.dconv2d_1(torch.cat((conv2d_1, dconv2d_2), dim=1))
+        return dconv2d_1
+    def denoise_frame_to_denoise(self, denoise_frame: torch.Tensor, batch_size: int, num_samples: int):
+        # overlap and add
+        # https://github.com/HardeyPandya/Temporal-Convolutional-Neural-Network-Single-Channel-Speech-Enhancement/blob/main/TCNN/util/utils.py#L40
+        b, t, f = denoise_frame.shape
+        if f != self.win_size:
+            raise AssertionError
+        denoise = torch.zeros(size=(b, num_samples), dtype=denoise_frame.dtype)
+        count = torch.zeros(size=(b, num_samples), dtype=torch.float32)
+        start = 0
+        end = start + self.win_size
+        for i in range(t):
+            denoise[..., start:end] += denoise_frame[:, i, :]
+            count[..., start:end] += 1.
+            start += self.hop_size
+            end = start + self.win_size
+        denoise = denoise / count
+        return denoise
+def main():
+    model = TCNN()
+    x = torch.randn(64, 1, 5, 320)
+    # x = torch.randn(64, 1, 5, 160)
+    y = model.forward_chunk(x)
+    print("output", y.shape)
+    noisy = torch.randn(size=(2, 16000), dtype=torch.float32)
+    denoise = model.forward(noisy)
+    print(f"denoise.shape: {denoise.shape}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/zip_enhancer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/models/zip_enhancer/modeling_zip_enhancer.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://arxiv.org/abs/2501.05183
+https://zipenhancer.github.io/ZipEnhancer/
+https://modelscope.cn/models/iic/speech_zipenhancer_ans_multiloss_16k_base
+https://github.com/boreas-l/zipEnhancer
+"""
+import torch
+import torch.nn as nn
+class DenseBlockV2(nn.Module):
+    def __init__(self, config, kernel_size=(2, 3), depth=4):
+        super(DenseBlockV2, self).__init__()
+        self.config = config
+        self.depth = depth
+        self.dense_block = nn.ModuleList([])
+        for i in range(depth):
+            dil = 2 ** i
+            pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1
+            dense_conv = nn.Sequential(
+                nn.ConstantPad2d((1, 1, pad_length, 0), value=0.),
+                nn.Conv2d(
+                    config.dense_channel * (i + 1),
+                    config.dense_channel,
+                    kernel_size,
+                    dilation=(dil, 1)
+                ),
+                nn.InstanceNorm2d(config.dense_channel, affine=True),
+                nn.PReLU(config.dense_channel)
+            )
+            self.dense_block.append(dense_conv)
+    def forward(self, x):
+        skip = x
+        # b, c, t, f
+        for i in range(self.depth):
+            _x = skip
+            x = self.dense_block[i](_x)
+            # print(x.size())
+            skip = torch.cat([x, skip], dim=1)
+        return x
+class DenseEncoder(nn.Module):
+    def __init__(self, config, in_channel):
+        super(DenseEncoder, self).__init__()
+        self.config = config
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(in_channel, config.dense_channel, (1, 1)),
+            nn.InstanceNorm2d(config.dense_channel, affine=True),
+            nn.PReLU(config.dense_channel)
+        )
+        self.dense_block = DenseBlockV2(config, depth=4)
+        encoder_pad_kersize = (0, 1)
+        # Here pad was originally (0, 0)，now change to (0, 1)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(
+                config.dense_channel,
+                config.dense_channel,
+                kernel_size=(1, 3),
+                stride=(1, 2),
+                padding=encoder_pad_kersize
+            ),
+            nn.InstanceNorm2d(config.dense_channel, affine=True),
+            nn.PReLU(config.dense_channel)
+        )
+    def forward(self, x):
+        """
+        Forward pass of the DenseEncoder module.
+        Args:
+        x (Tensor): Input tensor of shape [B, C=in_channel, T, F].
+        Returns:
+        Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2].
+        """
+        # print("x: {}".format(x.size()))
+        x = self.dense_conv_1(x)  # [b, 64, T, F]
+        if self.dense_block is not None:
+            x = self.dense_block(x)  # [b, 64, T, F]
+        x = self.dense_conv_2(x)  # [b, 64, T, F//2]
+        return x
+class ZipEnhancer(nn.Module):
+    def __init__(self, config):
+        super(ZipEnhancer, self).__init__()
+        self.config = config
+        num_tsconformers = config.num_tsconformers
+        self.num_tscblocks = num_tsconformers
+        self.dense_encoder = DenseEncoder(config, in_channel=2)
+        self.TSConformer = Zipformer2DualPathEncoder(
+            output_downsampling_factor=1,
+            dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+            **config.former_conf
+        )
+        self.mask_decoder = MappingDecoder(config, out_channel=config.model_num_spks)
+        self.phase_decoder = PhaseDecoder(config, out_channel=config.model_num_spks)
+    def forward(self, noisy_mag, noisy_pha):  # [B, F, T]
+        """
+        Forward pass of the ZipEnhancer module.
+        Args:
+        noisy_mag (torch.Tensor): Noisy magnitude input torch.tensor of shape [B, F, T].
+        noisy_pha (torch.Tensor): Noisy phase input torch.tensor of shape [B, F, T].
+        Returns:
+        Tuple: denoised magnitude, denoised phase, denoised complex representation,
+               (optional) predicted noise components, and other auxiliary information.
+        """
+        others = dict()
+        noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        x = self.dense_encoder(x)
+        # [B, C, T, F]
+        x = self.TSConformer(x)
+        pred_mag = self.mask_decoder(x)
+        pred_pha = self.phase_decoder(x)
+        # b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t
+        denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+        # b, t, f
+        denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+        # b, t, f
+        denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha),
+                                    denoised_mag * torch.sin(denoised_pha)),
+                                   dim=-1)
+        return denoised_mag, denoised_pha, denoised_com, None, others
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/zip_enhancer/scaling.py ADDED Viewed

	@@ -0,0 +1,249 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/boreas-l/zipEnhancer/blob/main/models/layers/scaling.py
+"""
+import logging
+import random
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+def logaddexp_onnx(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    max_value = torch.max(x, y)
+    diff = torch.abs(x - y)
+    return max_value + torch.log1p(torch.exp(-diff))
+# RuntimeError: Exporting the operator logaddexp to ONNX opset version
+# 14 is not supported. Please feel free to request support or submit
+# a pull request on PyTorch GitHub.
+#
+# The following function is to solve the above error when exporting
+# models to ONNX via torch.jit.trace()
+def logaddexp(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Note: We cannot use torch.jit.is_tracing() here as it also
+        # matches torch.onnx.export().
+        return torch.logaddexp(x, y)
+    elif torch.onnx.is_in_onnx_export():
+        return logaddexp_onnx(x, y)
+    else:
+        # for torch.jit.trace()
+        return torch.logaddexp(x, y)
+class PiecewiseLinear(object):
+    """
+    Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
+    the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
+    respectively.
+    """
+    def __init__(self, *args):
+        assert len(args) >= 1, len(args)
+        if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
+            self.pairs = list(args[0].pairs)
+        else:
+            self.pairs = [(float(x), float(y)) for x, y in args]
+        for x, y in self.pairs:
+            assert isinstance(x, (float, int)), type(x)
+            assert isinstance(y, (float, int)), type(y)
+        for i in range(len(self.pairs) - 1):
+            assert self.pairs[i + 1][0] > self.pairs[i][0], (
+                i,
+                self.pairs[i],
+                self.pairs[i + 1],
+            )
+    def __str__(self):
+        # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
+        return f'PiecewiseLinear({str(self.pairs)[1:-1]})'
+    def __call__(self, x):
+        if x <= self.pairs[0][0]:
+            return self.pairs[0][1]
+        elif x >= self.pairs[-1][0]:
+            return self.pairs[-1][1]
+        else:
+            cur_x, cur_y = self.pairs[0]
+            for i in range(1, len(self.pairs)):
+                next_x, next_y = self.pairs[i]
+                if cur_x <= x <= next_x:
+                    return cur_y + (next_y - cur_y) * (x - cur_x) / (
+                            next_x - cur_x)
+                cur_x, cur_y = next_x, next_y
+            assert False
+    def __mul__(self, alpha):
+        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
+    def __add__(self, x):
+        if isinstance(x, (float, int)):
+            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
+        s, x = self.get_common_basis(x)
+        return PiecewiseLinear(*[(sp[0], sp[1] + xp[1])
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+    def max(self, x):
+        if isinstance(x, (float, int)):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(*[(sp[0], max(sp[1], xp[1]))
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+    def min(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(*[(sp[0], min(sp[1], xp[1]))
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+    def __eq__(self, other):
+        return self.pairs == other.pairs
+    def get_common_basis(self,
+                         p: 'PiecewiseLinear',
+                         include_crossings: bool = False):
+        """
+        Returns (self_mod, p_mod) which are equivalent piecewise linear
+        functions to self and p, but with the same x values.
+          p: the other piecewise linear function
+          include_crossings: if true, include in the x values positions
+              where the functions indicate by this and p cross.
+        """
+        assert isinstance(p, PiecewiseLinear), type(p)
+        # get sorted x-values without repetition.
+        x_vals = sorted(
+            set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+        if include_crossings:
+            extra_x_vals = []
+            for i in range(len(x_vals) - 1):
+                _compare_results1 = (y_vals1[i] > y_vals2[i])
+                _compare_results2 = (y_vals1[i + 1] > y_vals2[i + 1])
+                if _compare_results1 != _compare_results2:
+                    # if ((y_vals1[i] > y_vals2[i]) !=
+                    #     (y_vals1[i + 1] > y_vals2[i + 1])):
+                    # if the two lines in this subsegment potentially cross each other.
+                    diff_cur = abs(y_vals1[i] - y_vals2[i])
+                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
+                    # `pos`, between 0 and 1, gives the relative x position,
+                    # with 0 being x_vals[i] and 1 being x_vals[i+1].
+                    pos = diff_cur / (diff_cur + diff_next)
+                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
+                    extra_x_vals.append(extra_x_val)
+            if len(extra_x_vals) > 0:
+                x_vals = sorted(set(x_vals + extra_x_vals))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+        return (
+            PiecewiseLinear(*zip(x_vals, y_vals1)),
+            PiecewiseLinear(*zip(x_vals, y_vals2)),
+        )
+class ScheduledFloat(torch.nn.Module):
+    """
+    This object is a torch.nn.Module only because we want it to show up in [top_level module].modules();
+    it does not have a working forward() function.  You are supposed to cast it to float, as
+    in, float(parent_module.whatever), and use it as something like a dropout prob.
+    It is a floating point value whose value changes depending on the batch count of the
+    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
+    in sorted order on x; x corresponds to the batch index.  For batch-index values before the
+    first x or after the last x, we just use the first or last y value.
+    Example:
+       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)
+    `default` is used when self.batch_count is not set or not in training mode or in
+     torch.jit scripting mode.
+    """
+    def __init__(self, *args, default: float = 0.0):
+        super().__init__()
+        # self.batch_count and self.name will be written to in the training loop.
+        self.batch_count = None
+        self.name = None
+        self.default = default
+        self.schedule = PiecewiseLinear(*args)
+    def extra_repr(self) -> str:
+        return (
+            f'batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}'
+        )
+    def __float__(self):
+        batch_count = self.batch_count
+        if (batch_count is None or not self.training
+                or torch.jit.is_scripting() or torch.jit.is_tracing()):
+            return float(self.default)
+        else:
+            ans = self.schedule(self.batch_count)
+            if random.random() < 0.0002:
+                logging.info(
+                    f'ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}'
+                )
+            return ans
+    def __add__(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule + x, default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule + x.schedule, default=self.default + x.default)
+    def max(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule.max(x), default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule.max(x.schedule),
+                default=max(self.default, x.default))
+FloatLike = Union[float, ScheduledFloat]
+class SoftmaxFunction(torch.autograd.Function):
+    """
+    Tries to handle half-precision derivatives in a randomized way that should
+    be more accurate for training than the default behavior.
+    """
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, dim: int):
+        ans = x.softmax(dim=dim)
+        # if x dtype is float16, x.softmax() returns a float32 because
+        # (presumably) that op does not support float16, and autocast
+        # is enabled.
+        if torch.is_autocast_enabled():
+            ans = ans.to(torch.float16)
+        ctx.save_for_backward(ans)
+        ctx.x_dtype = x.dtype
+        ctx.dim = dim
+        return ans
+    @staticmethod
+    def backward(ctx, ans_grad: torch.Tensor):
+        (ans,) = ctx.saved_tensors
+        with torch.cuda.amp.autocast(enabled=False):
+            ans_grad = ans_grad.to(torch.float32)
+            ans = ans.to(torch.float32)
+            x_grad = ans_grad * ans
+            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
+            return x_grad, None
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/zip_enhancer/zip_enhancer_layer.py ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/boreas-l/zipEnhancer/blob/main/models/layers/zipenhancer_layer.py
+"""
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/models/zip_enhancer/zipformer.py ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/boreas-l/zipEnhancer/blob/main/models/layers/zipformer.py
+"""
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/modules/conv_stft.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):
+    if win_type == "None" or win_type is None:
+        window = np.ones(win_size)
+    else:
+        window = get_window(win_type, win_size, fftbins=True)**0.5
+    fourier_basis = np.fft.rfft(np.eye(nfft))[:win_size]
+    real_kernel = np.real(fourier_basis)
+    image_kernel = np.imag(fourier_basis)
+    kernel = np.concatenate([real_kernel, image_kernel], 1).T
+    if inverse:
+        kernel = np.linalg.pinv(kernel).T
+    kernel = kernel * window
+    kernel = kernel[:, None, :]
+    result = (
+        torch.from_numpy(kernel.astype(np.float32)),
+        torch.from_numpy(window[None, :, None].astype(np.float32))
+    )
+    return result
+class ConvSTFT(nn.Module):
+    def __init__(self,
+                 nfft: int,
+                 win_size: int,
+                 hop_size: int,
+                 win_type: str = "hamming",
+                 power: int = None,
+                 requires_grad: bool = False):
+        super(ConvSTFT, self).__init__()
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+        kernel, _ = init_kernels(self.nfft, win_size, hop_size, win_type)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.stride = hop_size
+        self.dim = self.nfft
+        self.power = power
+    def forward(self, inputs: torch.Tensor):
+        if inputs.dim() == 2:
+            inputs = torch.unsqueeze(inputs, 1)
+        matrix = F.conv1d(inputs, self.weight, stride=self.stride)
+        dim = self.dim // 2 + 1
+        real = matrix[:, :dim, :]
+        imag = matrix[:, dim:, :]
+        spec = torch.complex(real, imag)
+        # spec shape: [b, f, t], torch.complex64
+        if self.power is None:
+            return spec
+        elif self.power == 1:
+            mags = torch.sqrt(real**2 + imag**2)
+            # phase = torch.atan2(imag, real)
+            return mags
+        elif self.power == 2:
+            power = real**2 + imag**2
+            return power
+        else:
+            raise AssertionError
+class ConviSTFT(nn.Module):
+    def __init__(self,
+                 win_size: int,
+                 hop_size: int,
+                 nfft: int = None,
+                 win_type: str = "hamming",
+                 requires_grad: bool = False):
+        super(ConviSTFT, self).__init__()
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+        kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.stride = hop_size
+        self.dim = self.nfft
+        self.register_buffer("window", window)
+        self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
+    def forward(self,
+                inputs: torch.Tensor):
+        """
+        :param inputs: torch.Tensor, shape: [b, f, t]
+        :return:
+        """
+        inputs = torch.view_as_real(inputs)
+        matrix = torch.concat(tensors=[inputs[..., 0], inputs[..., 1]], dim=1)
+        waveform = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
+        # this is from torch-stft: https://github.com/pseeth/torch-stft
+        t = self.window.repeat(1, 1, matrix.size(-1))**2
+        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        waveform = waveform / (coff + 1e-8)
+        return waveform
+def main():
+    stft = ConvSTFT(nfft=512, win_size=512, hop_size=200, power=None)
+    istft = ConviSTFT(nfft=512, win_size=512, hop_size=200)
+    mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)
+    spec = stft.forward(mixture)
+    # shape: [batch_size, freq_bins, time_steps]
+    print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
+    waveform = istft.forward(spec)
+    # shape: [batch_size, channels, num_samples]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/modules/erb_bands.py DELETED Viewed

@@ -1,124 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import math
-import numpy as np
-def freq2erb(freq_hz: float) -> float:
-    """
-    https://www.cnblogs.com/LXP-Never/p/16011229.html
-    1 / (24.7 * 9.265) = 0.00436976
-    """
-    return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
-def erb2freq(n_erb: float) -> float:
-    return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
-def get_erb_widths(sample_rate: int, fft_size: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
-    """
-    https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
-    :param sample_rate:
-    :param fft_size:
-    :param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
-    :param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
-    :return:
-    """
-    nyq_freq = sample_rate / 2.
-    freq_width: float = sample_rate / fft_size
-    min_erb: float = freq2erb(0.)
-    max_erb: float = freq2erb(nyq_freq)
-    erb = [0] * erb_bins
-    step = (max_erb - min_erb) / erb_bins
-    prev_freq_bin = 0
-    freq_over = 0
-    for i in range(1, erb_bins + 1):
-        f = erb2freq(min_erb + i * step)
-        freq_bin = int(round(f / freq_width))
-        freq_bins = freq_bin - prev_freq_bin - freq_over
-        if freq_bins < min_freq_bins_for_erb:
-            freq_over = min_freq_bins_for_erb - freq_bins
-            freq_bins = min_freq_bins_for_erb
-        else:
-            freq_over = 0
-        erb[i - 1] = freq_bins
-        prev_freq_bin = freq_bin
-    erb[erb_bins - 1] += 1
-    too_large = sum(erb) - (fft_size / 2 + 1)
-    if too_large > 0:
-        erb[erb_bins - 1] -= too_large
-    return np.array(erb, dtype=np.uint64)
-def get_erb_filter_bank(erb_widths: np.ndarray,
-                        sample_rate: int,
-                        normalized: bool = True,
-                        inverse: bool = False,
-                        ):
-    num_freq_bins = int(np.sum(erb_widths))
-    num_erb_bins = len(erb_widths)
-    fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
-    points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
-    for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
-        fb[b: b + w, i] = 1
-    if inverse:
-        fb = fb.T
-        if not normalized:
-            fb /= np.sum(fb, axis=1, keepdims=True)
-    else:
-        if normalized:
-            fb /= np.sum(fb, axis=0)
-    return fb
-def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
-    """
-    ERB filterbank and transform to decibel scale.
-    :param spec: Spectrum of shape [B, C, T, F].
-    :param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
-            where B are the number of ERB bins.
-    :param db: Whether to transform the output into decibel scale. Defaults to `True`.
-    :return:
-    """
-    # complex spec to power spec. (real * real + image * image)
-    spec_ = np.abs(spec) ** 2
-    # spec to erb feature.
-    erb_feat = np.matmul(spec_, erb_fb)
-    if db:
-        erb_feat = 10 * np.log10(erb_feat + 1e-10)
-    erb_feat = np.array(erb_feat, dtype=np.float32)
-    return erb_feat
-def main():
-    erb_widths = get_erb_widths(
-        sample_rate=8000,
-        fft_size=512,
-        erb_bins=32,
-        min_freq_bins_for_erb=2,
-    )
-    erb_fb = get_erb_filter_bank(
-        erb_widths=erb_widths,
-        sample_rate=8000,
-    )
-    print(erb_fb.shape)
-    return
-if __name__ == "__main__":
-    main()