Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on Mar 3

Commit

365fc03

1 Parent(s): 6de113d

update

Browse files

Files changed (8) hide show

examples/nx_clean_unet/run.sh +1 -1
examples/nx_clean_unet/step_3_evaluation.py +54 -1
examples/nx_clean_unet/yaml/config.yaml +4 -4
toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav +0 -0
toolbox/torchaudio/models/nx_clean_unet/inference_nx_clean_unet.py +95 -0
toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py +38 -0
toolbox/torchaudio/models/nx_clean_unet/transformer/transformer.py +9 -7
toolbox/torchaudio/models/nx_clean_unet/yaml/config.yaml +14 -6

examples/nx_clean_unet/run.sh CHANGED Viewed

@@ -12,7 +12,7 @@ sh run.sh --stage 3 --stop_stage 3 --system_version centos --file_folder_name fi
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
-sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir2 --final_model_name nx-clean-unet-aishell-20250228 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
 --max_epochs 100

 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
+sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-clean-unet-aishell-20250228 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
 --max_epochs 100

examples/nx_clean_unet/step_3_evaluation.py CHANGED Viewed

@@ -1,6 +1,59 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 if __name__ == '__main__':
-    pass

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+import argparse
+import logging
+import os
+from pathlib import Path
+import sys
+import uuid
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import librosa
+import numpy as np
+import pandas as pd
+from scipy.io import wavfile
+import torch
+import torch.nn as nn
+import torchaudio
+from tqdm import tqdm
+from toolbox.torchaudio.models.mpnet.configuration_mpnet import MPNetConfig
+from toolbox.torchaudio.models.mpnet.modeling_mpnet import MPNetPretrainedModel
+from toolbox.torchaudio.models.mpnet.utils import mag_pha_stft, mag_pha_istft
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
+    parser.add_argument("--model_dir", default="serialization_dir/best", type=str)
+    parser.add_argument("--evaluation_audio_dir", default="evaluation_audio_dir", type=str)
+    parser.add_argument("--limit", default=10, type=int)
+    args = parser.parse_args()
+    return args
+def logging_config():
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    return logger
+def main():
+    return
 if __name__ == '__main__':
+    main()

examples/nx_clean_unet/yaml/config.yaml CHANGED Viewed

@@ -12,13 +12,13 @@ down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
-tsfm_hidden_size: 256
 tsfm_attention_heads: 4
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
-tsfm_max_length: 1024
-tsfm_chunk_size: 1
-tsfm_num_left_chunks: 128
 discriminator_dim: 32
 discriminator_in_channel: 2

 down_sampling_kernel_size: 4
 down_sampling_stride: 2
+tsfm_hidden_size: 64
 tsfm_attention_heads: 4
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
+tsfm_max_length: 5120
+tsfm_chunk_size: 4
+tsfm_num_left_chunks: 64
 discriminator_dim: 32
 discriminator_in_channel: 2

toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav ADDED Viewed

Binary file (417 kB). View file

toolbox/torchaudio/models/nx_clean_unet/inference_nx_clean_unet.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import logging
+from pathlib import Path
+import shutil
+import tempfile
+import zipfile
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from project_settings import project_path
+from toolbox.torchaudio.models.nx_clean_unet.configuration_nx_clean_unet import NXCleanUNetConfig
+from toolbox.torchaudio.models.nx_clean_unet.modeling_nx_clean_unet import NXCleanUNetPretrainedModel, MODEL_FILE
+logger = logging.getLogger("toolbox")
+class InferenceNXCleanUNet(object):
+    def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
+        self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
+        self.device = torch.device(device)
+        logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
+        config, model = self.load_models(self.pretrained_model_path_or_zip_file)
+        logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")
+        self.config = config
+        self.model = model
+        self.model.to(device)
+        self.model.eval()
+    def load_models(self, model_path: str):
+        model_path = Path(model_path)
+        if model_path.name.endswith(".zip"):
+            with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
+                out_root = Path(tempfile.gettempdir()) / "nx_denoise"
+                out_root.mkdir(parents=True, exist_ok=True)
+                f_zip.extractall(path=out_root)
+            model_path = out_root / model_path.stem
+        config = NXCleanUNetConfig.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model = NXCleanUNetPretrainedModel.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model.to(self.device)
+        model.eval()
+        shutil.rmtree(model_path)
+        return config, model
+    def enhancement_by_tensor(self, noisy_audio: torch.Tensor) -> torch.Tensor:
+        if torch.max(noisy_audio) > 1 or torch.min(noisy_audio) < -1:
+            raise AssertionError(f"The value range of audio samples should be between -1 and 1.")
+        # noisy_audio shape: [batch_size, num_samples]
+        noisy_audios = noisy_audio.to(self.device)
+        with torch.no_grad():
+            enhanced_audios = self.model.forward_chunk_by_chunk(noisy_audios)
+            # enhanced_audio shape: [batch_size, n_samples]
+            # enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
+        enhanced_audio = enhanced_audios[0]
+        # enhanced_audio shape: [num_samples,]
+        return enhanced_audio
+def main():
+    model_zip_file = project_path / "trained_models/nx-clean-unet-44-epoch.zip"
+    infer_nx_clean_unet = InferenceNXCleanUNet(model_zip_file)
+    sample_rate = 8000
+    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_3.wav"
+    noisy_audio, _ = librosa.load(
+        noisy_audio_file.as_posix(),
+        sr=sample_rate,
+    )
+    # noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
+    noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+    noisy_audio = noisy_audio.unsqueeze(dim=0)
+    enhanced_audio = infer_nx_clean_unet.enhancement_by_tensor(noisy_audio)
+    filename = "enhanced_audio.wav"
+    torchaudio.save(filename, enhanced_audio.detach().cpu().unsqueeze(dim=0), sample_rate)
+    return
+if __name__ == '__main__':
+    main()

toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py CHANGED Viewed

@@ -213,9 +213,47 @@ class NXCleanUNet(nn.Module):
         # enhanced_audios shape: [batch_size, 1, n_samples]
         enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
         return enhanced_audios
 MODEL_FILE = "generator.pt"

         # enhanced_audios shape: [batch_size, 1, n_samples]
         enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
+        # enhanced_audios shape: [batch_size, n_samples]
         return enhanced_audios
+    def forward_chunk_by_chunk(self, noisy_audios: torch.Tensor):
+        # noisy_audios shape: [batch_size, n_samples]
+        noisy_audios = torch.unsqueeze(noisy_audios, dim=1)
+        # noisy_audios shape: [batch_size, 1, n_samples]
+        n_samples = noisy_audios.shape[-1]
+        padded_length = get_padding_length(
+            n_samples,
+            num_layers=self.config.down_sampling_num_layers,
+            kernel_size=self.config.down_sampling_kernel_size,
+            stride=self.config.down_sampling_stride,
+        )
+        noisy_audios_padded = F.pad(input=noisy_audios, pad=(0, padded_length - n_samples), mode="constant", value=0)
+        bottle_neck = self.down_sampling.forward(noisy_audios_padded)
+        # bottle_neck shape: [batch_size, channels, time_steps]
+        bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
+        # bottle_neck shape: [batch_size, time_steps, input_size]
+        bottle_neck = self.transformer.forward_chunk_by_chunk(bottle_neck)
+        # bottle_neck shape: [batch_size, time_steps, input_size]
+        bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
+        # bottle_neck shape: [batch_size, channels, time_steps]
+        enhanced_audios = self.up_sampling.forward(bottle_neck)
+        enhanced_audios = enhanced_audios[:, :, :n_samples]
+        # enhanced_audios shape: [batch_size, 1, n_samples]
+        enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
+        # enhanced_audios shape: [batch_size, n_samples]
+        return enhanced_audios
 MODEL_FILE = "generator.pt"

toolbox/torchaudio/models/nx_clean_unet/transformer/transformer.py CHANGED Viewed

@@ -509,13 +509,14 @@ class TransformerEncoder(nn.Module):
         # position_embedding shape: [1, time_steps, hidden_size]
         r_att_cache = []
-        for encoder_layer in self.encoder_layer_list:
             xs, new_att_cache = encoder_layer.forward(
                 x=xs, mask=attention_mask,
                 position_embedding=position_embedding,
-                attention_cache=attention_cache,
             )
             r_att_cache.append(new_att_cache[:, :, self.chunk_size:, :])
         r_att_cache = torch.cat(r_att_cache, dim=0)
@@ -528,8 +529,9 @@ class TransformerEncoder(nn.Module):
         batch_size, time_steps, _ = xs.shape
-        offset = 0
-        attention_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
         attention_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
         outputs = []
@@ -538,15 +540,15 @@ class TransformerEncoder(nn.Module):
             end = begin + self.chunk_size
             chunk_xs = xs[:, begin:end, :]
-            ys, att_cache = self.forward_chunk(
                 xs=chunk_xs, attention_mask=attention_mask,
-                offset=offset, attention_cache=attention_cache
             )
             # xs shape: [batch_size, chunk_size, hidden_size]
             ys = self.output_linear.forward(ys)
             # xs shape: [batch_size, chunk_size, input_size]
-            offset += self.chunk_size
             outputs.append(ys)
         ys = torch.cat(outputs, 1)

         # position_embedding shape: [1, time_steps, hidden_size]
         r_att_cache = []
+        for idx, encoder_layer in enumerate(self.encoder_layer_list):
             xs, new_att_cache = encoder_layer.forward(
                 x=xs, mask=attention_mask,
                 position_embedding=position_embedding,
+                attention_cache=attention_cache[idx: idx+1],
             )
             r_att_cache.append(new_att_cache[:, :, self.chunk_size:, :])
+            # r_att_cache.append(new_att_cache)
         r_att_cache = torch.cat(r_att_cache, dim=0)
         batch_size, time_steps, _ = xs.shape
+        # [num_blocks, attention_heads, num_left_chunks, dim]
+        # attention_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        attention_cache: torch.Tensor = torch.zeros((6, 8, 128, 256),  device=xs.device)
         attention_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
         outputs = []
             end = begin + self.chunk_size
             chunk_xs = xs[:, begin:end, :]
+            ys, attention_cache = self.forward_chunk(
                 xs=chunk_xs, attention_mask=attention_mask,
+                offset=0, attention_cache=attention_cache
             )
             # xs shape: [batch_size, chunk_size, hidden_size]
             ys = self.output_linear.forward(ys)
             # xs shape: [batch_size, chunk_size, input_size]
             outputs.append(ys)
         ys = torch.cat(outputs, 1)

toolbox/torchaudio/models/nx_clean_unet/yaml/config.yaml CHANGED Viewed

@@ -6,21 +6,29 @@ n_fft: 512
 win_size: 200
 hop_size: 80
 down_sampling_num_layers: 5
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
-tsfm_hidden_size: 1024
-tsfm_attention_heads: 8
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
-tsfm_max_length: 1024
-tsfm_chunk_size: 1
-tsfm_num_left_chunks: 128
-discriminator_dim: 16
 discriminator_in_channel: 2
 compress_factor: 0.3

 win_size: 200
 hop_size: 80
+# 2**down_sampling_num_layers，
+# 例如 2**5=32 就意味着 32个值在降采样之后是一个时间步，
+# 则一步是 32/sample_rate = 0.004秒。
+# 那么 tsfm_chunk_size=4 则为16ms，tsfm_chunk_size=8 则为32ms
+# 假设每次向左看1秒，则：
+# tsfm_chunk_size=1，tsfm_num_left_chunks: 256
+# tsfm_chunk_size=4，tsfm_num_left_chunks: 64
+# tsfm_chunk_size=8，tsfm_num_left_chunks: 32
 down_sampling_num_layers: 5
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
+tsfm_hidden_size: 64
+tsfm_attention_heads: 4
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
+tsfm_max_length: 5120
+tsfm_chunk_size: 4
+tsfm_num_left_chunks: 64
+discriminator_dim: 32
 discriminator_in_channel: 2
 compress_factor: 0.3