Spaces:

qgyd2021
/

nx_denoise

Running

File size: 9,630 Bytes

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import logging
import os
from pathlib import Path
import sys
import uuid

pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../../"))

import librosa
import numpy as np
import pandas as pd
from scipy.io import wavfile
import torch
import torch.nn as nn
import torchaudio
from tqdm import tqdm

from toolbox.torchaudio.models.spectrum_dfnet.modeling_spectrum_dfnet import SpectrumDfNetPretrainedModel


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
    parser.add_argument("--model_dir", default="serialization_dir/best", type=str)
    parser.add_argument("--evaluation_audio_dir", default="evaluation_audio_dir", type=str)

    parser.add_argument("--limit", default=10, type=int)

    args = parser.parse_args()
    return args


def logging_config():
    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"

    logging.basicConfig(format=fmt,
                        datefmt="%m/%d/%Y %H:%M:%S",
                        level=logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    stream_handler.setFormatter(logging.Formatter(fmt))

    logger = logging.getLogger(__name__)

    return logger


def mix_speech_and_noise(speech: np.ndarray, noise: np.ndarray, snr_db: float):
    l1 = len(speech)
    l2 = len(noise)
    l = min(l1, l2)
    speech = speech[:l]
    noise = noise[:l]

    # np.float32, value between (-1, 1).

    speech_power = np.mean(np.square(speech))
    noise_power = speech_power / (10 ** (snr_db / 10))

    noise_adjusted = np.sqrt(noise_power) * noise / np.sqrt(np.mean(noise ** 2))

    noisy_signal = speech + noise_adjusted

    return noisy_signal


stft_power = torchaudio.transforms.Spectrogram(
    n_fft=512,
    win_length=200,
    hop_length=80,
    power=2.0,
    window_fn=torch.hamming_window,
)


stft_complex = torchaudio.transforms.Spectrogram(
    n_fft=512,
    win_length=200,
    hop_length=80,
    power=None,
    window_fn=torch.hamming_window,
)


istft = torchaudio.transforms.InverseSpectrogram(
    n_fft=512,
    win_length=200,
    hop_length=80,
    window_fn=torch.hamming_window,
)


def enhance(mix_spec_complex: torch.Tensor,
            speech_spec_prediction: torch.Tensor,
            speech_irm_prediction: torch.Tensor,
            ):
    mix_spec_complex = mix_spec_complex.detach().cpu()
    speech_spec_prediction = speech_spec_prediction.detach().cpu()
    speech_irm_prediction = speech_irm_prediction.detach().cpu()

    mask_speech = speech_irm_prediction
    mask_noise = 1.0 - speech_irm_prediction

    speech_spec = mix_spec_complex * mask_speech
    noise_spec = mix_spec_complex * mask_noise

    # print(f"speech_spec_prediction: {speech_spec_prediction.shape}")
    # print(f"noise_spec: {noise_spec.shape}")

    speech_wave = istft.forward(speech_spec_prediction)
    # speech_wave = istft.forward(speech_spec)
    noise_wave = istft.forward(noise_spec)

    return speech_wave, noise_wave


def save_audios(noise_wave: torch.Tensor,
                speech_wave: torch.Tensor,
                mix_wave: torch.Tensor,
                speech_wave_enhanced: torch.Tensor,
                noise_wave_enhanced: torch.Tensor,
                output_dir: str,
                sample_rate: int = 8000,
                ):
    basename = uuid.uuid4().__str__()
    output_dir = Path(output_dir) / basename
    output_dir.mkdir(parents=True, exist_ok=True)

    filename = output_dir / "noise_wave.wav"
    torchaudio.save(filename, noise_wave, sample_rate)
    filename = output_dir / "speech_wave.wav"
    torchaudio.save(filename, speech_wave, sample_rate)
    filename = output_dir / "mix_wave.wav"
    torchaudio.save(filename, mix_wave, sample_rate)

    filename = output_dir / "speech_wave_enhanced.wav"
    torchaudio.save(filename, speech_wave_enhanced, sample_rate)
    filename = output_dir / "noise_wave_enhanced.wav"
    torchaudio.save(filename, noise_wave_enhanced, sample_rate)

    return output_dir.as_posix()


def main():
    args = get_args()

    logger = logging_config()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("GPU available count: {}; device: {}".format(n_gpu, device))

    logger.info("prepare model")
    model = SpectrumDfNetPretrainedModel.from_pretrained(
        pretrained_model_name_or_path=args.model_dir,
    )
    model.to(device)
    model.eval()

    # optimizer
    logger.info("prepare loss_fn")
    irm_mse_loss = nn.MSELoss(
        reduction="mean",
    )
    snr_mse_loss = nn.MSELoss(
        reduction="mean",
    )

    logger.info("read excel")
    df = pd.read_excel(args.valid_dataset)

    total_loss = 0.
    total_examples = 0.
    progress_bar = tqdm(total=len(df), desc="Evaluation")
    for idx, row in df.iterrows():
        noise_filename = row["noise_filename"]
        noise_offset = row["noise_offset"]
        noise_duration = row["noise_duration"]

        speech_filename = row["speech_filename"]
        speech_offset = row["speech_offset"]
        speech_duration = row["speech_duration"]

        snr_db = row["snr_db"]

        noise_wave, _ = librosa.load(
            noise_filename,
            sr=8000,
            offset=noise_offset,
            duration=noise_duration,
        )
        speech_wave, _ = librosa.load(
            speech_filename,
            sr=8000,
            offset=speech_offset,
            duration=speech_duration,
        )
        mix_wave: np.ndarray = mix_speech_and_noise(
            speech=speech_wave,
            noise=noise_wave,
            snr_db=snr_db,
        )
        noise_wave = torch.tensor(noise_wave, dtype=torch.float32)
        speech_wave = torch.tensor(speech_wave, dtype=torch.float32)
        mix_wave: torch.Tensor = torch.tensor(mix_wave, dtype=torch.float32)

        noise_wave = noise_wave.unsqueeze(dim=0)
        speech_wave = speech_wave.unsqueeze(dim=0)
        mix_wave = mix_wave.unsqueeze(dim=0)

        noise_spec: torch.Tensor = stft_power.forward(noise_wave)
        speech_spec: torch.Tensor = stft_power.forward(speech_wave)
        mix_spec: torch.Tensor = stft_power.forward(mix_wave)

        speech_spec_complex: torch.Tensor = stft_complex.forward(speech_wave)
        mix_spec_complex: torch.Tensor = stft_complex.forward(mix_wave)
        # mix_spec_complex shape: [batch_size, freq_dim (257), time_steps, 2]

        noise_spec = noise_spec[:, :-1, :]
        speech_spec = speech_spec[:, :-1, :]
        mix_spec = mix_spec[:, :-1, :]
        speech_spec_complex = speech_spec_complex[:, :-1, :]
        mix_spec_complex = mix_spec_complex[:, :-1, :]

        speech_irm = speech_spec / (noise_spec + speech_spec)
        speech_irm = torch.pow(speech_irm, 1.0)

        snr_db: torch.Tensor = 10 * torch.log10(
            speech_spec / (noise_spec + 1e-8)
        )
        snr_db = torch.clamp(snr_db, min=1e-8)
        snr_db = torch.mean(snr_db, dim=1, keepdim=True)
        # snr_db shape: [batch_size, 1, time_steps]

        speech_spec_complex = speech_spec_complex.to(device)
        mix_spec_complex = mix_spec_complex.to(device)
        mix_spec = mix_spec.to(device)
        speech_irm_target = speech_irm.to(device)
        snr_db_target = snr_db.to(device)

        with torch.no_grad():
            speech_spec_prediction, speech_irm_prediction, lsnr_prediction = model.forward(mix_spec_complex)
            speech_spec_prediction = torch.view_as_complex(speech_spec_prediction)

            irm_loss = irm_mse_loss.forward(speech_irm_prediction, speech_irm_target)
            # snr_loss = snr_mse_loss.forward(lsnr_prediction, snr_db_target)
            # loss = irm_loss + 0.1 * snr_loss
            loss = irm_loss

        # mix_spec_complex shape: [batch_size, freq_dim (257), time_steps, 2]
        # speech_irm_prediction shape: [batch_size, freq_dim (256), time_steps]
        batch_size, _, time_steps = speech_irm_prediction.shape

        mix_spec_complex = torch.concat(
            [
                mix_spec_complex,
                torch.zeros(size=(batch_size, 1, time_steps), dtype=mix_spec_complex.dtype).to(device)
            ],
            dim=1,
        )
        speech_spec_prediction = torch.concat(
            [
                speech_spec_prediction,
                torch.zeros(size=(batch_size, 1, time_steps), dtype=speech_spec_prediction.dtype).to(device)
            ],
            dim=1,
        )
        speech_irm_prediction = torch.concat(
            [
                speech_irm_prediction,
                0.5 * torch.ones(size=(batch_size, 1, time_steps), dtype=speech_irm_prediction.dtype).to(device)
            ],
            dim=1,
        )

        # speech_irm_prediction shape: [batch_size, freq_dim (257), time_steps]
        speech_wave_enhanced, noise_wave_enhanced = enhance(mix_spec_complex, speech_spec_prediction, speech_irm_prediction)
        save_audios(noise_wave, speech_wave, mix_wave, speech_wave_enhanced, noise_wave_enhanced, args.evaluation_audio_dir)

        total_loss += loss.item()
        total_examples += mix_spec.size(0)

        evaluation_loss = total_loss / total_examples
        evaluation_loss = round(evaluation_loss, 4)

        progress_bar.update(1)
        progress_bar.set_postfix({
            "evaluation_loss": evaluation_loss,
        })

        if idx > args.limit:
            break

    return


if __name__ == '__main__':
    main()