Spaces:

qgyd2021
/

cc_denoise

Running

App Files Files Community

HoneyTian commited on Mar 21

Commit

88b2fbf

1 Parent(s): 7f9e32d

update

Browse files

Files changed (3) hide show

examples/conv_tasnet/step_1_prepare_data.py +55 -100
main.py +7 -0
toolbox/torch/utils/data/dataset/denoise_jsonl_dataset.py +133 -0

examples/conv_tasnet/step_1_prepare_data.py CHANGED Viewed

@@ -1,22 +1,18 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import argparse
 import os
 from pathlib import Path
 import random
 import sys
-import shutil
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))
-import pandas as pd
-from scipy.io import wavfile
 from tqdm import tqdm
 import librosa
-from project_settings import project_path
 def get_args():
     parser = argparse.ArgumentParser()
@@ -33,8 +29,8 @@ def get_args():
         type=str
     )
-    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
-    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
     parser.add_argument("--duration", default=2.0, type=float)
     parser.add_argument("--min_snr_db", default=-10, type=float)
@@ -80,7 +76,9 @@ def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate
                 yield row
-def get_dataset(args):
     file_dir = Path(args.file_dir)
     file_dir.mkdir(exist_ok=True)
@@ -104,99 +102,56 @@ def get_dataset(args):
     count = 0
     process_bar = tqdm(desc="build dataset excel")
-    for noise, speech in zip(noise_generator, speech_generator):
-        if count >= args.max_count:
-            break
-        noise_filename = noise["filename"]
-        noise_raw_duration = noise["raw_duration"]
-        noise_offset = noise["offset"]
-        noise_duration = noise["duration"]
-        speech_filename = speech["filename"]
-        speech_raw_duration = speech["raw_duration"]
-        speech_offset = speech["offset"]
-        speech_duration = speech["duration"]
-        random1 = random.random()
-        random2 = random.random()
-        row = {
-            "noise_filename": noise_filename,
-            "noise_raw_duration": noise_raw_duration,
-            "noise_offset": noise_offset,
-            "noise_duration": noise_duration,
-            "speech_filename": speech_filename,
-            "speech_raw_duration": speech_raw_duration,
-            "speech_offset": speech_offset,
-            "speech_duration": speech_duration,
-            "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
-            "random1": random1,
-            "random2": random2,
-            "flag": "TRAIN" if random2 < 0.8 else "TEST",
-        }
-        dataset.append(row)
-        count += 1
-        duration_seconds = count * args.duration
-        duration_hours = duration_seconds / 3600
-        process_bar.update(n=1)
-        process_bar.set_postfix({
-            # "duration_seconds": round(duration_seconds, 4),
-            "duration_hours": round(duration_hours, 4),
-        })
-    dataset = pd.DataFrame(dataset)
-    dataset = dataset.sort_values(by=["random1"], ascending=False)
-    dataset.to_excel(
-        file_dir / "dataset.xlsx",
-        index=False,
-    )
-    return
-def split_dataset(args):
-    """分割训练集, 测试集"""
-    file_dir = Path(args.file_dir)
-    file_dir.mkdir(exist_ok=True)
-    df = pd.read_excel(file_dir / "dataset.xlsx")
-    train = list()
-    test = list()
-    for i, row in df.iterrows():
-        flag = row["flag"]
-        if flag == "TRAIN":
-            train.append(row)
-        else:
-            test.append(row)
-    train = pd.DataFrame(train)
-    train.to_excel(
-        args.train_dataset,
-        index=False,
-        # encoding="utf_8_sig"
-    )
-    test = pd.DataFrame(test)
-    test.to_excel(
-        args.valid_dataset,
-        index=False,
-        # encoding="utf_8_sig"
-    )
-    return
-def main():
-    args = get_args()
-    get_dataset(args)
-    split_dataset(args)
     return

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import argparse
+import json
 import os
 from pathlib import Path
 import random
 import sys
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))
 from tqdm import tqdm
 import librosa
 def get_args():
     parser = argparse.ArgumentParser()
         type=str
     )
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
     parser.add_argument("--duration", default=2.0, type=float)
     parser.add_argument("--min_snr_db", default=-10, type=float)
                 yield row
+def main():
+    args = get_args()
     file_dir = Path(args.file_dir)
     file_dir.mkdir(exist_ok=True)
     count = 0
     process_bar = tqdm(desc="build dataset excel")
+    with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
+        for noise, speech in zip(noise_generator, speech_generator):
+            if count >= args.max_count:
+                break
+            noise_filename = noise["filename"]
+            noise_raw_duration = noise["raw_duration"]
+            noise_offset = noise["offset"]
+            noise_duration = noise["duration"]
+            speech_filename = speech["filename"]
+            speech_raw_duration = speech["raw_duration"]
+            speech_offset = speech["offset"]
+            speech_duration = speech["duration"]
+            random1 = random.random()
+            random2 = random.random()
+            row = {
+                "noise_filename": noise_filename,
+                "noise_raw_duration": noise_raw_duration,
+                "noise_offset": noise_offset,
+                "noise_duration": noise_duration,
+                "speech_filename": speech_filename,
+                "speech_raw_duration": speech_raw_duration,
+                "speech_offset": speech_offset,
+                "speech_duration": speech_duration,
+                "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
+                "random1": random1,
+            }
+            row = json.dumps(row, ensure_ascii=False)
+            if random2 < 0.8:
+                ftrain.write(f"{row}\n")
+            else:
+                fvalid.write(f"{row}\n")
+            count += 1
+            duration_seconds = count * args.duration
+            duration_hours = duration_seconds / 3600
+            process_bar.update(n=1)
+            process_bar.set_postfix({
+                # "duration_seconds": round(duration_seconds, 4),
+                "duration_hours": round(duration_hours, 4),
+            })
     return

main.py CHANGED Viewed

@@ -74,6 +74,13 @@ denoise_engines = {
                     project_path / "trained_models/mpnet-nx-speech-20-epoch.zip").as_posix()
         }
     },
     "mpnet-aishell-1-epoch": {
         "infer_cls": InferenceMPNet,
         "kwargs": {

                     project_path / "trained_models/mpnet-nx-speech-20-epoch.zip").as_posix()
         }
     },
+    "mpnet-nx-speech-33-epoch-best": {
+        "infer_cls": InferenceMPNet,
+        "kwargs": {
+            "pretrained_model_path_or_zip_file": (
+                    project_path / "trained_models/mpnet-nx-speech-33-epoch-best.zip").as_posix()
+        }
+    },
     "mpnet-aishell-1-epoch": {
         "infer_cls": InferenceMPNet,
         "kwargs": {

toolbox/torch/utils/data/dataset/denoise_jsonl_dataset.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import json
+import os
+import librosa
+import numpy as np
+import pandas as pd
+from scipy.io import wavfile
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+from tqdm import tqdm
+class DenoiseJsonlDataset(Dataset):
+    def __init__(self,
+                 jsonl_file: str,
+                 expected_sample_rate: int,
+                 resample: bool = False,
+                 max_wave_value: float = 1.0,
+                 ):
+        self.jsonl_file = jsonl_file
+        self.expected_sample_rate = expected_sample_rate
+        self.resample = resample
+        self.max_wave_value = max_wave_value
+        self.samples = self.load_samples(jsonl_file)
+    @staticmethod
+    def load_samples(filename: str):
+        samples = list()
+        with open(filename, "r", encoding="utf-8") as f:
+            for row in f:
+                row = json.loads(row)
+                noise_filename = row["noise_filename"]
+                noise_raw_duration = row["noise_raw_duration"]
+                noise_offset = row["noise_offset"]
+                noise_duration = row["noise_duration"]
+                speech_filename = row["speech_filename"]
+                speech_raw_duration = row["speech_raw_duration"]
+                speech_offset = row["speech_offset"]
+                speech_duration = row["speech_duration"]
+                snr_db = row["snr_db"]
+                row = {
+                    "noise_filename": noise_filename,
+                    "noise_raw_duration": noise_raw_duration,
+                    "noise_offset": noise_offset,
+                    "noise_duration": noise_duration,
+                    "speech_filename": speech_filename,
+                    "speech_raw_duration": speech_raw_duration,
+                    "speech_offset": speech_offset,
+                    "speech_duration": speech_duration,
+                    "snr_db": snr_db,
+                }
+                samples.append(row)
+        return samples
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        noise_filename = sample["noise_filename"]
+        noise_offset = sample["noise_offset"]
+        noise_duration = sample["noise_duration"]
+        speech_filename = sample["speech_filename"]
+        speech_offset = sample["speech_offset"]
+        speech_duration = sample["speech_duration"]
+        snr_db = sample["snr_db"]
+        noise_wave = self.filename_to_waveform(noise_filename, noise_offset, noise_duration)
+        speech_wave = self.filename_to_waveform(speech_filename, speech_offset, speech_duration)
+        mix_wave, noise_wave_adjusted = self.mix_speech_and_noise(
+            speech=speech_wave.numpy(),
+            noise=noise_wave.numpy(),
+            snr_db=snr_db,
+        )
+        mix_wave = torch.tensor(mix_wave, dtype=torch.float32)
+        noise_wave_adjusted = torch.tensor(noise_wave_adjusted, dtype=torch.float32)
+        result = {
+            "noise_wave": noise_wave_adjusted,
+            "speech_wave": speech_wave,
+            "mix_wave": mix_wave,
+            "snr_db": snr_db,
+        }
+        return result
+    def __len__(self):
+        return len(self.samples)
+    def filename_to_waveform(self, filename: str, offset: float, duration: float):
+        try:
+            waveform, sample_rate = librosa.load(
+                filename,
+                sr=self.expected_sample_rate,
+                offset=offset,
+                duration=duration,
+            )
+        except ValueError as e:
+            print(f"load failed. error type: {type(e)}, error text: {str(e)}, filename: {filename}")
+            raise e
+        waveform = torch.tensor(waveform, dtype=torch.float32)
+        return waveform
+    @staticmethod
+    def mix_speech_and_noise(speech: np.ndarray, noise: np.ndarray, snr_db: float):
+        l1 = len(speech)
+        l2 = len(noise)
+        l = min(l1, l2)
+        speech = speech[:l]
+        noise = noise[:l]
+        # np.float32, value between (-1, 1).
+        speech_power = np.mean(np.square(speech))
+        noise_power = speech_power / (10 ** (snr_db / 10))
+        noise_adjusted = np.sqrt(noise_power) * noise / np.sqrt(np.mean(noise ** 2))
+        noisy_signal = speech + noise_adjusted
+        return noisy_signal, noise_adjusted
+if __name__ == '__main__':
+    pass