Spaces:
Running
Running
update
Browse files- .gitignore +1 -0
- examples/dfnet2/run.sh +10 -16
- examples/dfnet2/step_1_prepare_data.py +46 -38
- examples/dfnet2/yaml/config.yaml +3 -3
.gitignore
CHANGED
@@ -21,5 +21,6 @@
|
|
21 |
|
22 |
**/*.wav
|
23 |
**/*.xlsx
|
|
|
24 |
|
25 |
requirements-python-3-9-9.txt
|
|
|
21 |
|
22 |
**/*.wav
|
23 |
**/*.xlsx
|
24 |
+
**/*.jsonl
|
25 |
|
26 |
requirements-python-3-9-9.txt
|
examples/dfnet2/run.sh
CHANGED
@@ -3,20 +3,15 @@
|
|
3 |
: <<'END'
|
4 |
|
5 |
sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
|
6 |
-
--
|
7 |
-
--
|
|
|
8 |
|
9 |
-
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet2-nx-dns3 \
|
10 |
-
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
|
11 |
-
--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"
|
12 |
-
|
13 |
-
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet2-nx2 \
|
14 |
-
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/nx-noise" \
|
15 |
-
--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2"
|
16 |
|
17 |
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name dfnet2-nx2-dns3 --final_model_name dfnet2-nx2-dns3 \
|
18 |
-
--
|
19 |
-
--
|
|
|
20 |
|
21 |
|
22 |
END
|
@@ -34,8 +29,8 @@ final_model_name=final_model_name
|
|
34 |
config_file="yaml/config.yaml"
|
35 |
limit=10
|
36 |
|
37 |
-
|
38 |
-
|
39 |
|
40 |
max_count=-1
|
41 |
|
@@ -99,9 +94,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
|
99 |
$verbose && echo "stage 1: prepare data"
|
100 |
cd "${work_dir}" || exit 1
|
101 |
python3 step_1_prepare_data.py \
|
102 |
-
--
|
103 |
-
--
|
104 |
-
--speech_dir "${speech_dir}" \
|
105 |
--train_dataset "${train_dataset}" \
|
106 |
--valid_dataset "${valid_dataset}" \
|
107 |
--max_count "${max_count}" \
|
|
|
3 |
: <<'END'
|
4 |
|
5 |
sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
|
6 |
+
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
7 |
+
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
8 |
+
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav"
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name dfnet2-nx2-dns3 --final_model_name dfnet2-nx2-dns3 \
|
12 |
+
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
13 |
+
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
14 |
+
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav"
|
15 |
|
16 |
|
17 |
END
|
|
|
29 |
config_file="yaml/config.yaml"
|
30 |
limit=10
|
31 |
|
32 |
+
noise_patterns=/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav
|
33 |
+
speech_patterns=/data/tianxing/HuggingDatasets/nx_noise/data/speech/**/*.wav
|
34 |
|
35 |
max_count=-1
|
36 |
|
|
|
94 |
$verbose && echo "stage 1: prepare data"
|
95 |
cd "${work_dir}" || exit 1
|
96 |
python3 step_1_prepare_data.py \
|
97 |
+
--noise_patterns "${noise_patterns}" \
|
98 |
+
--speech_patterns "${speech_patterns}" \
|
|
|
99 |
--train_dataset "${train_dataset}" \
|
100 |
--valid_dataset "${valid_dataset}" \
|
101 |
--max_count "${max_count}" \
|
examples/dfnet2/step_1_prepare_data.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
import argparse
|
|
|
4 |
import json
|
5 |
import os
|
6 |
from pathlib import Path
|
7 |
import random
|
8 |
import sys
|
|
|
9 |
|
10 |
pwd = os.path.abspath(os.path.dirname(__file__))
|
11 |
sys.path.append(os.path.join(pwd, "../../"))
|
@@ -17,16 +19,14 @@ from tqdm import tqdm
|
|
17 |
|
18 |
def get_args():
|
19 |
parser = argparse.ArgumentParser()
|
20 |
-
parser.add_argument("--file_dir", default="./", type=str)
|
21 |
-
|
22 |
parser.add_argument(
|
23 |
-
"--
|
24 |
-
default=r"
|
25 |
type=str
|
26 |
)
|
27 |
parser.add_argument(
|
28 |
-
"--
|
29 |
-
default=r"
|
30 |
type=str
|
31 |
)
|
32 |
|
@@ -51,58 +51,66 @@ def filename_generator(data_dir: str):
|
|
51 |
yield filename.as_posix()
|
52 |
|
53 |
|
54 |
-
def target_second_signal_generator(
|
55 |
-
|
|
|
|
|
|
|
56 |
for epoch_idx in range(max_epoch):
|
57 |
-
for
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
if signal.ndim != 1:
|
65 |
-
raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
|
66 |
-
|
67 |
-
signal_length = len(signal)
|
68 |
-
win_size = int(duration * sample_rate)
|
69 |
-
for begin in range(0, signal_length - win_size, win_size):
|
70 |
-
if np.sum(signal[begin: begin+win_size]) == 0:
|
71 |
continue
|
72 |
-
|
73 |
-
"
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
|
82 |
def main():
|
83 |
args = get_args()
|
84 |
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
|
91 |
noise_generator = target_second_signal_generator(
|
92 |
-
|
93 |
duration=args.duration,
|
94 |
sample_rate=args.target_sample_rate,
|
95 |
max_epoch=100000,
|
96 |
)
|
97 |
speech_generator = target_second_signal_generator(
|
98 |
-
|
99 |
duration=args.duration,
|
100 |
sample_rate=args.target_sample_rate,
|
101 |
max_epoch=1,
|
102 |
)
|
103 |
|
104 |
-
dataset = list()
|
105 |
-
|
106 |
count = 0
|
107 |
process_bar = tqdm(desc="build dataset jsonl")
|
108 |
with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
|
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
import argparse
|
4 |
+
from glob import glob
|
5 |
import json
|
6 |
import os
|
7 |
from pathlib import Path
|
8 |
import random
|
9 |
import sys
|
10 |
+
from typing import List
|
11 |
|
12 |
pwd = os.path.abspath(os.path.dirname(__file__))
|
13 |
sys.path.append(os.path.join(pwd, "../../"))
|
|
|
19 |
|
20 |
def get_args():
|
21 |
parser = argparse.ArgumentParser()
|
|
|
|
|
22 |
parser.add_argument(
|
23 |
+
"--noise_patterns",
|
24 |
+
default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\**\*.wav",
|
25 |
type=str
|
26 |
)
|
27 |
parser.add_argument(
|
28 |
+
"--speech_patterns",
|
29 |
+
default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\**\*.wav",
|
30 |
type=str
|
31 |
)
|
32 |
|
|
|
51 |
yield filename.as_posix()
|
52 |
|
53 |
|
54 |
+
def target_second_signal_generator(filename_patterns: List[str],
|
55 |
+
duration: int = 2,
|
56 |
+
sample_rate: int = 8000,
|
57 |
+
max_epoch: int = 20000
|
58 |
+
):
|
59 |
for epoch_idx in range(max_epoch):
|
60 |
+
for filename_pattern in filename_patterns:
|
61 |
+
for filename in glob(filename_pattern, recursive=True):
|
62 |
+
signal, _ = librosa.load(filename, sr=sample_rate)
|
63 |
+
raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
|
64 |
+
|
65 |
+
if raw_duration < duration:
|
66 |
+
# print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
continue
|
68 |
+
if signal.ndim != 1:
|
69 |
+
raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
|
70 |
+
|
71 |
+
signal_length = len(signal)
|
72 |
+
win_size = int(duration * sample_rate)
|
73 |
+
for begin in range(0, signal_length - win_size, win_size):
|
74 |
+
if np.sum(signal[begin: begin+win_size]) == 0:
|
75 |
+
continue
|
76 |
+
row = {
|
77 |
+
"epoch_idx": epoch_idx,
|
78 |
+
"filename": filename,
|
79 |
+
"raw_duration": round(raw_duration, 4),
|
80 |
+
"offset": round(begin / sample_rate, 4),
|
81 |
+
"duration": round(duration, 4),
|
82 |
+
}
|
83 |
+
yield row
|
84 |
|
85 |
|
86 |
def main():
|
87 |
args = get_args()
|
88 |
|
89 |
+
noise_patterns = args.noise_patterns
|
90 |
+
noise_patterns = noise_patterns.split(" ")
|
91 |
+
print(f"noise_patterns: {noise_patterns}")
|
92 |
+
speech_patterns = args.speech_patterns
|
93 |
+
speech_patterns = speech_patterns.split(" ")
|
94 |
+
print(f"speech_patterns: {speech_patterns}")
|
95 |
|
96 |
+
train_dataset = Path(args.train_dataset)
|
97 |
+
valid_dataset = Path(args.valid_dataset)
|
98 |
+
train_dataset.parent.mkdir(parents=True, exist_ok=True)
|
99 |
+
valid_dataset.parent.mkdir(parents=True, exist_ok=True)
|
100 |
|
101 |
noise_generator = target_second_signal_generator(
|
102 |
+
noise_patterns,
|
103 |
duration=args.duration,
|
104 |
sample_rate=args.target_sample_rate,
|
105 |
max_epoch=100000,
|
106 |
)
|
107 |
speech_generator = target_second_signal_generator(
|
108 |
+
speech_patterns,
|
109 |
duration=args.duration,
|
110 |
sample_rate=args.target_sample_rate,
|
111 |
max_epoch=1,
|
112 |
)
|
113 |
|
|
|
|
|
114 |
count = 0
|
115 |
process_bar = tqdm(desc="build dataset jsonl")
|
116 |
with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
|
examples/dfnet2/yaml/config.yaml
CHANGED
@@ -48,12 +48,12 @@ df_lookahead: 2
|
|
48 |
|
49 |
# lsnr
|
50 |
n_frame: 3
|
51 |
-
|
52 |
-
|
53 |
norm_tau: 1.
|
54 |
|
55 |
# data
|
56 |
-
min_snr_db: -
|
57 |
max_snr_db: 20
|
58 |
|
59 |
# train
|
|
|
48 |
|
49 |
# lsnr
|
50 |
n_frame: 3
|
51 |
+
max_local_snr: 30
|
52 |
+
min_local_snr: -15
|
53 |
norm_tau: 1.
|
54 |
|
55 |
# data
|
56 |
+
min_snr_db: -10
|
57 |
max_snr_db: 20
|
58 |
|
59 |
# train
|