HoneyTian commited on
Commit
d32c7e7
·
1 Parent(s): 7b61e4d
.gitignore CHANGED
@@ -21,5 +21,6 @@
21
 
22
  **/*.wav
23
  **/*.xlsx
 
24
 
25
  requirements-python-3-9-9.txt
 
21
 
22
  **/*.wav
23
  **/*.xlsx
24
+ **/*.jsonl
25
 
26
  requirements-python-3-9-9.txt
examples/dfnet2/run.sh CHANGED
@@ -3,20 +3,15 @@
3
  : <<'END'
4
 
5
  sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
6
- --noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
7
- --speech_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/speech"
 
8
 
9
- sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet2-nx-dns3 \
10
- --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
11
- --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech"
12
-
13
- sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name dfnet2-nx2 \
14
- --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/nx-noise" \
15
- --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2"
16
 
17
  sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name dfnet2-nx2-dns3 --final_model_name dfnet2-nx2-dns3 \
18
- --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
19
- --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/"
 
20
 
21
 
22
  END
@@ -34,8 +29,8 @@ final_model_name=final_model_name
34
  config_file="yaml/config.yaml"
35
  limit=10
36
 
37
- noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
38
- speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
39
 
40
  max_count=-1
41
 
@@ -99,9 +94,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
99
  $verbose && echo "stage 1: prepare data"
100
  cd "${work_dir}" || exit 1
101
  python3 step_1_prepare_data.py \
102
- --file_dir "${file_dir}" \
103
- --noise_dir "${noise_dir}" \
104
- --speech_dir "${speech_dir}" \
105
  --train_dataset "${train_dataset}" \
106
  --valid_dataset "${valid_dataset}" \
107
  --max_count "${max_count}" \
 
3
  : <<'END'
4
 
5
  sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name dfnet-nx-speech \
6
+ --noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
7
+ --speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
8
+ /data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav"
9
 
 
 
 
 
 
 
 
10
 
11
  sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name dfnet2-nx2-dns3 --final_model_name dfnet2-nx2-dns3 \
12
+ --noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
13
+ --speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
14
+ /data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav"
15
 
16
 
17
  END
 
29
  config_file="yaml/config.yaml"
30
  limit=10
31
 
32
+ noise_patterns=/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav
33
+ speech_patterns=/data/tianxing/HuggingDatasets/nx_noise/data/speech/**/*.wav
34
 
35
  max_count=-1
36
 
 
94
  $verbose && echo "stage 1: prepare data"
95
  cd "${work_dir}" || exit 1
96
  python3 step_1_prepare_data.py \
97
+ --noise_patterns "${noise_patterns}" \
98
+ --speech_patterns "${speech_patterns}" \
 
99
  --train_dataset "${train_dataset}" \
100
  --valid_dataset "${valid_dataset}" \
101
  --max_count "${max_count}" \
examples/dfnet2/step_1_prepare_data.py CHANGED
@@ -1,11 +1,13 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
 
4
  import json
5
  import os
6
  from pathlib import Path
7
  import random
8
  import sys
 
9
 
10
  pwd = os.path.abspath(os.path.dirname(__file__))
11
  sys.path.append(os.path.join(pwd, "../../"))
@@ -17,16 +19,14 @@ from tqdm import tqdm
17
 
18
  def get_args():
19
  parser = argparse.ArgumentParser()
20
- parser.add_argument("--file_dir", default="./", type=str)
21
-
22
  parser.add_argument(
23
- "--noise_dir",
24
- default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
25
  type=str
26
  )
27
  parser.add_argument(
28
- "--speech_dir",
29
- default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
30
  type=str
31
  )
32
 
@@ -51,58 +51,66 @@ def filename_generator(data_dir: str):
51
  yield filename.as_posix()
52
 
53
 
54
- def target_second_signal_generator(data_dir: str, duration: int = 2, sample_rate: int = 8000, max_epoch: int = 20000):
55
- data_dir = Path(data_dir)
 
 
 
56
  for epoch_idx in range(max_epoch):
57
- for filename in data_dir.glob("**/*.wav"):
58
- signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
59
- raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
60
-
61
- if raw_duration < duration:
62
- # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
63
- continue
64
- if signal.ndim != 1:
65
- raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
66
-
67
- signal_length = len(signal)
68
- win_size = int(duration * sample_rate)
69
- for begin in range(0, signal_length - win_size, win_size):
70
- if np.sum(signal[begin: begin+win_size]) == 0:
71
  continue
72
- row = {
73
- "epoch_idx": epoch_idx,
74
- "filename": filename.as_posix(),
75
- "raw_duration": round(raw_duration, 4),
76
- "offset": round(begin / sample_rate, 4),
77
- "duration": round(duration, 4),
78
- }
79
- yield row
 
 
 
 
 
 
 
 
80
 
81
 
82
  def main():
83
  args = get_args()
84
 
85
- file_dir = Path(args.file_dir)
86
- file_dir.mkdir(exist_ok=True)
 
 
 
 
87
 
88
- noise_dir = Path(args.noise_dir)
89
- speech_dir = Path(args.speech_dir)
 
 
90
 
91
  noise_generator = target_second_signal_generator(
92
- noise_dir.as_posix(),
93
  duration=args.duration,
94
  sample_rate=args.target_sample_rate,
95
  max_epoch=100000,
96
  )
97
  speech_generator = target_second_signal_generator(
98
- speech_dir.as_posix(),
99
  duration=args.duration,
100
  sample_rate=args.target_sample_rate,
101
  max_epoch=1,
102
  )
103
 
104
- dataset = list()
105
-
106
  count = 0
107
  process_bar = tqdm(desc="build dataset jsonl")
108
  with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
4
+ from glob import glob
5
  import json
6
  import os
7
  from pathlib import Path
8
  import random
9
  import sys
10
+ from typing import List
11
 
12
  pwd = os.path.abspath(os.path.dirname(__file__))
13
  sys.path.append(os.path.join(pwd, "../../"))
 
19
 
20
  def get_args():
21
  parser = argparse.ArgumentParser()
 
 
22
  parser.add_argument(
23
+ "--noise_patterns",
24
+ default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\**\*.wav",
25
  type=str
26
  )
27
  parser.add_argument(
28
+ "--speech_patterns",
29
+ default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\**\*.wav",
30
  type=str
31
  )
32
 
 
51
  yield filename.as_posix()
52
 
53
 
54
+ def target_second_signal_generator(filename_patterns: List[str],
55
+ duration: int = 2,
56
+ sample_rate: int = 8000,
57
+ max_epoch: int = 20000
58
+ ):
59
  for epoch_idx in range(max_epoch):
60
+ for filename_pattern in filename_patterns:
61
+ for filename in glob(filename_pattern, recursive=True):
62
+ signal, _ = librosa.load(filename, sr=sample_rate)
63
+ raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
64
+
65
+ if raw_duration < duration:
66
+ # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
 
 
 
 
 
 
 
67
  continue
68
+ if signal.ndim != 1:
69
+ raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
70
+
71
+ signal_length = len(signal)
72
+ win_size = int(duration * sample_rate)
73
+ for begin in range(0, signal_length - win_size, win_size):
74
+ if np.sum(signal[begin: begin+win_size]) == 0:
75
+ continue
76
+ row = {
77
+ "epoch_idx": epoch_idx,
78
+ "filename": filename,
79
+ "raw_duration": round(raw_duration, 4),
80
+ "offset": round(begin / sample_rate, 4),
81
+ "duration": round(duration, 4),
82
+ }
83
+ yield row
84
 
85
 
86
  def main():
87
  args = get_args()
88
 
89
+ noise_patterns = args.noise_patterns
90
+ noise_patterns = noise_patterns.split(" ")
91
+ print(f"noise_patterns: {noise_patterns}")
92
+ speech_patterns = args.speech_patterns
93
+ speech_patterns = speech_patterns.split(" ")
94
+ print(f"speech_patterns: {speech_patterns}")
95
 
96
+ train_dataset = Path(args.train_dataset)
97
+ valid_dataset = Path(args.valid_dataset)
98
+ train_dataset.parent.mkdir(parents=True, exist_ok=True)
99
+ valid_dataset.parent.mkdir(parents=True, exist_ok=True)
100
 
101
  noise_generator = target_second_signal_generator(
102
+ noise_patterns,
103
  duration=args.duration,
104
  sample_rate=args.target_sample_rate,
105
  max_epoch=100000,
106
  )
107
  speech_generator = target_second_signal_generator(
108
+ speech_patterns,
109
  duration=args.duration,
110
  sample_rate=args.target_sample_rate,
111
  max_epoch=1,
112
  )
113
 
 
 
114
  count = 0
115
  process_bar = tqdm(desc="build dataset jsonl")
116
  with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
examples/dfnet2/yaml/config.yaml CHANGED
@@ -48,12 +48,12 @@ df_lookahead: 2
48
 
49
  # lsnr
50
  n_frame: 3
51
- lsnr_max: 30
52
- lsnr_min: -15
53
  norm_tau: 1.
54
 
55
  # data
56
- min_snr_db: -15
57
  max_snr_db: 20
58
 
59
  # train
 
48
 
49
  # lsnr
50
  n_frame: 3
51
+ max_local_snr: 30
52
+ min_local_snr: -15
53
  norm_tau: 1.
54
 
55
  # data
56
+ min_snr_db: -10
57
  max_snr_db: 20
58
 
59
  # train