File size: 7,084 Bytes
ca690f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
# ################################
# Model: Neural SI-SNR Estimator with Pool training strategy (https://arxiv.org/pdf/2110.10812.pdf)
# Dataset : LibriMix and WHAMR!
# ################################
#
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
#
seed: 1234
__set_seed: !apply:torch.manual_seed [1234]
# Data params
# e.g. '/yourpath/wsj0-mix/2speakers'
# end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/
# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
# e.g. /yourpath/wsj0-processed/si_tr_s/
# you need to convert the original wsj0 to 8k
# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/
rir_path: /miniscratch/subakany/whamr_rirs_wav
experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators
output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234
train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv
valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv
test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv
wsj_data_folder: /network/tmp1/subakany/wham_original
train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv
test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv
base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s
use_whamr_train: true
whamr_proportion: 0.6
test_onwsj: false
skip_prep: false
ckpt_interval_minutes: 60
# Experiment params
auto_mix_prec: false # Set it to True for mixed precision
test_only: false
num_spks: 2 # set to 3 for wsj0-3mix
progressbar: true
save_audio: false # Save estimated sources on disk
sample_rate: 8000
# Training parameters
N_epochs: 200
batch_size: 1
lr: 0.0001
clip_grad_norm: 5
loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: false
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000000
# Set it to True to dynamically create mixtures at training time
dynamic_mixing: true
use_wham_noise: true
use_reverb_augment: true
# Parameters for data augmentation
use_wavedrop: false
use_speedperturb: true
use_speedperturb_sameforeachsource: false
use_rand_shift: false
min_shift: -8000
max_shift: 8000
speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
perturb_prob: 1.0
drop_freq_prob: 0.0
drop_chunk_prob: 0.0
sample_rate: 8000
speeds: [95, 100, 105]
wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
perturb_prob: 0.0
drop_freq_prob: 1.0
drop_chunk_prob: 1.0
sample_rate: 8000
# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30
# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8
# Dataloader options
dataloader_opts:
batch_size: 1
num_workers: 0
# Specifying the network
snrmin: 0
snrmax: 10
out_n_neurons: 16
use_snr_compression: true
separation_norm_type: stnorm
# compute_features: !new:speechbrain.lobes.features.Fbank
# n_mels: !ref <n_mels>
# left_frames: 0
# right_frames: 0
# deltas: False
latent_dim: 128
n_inp: 256
encoder: &id006 !new:speechbrain.nnet.containers.Sequential
input_shape: [!!null '', 2, !!null '']
cnn1: !new:speechbrain.nnet.CNN.Conv1d
in_channels: 2
kernel_size: 4
out_channels: 128
stride: 1
skip_transpose: true
padding: valid
relu1: !new:torch.nn.ReLU
cnn2: !new:speechbrain.nnet.CNN.Conv1d
in_channels: 128
kernel_size: 4
out_channels: 128
stride: 2
skip_transpose: true
padding: valid
relu2: !new:torch.nn.ReLU
cnn3: !new:speechbrain.nnet.CNN.Conv1d
in_channels: 128
kernel_size: 4
out_channels: 128
stride: 2
skip_transpose: true
padding: valid
relu3: !new:torch.nn.ReLU
cnn4: !new:speechbrain.nnet.CNN.Conv1d
in_channels: 128
kernel_size: 4
out_channels: 128
stride: 2
skip_transpose: true
padding: valid
relu4: !new:torch.nn.ReLU
cnn5: !new:speechbrain.nnet.CNN.Conv1d
in_channels: 128
kernel_size: 4
out_channels: 128
stride: 2
skip_transpose: true
padding: valid
stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling
# classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
# input_size: !ref <n_inp>
# channels: [1024, 1024, 1024, 1024, 3072]
# kernel_sizes: [5, 3, 3, 3, 1]
# dilations: [1, 2, 3, 4, 1]
# attention_channels: 128
# lin_neurons: 192
#classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
# input_size: 192
# out_neurons: !ref <out_n_neurons>
#
# classifier_out: !new:speechbrain.nnet.linear.Linear
# input_size: 256
# n_neurons: 1
encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
# lr_scheduler: !ref <lr_scheduler>
input_shape: [!!null '', 256]
layer1: !new:speechbrain.nnet.linear.Linear
input_size: 256
n_neurons: 256
relu: !new:torch.nn.ReLU
layer2: !new:speechbrain.nnet.linear.Linear
input_size: 256
n_neurons: 1
sigm: !new:torch.nn.Sigmoid
classifier_loss: !new:torch.nn.CrossEntropyLoss
optimizer: !name:torch.optim.Adam
lr: 0.0001
weight_decay: 0
loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
factor: 0.5
patience: 2
dont_halve_until_epoch: 95
epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 200
modules:
encoder: *id006
encoder_out: *id007
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
recoverables:
counter: *id008
encoder: *id006
encoder_out: *id007
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
num_separators_per_model: 3
separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
encoder: !ref <encoder>
encoder_out: !ref <encoder_out>
|