File size: 6,964 Bytes
b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c f3588ae b72ec2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
# Generated 2024-06-04 from:
# /content/speechbrain/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml
# yamllint disable
# ################################
# Model: Speaker identification with ECAPA
# Authors: Hwidong Na & Mirco Ravanelli
# ################################
# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [1986]
output_folder: results/xvect_augment/1986
save_folder: results/xvect_augment/1986/save
pretrained_path: Definite/hwaja_insic
train_log: results/xvect_augment/1986/train_log.txt
# Data for augmentation
NOISE_DATASET_URL:
https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL:
https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1
# Data files
data_folder: ./Voxceleb # e.g. /path/to/Voxceleb
data_folder_noise: ./Voxceleb/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: ./Voxceleb/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: results/xvect_augment/1986/save/train.csv
valid_annotation: results/xvect_augment/1986/save/dev.csv
noise_annotation: results/xvect_augment/1986/save/noise.csv
rir_annotation: results/xvect_augment/1986/save/rir.csv
# Use the following links for the official voxceleb splits:
# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
verification_file: ./Voxceleb/save/veri_test.txt
split_ratio: [90, 10]
skip_prep: true
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 1
batch_size: 16
lr: 0.001
lr_final: 0.0001
sample_rate: 16000
sentence_len: 3.0 # seconds
shuffle: true
random_chunk: false
# Feature parameters
n_mels: 24
left_frames: 0
right_frames: 0
deltas: false
# Number of speakers
out_n_neurons: 1349 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2
emb_dim: 512
num_workers: 4
dataloader_options:
batch_size: 16
shuffle: true
num_workers: 4
# Functions
compute_features: &id005 !new:speechbrain.lobes.features.Fbank
n_mels: 24
left_frames: 0
right_frames: 0
deltas: false
embedding_model: &id006 !new:speechbrain.lobes.models.Xvector.Xvector
in_channels: 24
activation: !name:torch.nn.LeakyReLU
tdnn_blocks: 5
tdnn_channels: [512, 512, 512, 512, 1500]
tdnn_kernel_sizes: [5, 3, 3, 1, 1]
tdnn_dilations: [1, 2, 3, 1, 1]
lin_neurons: 512
classifier: &id007 !new:speechbrain.lobes.models.Xvector.Classifier
input_shape: [null, null, 512]
activation: !name:torch.nn.LeakyReLU
lin_blocks: 1
lin_neurons: 512
out_neurons: 1349
epoch_counter: &id009 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 1
############################## Augmentations ###################################
# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
URL:
https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
dest_folder: ./Voxceleb/noise
ext: wav
csv_file: results/xvect_augment/1986/save/noise.csv
# Add noise to input signal
add_noise: &id001 !new:speechbrain.augment.time_domain.AddNoise
csv_file: results/xvect_augment/1986/save/noise.csv
snr_low: 0
snr_high: 15
noise_sample_rate: 16000
clean_sample_rate: 16000
num_workers: 4
# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
URL:
https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1
dest_folder: ./Voxceleb/rir
ext: wav
csv_file: results/xvect_augment/1986/save/rir.csv
# Add reverberation to input signal
add_reverb: &id002 !new:speechbrain.augment.time_domain.AddReverb
csv_file: results/xvect_augment/1986/save/rir.csv
reverb_sample_rate: 16000
clean_sample_rate: 16000
num_workers: 4
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: &id003 !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: &id004 !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
parallel_augment: true
concat_original: true
min_augmentations: 4
max_augmentations: 4
augment_prob: 1.0
augmentations: [*id001, *id002, *id003, *id004]
mean_var_norm: &id008 !new:speechbrain.processing.features.InputNormalization
# Cost + optimization
norm_type: sentence
std_norm: false
mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
norm_type: global
std_norm: False
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
modules:
compute_features: *id005
embedding_model: *id006
classifier: *id007
mean_var_norm: *id008
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error
opt_class: !name:torch.optim.Adam
lr: 0.001
weight_decay: 0.000002
lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
initial_value: 0.001
final_value: 0.0001
epoch_count: 1
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: results/xvect_augment/1986/train_log.txt
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: results/xvect_augment/1986/save
recoverables:
embedding_model: *id006
classifier: *id007
normalizer: *id008
counter: *id009
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
embedding_model: *id006
mean_var_norm: *id008
classifier: *id007
label_encoder: !ref <label_encoder>
paths:
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
mean_var_norm: !ref <pretrained_path>/normalizer.ckpt
classifier: !ref <pretrained_path>/classifier.ckpt
label_encoder: !ref <pretrained_path>/label_encoder.txt |