# Generated 2024-06-04 from: # /content/speechbrain/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml # yamllint disable # ################################ # Model: Speaker identification with ECAPA # Authors: Hwidong Na & Mirco Ravanelli # ################################ # Basic parameters seed: 1986 __set_seed: !apply:torch.manual_seed [1986] output_folder: results/xvect_augment/1986 save_folder: results/xvect_augment/1986/save pretrained_path: Definite/hwaja_insic train_log: results/xvect_augment/1986/train_log.txt # Data for augmentation NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1 # Data files data_folder: ./Voxceleb # e.g. /path/to/Voxceleb data_folder_noise: ./Voxceleb/noise # The noisy sequences for data augmentation will automatically be downloaded here. data_folder_rir: ./Voxceleb/rir # The impulse responses used for data augmentation will automatically be downloaded here. train_annotation: results/xvect_augment/1986/save/train.csv valid_annotation: results/xvect_augment/1986/save/dev.csv noise_annotation: results/xvect_augment/1986/save/noise.csv rir_annotation: results/xvect_augment/1986/save/rir.csv # Use the following links for the official voxceleb splits: # VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt # VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt # VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt. # VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set. # Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing. verification_file: ./Voxceleb/save/veri_test.txt split_ratio: [90, 10] skip_prep: true ckpt_interval_minutes: 15 # save checkpoint every N min # Training parameters number_of_epochs: 1 batch_size: 16 lr: 0.001 lr_final: 0.0001 sample_rate: 16000 sentence_len: 3.0 # seconds shuffle: true random_chunk: false # Feature parameters n_mels: 24 left_frames: 0 right_frames: 0 deltas: false # Number of speakers out_n_neurons: 1349 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2 emb_dim: 512 num_workers: 4 dataloader_options: batch_size: 16 shuffle: true num_workers: 4 # Functions compute_features: &id005 !new:speechbrain.lobes.features.Fbank n_mels: 24 left_frames: 0 right_frames: 0 deltas: false embedding_model: &id006 !new:speechbrain.lobes.models.Xvector.Xvector in_channels: 24 activation: !name:torch.nn.LeakyReLU tdnn_blocks: 5 tdnn_channels: [512, 512, 512, 512, 1500] tdnn_kernel_sizes: [5, 3, 3, 1, 1] tdnn_dilations: [1, 2, 3, 1, 1] lin_neurons: 512 classifier: &id007 !new:speechbrain.lobes.models.Xvector.Classifier input_shape: [null, null, 512] activation: !name:torch.nn.LeakyReLU lin_blocks: 1 lin_neurons: 512 out_neurons: 1349 epoch_counter: &id009 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 1 ############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 dest_folder: ./Voxceleb/noise ext: wav csv_file: results/xvect_augment/1986/save/noise.csv # Add noise to input signal add_noise: &id001 !new:speechbrain.augment.time_domain.AddNoise csv_file: results/xvect_augment/1986/save/noise.csv snr_low: 0 snr_high: 15 noise_sample_rate: 16000 clean_sample_rate: 16000 num_workers: 4 # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1 dest_folder: ./Voxceleb/rir ext: wav csv_file: results/xvect_augment/1986/save/rir.csv # Add reverberation to input signal add_reverb: &id002 !new:speechbrain.augment.time_domain.AddReverb csv_file: results/xvect_augment/1986/save/rir.csv reverb_sample_rate: 16000 clean_sample_rate: 16000 num_workers: 4 # Frequency drop: randomly drops a number of frequency bands to zero. drop_freq: &id003 !new:speechbrain.augment.time_domain.DropFreq drop_freq_low: 0 drop_freq_high: 1 drop_freq_count_low: 1 drop_freq_count_high: 3 drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. drop_chunk: &id004 !new:speechbrain.augment.time_domain.DropChunk drop_length_low: 1000 drop_length_high: 2000 drop_count_low: 1 drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: true concat_original: true min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 augmentations: [*id001, *id002, *id003, *id004] mean_var_norm: &id008 !new:speechbrain.processing.features.InputNormalization # Cost + optimization norm_type: sentence std_norm: false mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization norm_type: global std_norm: False label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder modules: compute_features: *id005 embedding_model: *id006 classifier: *id007 mean_var_norm: *id008 compute_cost: !name:speechbrain.nnet.losses.nll_loss # compute_error: !name:speechbrain.nnet.losses.classification_error opt_class: !name:torch.optim.Adam lr: 0.001 weight_decay: 0.000002 lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler initial_value: 0.001 final_value: 0.0001 epoch_count: 1 # Logging + checkpoints train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/xvect_augment/1986/train_log.txt error_stats: !name:speechbrain.utils.metric_stats.MetricStats metric: !name:speechbrain.nnet.losses.classification_error reduction: batch checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/xvect_augment/1986/save recoverables: embedding_model: *id006 classifier: *id007 normalizer: *id008 counter: *id009 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: embedding_model: *id006 mean_var_norm: *id008 classifier: *id007 label_encoder: !ref paths: embedding_model: !ref /embedding_model.ckpt mean_var_norm: !ref /normalizer.ckpt classifier: !ref /classifier.ckpt label_encoder: !ref /label_encoder.txt