File size: 6,964 Bytes

# Generated 2024-06-04 from:
# /content/speechbrain/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml
# yamllint disable
# ################################
# Model: Speaker identification with ECAPA
# Authors: Hwidong Na & Mirco Ravanelli
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [1986]
output_folder: results/xvect_augment/1986
save_folder: results/xvect_augment/1986/save
pretrained_path: Definite/hwaja_insic
train_log: results/xvect_augment/1986/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: 
  https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: 
  https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: ./Voxceleb  # e.g. /path/to/Voxceleb
data_folder_noise: ./Voxceleb/noise         # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: ./Voxceleb/rir         # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: results/xvect_augment/1986/save/train.csv
valid_annotation: results/xvect_augment/1986/save/dev.csv
noise_annotation: results/xvect_augment/1986/save/noise.csv
rir_annotation: results/xvect_augment/1986/save/rir.csv

# Use the following links for the official voxceleb splits:
# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
verification_file: ./Voxceleb/save/veri_test.txt

split_ratio: [90, 10]
skip_prep: true
ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 1
batch_size: 16
lr: 0.001
lr_final: 0.0001

sample_rate: 16000
sentence_len: 3.0 # seconds
shuffle: true
random_chunk: false

# Feature parameters
n_mels: 24
left_frames: 0
right_frames: 0
deltas: false

# Number of speakers
out_n_neurons: 1349 #1211 for vox1  # 5994 for vox2, 7205 for vox1+vox2
emb_dim: 512

num_workers: 4
dataloader_options:
  batch_size: 16
  shuffle: true
  num_workers: 4

# Functions
compute_features: &id005 !new:speechbrain.lobes.features.Fbank
  n_mels: 24
  left_frames: 0
  right_frames: 0
  deltas: false

embedding_model: &id006 !new:speechbrain.lobes.models.Xvector.Xvector
  in_channels: 24
  activation: !name:torch.nn.LeakyReLU
  tdnn_blocks: 5
  tdnn_channels: [512, 512, 512, 512, 1500]
  tdnn_kernel_sizes: [5, 3, 3, 1, 1]
  tdnn_dilations: [1, 2, 3, 1, 1]
  lin_neurons: 512

classifier: &id007 !new:speechbrain.lobes.models.Xvector.Classifier
  input_shape: [null, null, 512]
  activation: !name:torch.nn.LeakyReLU
  lin_blocks: 1
  lin_neurons: 512
  out_neurons: 1349

epoch_counter: &id009 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 1

############################## Augmentations ###################################

# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
  URL: 
    https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
  dest_folder: ./Voxceleb/noise
  ext: wav
  csv_file: results/xvect_augment/1986/save/noise.csv


# Add noise to input signal
add_noise: &id001 !new:speechbrain.augment.time_domain.AddNoise
  csv_file: results/xvect_augment/1986/save/noise.csv
  snr_low: 0
  snr_high: 15
  noise_sample_rate: 16000
  clean_sample_rate: 16000
  num_workers: 4

# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
  URL: 
    https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1
  dest_folder: ./Voxceleb/rir
  ext: wav
  csv_file: results/xvect_augment/1986/save/rir.csv

# Add reverberation to input signal
add_reverb: &id002 !new:speechbrain.augment.time_domain.AddReverb
  csv_file: results/xvect_augment/1986/save/rir.csv
  reverb_sample_rate: 16000
  clean_sample_rate: 16000
  num_workers: 4

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: &id003 !new:speechbrain.augment.time_domain.DropFreq
  drop_freq_low: 0
  drop_freq_high: 1
  drop_freq_count_low: 1
  drop_freq_count_high: 3
  drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: &id004 !new:speechbrain.augment.time_domain.DropChunk
  drop_length_low: 1000
  drop_length_high: 2000
  drop_count_low: 1
  drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
  parallel_augment: true
  concat_original: true
  min_augmentations: 4
  max_augmentations: 4
  augment_prob: 1.0
  augmentations: [*id001, *id002, *id003, *id004]

mean_var_norm: &id008 !new:speechbrain.processing.features.InputNormalization

# Cost + optimization
  norm_type: sentence
  std_norm: false

mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
    norm_type: global
    std_norm: False

label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder

modules:
  compute_features: *id005
  embedding_model: *id006
  classifier: *id007
  mean_var_norm: *id008
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
  lr: 0.001
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
  initial_value: 0.001
  final_value: 0.0001
  epoch_count: 1

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/xvect_augment/1986/train_log.txt

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
  metric: !name:speechbrain.nnet.losses.classification_error
    reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/xvect_augment/1986/save
  recoverables:
    embedding_model: *id006
    classifier: *id007
    normalizer: *id008
    counter: *id009

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        embedding_model: *id006
        mean_var_norm: *id008
        classifier: *id007
        label_encoder: !ref <label_encoder>
    paths:
        embedding_model: !ref <pretrained_path>/embedding_model.ckpt
        mean_var_norm: !ref <pretrained_path>/normalizer.ckpt
        classifier: !ref <pretrained_path>/classifier.ckpt
        label_encoder: !ref <pretrained_path>/label_encoder.txt