# ############################################################################ # Model: # Author: # ############################################################################ # Feature parameters sample_rate: 16000 time_resolution: 0.01 # in seconds (e.g,, 0.01 = 10 ms) n_fft: 400 n_mels_vad: 40 batch_size: 512 # VAD parameters cnn1_channels: 16 cnn2_channels: 32 cnn_kernelsize: (3, 3) rnn_layers: 2 rnn_neurons: 32 rnn_bidirectional: True dnn_blocks: 1 dnn_neurons: 16 output_neurons_vad: 1 # ECAPA_TDNN n_mels_ecapa: 80 out_neurons_ecapa: 7205 emb_dim: 192 dataloader_opts: batch_size: !ref # VAD objects compute_fbank_vad: !new:speechbrain.lobes.features.Fbank sample_rate: !ref n_fft: !ref n_mels: !ref hop_length: !ref * 1000 # in ms mean_var_norm_vad: !new:speechbrain.processing.features.InputNormalization norm_type: sentence cnn: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref ] norm1: !name:speechbrain.nnet.normalization.LayerNorm cnn1: !name:speechbrain.lobes.models.CRDNN.CNN_Block channels: !ref kernel_size: !ref cnn2: !name:speechbrain.lobes.models.CRDNN.CNN_Block channels: !ref kernel_size: !ref rnn: !new:speechbrain.nnet.RNN.GRU input_shape: [null, null, 320] hidden_size: !ref num_layers: !ref bidirectional: !ref dnn: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref * 2] dnn1: !name:speechbrain.lobes.models.CRDNN.DNN_Block neurons: !ref dnn2: !name:speechbrain.lobes.models.CRDNN.DNN_Block neurons: !ref lin: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: False ########################################################## # ECAPA_TDNN objects compute_fbank_ecapa: !new:speechbrain.lobes.features.Fbank n_mels: !ref mean_var_norm_ecapa: !new:speechbrain.processing.features.InputNormalization norm_type: sentence std_norm: False embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN input_size: !ref channels: [1024, 1024, 1024, 1024, 3072] kernel_sizes: [5, 3, 3, 3, 1] dilations: [1, 2, 3, 4, 1] attention_channels: 128 lin_neurons: 192 mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization norm_type: global std_norm: False ##################### vad: !new:torch.nn.ModuleList - [!ref , !ref , !ref ] ##################### modules: compute_fbank_vad: !ref compute_fbank_ecapa: !ref cnn: !ref rnn: !ref dnn: !ref mean_var_norm_vad: !ref mean_var_norm_ecapa: !ref embedding_model: !ref mean_var_norm_emb: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: vad: !ref embedding_model: !ref mean_var_norm_vad: !ref mean_var_norm_emb: !ref