# ################################ # Model: Best-RQ # Authors: Jarod Duret 2024 # ################################ sample_rate: 16000 n_fft: 512 n_mels: 80 win_length: 32 hop_length: 10 ####################### Model parameters ########################### # Transformer d_model: 768 nhead: 8 num_encoder_layers: 12 num_decoder_layers: 0 d_ffn: 2048 transformer_dropout: 0.1 activation: !name:torch.nn.GELU output_neurons: 5000 encoder_layerdrop: 0.0 compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref n_fft: !ref n_mels: !ref hop_length: !ref win_length: !ref normalizer: !new:speechbrain.processing.features.InputNormalization norm_type: sentence update_until_epoch: 0 ############################## Models ################################ latent_extractor: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 num_layers_per_block: 1 out_channels: (64, 32) kernel_sizes: (3, 3) strides: (2, 2) residuals: (False, False) latent_encoder: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR input_size: 640 tgt_vocab: !ref d_model: !ref nhead: !ref num_encoder_layers: !ref num_decoder_layers: !ref d_ffn: !ref dropout: !ref activation: !ref conformer_activation: !ref encoder_module: conformer attention_type: RelPosMHAXL normalize_before: True causal: False layerdrop_prob: !ref # We must call an encoder wrapper so the decoder isn't run (we don't have any) encoder_wrapper: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper transformer: !ref # encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential # latent_extractor: !ref # encoder_wrapper: !ref model: !new:torch.nn.ModuleList - [!ref , !ref ] modules: normalizer: !ref extractor: !ref encoder: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref normalizer: !ref