# ############################################################################ # Model: WAV2VEC base for Emotion Recognition # ############################################################################ # Hparams NEEDED HPARAMS_NEEDED: ["beam_searcher"] # Modules Needed MODULES_NEEDED: ["hubert", "decoder", "seq_lin"] # URL for the wav2vec2 model, you can change to benchmark diffrenet models wav2vec2_hub: facebook/hubert-base-ls960 # Pretrain folder (HuggingFace) pretrained_path: speechbrain/SLU-direct-SLURP-hubert-enc # parameters encoder_dim: 768 output_neurons: 58 emb_size: 128 dec_neurons: 512 dec_attn_dim: 512 dec_layer: 3 hubert: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: !ref output_norm: True freeze: True pretrain: False save_path: wav2vec2_checkpoints output_emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref input_size: !ref rnn_type: lstm attn_type: content hidden_size: !ref attn_dim: !ref num_layers: !ref scaling: 1.0 dropout: 0.0 seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 beam_size: 80 eos_threshold: 1.5 temperature: 1.25 using_max_attn_shift: false max_attn_shift: 30 coverage_penalty: 0. model: !new:torch.nn.ModuleList - [!ref , !ref , !ref ] modules: hubert: !ref beam_searcher: !ref tokenizer: !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: hubert: !ref model: !ref tokenizer: !ref paths: hubert: !ref /hubert.ckpt model: !ref /model.ckpt tokenizer: !ref /tokenizer_58_unigram.model