File size: 2,293 Bytes
8789192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# ############################################################################
# Model: WAV2VEC base for Emotion Recognition
# ############################################################################


# Hparams NEEDED
HPARAMS_NEEDED: ["beam_searcher"]
# Modules Needed
MODULES_NEEDED: ["hubert", "decoder", "seq_lin"]

# URL for the wav2vec2 model, you can change to benchmark diffrenet models
wav2vec2_hub: facebook/hubert-base-ls960

# Pretrain folder (HuggingFace)
pretrained_path: speechbrain/SLU-direct-SLURP-hubert-enc

# parameters
encoder_dim: 768
output_neurons: 58
emb_size: 128
dec_neurons: 512
dec_attn_dim: 512
dec_layer: 3

hubert: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
    source: !ref <wav2vec2_hub>
    output_norm: True
    freeze: True
    pretrain: False
    save_path: wav2vec2_checkpoints

output_emb: !new:speechbrain.nnet.embedding.Embedding
  num_embeddings: !ref <output_neurons>
  embedding_dim: !ref <emb_size>

dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
  enc_dim: !ref <encoder_dim>
  input_size: !ref <emb_size>
  rnn_type: lstm
  attn_type: content
  hidden_size: !ref <dec_neurons>
  attn_dim: !ref <dec_attn_dim>
  num_layers: !ref <dec_layer>
  scaling: 1.0
  dropout: 0.0

seq_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <dec_neurons>
  n_neurons: !ref <output_neurons>

beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
  embedding: !ref <output_emb>
  decoder: !ref <dec>
  linear: !ref <seq_lin>
  bos_index: 0
  eos_index: 0
  min_decode_ratio: 0.0
  max_decode_ratio: 10.0
  beam_size: 80
  eos_threshold: 1.5
  temperature: 1.25
  using_max_attn_shift: false
  max_attn_shift: 30
  coverage_penalty: 0.

model: !new:torch.nn.ModuleList
    - [!ref <output_emb>, !ref <dec>, !ref <seq_lin>]

modules:
    hubert: !ref <hubert>
    beam_searcher: !ref <beam_searcher>

tokenizer: !new:sentencepiece.SentencePieceProcessor


pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        hubert: !ref <hubert>
        model: !ref <model>
        tokenizer: !ref <tokenizer>
    paths:
        hubert: !ref <pretrained_path>/hubert.ckpt
        model: !ref <pretrained_path>/model.ckpt
        tokenizer: !ref <pretrained_path>/tokenizer_58_unigram.model