runner: total_steps: 200000 gradient_clipping: 1 gradient_accumulate_steps: 48 log_step: 100 eval_step: 250 save_step: 250 max_keep: 1 eval_dataloaders: - dev - test optimizer: name: AdamW lr: 2.0e-4 # comment the whole scheduler config block # to disable learning rate scheduling scheduler: name: linear_schedule_with_warmup num_warmup_steps: 1000 downstream_expert: datarc: file_path: /path/to/audio_slu train_speakers: - Aditi - Amy # - Brian # - Emma # - Geraint # - Ivy # - Joanna # - Joey # - Justin # - Kendra # - Kimberly # - Matthew # - Nicole # - Raveena # - Russell # - Salli test_speakers: - Aditi # - Amy # - Brian # - Emma # - Geraint # - Ivy # - Joanna # - Joey # - Justin # - Kendra # - Kimberly # - Matthew # - Nicole # - Raveena # - Russell # - Salli num_workers: 12 train_batch_size: 1 eval_batch_size: 1 modelrc: module: TransformerEncoder hparams: hidden_size: 512 # Size of the encoder layers and the pooler layer. num_hidden_layers: 2 # Number of hidden layers in the Transformer encoder. num_attention_heads: 8 # Number of attention heads for each attention layer in the Transformer encoder. intermediate_size: 2048 # The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: "gelu" # The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. hidden_dropout_prob: 0.1 # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: 0.1 # The dropout ratio for the attention probabilities. initializer_range: 0.02 # The sttdev of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps: 1.0e-12 # The epsilon used by LayerNorm. share_layer: False # Share layer weights max_input_length: 0 # maximum input length (0 for no restriction) pre_layer_norm: False # apply the pre layer normalization technique introduced in: https://arxiv.org/abs/2002.04745 input_dim: 512 agg_module: SAP