runner:
  total_steps: 200000
  gradient_clipping: 1
  gradient_accumulate_steps: 48

  log_step: 100
  eval_step: 250
  save_step: 250
  max_keep: 1
  eval_dataloaders:
    - dev
    - test

optimizer:
  name: AdamW
  lr: 2.0e-4

# comment the whole scheduler config block
# to disable learning rate scheduling
scheduler:
  name: linear_schedule_with_warmup
  num_warmup_steps: 1000

downstream_expert:
  datarc:
    file_path: /path/to/audio_slu
    train_speakers: 
      - Aditi
      - Amy
      # - Brian
      # - Emma
      # - Geraint
      # - Ivy
      # - Joanna
      # - Joey
      # - Justin
      # - Kendra
      # - Kimberly
      # - Matthew
      # - Nicole
      # - Raveena
      # - Russell
      # - Salli
    test_speakers:
      - Aditi
      # - Amy
      # - Brian
      # - Emma
      # - Geraint
      # - Ivy
      # - Joanna
      # - Joey
      # - Justin
      # - Kendra
      # - Kimberly
      # - Matthew
      # - Nicole
      # - Raveena
      # - Russell
      # - Salli
    num_workers: 12
    train_batch_size: 1
    eval_batch_size: 1

  modelrc:
    module:
      TransformerEncoder
    hparams:
      hidden_size: 512                                      # Size of the encoder layers and the pooler layer.
      num_hidden_layers: 2                                  # Number of hidden layers in the Transformer encoder.
      num_attention_heads: 8                               # Number of attention heads for each attention layer in the Transformer encoder.
      intermediate_size: 2048                               # The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
      hidden_act: "gelu"                                    # The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
      hidden_dropout_prob: 0.1                              # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
      attention_probs_dropout_prob: 0.1                     # The dropout ratio for the attention probabilities.
      initializer_range: 0.02                               # The sttdev of the truncated_normal_initializer for initializing all weight matrices.
      layer_norm_eps: 1.0e-12                              # The epsilon used by LayerNorm.
      share_layer: False                                    # Share layer weights
      max_input_length: 0                                   # maximum input length (0 for no restriction)
      pre_layer_norm: False                                 # apply the pre layer normalization technique introduced in: https://arxiv.org/abs/2002.04745
    input_dim: 512
    agg_module: SAP