augmentation:
  augment_pitch_shift: false
  augment_time_stretch: false
  noise_injection: false
data:
  audio_normalization_type: standard
  eval_dataset_path: /mnt/disk4/datasets/librispeech/valid.tsv
  feature_type: mel
  hop_length: 160
  max_length: 12.5
  n_fft: 400
  n_mels: 80
  n_mfcc: 13
  normalize_audio: false
  normalize_features: true
  pad_to_max_length: true
  sample_rate: 16000
  train_dataset_path: /mnt/disk4/datasets/librispeech/train.tsv
  truncate_to_max_length: true
  win_length: 400
evaluation:
  early_stopping: false
  eval_interval: 1
  metric: validation_accuracy
  metric_lower_is_better: false
  patience: 10
inference:
  batch_size: 1
  checkpoint_path: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
logging:
  checkpoint_interval: 1
  log_dir: /mnt/disk3/bestrq_ckpts/logs-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
  log_interval: 4
  save_dir: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
masking:
  bert_style_masking: false
  mask_len: 8
  mask_prob: 0.05
  mask_type: random
model:
  conformer_depth: 12
  conformer_dim: 768
  dim_head: 96
  dropout: 0.1
  ff_mult: 4
  heads: 8
  input_dim: 80
  kernel_size: 31
  proj_dim: 16
  quantizer_simvq_mode: false
  subsampled_dim: 768
  use_subsampling: true
  vocab_size: 8192
training:
  accelerate_config: configs/accelerate_2GPU_config.yaml
  batch_size: 128
  comet_experiment_name: 12L-ls960-V8K-P16-M.05NM8-MEL
  comet_project_name: bestrq-pt-1124
  gradient_accumulation_steps: 32
  gradient_clipping: 5.0
  learning_rate: 0.001
  lr_scheduler: warmup_linear
  max_checkpoints: 5
  mixed_precision: false
  multi_gpu: true
  num_epochs: 500
  num_workers: 16
  optimizer: adamw
  use_comet: true
  use_cuda: true
  warmup_ratio: 0.05
  weight_decay: 0.05