augmentation: augment_pitch_shift: false augment_time_stretch: false noise_injection: false data: audio_normalization_type: standard eval_dataset_path: /mnt/disk4/datasets/librispeech/valid.tsv feature_type: mel hop_length: 160 max_length: 12.5 n_fft: 400 n_mels: 80 n_mfcc: 13 normalize_audio: false normalize_features: true pad_to_max_length: true sample_rate: 16000 train_dataset_path: /mnt/disk4/datasets/librispeech/train.tsv truncate_to_max_length: true win_length: 400 evaluation: early_stopping: false eval_interval: 1 metric: validation_accuracy metric_lower_is_better: false patience: 10 inference: batch_size: 1 checkpoint_path: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ logging: checkpoint_interval: 1 log_dir: /mnt/disk3/bestrq_ckpts/logs-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ log_interval: 4 save_dir: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ masking: bert_style_masking: false mask_len: 8 mask_prob: 0.05 mask_type: random model: conformer_depth: 12 conformer_dim: 768 dim_head: 96 dropout: 0.1 ff_mult: 4 heads: 8 input_dim: 80 kernel_size: 31 proj_dim: 16 quantizer_simvq_mode: false subsampled_dim: 768 use_subsampling: true vocab_size: 8192 training: accelerate_config: configs/accelerate_2GPU_config.yaml batch_size: 128 comet_experiment_name: 12L-ls960-V8K-P16-M.05NM8-MEL comet_project_name: bestrq-pt-1124 gradient_accumulation_steps: 32 gradient_clipping: 5.0 learning_rate: 0.001 lr_scheduler: warmup_linear max_checkpoints: 5 mixed_precision: false multi_gpu: true num_epochs: 500 num_workers: 16 optimizer: adamw use_comet: true use_cuda: true warmup_ratio: 0.05 weight_decay: 0.05