augmentation: | |
augment_pitch_shift: false | |
augment_time_stretch: false | |
noise_injection: false | |
data: | |
audio_normalization_type: standard | |
eval_dataset_path: /mnt/disk4/datasets/librispeech/valid.tsv | |
feature_type: mel | |
hop_length: 160 | |
max_length: 12.5 | |
n_fft: 400 | |
n_mels: 80 | |
n_mfcc: 13 | |
normalize_audio: false | |
normalize_features: true | |
pad_to_max_length: true | |
sample_rate: 16000 | |
train_dataset_path: /mnt/disk4/datasets/librispeech/train.tsv | |
truncate_to_max_length: true | |
win_length: 400 | |
evaluation: | |
early_stopping: false | |
eval_interval: 1 | |
metric: validation_accuracy | |
metric_lower_is_better: false | |
patience: 10 | |
inference: | |
batch_size: 1 | |
checkpoint_path: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ | |
logging: | |
checkpoint_interval: 1 | |
log_dir: /mnt/disk3/bestrq_ckpts/logs-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ | |
log_interval: 4 | |
save_dir: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/ | |
masking: | |
bert_style_masking: false | |
mask_len: 8 | |
mask_prob: 0.05 | |
mask_type: random | |
model: | |
conformer_depth: 12 | |
conformer_dim: 768 | |
dim_head: 96 | |
dropout: 0.1 | |
ff_mult: 4 | |
heads: 8 | |
input_dim: 80 | |
kernel_size: 31 | |
proj_dim: 16 | |
quantizer_simvq_mode: false | |
subsampled_dim: 768 | |
use_subsampling: true | |
vocab_size: 8192 | |
training: | |
accelerate_config: configs/accelerate_2GPU_config.yaml | |
batch_size: 128 | |
comet_experiment_name: 12L-ls960-V8K-P16-M.05NM8-MEL | |
comet_project_name: bestrq-pt-1124 | |
gradient_accumulation_steps: 32 | |
gradient_clipping: 5.0 | |
learning_rate: 0.001 | |
lr_scheduler: warmup_linear | |
max_checkpoints: 5 | |
mixed_precision: false | |
multi_gpu: true | |
num_epochs: 500 | |
num_workers: 16 | |
optimizer: adamw | |
use_comet: true | |
use_cuda: true | |
warmup_ratio: 0.05 | |
weight_decay: 0.05 | |