BEST-RQ-12L-LS960 / config.yaml
MorenoLaQuatra
Added model, config and complete checkpoint
f85a01d
augmentation:
augment_pitch_shift: false
augment_time_stretch: false
noise_injection: false
data:
audio_normalization_type: standard
eval_dataset_path: /mnt/disk4/datasets/librispeech/valid.tsv
feature_type: mel
hop_length: 160
max_length: 12.5
n_fft: 400
n_mels: 80
n_mfcc: 13
normalize_audio: false
normalize_features: true
pad_to_max_length: true
sample_rate: 16000
train_dataset_path: /mnt/disk4/datasets/librispeech/train.tsv
truncate_to_max_length: true
win_length: 400
evaluation:
early_stopping: false
eval_interval: 1
metric: validation_accuracy
metric_lower_is_better: false
patience: 10
inference:
batch_size: 1
checkpoint_path: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
logging:
checkpoint_interval: 1
log_dir: /mnt/disk3/bestrq_ckpts/logs-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
log_interval: 4
save_dir: /mnt/disk3/bestrq_ckpts/bestrq-mel-pt-subsampled/12L-ls960-V8K-P16-M.05NM8-MEL/
masking:
bert_style_masking: false
mask_len: 8
mask_prob: 0.05
mask_type: random
model:
conformer_depth: 12
conformer_dim: 768
dim_head: 96
dropout: 0.1
ff_mult: 4
heads: 8
input_dim: 80
kernel_size: 31
proj_dim: 16
quantizer_simvq_mode: false
subsampled_dim: 768
use_subsampling: true
vocab_size: 8192
training:
accelerate_config: configs/accelerate_2GPU_config.yaml
batch_size: 128
comet_experiment_name: 12L-ls960-V8K-P16-M.05NM8-MEL
comet_project_name: bestrq-pt-1124
gradient_accumulation_steps: 32
gradient_clipping: 5.0
learning_rate: 0.001
lr_scheduler: warmup_linear
max_checkpoints: 5
mixed_precision: false
multi_gpu: true
num_epochs: 500
num_workers: 16
optimizer: adamw
use_comet: true
use_cuda: true
warmup_ratio: 0.05
weight_decay: 0.05