runner: total_steps: 200000 gradient_clipping: 1 gradient_accumulate_steps: 1 log_step: 100 eval_step: 2000 save_step: 500 max_keep: 1 eval_dataloaders: - dev-clean optimizer: name: TorchOptim torch_optim_name: Adam lr: 1.0e-4 # comment the whole scheduler config block # to disable learning rate scheduling # scheduler: # name: linear_schedule_with_warmup # num_warmup_steps: 1400 # comment the whole specaug config block # to disable specaug on representation specaug: adaptive: false adaptive_number_ratio: 0.04 adaptive_size_ratio: 0.04 max_n_time_masks: 20 apply_time_warp: true apply_time_mask: true apply_freq_mask: true time_warp_window: 5 time_mask_width_range: [0, 40] freq_mask_width_range: [0, 50] num_freq_mask: 4 num_time_mask: 2 downstream_expert: datarc: train: ['train-clean-100'] dev-clean: ['dev-clean'] dev-other: ['dev-other'] test-clean: ['test-clean'] test-other: ['test-other'] num_workers: 12 train_batch_size: 32 batch_size: 32 eval_batch_size: 1 libri_root: '/home/leo/d/datasets/LibriSpeech' bucket_file: './data/librispeech/len_for_bucket' dict_path: "./downstream/asr/char.dict" zero_infinity: True decoder_args: # See https://github.com/flashlight/text/blob/main/flashlight/lib/text/decoder/LexiconDecoder.h#L20-L30 # for what the options mean. Python binding exposes the same options from C++. # KenLM is a fast LM query implementation, and it can be powered by: # 1. official LibriSpeech 4-gram LM: the 4-gram.arpa file on http://www.openslr.org/11 # 2. fairseq style, letter-based lexicon: https://dl.fbaipublicfiles.com/fairseq/wav2vec/librispeech_lexicon.lst decoder_type: 'None' nbest: 1 criterion: "ctc" beam: 5 beam_threshold: 25 kenlm_model: '/home/leo/d/datasets/4-gram.arpa' lexicon: '/home/leo/d/datasets/librispeech_lexicon.lst' lm_weight: 2 word_score: -1 unk_weight: -math.inf sil_weight: 0 modelrc: project_dim: 1024 select: RNNs Wav2Letter: total_rate: 320 RNNs: total_rate: -1 module: 'LSTM' # 'LSTM'/'GRU' bidirection: True dim: [1024, 1024] dropout: [0.2, 0.2] layer_norm: [False, False] proj: [False, False] # Linear projection + Tanh after each rnn layer sample_rate: [1, 1] sample_style: 'concat' # 'drop'/'concat'