preprocessor: s2t preprocessor_conf: text_prev_name: text_prev text_ctc_name: text_ctc fs: 16000 na_symbol: "" speech_length: 30 speech_resolution: 0.02 speech_init_silence: 30 text_prev_apply_prob: 0.0 time_apply_prob: 0.0 notime_symbol: "" first_time_symbol: "<0.00>" last_time_symbol: "<30.00>" frontend_conf: n_fft: 512 win_length: 400 hop_length: 160 specaug: specaug specaug_conf: apply_time_warp: false time_warp_window: 5 time_warp_mode: bicubic apply_freq_mask: true freq_mask_width_range: - 0 - 27 num_freq_mask: 2 apply_time_mask: true time_mask_width_ratio_range: - 0. - 0.05 num_time_mask: 5 normalize: global_mvn normalize_conf: stats_file: /espnet/egs2/owsm_v1/s2t1/exp/s2t_stats_raw_bpe20000/train/feats_stats.npz encoder: transformer encoder_conf: output_size: 768 # dimension of attention attention_heads: 12 linear_units: 3072 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks dropout_rate: 0.1 positional_dropout_rate: 0.1 attention_dropout_rate: 0.1 input_layer: conv2d2 # encoder architecture type normalize_before: true decoder: adptransformer decoder_conf: attention_heads: 12 linear_units: 3072 num_blocks: 12 dropout_rate: 0.1 positional_dropout_rate: 0.1 self_attention_dropout_rate: 0.1 src_attention_dropout_rate: 0.1 model_conf: ctc_weight: 0.3 lsm_weight: 0.1 length_normalized_loss: false sym_na: "" optim: adamw optim_conf: lr: 0.00055 betas: - 0.9 - 0.98 eps: 1.0e-06 weight_decay: 0.0 scheduler: warmuplr scheduler_conf: warmup_steps: 10000 # 4 GPU/node x 8 nodes = 32 A100 batch_type: unsorted batch_size: 5 accum_grad: 4 num_iters_per_epoch: 40000 max_epoch: 10 patience: none init: none best_model_criterion: - - valid - acc - max keep_nbest_models: 1 use_amp: true num_workers: 4 init_param: - /espnet/egs2/owsm_v1/s2t1/exp/s2t_train_raw_bpe20000/valid.acc.ave.pth ignore_init_mismatch: false