# Generated 2022-05-27 from: # /data/n.abdoumohamed/dvoice-africa/speechbrain/recipes/DVoice/ASR/CTC/hparams/train_amharic.yaml # yamllint disable # ################################ # Model: wav2vec2 + DNN + CTC # Augmentation: SpecAugment # Authors: Titouan Parcollet 2021 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1249 __set_seed: !!python/object/apply:torch.manual_see output_folder: results/wav2vec2_ctc_AMHARIC/1249 wer_file: results/wav2vec2_ctc_AMHARIC/1249/wer.txt save_folder: results/wav2vec2_ctc_AMHARIC/1249/save train_log: results/wav2vec2_ctc_AMHARIC/1249/train_log.txt # URL for the biggest LeBenchmark wav2vec french. wav2vec2_hub: facebook/wav2vec2-large-xlsr-53 # Data files data_folder: ASR/AMHARIC/data # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr train_csv_file: ASR/AMHARIC/data/train.csv # Standard CommonVoice .tsv files dev_csv_file: ASR/AMHARIC/data/dev.csv # Standard CommonVoice .tsv files test_csv_file: ASR/AMHARIC/data/test.csv # Standard CommonVoice .tsv files accented_letters: true language: amharic train_csv: results/wav2vec2_ctc_AMHARIC/save/train.csv valid_csv: results/wav2vec2_ctc_AMHARIC/save/dev.csv test_csv: results/wav2vec2_ctc_AMHARIC/save/test.csv skip_prep: false # Skip data preparation data_augmentation: false # Skip data augmentation # We remove utterance slonger than 10s in the train/dev/test sets as # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 # Training parameters number_of_epochs: 30 number_of_ctc_epochs: 15 lr: 1.0 lr_wav2vec: 0.0001 ctc_weight: 0.3 sorting: ascending auto_mix_prec: false sample_rate: 16000 ckpt_interval_minutes: 30 # save checkpoint every N min # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 6 per GPU to fit 16GB of VRAM batch_size: 4 test_batch_size: 4 dataloader_options: batch_size: 4 num_workers: 2 test_dataloader_options: batch_size: 4 num_workers: 2 # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 # Model parameters activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: false # Outputs output_neurons: 224 # BPE size, index(blank/eos/bos) = 0 # Decoding parameters # Be sure that the bos and eos index match with the BPEs ones blank_index: 0 bos_index: 1 eos_index: 2 min_decode_ratio: 0.0 max_decode_ratio: 1.0 beam_size: 80 eos_threshold: 1.5 using_max_attn_shift: true max_attn_shift: 140 ctc_weight_decode: 0.0 temperature: 1.50 # # Functions and classes # epoch_counter: &id007 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 30 augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [95, 100, 105] enc: &id002 !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, 1024] linear1: !name:speechbrain.nnet.linear.Linear n_neurons: 1024 bias: true bn1: !name:speechbrain.nnet.normalization.BatchNorm1d activation: !new:torch.nn.LeakyReLU drop: !new:torch.nn.Dropout p: 0.15 linear2: !name:speechbrain.nnet.linear.Linear n_neurons: 1024 bias: true bn2: !name:speechbrain.nnet.normalization.BatchNorm1d activation2: !new:torch.nn.LeakyReLU drop2: !new:torch.nn.Dropout p: 0.15 linear3: !name:speechbrain.nnet.linear.Linear n_neurons: 1024 bias: true bn3: !name:speechbrain.nnet.normalization.BatchNorm1d activation3: !new:torch.nn.LeakyReLU wav2vec2: &id001 !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: facebook/wav2vec2-large-xlsr-53 output_norm: true freeze: false save_path: results/wav2vec2_ctc_AMHARIC/1249/save/wav2vec2_checkpoint ##### # Uncomment this block if you prefer to use a Fairseq pretrained model instead # of a HuggingFace one. Here, we provide an URL that is obtained from the # Fairseq github for the multilingual XLSR. # #wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_53_56k.pt #wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2 # pretrained_path: !ref # output_norm: True # freeze: False # save_path: !ref /wav2vec2_checkpoint/model.pt ##### ctc_lin: &id003 !new:speechbrain.nnet.linear.Linear input_size: 1024 n_neurons: 224 log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: true ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: 0 modules: wav2vec2: *id001 enc: *id002 ctc_lin: *id003 model: &id004 !new:torch.nn.ModuleList - [*id002, *id003] model_opt_class: !name:torch.optim.Adadelta lr: 1.0 rho: 0.95 eps: 1.e-8 wav2vec_opt_class: !name:torch.optim.Adam lr: 0.0001 lr_annealing_model: &id005 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 1.0 improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 lr_annealing_wav2vec: &id006 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 0.0001 improvement_threshold: 0.0025 annealing_factor: 0.9 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/wav2vec2_ctc_AMHARIC/1249/save recoverables: wav2vec2: *id001 model: *id004 scheduler_model: *id005 scheduler_wav2vec: *id006 counter: *id007 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/wav2vec2_ctc_AMHARIC/1249/train_log.txt error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true