|
{ |
|
"name": "default_config", |
|
"n_gpu": 1, |
|
"text_encoder": { |
|
"type": "CTCCharTextEncoder", |
|
"args": { |
|
"kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", |
|
"unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" |
|
} |
|
}, |
|
"preprocessing": { |
|
"sr": 16000, |
|
"spectrogram": { |
|
"type": "MelSpectrogram", |
|
"args": { |
|
"n_mels": 256 |
|
} |
|
}, |
|
"log_spec": true |
|
}, |
|
"augmentations": { |
|
"random_apply_p": 0.6, |
|
"wave": [ |
|
{ |
|
"type": "AddColoredNoise", |
|
"args": { |
|
"p": 1, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "Gain", |
|
"args": { |
|
"p": 0.8, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "HighPassFilter", |
|
"args": { |
|
"p": 0, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "LowPassFilter", |
|
"args": { |
|
"p": 0, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "PitchShift", |
|
"args": { |
|
"p": 0.8, |
|
"min_transpose_semitones": -2, |
|
"max_transpose_semitones": 2, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "PolarityInversion", |
|
"args": { |
|
"p": 0.8, |
|
"sample_rate": 16000 |
|
} |
|
}, |
|
{ |
|
"type": "Shift", |
|
"args": { |
|
"p": 0.8, |
|
"sample_rate": 16000 |
|
} |
|
} |
|
], |
|
"spectrogram": [ |
|
{ |
|
"type": "TimeMasking", |
|
"args": { |
|
"time_mask_param": 80, |
|
"p": 0.05 |
|
} |
|
}, |
|
{ |
|
"type": "FrequencyMasking", |
|
"args": { |
|
"freq_mask_param": 80 |
|
} |
|
} |
|
] |
|
}, |
|
"arch": { |
|
"type": "DeepSpeech2Model", |
|
"args": { |
|
"n_feats": 256, |
|
"n_rnn_layers": 6, |
|
"rnn_hidden_size": 512, |
|
"rnn_dropout": 0.2 |
|
} |
|
}, |
|
"data": { |
|
"train": { |
|
"batch_size": 128, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-clean-100", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
}, |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-clean-360", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
}, |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-other-500", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
} |
|
] |
|
}, |
|
"val": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "dev-clean" |
|
} |
|
} |
|
] |
|
}, |
|
"test-other": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "test-other" |
|
} |
|
} |
|
] |
|
}, |
|
"test-clean": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "test-clean" |
|
} |
|
} |
|
] |
|
} |
|
}, |
|
"optimizer": { |
|
"type": "AdamW", |
|
"args": { |
|
"lr": 0.0003, |
|
"weight_decay": 1e-05 |
|
} |
|
}, |
|
"loss": { |
|
"type": "CTCLoss", |
|
"args": {} |
|
}, |
|
"metrics": [ |
|
{ |
|
"type": "ArgmaxWERMetric", |
|
"args": { |
|
"name": "WER (argmax)" |
|
} |
|
}, |
|
{ |
|
"type": "ArgmaxCERMetric", |
|
"args": { |
|
"name": "CER (argmax)" |
|
} |
|
}, |
|
{ |
|
"type": "BeamSearchWERMetric", |
|
"args": { |
|
"beam_size": 4, |
|
"name": "WER (beam search)" |
|
} |
|
}, |
|
{ |
|
"type": "BeamSearchCERMetric", |
|
"args": { |
|
"beam_size": 4, |
|
"name": "CER (beam search)" |
|
} |
|
}, |
|
{ |
|
"type": "LanguageModelWERMetric", |
|
"args": { |
|
"name": "WER (LM)" |
|
} |
|
}, |
|
{ |
|
"type": "LanguageModelCERMetric", |
|
"args": { |
|
"name": "CER (LM)" |
|
} |
|
} |
|
], |
|
"lr_scheduler": { |
|
"type": "OneCycleLR", |
|
"args": { |
|
"steps_per_epoch": 1000, |
|
"epochs": 50, |
|
"anneal_strategy": "cos", |
|
"max_lr": 0.0003, |
|
"pct_start": 0.1 |
|
} |
|
}, |
|
"trainer": { |
|
"epochs": 50, |
|
"save_dir": "saved/", |
|
"save_period": 5, |
|
"verbosity": 2, |
|
"monitor": "min val_loss", |
|
"early_stop": 100, |
|
"visualize": "wandb", |
|
"wandb_project": "asr_project", |
|
"len_epoch": 1000, |
|
"grad_norm_clip": 10 |
|
} |
|
} |