|
{ |
|
"name": "default_config", |
|
"n_gpu": 1, |
|
"text_encoder": { |
|
"type": "CTCCharTextEncoder", |
|
"args": { |
|
"kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa", |
|
"unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt" |
|
} |
|
}, |
|
"preprocessing": { |
|
"sr": 16000, |
|
"spectrogram": { |
|
"type": "MelSpectrogram", |
|
"args": { |
|
"n_mels": 256 |
|
} |
|
}, |
|
"log_spec": true |
|
}, |
|
"augmentations": { |
|
"random_apply_p": 0.6, |
|
"wave": [ |
|
{"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}}, |
|
{"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}}, |
|
{"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}}, |
|
{"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}}, |
|
{"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}}, |
|
{"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}}, |
|
{"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}} |
|
], |
|
"spectrogram": [] |
|
}, |
|
"arch": { |
|
"type": "DeepSpeech2Model", |
|
"args": { |
|
"n_feats": 256, |
|
"n_rnn_layers": 5, |
|
"rnn_hidden_size": 512, |
|
"rnn_dropout": 0.2 |
|
} |
|
}, |
|
"data": { |
|
"train": { |
|
"batch_size": 128, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-clean-100", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
}, |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-clean-360", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
}, |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "train-other-500", |
|
"max_audio_length": 40.0, |
|
"max_text_length": 400 |
|
} |
|
} |
|
] |
|
}, |
|
"val": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "dev-clean" |
|
} |
|
} |
|
] |
|
}, |
|
"test-other": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "test-other" |
|
} |
|
} |
|
] |
|
}, |
|
"test-clean": { |
|
"batch_size": 64, |
|
"num_workers": 4, |
|
"datasets": [ |
|
{ |
|
"type": "LibrispeechDataset", |
|
"args": { |
|
"part": "test-clean" |
|
} |
|
} |
|
] |
|
} |
|
}, |
|
"optimizer": { |
|
"type": "AdamW", |
|
"args": { |
|
"lr": 5e-4, |
|
"weight_decay": 1e-3 |
|
} |
|
}, |
|
"loss": { |
|
"type": "CTCLoss", |
|
"args": {} |
|
}, |
|
"metrics": [ |
|
{ |
|
"type": "ArgmaxWERMetric", |
|
"args": { |
|
"name": "WER (argmax)" |
|
} |
|
}, |
|
{ |
|
"type": "ArgmaxCERMetric", |
|
"args": { |
|
"name": "CER (argmax)" |
|
} |
|
}, |
|
{ |
|
"type": "BeamSearchWERMetric", |
|
"args": { |
|
"beam_size": 4, |
|
"name": "WER (beam search)" |
|
} |
|
}, |
|
{ |
|
"type": "BeamSearchCERMetric", |
|
"args": { |
|
"beam_size": 4, |
|
"name": "CER (beam search)" |
|
} |
|
}, |
|
{ |
|
"type": "LanguageModelWERMetric", |
|
"args": { |
|
"name": "WER (LM)" |
|
} |
|
}, |
|
{ |
|
"type": "LanguageModelCERMetric", |
|
"args": { |
|
"name": "CER (LM)" |
|
} |
|
} |
|
], |
|
"lr_scheduler": { |
|
"type": "OneCycleLR", |
|
"args": { |
|
"steps_per_epoch": 1000, |
|
"epochs": 50, |
|
"anneal_strategy": "cos", |
|
"max_lr": 5e-4, |
|
"pct_start": 0.1 |
|
} |
|
}, |
|
"trainer": { |
|
"epochs": 50, |
|
"save_dir": "saved/", |
|
"save_period": 5, |
|
"verbosity": 2, |
|
"monitor": "min val_loss", |
|
"early_stop": 100, |
|
"visualize": "wandb", |
|
"wandb_project": "asr_project", |
|
"len_epoch": 1000, |
|
"grad_norm_clip": 10 |
|
} |
|
} |
|
|