feature_extractor: | |
class_path: vocos.feature_extractors.EncodecFeatures | |
init_args: | |
encodec_model: encodec_24khz | |
bandwidths: [1.5, 3.0, 6.0, 12.0] | |
train_codebooks: false | |
backbone: | |
class_path: vocos.models.VocosBackbone | |
init_args: | |
input_channels: 128 | |
dim: 384 | |
intermediate_dim: 1152 | |
num_layers: 8 | |
adanorm_num_embeddings: 4 # len(bandwidths) | |
head: | |
class_path: vocos.heads.ISTFTHead | |
init_args: | |
dim: 384 | |
n_fft: 1280 | |
hop_length: 320 | |
padding: same |