|
{ |
|
"model_type": "indextts", |
|
"bigvgan": { |
|
"adam_b1": 0.8, |
|
"adam_b2": 0.99, |
|
"lr_decay": 0.999998, |
|
"seed": 1234, |
|
"resblock": "1", |
|
"upsample_rates": [ |
|
4, |
|
4, |
|
4, |
|
4, |
|
2, |
|
2 |
|
], |
|
"upsample_kernel_sizes": [ |
|
8, |
|
8, |
|
4, |
|
4, |
|
4, |
|
4 |
|
], |
|
"upsample_initial_channel": 1536, |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
], |
|
"feat_upsample": false, |
|
"speaker_embedding_dim": 512, |
|
"cond_d_vector_in_each_upsampling_layer": true, |
|
"gpt_dim": 1024, |
|
"activation": "snakebeta", |
|
"snake_logscale": true, |
|
"use_cqtd_instead_of_mrd": true, |
|
"cqtd_filters": 128, |
|
"cqtd_max_filters": 1024, |
|
"cqtd_filters_scale": 1, |
|
"cqtd_dilations": [ |
|
1, |
|
2, |
|
4 |
|
], |
|
"cqtd_hop_lengths": [ |
|
512, |
|
256, |
|
256 |
|
], |
|
"cqtd_n_octaves": [ |
|
9, |
|
9, |
|
9 |
|
], |
|
"cqtd_bins_per_octaves": [ |
|
24, |
|
36, |
|
48 |
|
], |
|
"resolutions": [ |
|
[ |
|
1024, |
|
120, |
|
600 |
|
], |
|
[ |
|
2048, |
|
240, |
|
1200 |
|
], |
|
[ |
|
512, |
|
50, |
|
240 |
|
] |
|
], |
|
"mpd_reshapes": [ |
|
2, |
|
3, |
|
5, |
|
7, |
|
11 |
|
], |
|
"use_spectral_norm": false, |
|
"discriminator_channel_mult": 1, |
|
"use_multiscale_melloss": true, |
|
"lambda_melloss": 15, |
|
"clip_grad_norm": 1000, |
|
"segment_size": 16384, |
|
"num_mels": 100, |
|
"num_freq": 1025, |
|
"n_fft": 1024, |
|
"hop_size": 256, |
|
"win_size": 1024, |
|
"sampling_rate": 24000, |
|
"fmin": 0, |
|
"fmax": null, |
|
"fmax_for_loss": null, |
|
"mel_type": "pytorch", |
|
"num_workers": 2, |
|
"dist_config": { |
|
"dist_backend": "nccl", |
|
"dist_url": "tcp://localhost:54321", |
|
"world_size": 1 |
|
} |
|
}, |
|
"bigvgan_checkpoint": "bigvgan_generator.pth", |
|
"dataset": { |
|
"bpe_model": "checkpoints/bpe.model", |
|
"sample_rate": 24000, |
|
"squeeze": false, |
|
"mel": { |
|
"sample_rate": 24000, |
|
"n_fft": 1024, |
|
"hop_length": 256, |
|
"win_length": 1024, |
|
"n_mels": 100, |
|
"mel_fmin": 0, |
|
"normalize": false |
|
} |
|
}, |
|
"dvae_checkpoint": "dvae.pth", |
|
"gpt": { |
|
"model_dim": 1024, |
|
"max_mel_tokens": 605, |
|
"max_text_tokens": 402, |
|
"heads": 16, |
|
"use_mel_codes_as_input": true, |
|
"mel_length_compression": 1024, |
|
"layers": 20, |
|
"number_text_tokens": 12000, |
|
"number_mel_codes": 8194, |
|
"start_mel_token": 8192, |
|
"stop_mel_token": 8193, |
|
"start_text_token": 0, |
|
"stop_text_token": 1, |
|
"train_solo_embeddings": false, |
|
"condition_type": "conformer_perceiver", |
|
"condition_module": { |
|
"output_size": 512, |
|
"linear_units": 2048, |
|
"attention_heads": 8, |
|
"num_blocks": 6, |
|
"input_layer": "conv2d2", |
|
"perceiver_mult": 2 |
|
} |
|
}, |
|
"gpt_checkpoint": "gpt.pth", |
|
"vqvae": { |
|
"channels": 100, |
|
"num_tokens": 8192, |
|
"hidden_dim": 512, |
|
"num_resnet_blocks": 3, |
|
"codebook_dim": 512, |
|
"num_layers": 2, |
|
"positional_dims": 1, |
|
"kernel_size": 3, |
|
"smooth_l1_loss": true, |
|
"use_transposed_convs": false |
|
} |
|
} |