IndexTTS / config.json
senstella's picture
Update config.json
63e3a96 verified
{
"model_type": "indextts",
"bigvgan": {
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999998,
"seed": 1234,
"resblock": "1",
"upsample_rates": [
4,
4,
4,
4,
2,
2
],
"upsample_kernel_sizes": [
8,
8,
4,
4,
4,
4
],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"feat_upsample": false,
"speaker_embedding_dim": 512,
"cond_d_vector_in_each_upsampling_layer": true,
"gpt_dim": 1024,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [
1,
2,
4
],
"cqtd_hop_lengths": [
512,
256,
256
],
"cqtd_n_octaves": [
9,
9,
9
],
"cqtd_bins_per_octaves": [
24,
36,
48
],
"resolutions": [
[
1024,
120,
600
],
[
2048,
240,
1200
],
[
512,
50,
240
]
],
"mpd_reshapes": [
2,
3,
5,
7,
11
],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 1000,
"segment_size": 16384,
"num_mels": 100,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 24000,
"fmin": 0,
"fmax": null,
"fmax_for_loss": null,
"mel_type": "pytorch",
"num_workers": 2,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
},
"bigvgan_checkpoint": "bigvgan_generator.pth",
"dataset": {
"bpe_model": "checkpoints/bpe.model",
"sample_rate": 24000,
"squeeze": false,
"mel": {
"sample_rate": 24000,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 100,
"mel_fmin": 0,
"normalize": false
}
},
"dvae_checkpoint": "dvae.pth",
"gpt": {
"model_dim": 1024,
"max_mel_tokens": 605,
"max_text_tokens": 402,
"heads": 16,
"use_mel_codes_as_input": true,
"mel_length_compression": 1024,
"layers": 20,
"number_text_tokens": 12000,
"number_mel_codes": 8194,
"start_mel_token": 8192,
"stop_mel_token": 8193,
"start_text_token": 0,
"stop_text_token": 1,
"train_solo_embeddings": false,
"condition_type": "conformer_perceiver",
"condition_module": {
"output_size": 512,
"linear_units": 2048,
"attention_heads": 8,
"num_blocks": 6,
"input_layer": "conv2d2",
"perceiver_mult": 2
}
},
"gpt_checkpoint": "gpt.pth",
"vqvae": {
"channels": 100,
"num_tokens": 8192,
"hidden_dim": 512,
"num_resnet_blocks": 3,
"codebook_dim": 512,
"num_layers": 2,
"positional_dims": 1,
"kernel_size": 3,
"smooth_l1_loss": true,
"use_transposed_convs": false
}
}