Spaces:
Running
Running
{ | |
"base_config": "config/comosvc.json", | |
"model_type": "DiffComoSVC", | |
"dataset": [ | |
"m4singer", | |
"opencpop", | |
"opensinger", | |
"svcc", | |
"vctk" | |
], | |
"dataset_path": { | |
// TODO: Fill in your dataset path | |
"m4singer": "[M4Singer dataset path]", | |
"opencpop": "[Opencpop dataset path]", | |
"opensinger": "[OpenSinger dataset path]", | |
"svcc": "[SVCC dataset path]", | |
"vctk": "[VCTK dataset path]" | |
}, | |
// TODO: Fill in the output log path | |
"log_dir": "[Your path to save logs and checkpoints]", | |
"preprocess": { | |
// TODO: Fill in the output data path | |
"processed_dir": "[Your path to save processed data]", | |
// Config for features extraction | |
"extract_mel": true, | |
"extract_pitch": true, | |
"extract_energy": true, | |
"extract_whisper_feature": true, | |
"extract_contentvec_feature": true, | |
"extract_wenet_feature": false, | |
"whisper_batch_size": 30, // decrease it if your GPU is out of memory | |
"contentvec_batch_size": 1, | |
// Fill in the content-based pretrained model's path | |
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", | |
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", | |
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", | |
"whisper_model": "medium", | |
"whisper_model_path": "pretrained/whisper/medium.pt", | |
// Config for features usage | |
"use_mel": true, | |
"use_min_max_norm_mel": true, | |
"use_frame_pitch": true, | |
"use_frame_energy": true, | |
"use_spkid": true, | |
"use_whisper": true, | |
"use_contentvec": true, | |
"use_wenet": false, | |
"n_mel": 100, | |
"sample_rate": 24000 | |
}, | |
"model": { | |
"teacher_model_path":"[Your_teacher_model_checkpoint].bin", | |
"condition_encoder": { | |
// Config for features usage | |
"use_whisper": true, | |
"use_contentvec": true, | |
"use_wenet": false, | |
"whisper_dim": 1024, | |
"contentvec_dim": 256, | |
"wenet_dim": 512, | |
"use_singer_encoder": false, | |
"pitch_min": 50, | |
"pitch_max": 1100 | |
}, | |
"comosvc":{ | |
"distill": false, | |
// conformer encoder | |
"input_dim": 384, | |
"output_dim": 100, | |
"n_heads": 2, | |
"n_layers": 6, | |
"filter_channels":512, | |
"dropout":0.1, | |
// karras diffusion | |
"P_mean": -1.2, | |
"P_std": 1.2, | |
"sigma_data": 0.5, | |
"sigma_min": 0.002, | |
"sigma_max": 80, | |
"rho": 7, | |
"n_timesteps": 40, | |
}, | |
"diffusion": { | |
// Diffusion steps encoder | |
"step_encoder": { | |
"dim_raw_embedding": 128, | |
"dim_hidden_layer": 512, | |
"activation": "SiLU", | |
"num_layer": 2, | |
"max_period": 10000 | |
}, | |
// Diffusion decoder | |
"model_type": "bidilconv", | |
// bidilconv, unet2d, TODO: unet1d | |
"bidilconv": { | |
"base_channel": 384, | |
"n_res_block": 20, | |
"conv_kernel_size": 3, | |
"dilation_cycle_length": 4, | |
// specially, 1 means no dilation | |
"conditioner_size": 100 | |
} | |
} | |
}, | |
"train": { | |
"batch_size": 64, | |
"gradient_accumulation_step": 1, | |
"max_epoch": -1, // -1 means no limit | |
"save_checkpoint_stride": [ | |
50, | |
50 | |
], | |
"keep_last": [ | |
5, | |
-1 | |
], | |
"run_eval": [ | |
false, | |
true | |
], | |
"adamw": { | |
"lr": 4.0e-4 | |
}, | |
"reducelronplateau": { | |
"factor": 0.8, | |
"patience": 10, | |
"min_lr": 1.0e-4 | |
}, | |
"dataloader": { | |
"num_worker": 8, | |
"pin_memory": true | |
}, | |
"sampler": { | |
"holistic_shuffle": false, | |
"drop_last": true | |
} | |
}, | |
"inference": { | |
"comosvc": { | |
"inference_steps": 40 | |
} | |
} | |
} |