kxxia's picture
Upload 2 files
d759928 verified
{
"base_config": "egs/tts/NaturalSpeech2/exp_config_base.json",
"dataset": [
"libritts",
],
"exp_name": "ns2_wenet_16_3.8_new",
"log_dir": "ckpts/tts",
"model": {
"diffusion": {
"beta_max": 20,
"beta_min": 0.05,
"diffusion_type": "diffusion",
"noise_factor": 1.0,
"ode_solver": "euler",
"sigma": 1.0,
"wavenet": {
"attn_head": 8,
"cross_attn_per_layer": 3,
"dilation_cycle": 2,
"drop_out": 0.2,
"hidden_size": 512,
"input_size": 128,
"num_layers": 40,
"out_size": 128,
},
},
"inference_step": 500,
"latent_dim": 128,
"prior_encoder": {
"duration_predictor": {
"attn_head": 8,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"drop_out": 0.5,
"filter_size": 512,
"input_size": 512,
"kernel_size": 3,
},
"encoder": {
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"encoder_head": 8,
"encoder_hidden": 512,
"encoder_layer": 6,
"use_cln": true,
},
"pitch_bins_num": 512,
"pitch_max": 1100,
"pitch_min": 50,
"pitch_predictor": {
"attn_head": 8,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"drop_out": 0.5,
"filter_size": 512,
"input_size": 512,
"kernel_size": 5,
},
"vocab_size": 100,
},
"prompt_encoder": {
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"encoder_head": 8,
"encoder_hidden": 512,
"encoder_layer": 6,
"use_cln": false,
},
"query_emb": {
"head_num": 8,
"hidden_size": 512,
"query_token_num": 32,
},
},
"model_type": "NaturalSpeech2",
"preprocess": {
"align_mel_duration": false,
"audio_dir": "audios",
"bits": 8,
"clip_mode": "start",
"code_dir": "code",
"contentvec_dir": "contentvec",
"data_augment": false,
"dur_dir": "durs",
"duration_dir": "duration",
"emo2id": "emo2id.json",
"energy_dir": "energys",
"energy_extract_mode": "from_mel",
"energy_norm": false,
"energy_remove_outlier": false,
"extract_acoustic_token": false,
"extract_amplitude_phase": false,
"extract_audio": false,
"extract_contentvec_feature": false,
"extract_duration": false,
"extract_energy": false,
"extract_label": false,
"extract_linear_spec": false,
"extract_mcep": false,
"extract_mel": false,
"extract_mert_feature": false,
"extract_phone": false,
"extract_pitch": false,
"extract_uv": false,
"extract_wenet_feature": false,
"extract_whisper_feature": false,
"file_lst": "file.lst",
"fmax": 12000,
"fmin": 0,
"hop_size": 120,
"imaginary_dir": "imaginarys",
"lab_dir": "labs",
"label_dir": "labels",
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
"linear_dir": "linears",
"log_amplitude_dir": "log_amplitudes",
"mcep_dir": "mcep",
"mel_dir": "mels",
"mel_extract_mode": "",
"mel_min_max_norm": false,
"melspec_dir": "mel",
"metadata_dir": "metadata",
"min_level_db": -115,
"n_fft": 1024,
"n_mel": 80,
"num_silent_frames": 8,
"phase_dir": "phases",
"phone_dir": "phones",
"phone_energy_dir": "phone_energys",
"phone_extractor": "espeak",
"phone_pitch_dir": "phone_pitches",
"phone_seq_file": "phone_seq_file",
"pitch_dir": "pitch",
"pitch_extractor": "parselmouth",
"pitch_norm": false,
"pitch_remove_outlier": false,
"processed_dir": "data",
"raw_data": "raw_data",
"read_metadata": true,
"real_dir": "reals",
"ref_level_db": 20,
"sample_rate": 24000,
"spk2id": "spk2id.json",
"symbols_dict": "symbols.dict",
"train_file": "train.json",
"trim_fft_size": 512,
"trim_hop_size": 128,
"trim_silence": false,
"trim_top_db": 30,
"trimmed_wav_dir": "trimmed_wavs",
"use_amplitude_phase": false,
"use_audio": false,
"use_code": true,
"use_cross_reference": true,
"use_dur": false,
"use_duration": true,
"use_emoid": false,
"use_frame_duration": false,
"use_frame_energy": false,
"use_frame_pitch": false,
"use_lab": false,
"use_label": false,
"use_len": true,
"use_linear": false,
"use_log_scale_energy": false,
"use_log_scale_pitch": false,
"use_mel": false,
"use_min_max_norm_mel": false,
"use_one_hot": false,
"use_phn_seq": false,
"use_phone": true,
"use_phone_duration": false,
"use_phone_energy": false,
"use_phone_pitch": false,
"use_pitch": true,
"use_spkid": true,
"use_text": false,
"use_uv": false,
"use_wav": false,
"use_wenet": false,
"utt2emo": "utt2emo",
"utt2spk": "utt2spk",
"uv_dir": "uvs",
"valid_file": "test.json",
"wav_dir": "wavs",
"wenet_dir": "wenet",
"win_size": 480,
},
"supported_model_type": [
"GANVocoder",
"Fastspeech2",
"DiffSVC",
"Transformer",
"EDM",
"CD",
],
"task_type": "",
"train": {
"adam": {
"lr": 0.0001,
},
"adamw": {
"lr": 0.0004,
},
"batch_size": 12,
"dataloader": {
"num_worker": 16,
"pin_memory": true,
},
"ddp": true,
"diff_ce_loss_lambda": 0.5,
"diff_noise_loss_lambda": 1.0,
"epochs": 5000,
"gradient_accumulation_step": 1,
"keep_checkpoint_max": 100,
"keep_last": [
1000,
],
"lr_scheduler": "cosine",
"lr_warmup_steps": 5000,
"max_epoch": 5000,
"max_sentences": 32,
"max_steps": 1000000,
"max_tokens": 7500,
"multi_speaker_training": false,
"num_train_steps": 800000,
"optimizer": "AdamW",
"random_seed": 114,
"reducelronplateau": {
"factor": 0.8,
"min_lr": 0.0001,
"patience": 10,
},
"run_eval": [
true,
],
"sampler": {
"drop_last": true,
"holistic_shuffle": true,
},
"save_checkpoint_stride": [
1,
],
"save_checkpoints_steps": 2000,
"save_summary_steps": 500,
"scheduler": "ReduceLROnPlateau",
"total_training_steps": 800000,
"tracker": [
"tensorboard",
],
"train_feature_dirs": [
"/path/labels_with_dur_75",
"/path/mels_16k_75",
"/path/mos38_normed_encodec_16",
"/path/norm_wavs.scp",
],
"train_fileid_list_path": "/path/train_pure_3.8.txt",
"use_dynamic_batchsize": false,
"valid_feature_dirs": [
"/path/labels_with_dur_75",
"/path/mels_16k_75",
"/path/mos38_normed_encodec_16",
"/path/norm_wavs.scp",
],
"valid_fileid_list_path": "/path/test.txt",
"valid_interval": 2000,
},
"use_custom_dataset": false,
}