Spaces:

waysolong
/

text_to_speech

Sleeping

text_to_speech / examples /biaobei /config.yaml

wuxulong19950206

add files

9270314 over 1 year ago

3.51 kB


	dataset:
	train:
	wav_scp: './train/wav.scp'
	mel_scp: './train/mel.scp'
	dur_scp: './train/dur.scp'
	emb_type1:
	_name: 'pinyin'
	scp: './train/py.scp'
	vocab: 'py.vocab'
	emb_type2:
	_name: 'graphic'
	scp: './train/gp.scp'
	vocab: 'gp.vocab'

	emb_type3:
	_name: 'speaker'
	scp: './train/spk.scp'
	vocab: # dosn't need vocab

	training:
	batch_size: 16
	batch_split: 1
	epochs: 10000
	grad_clip_thresh: 1.0
	acc_step: 1
	checkpoint_path: "./checkpoints/"
	log_path: "./log/"
	checkpoint_step: 5000
	synth_step: 5000
	log_step: 20
	num_workers: 8

	evaluation_step: 1000

	optimizer:
	type: Adam
	n_warm_up_step: 2000
	#lr_decrease_step: 10000
	#lr_decrease_factor:
	params:
	betas: [0.9,0.98]
	eps: !!float 1e-9
	weight_decay: !!float 0.0
	lr: !!float 1e-4
	lr_scheduler:
	type: CyclicLR
	params:
	base_lr: !!float 1e-7
	max_lr: !!float 1e-4
	step_size_up: 5000
	step_size_down: 8000
	cycle_momentum: False

	vocoder:
	type: VocGan # choose one of the following
	MelGAN:
	checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
	config: ~/checkpoints/melgan/default.yaml
	device: cpu
	VocGan:
	checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt
	denoise: True
	device: cpu
	HiFiGAN:
	checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here
	device: cpu
	Waveglow:
	checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
	sigma: 1.0
	denoiser_strength: 0.0 # try 0.1
	device: cpu #try cpu if out of memory

	fbank:
	sample_rate: 16000
	n_fft: 1024
	hop_length: 256
	win_length: 1024
	max_wav_value: 32768.0
	n_mels: 80
	fmin: 0.0
	fmax: 8000.0 # should be 11025
	mel_mean: -6.0304103

	encoder:
	encoder_type: 'FS2TransformerEncoder'
	conf:
	n_layers: 4
	n_heads: 2
	hidden_dim: 256
	dropout: 0.25
	d_inner: 1024
	max_len: 2048

	decoder:
	decoder_type: 'FS2TransformerDecoder'
	input_dim: 256 # should be the same as the output of encoder
	n_layers: 4
	n_heads: 2
	hidden_dim: 256
	d_inner: 1024
	dropout: 0.25
	max_len: 2048 # max len of seq, for position embedding pre-computation

	#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
	#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
	postnet:
	postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
	speaker_embedding:
	enable: True
	vocab: #None
	vocab_size: 1 #
	weight: 1.0
	dim: 256

	utterence_embedding:
	enable: False
	type: 'lstm' # resnet
	feature_config:
	type: 'mel'
	n_mels: 80
	sampling_rate: 22050
	n_fft: 1024
	hop_length: 256
	win_length: 1024

	model_config:
	n_layers: 3
	bidirectional: True

	hanzi_embedding:
	enable: True
	type: embedding
	vocab: './gp.vocab'
	dim: 256
	weight: 0.5
	max_seq_len: 100

	pinyin_embedding:
	enable: True
	type: embedding
	vocab: './py.vocab'
	dim: 256
	weight: 1.0
	max_seq_len: 100

	duration_predictor:
	input_dim: 256 # should be the same as encoder hiddien_dim
	filter_size: 256
	kernel_size: 3
	dropout: 0.5
	duration_mean: 21.517294924096635 #for aishell3

	f0_predictor:
	enable: False
	filter_size: 256
	kernel_size: 3
	dropout: 0.5
	n_bins: 256
	synthesis:
	normalize: True # normalize the sound volume