MNP-SVC-v2-VCTK / config.yaml
TylorShine's picture
Upload models
c2d0a87 verified
data:
block_size: 512
duration: 1.5
encoder: dpwavlmbase
encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth
encoder_hop_size: 320
encoder_out_channels: 768
encoder_sample_rate: 16000
extensions:
- wav
- flac
- mp3
- m4a
f0_extractor: rmvpe
f0_max: 1200
f0_min: 65
sampling_rate: 44100
spk_embed_channels: 256
spk_embed_encoder: pyannote.audio
spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin
spk_embed_encoder_sample_rate: 16000
volume_window_size: 8
device: cuda
env:
gpu_id: 0
loss:
beta: 1.0
fft_max: 2048
fft_min: 128
gamma: 0.0
n_ffts:
- 32
- 64
- 128
- 256
- 512
- 1024
- 2048
overlap: 0.5
use_multi_scale_log_freq: true
model:
f0_input_variance: 0.0
f0_offset_size_downsamples: 16
harmonic_env_size_downsamples: 16
no_use_embed_conv: false
noise_env_size_downsamples: 16
noise_seed: 289
noise_to_harmonic_phase: false
type: CombSubMinimumNoisedPhase
units_hidden_channels: 256
units_layers:
- - 10
- 11
use_add_noise_env: false
use_discriminator: true
use_f0_offset: false
use_harmonic_env: true
use_noise_env: false
use_speaker_embed: true
win_length: 2048
train:
accelerator:
log_with: tensorboard
accelerator_project_config:
total_limit: 10
allow_tf32: true
amp_dtype: fp32
batch_size: 32
cache_all_data: true
cache_device: cpu
cache_fp16: true
epochs: 100
frame_hop_random_max: 64
frame_hop_random_min: 32
interval_log: 10
interval_val: 2000
loss_variation: 0.1
low_similar_loss_variation: 0.7
lr: 0.0003
num_workers: 2
only_u2c_stack: false
save_states: true
sched_cooldown: 2
sched_factor: 0.5
sched_gamma: 0.99999
sched_min_lr: 1.0e-05
sched_patience: 50
sched_threshold: 1.0e-05
sched_threshold_mode: rel
weight_decay: 1.0e-05