data: block_size: 512 duration: 1.5 encoder: dpwavlmbase encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth encoder_hop_size: 320 encoder_out_channels: 768 encoder_sample_rate: 16000 extensions: - wav - flac - mp3 - m4a f0_extractor: rmvpe f0_max: 1200 f0_min: 65 sampling_rate: 44100 spk_embed_channels: 256 spk_embed_encoder: pyannote.audio spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin spk_embed_encoder_sample_rate: 16000 volume_window_size: 8 device: cuda env: gpu_id: 0 loss: beta: 1.0 fft_max: 2048 fft_min: 128 gamma: 0.0 n_ffts: - 32 - 64 - 128 - 256 - 512 - 1024 - 2048 overlap: 0.5 use_multi_scale_log_freq: true model: f0_input_variance: 0.0 f0_offset_size_downsamples: 16 harmonic_env_size_downsamples: 16 no_use_embed_conv: false noise_env_size_downsamples: 16 noise_seed: 289 noise_to_harmonic_phase: false type: CombSubMinimumNoisedPhase units_hidden_channels: 256 units_layers: - - 10 - 11 use_add_noise_env: false use_discriminator: true use_f0_offset: false use_harmonic_env: true use_noise_env: false use_speaker_embed: true win_length: 2048 train: accelerator: log_with: tensorboard accelerator_project_config: total_limit: 10 allow_tf32: true amp_dtype: fp32 batch_size: 32 cache_all_data: true cache_device: cpu cache_fp16: true epochs: 100 frame_hop_random_max: 64 frame_hop_random_min: 32 interval_log: 10 interval_val: 2000 loss_variation: 0.1 low_similar_loss_variation: 0.7 lr: 0.0003 num_workers: 2 only_u2c_stack: false save_states: true sched_cooldown: 2 sched_factor: 0.5 sched_gamma: 0.99999 sched_min_lr: 1.0e-05 sched_patience: 50 sched_threshold: 1.0e-05 sched_threshold_mode: rel weight_decay: 1.0e-05