fishspeech2 / fish_speech /configs /firefly_gan_vq.yaml
pineconeT94's picture
first commit
8b14bed
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
spec_transform:
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
sample_rate: 44100
n_mels: 160
n_fft: 2048
hop_length: 512
win_length: 2048
backbone:
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
input_channels: 160
depths: [3, 3, 9, 3]
dims: [128, 256, 384, 512]
drop_path_rate: 0.2
kernel_size: 7
head:
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
hop_length: 512
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
upsample_kernel_sizes: [16, 16, 4, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
num_mels: 512
upsample_initial_channel: 512
pre_conv_kernel_size: 13
post_conv_kernel_size: 13
quantizer:
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
input_dim: 512
n_groups: 8
n_codebooks: 1
levels: [8, 5, 5, 5]
downsample_factor: [2, 2]