File size: 2,132 Bytes
9d61c9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from dataclasses import dataclass


# TODO: DEPRECATED!
@dataclass
class PostNetConfig:
    p_dropout: float
    postnet_embedding_dim: int
    postnet_kernel_size: int
    postnet_n_convolutions: int

postnet_expetimental = PostNetConfig(
    p_dropout=0.1,
    postnet_embedding_dim=512,
    postnet_kernel_size=5,
    postnet_n_convolutions=3,
)

# TODO: DEPRECATED!
@dataclass
class DiffusionConfig:
    # model parameters
    model: str
    n_mel_channels: int
    multi_speaker: bool
    # denoiser parameters
    residual_channels: int
    residual_layers: int
    denoiser_dropout: float
    noise_schedule_naive: str
    timesteps: int
    shallow_timesteps: int
    min_beta: float
    max_beta: float
    s: float
    pe_scale: int
    keep_bins: int
    # trainsformer params
    encoder_hidden: int
    decoder_hidden: int
    speaker_embed_dim: int
    # loss params
    noise_loss: str


diff_en = DiffusionConfig(
    # model parameters
    model="shallow",
    n_mel_channels=100,
    multi_speaker=True,
    # denoiser parameters
    # residual_channels=256,
    # residual_channels=384,
    residual_channels=100,
    residual_layers=20,
    denoiser_dropout=0.2,
    noise_schedule_naive="vpsde",
    timesteps=10,
    shallow_timesteps=1,
    min_beta=0.1,
    max_beta=40,
    s=0.008,
    keep_bins=80,
    pe_scale=1000,
    # trainsformer params
    # encoder_hidden=100,
    encoder_hidden=512,
    decoder_hidden=512,
    # Speaker_emb + lang_emb
    speaker_embed_dim=1025,
    # loss params
    noise_loss="l1",
)

diff_multi = DiffusionConfig(
    # model parameters
    model="shallow",
    n_mel_channels=100,
    multi_speaker=True,
    # denoiser parameters
    # residual_channels=256,
    residual_channels=100,
    residual_layers=20,
    denoiser_dropout=0.2,
    noise_schedule_naive="vpsde",
    timesteps=10,
    shallow_timesteps=1,
    min_beta=0.1,
    max_beta=40,
    s=0.008,
    pe_scale=1000,
    keep_bins=80,
    # trainsformer params
    encoder_hidden=512,
    decoder_hidden=512,
    # Speaker_emb + lang_emb
    speaker_embed_dim=1280,
    # loss params
    noise_loss="l1",
)