hydrusbeta
commited on
Commit
·
6a63476
1
Parent(s):
2dd5b2b
Upload Starlight Glimmer tortoise model
Browse files
tortoise/Starlight Glimmer/09092023_061233_train.yaml
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: 'Starlight Glimmer'
|
2 |
+
model: extensibletrainer
|
3 |
+
scale: 1
|
4 |
+
gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
|
5 |
+
start_step: 0
|
6 |
+
checkpointing_enabled: true
|
7 |
+
fp16: False
|
8 |
+
bitsandbytes: True
|
9 |
+
gpus: 1
|
10 |
+
|
11 |
+
datasets:
|
12 |
+
train:
|
13 |
+
name: training
|
14 |
+
n_workers: 2
|
15 |
+
batch_size: 128
|
16 |
+
mode: paired_voice_audio
|
17 |
+
path: ./training/Starlight Glimmer/train.txt
|
18 |
+
fetcher_mode: ['lj']
|
19 |
+
phase: train
|
20 |
+
max_wav_length: 255995 # ~11.6 seconds
|
21 |
+
max_text_length: 200
|
22 |
+
sample_rate: 22050
|
23 |
+
load_conditioning: True
|
24 |
+
num_conditioning_candidates: 2
|
25 |
+
conditioning_length: 44000
|
26 |
+
use_bpe_tokenizer: True
|
27 |
+
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
|
28 |
+
load_aligned_codes: False
|
29 |
+
val:
|
30 |
+
name: validation
|
31 |
+
n_workers: 2
|
32 |
+
batch_size: 8
|
33 |
+
mode: paired_voice_audio
|
34 |
+
path: ./training/Starlight Glimmer/validation.txt
|
35 |
+
fetcher_mode: ['lj']
|
36 |
+
phase: val
|
37 |
+
max_wav_length: 255995
|
38 |
+
max_text_length: 200
|
39 |
+
sample_rate: 22050
|
40 |
+
load_conditioning: True
|
41 |
+
num_conditioning_candidates: 2
|
42 |
+
conditioning_length: 44000
|
43 |
+
use_bpe_tokenizer: True
|
44 |
+
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
|
45 |
+
load_aligned_codes: False
|
46 |
+
|
47 |
+
steps:
|
48 |
+
gpt_train:
|
49 |
+
training: gpt
|
50 |
+
loss_log_buffer: 500
|
51 |
+
|
52 |
+
# Generally follows the recipe from the DALLE paper.
|
53 |
+
optimizer: adamw # this should be adamw_zero if you're using distributed training
|
54 |
+
optimizer_params:
|
55 |
+
lr: !!float 3e-05 # originally: 1e-4
|
56 |
+
weight_decay: !!float 1e-2
|
57 |
+
beta1: 0.9
|
58 |
+
beta2: 0.96
|
59 |
+
clip_grad_eps: 4
|
60 |
+
|
61 |
+
injectors:
|
62 |
+
paired_to_mel:
|
63 |
+
type: torch_mel_spectrogram
|
64 |
+
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
|
65 |
+
in: wav
|
66 |
+
out: paired_mel
|
67 |
+
paired_cond_to_mel:
|
68 |
+
type: for_each
|
69 |
+
subtype: torch_mel_spectrogram
|
70 |
+
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
|
71 |
+
in: conditioning
|
72 |
+
out: paired_conditioning_mel
|
73 |
+
to_codes:
|
74 |
+
type: discrete_token
|
75 |
+
in: paired_mel
|
76 |
+
out: paired_mel_codes
|
77 |
+
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
|
78 |
+
paired_fwd_text:
|
79 |
+
type: generator
|
80 |
+
generator: gpt
|
81 |
+
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
|
82 |
+
out: [loss_text_ce, loss_mel_ce, logits]
|
83 |
+
losses:
|
84 |
+
text_ce:
|
85 |
+
type: direct
|
86 |
+
weight: 0.01
|
87 |
+
key: loss_text_ce
|
88 |
+
mel_ce:
|
89 |
+
type: direct
|
90 |
+
weight: 1
|
91 |
+
key: loss_mel_ce
|
92 |
+
|
93 |
+
networks:
|
94 |
+
gpt:
|
95 |
+
type: generator
|
96 |
+
which_model_G: unified_voice2
|
97 |
+
kwargs:
|
98 |
+
layers: 30 # originally: 8
|
99 |
+
model_dim: 1024 # originally: 512
|
100 |
+
heads: 16 # originally: 8
|
101 |
+
max_text_tokens: 402 # originally: 120
|
102 |
+
max_mel_tokens: 604 # originally: 250
|
103 |
+
max_conditioning_inputs: 2 # originally: 1
|
104 |
+
mel_length_compression: 1024
|
105 |
+
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
|
106 |
+
number_mel_codes: 8194
|
107 |
+
start_mel_token: 8192
|
108 |
+
stop_mel_token: 8193
|
109 |
+
start_text_token: 255
|
110 |
+
train_solo_embeddings: False # missing in uv3/4
|
111 |
+
use_mel_codes_as_input: True # ditto
|
112 |
+
checkpointing: True
|
113 |
+
tortoise_compat: True
|
114 |
+
# freeze_everything_but_position_embeddings: True
|
115 |
+
|
116 |
+
path:
|
117 |
+
strict_load: true
|
118 |
+
pretrain_model_gpt: './models/tortoise/autoregressive.pth'
|
119 |
+
# resume_state: ''
|
120 |
+
|
121 |
+
train:
|
122 |
+
niter: 416
|
123 |
+
warmup_iter: -1
|
124 |
+
mega_batch_factor: 16
|
125 |
+
val_freq: 52
|
126 |
+
|
127 |
+
ema_enabled: false # I really don't think EMA matters
|
128 |
+
|
129 |
+
default_lr_scheme: MultiStepLR
|
130 |
+
gen_lr_steps: [8, 16, 36, 72, 100, 132, 200]
|
131 |
+
lr_gamma: 0.5
|
132 |
+
|
133 |
+
eval:
|
134 |
+
pure: False
|
135 |
+
output_state: gen
|
136 |
+
|
137 |
+
logger:
|
138 |
+
save_checkpoint_freq: 52
|
139 |
+
visuals: [gen, mel]
|
140 |
+
visual_debug_rate: 52
|
141 |
+
is_mel_spectrogram: true
|
tortoise/Starlight Glimmer/208_gpt.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:666dd8e0565d2c01bcc20ecd0d4224b4cbeed189d6124078c65888a29aa39e21
|
3 |
+
size 1716990037
|