hydrusbeta commited on
Commit
6a63476
·
1 Parent(s): 2dd5b2b

Upload Starlight Glimmer tortoise model

Browse files
tortoise/Starlight Glimmer/09092023_061233_train.yaml ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 'Starlight Glimmer'
2
+ model: extensibletrainer
3
+ scale: 1
4
+ gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
5
+ start_step: 0
6
+ checkpointing_enabled: true
7
+ fp16: False
8
+ bitsandbytes: True
9
+ gpus: 1
10
+
11
+ datasets:
12
+ train:
13
+ name: training
14
+ n_workers: 2
15
+ batch_size: 128
16
+ mode: paired_voice_audio
17
+ path: ./training/Starlight Glimmer/train.txt
18
+ fetcher_mode: ['lj']
19
+ phase: train
20
+ max_wav_length: 255995 # ~11.6 seconds
21
+ max_text_length: 200
22
+ sample_rate: 22050
23
+ load_conditioning: True
24
+ num_conditioning_candidates: 2
25
+ conditioning_length: 44000
26
+ use_bpe_tokenizer: True
27
+ tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
28
+ load_aligned_codes: False
29
+ val:
30
+ name: validation
31
+ n_workers: 2
32
+ batch_size: 8
33
+ mode: paired_voice_audio
34
+ path: ./training/Starlight Glimmer/validation.txt
35
+ fetcher_mode: ['lj']
36
+ phase: val
37
+ max_wav_length: 255995
38
+ max_text_length: 200
39
+ sample_rate: 22050
40
+ load_conditioning: True
41
+ num_conditioning_candidates: 2
42
+ conditioning_length: 44000
43
+ use_bpe_tokenizer: True
44
+ tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
45
+ load_aligned_codes: False
46
+
47
+ steps:
48
+ gpt_train:
49
+ training: gpt
50
+ loss_log_buffer: 500
51
+
52
+ # Generally follows the recipe from the DALLE paper.
53
+ optimizer: adamw # this should be adamw_zero if you're using distributed training
54
+ optimizer_params:
55
+ lr: !!float 3e-05 # originally: 1e-4
56
+ weight_decay: !!float 1e-2
57
+ beta1: 0.9
58
+ beta2: 0.96
59
+ clip_grad_eps: 4
60
+
61
+ injectors:
62
+ paired_to_mel:
63
+ type: torch_mel_spectrogram
64
+ mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
65
+ in: wav
66
+ out: paired_mel
67
+ paired_cond_to_mel:
68
+ type: for_each
69
+ subtype: torch_mel_spectrogram
70
+ mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
71
+ in: conditioning
72
+ out: paired_conditioning_mel
73
+ to_codes:
74
+ type: discrete_token
75
+ in: paired_mel
76
+ out: paired_mel_codes
77
+ dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
78
+ paired_fwd_text:
79
+ type: generator
80
+ generator: gpt
81
+ in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
82
+ out: [loss_text_ce, loss_mel_ce, logits]
83
+ losses:
84
+ text_ce:
85
+ type: direct
86
+ weight: 0.01
87
+ key: loss_text_ce
88
+ mel_ce:
89
+ type: direct
90
+ weight: 1
91
+ key: loss_mel_ce
92
+
93
+ networks:
94
+ gpt:
95
+ type: generator
96
+ which_model_G: unified_voice2
97
+ kwargs:
98
+ layers: 30 # originally: 8
99
+ model_dim: 1024 # originally: 512
100
+ heads: 16 # originally: 8
101
+ max_text_tokens: 402 # originally: 120
102
+ max_mel_tokens: 604 # originally: 250
103
+ max_conditioning_inputs: 2 # originally: 1
104
+ mel_length_compression: 1024
105
+ number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
106
+ number_mel_codes: 8194
107
+ start_mel_token: 8192
108
+ stop_mel_token: 8193
109
+ start_text_token: 255
110
+ train_solo_embeddings: False # missing in uv3/4
111
+ use_mel_codes_as_input: True # ditto
112
+ checkpointing: True
113
+ tortoise_compat: True
114
+ # freeze_everything_but_position_embeddings: True
115
+
116
+ path:
117
+ strict_load: true
118
+ pretrain_model_gpt: './models/tortoise/autoregressive.pth'
119
+ # resume_state: ''
120
+
121
+ train:
122
+ niter: 416
123
+ warmup_iter: -1
124
+ mega_batch_factor: 16
125
+ val_freq: 52
126
+
127
+ ema_enabled: false # I really don't think EMA matters
128
+
129
+ default_lr_scheme: MultiStepLR
130
+ gen_lr_steps: [8, 16, 36, 72, 100, 132, 200]
131
+ lr_gamma: 0.5
132
+
133
+ eval:
134
+ pure: False
135
+ output_state: gen
136
+
137
+ logger:
138
+ save_checkpoint_freq: 52
139
+ visuals: [gen, mel]
140
+ visual_debug_rate: 52
141
+ is_mel_spectrogram: true
tortoise/Starlight Glimmer/208_gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:666dd8e0565d2c01bcc20ecd0d4224b4cbeed189d6124078c65888a29aa39e21
3
+ size 1716990037