update
Browse files- .gitignore +1 -0
- .models/clip.pth +0 -3
- api.py +11 -22
- data/mel_norms.pth +0 -0
- do_tts.py +5 -1
- models/arch_util.py +1 -2
- models/clvp.py +1 -1
- models/xtransformers.py +0 -47
- read.py +10 -12
- requirements.txt +1 -2
.gitignore
CHANGED
@@ -130,3 +130,4 @@ dmypy.json
|
|
130 |
.pyre/
|
131 |
|
132 |
.idea/*
|
|
|
|
130 |
.pyre/
|
131 |
|
132 |
.idea/*
|
133 |
+
.models/*
|
.models/clip.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8ab5a7751b6098b7e57528b5d812ea2ffbaa16f1b36c02e143c501c74900140d
|
3 |
-
size 271601435
|
|
|
|
|
|
|
|
api.py
CHANGED
@@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
|
|
23 |
pbar = None
|
24 |
def download_models():
|
25 |
MODELS = {
|
26 |
-
'
|
27 |
-
'
|
28 |
-
'
|
|
|
|
|
29 |
}
|
30 |
os.makedirs('.models', exist_ok=True)
|
31 |
def show_progress(block_num, block_size, total_size):
|
@@ -162,25 +164,12 @@ class TextToSpeech:
|
|
162 |
train_solo_embeddings=False,
|
163 |
average_conditioning_embeddings=True).cpu().eval()
|
164 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
165 |
-
'''
|
166 |
-
self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
|
167 |
-
model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
|
168 |
-
average_conditioning_embeddings=True, types=2).cpu().eval()
|
169 |
-
self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
|
170 |
-
'''
|
171 |
-
|
172 |
-
self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
173 |
-
model_dim=1024,
|
174 |
-
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
175 |
-
train_solo_embeddings=False,
|
176 |
-
average_conditioning_embeddings=True).cpu().eval()
|
177 |
-
self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
|
178 |
|
179 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
180 |
text_seq_len=350, text_heads=8,
|
181 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
182 |
use_xformers=True).cpu().eval()
|
183 |
-
self.clvp.load_state_dict(torch.load('.models/
|
184 |
|
185 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
186 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
@@ -213,7 +202,7 @@ class TextToSpeech:
|
|
213 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
214 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
215 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
216 |
-
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations':
|
217 |
}
|
218 |
kwargs.update(presets[preset])
|
219 |
return self.tts(text, voice_samples, **kwargs)
|
@@ -281,11 +270,11 @@ class TextToSpeech:
|
|
281 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
282 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
283 |
# results, but will increase memory usage.
|
284 |
-
self.
|
285 |
-
best_latents = self.
|
286 |
-
torch.tensor([best_results.shape[-1]*self.
|
287 |
return_latent=True, clip_inputs=False)
|
288 |
-
self.
|
289 |
|
290 |
print("Performing vocoding..")
|
291 |
wav_candidates = []
|
|
|
23 |
pbar = None
|
24 |
def download_models():
|
25 |
MODELS = {
|
26 |
+
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
|
27 |
+
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
|
28 |
+
'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
|
29 |
+
'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
|
30 |
+
'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
|
31 |
}
|
32 |
os.makedirs('.models', exist_ok=True)
|
33 |
def show_progress(block_num, block_size, total_size):
|
|
|
164 |
train_solo_embeddings=False,
|
165 |
average_conditioning_embeddings=True).cpu().eval()
|
166 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
169 |
text_seq_len=350, text_heads=8,
|
170 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
171 |
use_xformers=True).cpu().eval()
|
172 |
+
self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
|
173 |
|
174 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
175 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
|
|
202 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
203 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
204 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
205 |
+
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
206 |
}
|
207 |
kwargs.update(presets[preset])
|
208 |
return self.tts(text, voice_samples, **kwargs)
|
|
|
270 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
271 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
272 |
# results, but will increase memory usage.
|
273 |
+
self.autoregressive = self.autoregressive.cuda()
|
274 |
+
best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
275 |
+
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
276 |
return_latent=True, clip_inputs=False)
|
277 |
+
self.autoregressive = self.autoregressive.cpu()
|
278 |
|
279 |
print("Performing vocoding..")
|
280 |
wav_candidates = []
|
data/mel_norms.pth
CHANGED
Binary files a/data/mel_norms.pth and b/data/mel_norms.pth differ
|
|
do_tts.py
CHANGED
@@ -11,6 +11,10 @@ if __name__ == '__main__':
|
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
|
|
|
|
|
|
|
|
14 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
15 |
args = parser.parse_args()
|
16 |
os.makedirs(args.output_path, exist_ok=True)
|
@@ -25,6 +29,6 @@ if __name__ == '__main__':
|
|
25 |
for cond_path in cond_paths:
|
26 |
c = load_audio(cond_path, 22050)
|
27 |
conds.append(c)
|
28 |
-
gen = tts.tts_with_preset(args.text, conds, preset=
|
29 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
30 |
|
|
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
14 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
15 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
16 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
17 |
+
default=.5)
|
18 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
19 |
args = parser.parse_args()
|
20 |
os.makedirs(args.output_path, exist_ok=True)
|
|
|
29 |
for cond_path in cond_paths:
|
30 |
c = load_audio(cond_path, 22050)
|
31 |
conds.append(c)
|
32 |
+
gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
33 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
34 |
|
models/arch_util.py
CHANGED
@@ -5,8 +5,7 @@ import torch
|
|
5 |
import torch.nn as nn
|
6 |
import torch.nn.functional as F
|
7 |
import torchaudio
|
8 |
-
from
|
9 |
-
from x_transformers.x_transformers import RelativePositionBias
|
10 |
|
11 |
|
12 |
def zero_module(module):
|
|
|
5 |
import torch.nn as nn
|
6 |
import torch.nn.functional as F
|
7 |
import torchaudio
|
8 |
+
from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
|
|
|
9 |
|
10 |
|
11 |
def zero_module(module):
|
models/clvp.py
CHANGED
@@ -2,10 +2,10 @@ import torch
|
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
4 |
from torch import einsum
|
5 |
-
from x_transformers import Encoder
|
6 |
|
7 |
from models.arch_util import CheckpointedXTransformerEncoder
|
8 |
from models.transformer import Transformer
|
|
|
9 |
|
10 |
|
11 |
def exists(val):
|
|
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
4 |
from torch import einsum
|
|
|
5 |
|
6 |
from models.arch_util import CheckpointedXTransformerEncoder
|
7 |
from models.transformer import Transformer
|
8 |
+
from models.xtransformers import Encoder
|
9 |
|
10 |
|
11 |
def exists(val):
|
models/xtransformers.py
CHANGED
@@ -1253,50 +1253,3 @@ class ContinuousTransformerWrapper(nn.Module):
|
|
1253 |
return tuple(res)
|
1254 |
return res[0]
|
1255 |
|
1256 |
-
|
1257 |
-
class XTransformer(nn.Module):
|
1258 |
-
def __init__(
|
1259 |
-
self,
|
1260 |
-
*,
|
1261 |
-
dim,
|
1262 |
-
tie_token_emb=False,
|
1263 |
-
**kwargs
|
1264 |
-
):
|
1265 |
-
super().__init__()
|
1266 |
-
enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
|
1267 |
-
dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
|
1268 |
-
|
1269 |
-
assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
|
1270 |
-
enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
|
1271 |
-
enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
|
1272 |
-
enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
|
1273 |
-
enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True)
|
1274 |
-
|
1275 |
-
dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
|
1276 |
-
dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
|
1277 |
-
dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True)
|
1278 |
-
|
1279 |
-
self.encoder = TransformerWrapper(
|
1280 |
-
**enc_transformer_kwargs,
|
1281 |
-
attn_layers=Encoder(dim=dim, **enc_kwargs)
|
1282 |
-
)
|
1283 |
-
|
1284 |
-
self.decoder = TransformerWrapper(
|
1285 |
-
**dec_transformer_kwargs,
|
1286 |
-
attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs)
|
1287 |
-
)
|
1288 |
-
|
1289 |
-
if tie_token_emb:
|
1290 |
-
self.decoder.token_emb = self.encoder.token_emb
|
1291 |
-
|
1292 |
-
self.decoder = AutoregressiveWrapper(self.decoder)
|
1293 |
-
|
1294 |
-
@torch.no_grad()
|
1295 |
-
def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs):
|
1296 |
-
encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
|
1297 |
-
return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs)
|
1298 |
-
|
1299 |
-
def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None):
|
1300 |
-
enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
|
1301 |
-
out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask)
|
1302 |
-
return out
|
|
|
1253 |
return tuple(res)
|
1254 |
return res[0]
|
1255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
read.py
CHANGED
@@ -28,11 +28,14 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
|
28 |
|
29 |
if __name__ == '__main__':
|
30 |
parser = argparse.ArgumentParser()
|
31 |
-
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
35 |
-
parser.add_argument('--
|
|
|
|
|
|
|
36 |
args = parser.parse_args()
|
37 |
|
38 |
outpath = args.output_path
|
@@ -60,16 +63,11 @@ if __name__ == '__main__':
|
|
60 |
if not cond_paths:
|
61 |
print('Error: no valid voices specified. Try again.')
|
62 |
|
63 |
-
|
|
|
|
|
|
|
64 |
for j, text in enumerate(texts):
|
65 |
-
conds =
|
66 |
-
for cond_path in cond_paths:
|
67 |
-
c = load_audio(cond_path, 22050)
|
68 |
-
conds.append(c)
|
69 |
-
gen = tts.tts_with_preset(text, conds, preset=args.generation_preset)
|
70 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
71 |
|
72 |
-
priors.append(torchaudio.functional.resample(gen, 24000, 22050).squeeze(0))
|
73 |
-
while len(priors) > 2:
|
74 |
-
priors.pop(0)
|
75 |
-
|
|
|
28 |
|
29 |
if __name__ == '__main__':
|
30 |
parser = argparse.ArgumentParser()
|
31 |
+
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
35 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
36 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
37 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
38 |
+
default=.5)
|
39 |
args = parser.parse_args()
|
40 |
|
41 |
outpath = args.output_path
|
|
|
63 |
if not cond_paths:
|
64 |
print('Error: no valid voices specified. Try again.')
|
65 |
|
66 |
+
conds = []
|
67 |
+
for cond_path in cond_paths:
|
68 |
+
c = load_audio(cond_path, 22050)
|
69 |
+
conds.append(c)
|
70 |
for j, text in enumerate(texts):
|
71 |
+
gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
|
|
|
|
|
|
|
|
72 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
73 |
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -6,5 +6,4 @@ tokenizers
|
|
6 |
inflect
|
7 |
progressbar
|
8 |
einops
|
9 |
-
unidecode
|
10 |
-
x-transformers
|
|
|
6 |
inflect
|
7 |
progressbar
|
8 |
einops
|
9 |
+
unidecode
|
|