Spaces:
Sleeping
Sleeping
updated dockerfile and added scripts
Browse files- Dockerfile +36 -47
- scripts/tortoise_tts.py +390 -0
- setup.py +40 -0
- tortoise_tts.ipynb +268 -0
Dockerfile
CHANGED
@@ -1,47 +1,36 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
ENV
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
echo "
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
conda install
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
# Make port 8501 available to the world outside this container
|
38 |
-
EXPOSE 8501
|
39 |
-
|
40 |
-
# Define environment variable
|
41 |
-
ENV NAME tortoise-tts
|
42 |
-
|
43 |
-
# List the contents of the /app directory
|
44 |
-
RUN ls -al /app
|
45 |
-
|
46 |
-
# Run the application
|
47 |
-
CMD ["streamlit", "run", "app.py"]
|
|
|
1 |
+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
|
2 |
+
|
3 |
+
COPY . /app
|
4 |
+
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get install -y --allow-unauthenticated --no-install-recommends \
|
7 |
+
wget \
|
8 |
+
git \
|
9 |
+
&& apt-get autoremove -y \
|
10 |
+
&& apt-get clean -y \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
ENV HOME "/root"
|
14 |
+
ENV CONDA_DIR "${HOME}/miniconda"
|
15 |
+
ENV PATH="$CONDA_DIR/bin":$PATH
|
16 |
+
ENV CONDA_AUTO_UPDATE_CONDA=false
|
17 |
+
ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
|
18 |
+
ENV TORTOISE_MODELS_DIR
|
19 |
+
|
20 |
+
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
|
21 |
+
&& bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
|
22 |
+
&& "${CONDA_DIR}/bin/conda" init bash \
|
23 |
+
&& rm -f /tmp/miniconda3.sh \
|
24 |
+
&& echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
|
25 |
+
|
26 |
+
# --login option used to source bashrc (thus activating conda env) at every RUN statement
|
27 |
+
SHELL ["/bin/bash", "--login", "-c"]
|
28 |
+
|
29 |
+
RUN conda create --name tortoise python=3.9 numba inflect \
|
30 |
+
&& conda activate tortoise \
|
31 |
+
&& conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
|
32 |
+
&& conda install transformers=4.29.2 \
|
33 |
+
&& conda install streamlit \
|
34 |
+
&& cd /app \
|
35 |
+
&& python setup.py install \
|
36 |
+
&& streamlit run app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/tortoise_tts.py
ADDED
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# AGPL: a notification must be added stating that changes have been made to that file.
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import tempfile
|
7 |
+
from dataclasses import dataclass
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Literal, Optional
|
10 |
+
|
11 |
+
import torch
|
12 |
+
import torchaudio
|
13 |
+
from simple_parsing import ArgumentParser, field
|
14 |
+
|
15 |
+
from tortoise.api import MODELS_DIR, TextToSpeech
|
16 |
+
from tortoise.utils.audio import load_audio
|
17 |
+
from tortoise.utils.diffusion import SAMPLERS
|
18 |
+
from tortoise.models.vocoder import VocConf
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class General:
|
23 |
+
"""General options"""
|
24 |
+
|
25 |
+
text: str = field(positional=True, nargs="*", metavar="text")
|
26 |
+
"""Text to speak. If omitted, text is read from stdin."""
|
27 |
+
|
28 |
+
voice: str = field(default="random", alias=["-v"])
|
29 |
+
"""Selects the voice to use for generation. Use the & character to join two voices together.
|
30 |
+
Use a comma to perform inference on multiple voices. Set to "all" to use all available voices.
|
31 |
+
Note that multiple voices require the --output-dir option to be set."""
|
32 |
+
|
33 |
+
voices_dir: Optional[str] = field(default=None, alias=["-V"])
|
34 |
+
"""Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories."""
|
35 |
+
|
36 |
+
preset: Literal["ultra_fast", "fast", "standard", "high_quality"] = field(
|
37 |
+
default="fast", alias=["-p"]
|
38 |
+
)
|
39 |
+
"""Which voice quality preset to use."""
|
40 |
+
|
41 |
+
quiet: bool = field(default=False, alias=["-q"])
|
42 |
+
"""Suppress all output."""
|
43 |
+
|
44 |
+
voicefixer: bool = field(default=True)
|
45 |
+
"""Enable/Disable voicefixer"""
|
46 |
+
|
47 |
+
|
48 |
+
@dataclass
|
49 |
+
class Output:
|
50 |
+
"""Output options"""
|
51 |
+
|
52 |
+
list_voices: bool = field(default=False, alias=["-l"])
|
53 |
+
"""List available voices and exit."""
|
54 |
+
|
55 |
+
play: bool = field(default=False, alias=["-P"])
|
56 |
+
"""Play the audio (requires pydub)."""
|
57 |
+
|
58 |
+
output: Optional[Path] = field(default=None, alias=["-o"])
|
59 |
+
"""Save the audio to a file."""
|
60 |
+
|
61 |
+
output_dir: Path = field(default=Path("results/"), alias=["-O"])
|
62 |
+
"""Save the audio to a directory as individual segments."""
|
63 |
+
|
64 |
+
|
65 |
+
@dataclass
|
66 |
+
class MultiOutput:
|
67 |
+
"""Multi-output options"""
|
68 |
+
|
69 |
+
candidates: int = 1
|
70 |
+
"""How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output."""
|
71 |
+
|
72 |
+
regenerate: Optional[str] = None
|
73 |
+
"""Comma-separated list of clip numbers to re-generate."""
|
74 |
+
|
75 |
+
skip_existing: bool = False
|
76 |
+
"""Set to skip re-generating existing clips."""
|
77 |
+
|
78 |
+
|
79 |
+
@dataclass
|
80 |
+
class Advanced:
|
81 |
+
"""Advanced options"""
|
82 |
+
|
83 |
+
produce_debug_state: bool = False
|
84 |
+
"""Whether or not to produce debug_states in current directory, which can aid in reproducing problems."""
|
85 |
+
|
86 |
+
seed: Optional[int] = None
|
87 |
+
"""Random seed which can be used to reproduce results."""
|
88 |
+
|
89 |
+
models_dir: str = MODELS_DIR
|
90 |
+
"""Where to find pretrained model checkpoints. Tortoise automatically downloads these to
|
91 |
+
~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints."""
|
92 |
+
|
93 |
+
text_split: Optional[str] = None
|
94 |
+
"""How big chunks to split the text into, in the format <desired_length>,<max_length>."""
|
95 |
+
|
96 |
+
disable_redaction: bool = False
|
97 |
+
"""Normally text enclosed in brackets are automatically redacted from the spoken output
|
98 |
+
(but are still rendered by the model), this can be used for prompt engineering.
|
99 |
+
Set this to disable this behavior."""
|
100 |
+
|
101 |
+
device: Optional[str] = None
|
102 |
+
"""Device to use for inference."""
|
103 |
+
|
104 |
+
batch_size: Optional[int] = None
|
105 |
+
"""Batch size to use for inference. If omitted, the batch size is set based on available GPU memory."""
|
106 |
+
|
107 |
+
vocoder: Literal["Univnet", "BigVGAN", "BigVGAN_Base"] = "BigVGAN_Base"
|
108 |
+
"""Pretrained vocoder to be used.
|
109 |
+
Univnet - tortoise original
|
110 |
+
BigVGAN - 112M model
|
111 |
+
BigVGAN_Base - 14M model
|
112 |
+
"""
|
113 |
+
|
114 |
+
ar_checkpoint: Optional[str] = None
|
115 |
+
"""Path to a checkpoint to use for the autoregressive model. If omitted, the default checkpoint is used."""
|
116 |
+
|
117 |
+
clvp_checkpoint: Optional[str] = None
|
118 |
+
"""Path to a checkpoint to use for the CLVP model. If omitted, the default checkpoint is used."""
|
119 |
+
|
120 |
+
diff_checkpoint: Optional[str] = None
|
121 |
+
"""Path to a checkpoint to use for the diffusion model. If omitted, the default checkpoint is used."""
|
122 |
+
|
123 |
+
|
124 |
+
@dataclass
|
125 |
+
class Tuning:
|
126 |
+
"""Tuning options (overrides preset settings)"""
|
127 |
+
|
128 |
+
num_autoregressive_samples: Optional[int] = None
|
129 |
+
"""Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
|
130 |
+
As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great"."""
|
131 |
+
|
132 |
+
temperature: Optional[float] = None
|
133 |
+
"""The softmax temperature of the autoregressive model."""
|
134 |
+
|
135 |
+
length_penalty: Optional[float] = None
|
136 |
+
"""A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs."""
|
137 |
+
|
138 |
+
repetition_penalty: Optional[float] = None
|
139 |
+
"""A penalty that prevents the autoregressive decoder from repeating itself during decoding.
|
140 |
+
Can be used to reduce the incidence of long silences or "uhhhhhhs", etc."""
|
141 |
+
|
142 |
+
top_p: Optional[float] = None
|
143 |
+
"""P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs."""
|
144 |
+
|
145 |
+
max_mel_tokens: Optional[int] = None
|
146 |
+
"""Restricts the output length. 1 to 600. Each unit is 1/20 of a second."""
|
147 |
+
|
148 |
+
cvvp_amount: Optional[float] = None
|
149 |
+
"""How much the CVVP model should influence the output.
|
150 |
+
Increasing this can in some cases reduce the likelihood of multiple speakers."""
|
151 |
+
|
152 |
+
diffusion_iterations: Optional[int] = None
|
153 |
+
"""Number of diffusion steps to perform. More steps means the network has more chances to iteratively
|
154 |
+
refine the output, which should theoretically mean a higher quality output.
|
155 |
+
Generally a value above 250 is not noticeably better, however."""
|
156 |
+
|
157 |
+
cond_free: Optional[bool] = None
|
158 |
+
"""Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
|
159 |
+
each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
|
160 |
+
of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
|
161 |
+
dramatically improves realism."""
|
162 |
+
|
163 |
+
cond_free_k: Optional[float] = None
|
164 |
+
"""Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
|
165 |
+
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
|
166 |
+
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k"""
|
167 |
+
|
168 |
+
diffusion_temperature: Optional[float] = None
|
169 |
+
"""Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
|
170 |
+
are the "mean" prediction of the diffusion network and will sound bland and smeared."""
|
171 |
+
|
172 |
+
|
173 |
+
@dataclass
|
174 |
+
class Speed:
|
175 |
+
"""New/speed options"""
|
176 |
+
|
177 |
+
low_vram: bool = False
|
178 |
+
"""re-enable default offloading behaviour of tortoise"""
|
179 |
+
|
180 |
+
half: bool = False
|
181 |
+
"""enable autocast to half precision for autoregressive model"""
|
182 |
+
|
183 |
+
no_cache: bool = False
|
184 |
+
"""disable kv_cache usage. This should really only be used if you are very low on vram."""
|
185 |
+
|
186 |
+
sampler: Optional[str] = field(default=None, choices=SAMPLERS)
|
187 |
+
"""override the sampler used for diffusion (default depends on --preset)"""
|
188 |
+
|
189 |
+
original_tortoise: bool = False
|
190 |
+
"""ensure results are identical to original tortoise-tts repo"""
|
191 |
+
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
parser = ArgumentParser(
|
195 |
+
description="TorToiSe is a text-to-speech program that is capable of synthesizing speech "
|
196 |
+
"in multiple voices with realistic prosody and intonation."
|
197 |
+
)
|
198 |
+
# bugs out for some reason
|
199 |
+
# parser.add_argument(
|
200 |
+
# "--web",
|
201 |
+
# action="store_true",
|
202 |
+
# help="launch the webui (doesn't pass it the other arguments)",
|
203 |
+
# )
|
204 |
+
parser.add_arguments(General, "general")
|
205 |
+
parser.add_arguments(Output, "output")
|
206 |
+
parser.add_arguments(MultiOutput, "multi_output")
|
207 |
+
parser.add_arguments(Advanced, "advanced")
|
208 |
+
parser.add_arguments(Tuning, "tuning")
|
209 |
+
parser.add_arguments(Speed, "speed")
|
210 |
+
|
211 |
+
usage_examples = f"""
|
212 |
+
Examples:
|
213 |
+
|
214 |
+
Read text using random voice and place it in a file:
|
215 |
+
|
216 |
+
{parser.prog} -o hello.wav "Hello, how are you?"
|
217 |
+
|
218 |
+
Read text from stdin and play it using the tom voice:
|
219 |
+
|
220 |
+
echo "Say it like you mean it!" | {parser.prog} -P -v tom
|
221 |
+
|
222 |
+
Read a text file using multiple voices and save the audio clips to a directory:
|
223 |
+
|
224 |
+
{parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt
|
225 |
+
"""
|
226 |
+
|
227 |
+
# show usage even when Ctrl+C is pressed early
|
228 |
+
try:
|
229 |
+
args = parser.parse_args()
|
230 |
+
except SystemExit as e:
|
231 |
+
if e.code == 0:
|
232 |
+
print(usage_examples)
|
233 |
+
sys.exit(e.code)
|
234 |
+
# bugs out for some reason
|
235 |
+
# if args.web:
|
236 |
+
# from importlib import import_module
|
237 |
+
# app = import_module("app")
|
238 |
+
# sys.exit(app.main())
|
239 |
+
|
240 |
+
from tortoise.inference import (
|
241 |
+
check_pydub,
|
242 |
+
get_all_voices,
|
243 |
+
get_seed,
|
244 |
+
parse_multiarg_text,
|
245 |
+
parse_voice_str,
|
246 |
+
split_text,
|
247 |
+
validate_output_dir,
|
248 |
+
voice_loader,
|
249 |
+
save_gen_with_voicefix
|
250 |
+
)
|
251 |
+
|
252 |
+
# get voices
|
253 |
+
all_voices, extra_voice_dirs = get_all_voices(args.general.voices_dir)
|
254 |
+
if args.output.list_voices:
|
255 |
+
for v in all_voices:
|
256 |
+
print(v)
|
257 |
+
sys.exit(0)
|
258 |
+
selected_voices = parse_voice_str(args.general.voice, all_voices)
|
259 |
+
voice_generator = voice_loader(selected_voices, extra_voice_dirs)
|
260 |
+
|
261 |
+
# parse text
|
262 |
+
if not args.general.text:
|
263 |
+
print("reading text from stdin!")
|
264 |
+
text = parse_multiarg_text(args.general.text)
|
265 |
+
texts = split_text(text, args.advanced.text_split)
|
266 |
+
|
267 |
+
output_dir = validate_output_dir(
|
268 |
+
args.output.output_dir, selected_voices, args.multi_output.candidates
|
269 |
+
)
|
270 |
+
|
271 |
+
# error out early if pydub isn't installed
|
272 |
+
pydub = check_pydub(args.output.play)
|
273 |
+
|
274 |
+
seed = get_seed(args.advanced.seed)
|
275 |
+
verbose = not args.general.quiet
|
276 |
+
|
277 |
+
vocoder = getattr(VocConf, args.advanced.vocoder)
|
278 |
+
if verbose:
|
279 |
+
print("Loading tts...")
|
280 |
+
tts = TextToSpeech(
|
281 |
+
models_dir=args.advanced.models_dir,
|
282 |
+
enable_redaction=not args.advanced.disable_redaction,
|
283 |
+
device=args.advanced.device,
|
284 |
+
autoregressive_batch_size=args.advanced.batch_size,
|
285 |
+
high_vram=not args.speed.low_vram,
|
286 |
+
kv_cache=not args.speed.no_cache,
|
287 |
+
ar_checkpoint=args.advanced.ar_checkpoint,
|
288 |
+
clvp_checkpoint=args.advanced.clvp_checkpoint,
|
289 |
+
diff_checkpoint=args.advanced.diff_checkpoint,
|
290 |
+
vocoder=vocoder,
|
291 |
+
)
|
292 |
+
|
293 |
+
gen_settings = {
|
294 |
+
"use_deterministic_seed": seed,
|
295 |
+
"verbose": verbose,
|
296 |
+
"k": args.multi_output.candidates,
|
297 |
+
"preset": args.general.preset,
|
298 |
+
}
|
299 |
+
tuning_options = [
|
300 |
+
"num_autoregressive_samples",
|
301 |
+
"temperature",
|
302 |
+
"length_penalty",
|
303 |
+
"repetition_penalty",
|
304 |
+
"top_p",
|
305 |
+
"max_mel_tokens",
|
306 |
+
"cvvp_amount",
|
307 |
+
"diffusion_iterations",
|
308 |
+
"cond_free",
|
309 |
+
"cond_free_k",
|
310 |
+
"diffusion_temperature",
|
311 |
+
]
|
312 |
+
for option in tuning_options:
|
313 |
+
if getattr(args.tuning, option) is not None:
|
314 |
+
gen_settings[option] = getattr(args.tuning, option)
|
315 |
+
|
316 |
+
speed_options = [
|
317 |
+
"sampler",
|
318 |
+
"original_tortoise",
|
319 |
+
"half",
|
320 |
+
]
|
321 |
+
for option in speed_options:
|
322 |
+
if getattr(args.speed, option) is not None:
|
323 |
+
gen_settings[option] = getattr(args.speed, option)
|
324 |
+
|
325 |
+
total_clips = len(texts) * len(selected_voices)
|
326 |
+
regenerate_clips = (
|
327 |
+
[int(x) for x in args.multi_output.regenerate.split(",")]
|
328 |
+
if args.multi_output.regenerate
|
329 |
+
else None
|
330 |
+
)
|
331 |
+
for voice_idx, (voice, voice_samples, conditioning_latents) in enumerate(
|
332 |
+
voice_generator
|
333 |
+
):
|
334 |
+
audio_parts = []
|
335 |
+
for text_idx, text in enumerate(texts):
|
336 |
+
clip_name = f'{"-".join(voice)}_{text_idx:02d}'
|
337 |
+
if args.output.output_dir:
|
338 |
+
first_clip = os.path.join(args.output.output_dir, f"{clip_name}_00.wav")
|
339 |
+
if (
|
340 |
+
args.multi_output.skip_existing
|
341 |
+
or (regenerate_clips and text_idx not in regenerate_clips)
|
342 |
+
) and os.path.exists(first_clip):
|
343 |
+
audio_parts.append(load_audio(first_clip, 24000))
|
344 |
+
if verbose:
|
345 |
+
print(f"Skipping {clip_name}")
|
346 |
+
continue
|
347 |
+
if verbose:
|
348 |
+
print(
|
349 |
+
f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..."
|
350 |
+
)
|
351 |
+
print(" " + text)
|
352 |
+
gen = tts.tts_with_preset(
|
353 |
+
text,
|
354 |
+
voice_samples=voice_samples,
|
355 |
+
conditioning_latents=conditioning_latents,
|
356 |
+
**gen_settings,
|
357 |
+
)
|
358 |
+
gen = gen if args.multi_output.candidates > 1 else [gen]
|
359 |
+
for candidate_idx, audio in enumerate(gen):
|
360 |
+
audio = audio.squeeze(0).cpu()
|
361 |
+
if candidate_idx == 0:
|
362 |
+
audio_parts.append(audio)
|
363 |
+
if args.output.output_dir:
|
364 |
+
filename = f"{clip_name}_{candidate_idx:02d}.wav"
|
365 |
+
save_gen_with_voicefix(audio, os.path.join(args.output.output_dir, filename), squeeze=False, voicefixer=args.general.voicefixer)
|
366 |
+
|
367 |
+
audio = torch.cat(audio_parts, dim=-1)
|
368 |
+
if args.output.output_dir:
|
369 |
+
filename = f'{"-".join(voice)}_combined.wav'
|
370 |
+
save_gen_with_voicefix(
|
371 |
+
audio,
|
372 |
+
os.path.join(args.output.output_dir, filename),
|
373 |
+
squeeze=False,
|
374 |
+
voicefixer=args.general.voicefixer,
|
375 |
+
)
|
376 |
+
elif args.output.output:
|
377 |
+
filename = args.output.output or os.tmp
|
378 |
+
save_gen_with_voicefix(audio, filename, squeeze=False, voicefixer=args.general.voicefixer)
|
379 |
+
elif args.output.play:
|
380 |
+
print("WARNING: cannot use voicefixer with --play")
|
381 |
+
f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True)
|
382 |
+
torchaudio.save(f.name, audio, 24000)
|
383 |
+
pydub.playback.play(pydub.AudioSegment.from_wav(f.name))
|
384 |
+
|
385 |
+
if args.advanced.produce_debug_state:
|
386 |
+
os.makedirs("debug_states", exist_ok=True)
|
387 |
+
dbg_state = (seed, texts, voice_samples, conditioning_latents, args)
|
388 |
+
torch.save(
|
389 |
+
dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth')
|
390 |
+
)
|
setup.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setuptools
|
2 |
+
|
3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
4 |
+
long_description = fh.read()
|
5 |
+
|
6 |
+
setuptools.setup(
|
7 |
+
name="TorToiSe",
|
8 |
+
packages=setuptools.find_packages(),
|
9 |
+
version="2.8.0",
|
10 |
+
author="James Betker",
|
11 |
+
author_email="[email protected]",
|
12 |
+
description="A high quality multi-voice text-to-speech library",
|
13 |
+
long_description=long_description,
|
14 |
+
long_description_content_type="text/markdown",
|
15 |
+
url="https://github.com/neonbjb/tortoise-tts",
|
16 |
+
project_urls={},
|
17 |
+
scripts=[
|
18 |
+
'scripts/tortoise_tts.py',
|
19 |
+
],
|
20 |
+
include_package_data=True,
|
21 |
+
install_requires=[
|
22 |
+
'tqdm',
|
23 |
+
'rotary_embedding_torch',
|
24 |
+
'inflect',
|
25 |
+
'progressbar',
|
26 |
+
'einops',
|
27 |
+
'unidecode',
|
28 |
+
'scipy',
|
29 |
+
'librosa',
|
30 |
+
'transformers==4.29.2',
|
31 |
+
'tokenizers',
|
32 |
+
'deepspeed==0.8.3',
|
33 |
+
],
|
34 |
+
classifiers=[
|
35 |
+
"Programming Language :: Python :: 3",
|
36 |
+
"License :: OSI Approved :: Apache Software License",
|
37 |
+
"Operating System :: OS Independent",
|
38 |
+
],
|
39 |
+
python_requires=">=3.6",
|
40 |
+
)
|
tortoise_tts.ipynb
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "_pIZ3ZXNp7cf"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"Welcome to Tortoise! 🐢🐢🐢🐢\n",
|
10 |
+
"\n",
|
11 |
+
"Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
|
12 |
+
"\n",
|
13 |
+
"There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": null,
|
19 |
+
"metadata": {
|
20 |
+
"colab": {
|
21 |
+
"base_uri": "https://localhost:8080/",
|
22 |
+
"height": 1000
|
23 |
+
},
|
24 |
+
"id": "JrK20I32grP6",
|
25 |
+
"outputId": "9711e23e-3bfc-4cb0-c030-25a1cf460972"
|
26 |
+
},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"!git clone https://github.com/DjKesu/tortoise-tts-fast-cloning.git\n",
|
30 |
+
"%cd tortoise-tts-fast-cloning\n",
|
31 |
+
"!pip3 install -r requirements.txt --no-deps\n",
|
32 |
+
"!pip3 install -e .\n",
|
33 |
+
"!pip3 install git+https://github.com/152334H/BigVGAN.git"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "code",
|
38 |
+
"execution_count": null,
|
39 |
+
"metadata": {},
|
40 |
+
"outputs": [],
|
41 |
+
"source": [
|
42 |
+
"!pip uninstall transformers\n",
|
43 |
+
"!pip install transformers==4.29.2"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "markdown",
|
48 |
+
"metadata": {
|
49 |
+
"id": "zRW4p3ftjZ3Y"
|
50 |
+
},
|
51 |
+
"source": [
|
52 |
+
"## **Restart the runtime!**\n",
|
53 |
+
"## Ctrl+M for Colab"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": null,
|
59 |
+
"metadata": {
|
60 |
+
"id": "Gen09NM4hONQ"
|
61 |
+
},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"#@title # Setup\n",
|
65 |
+
"# Imports used through the rest of the notebook.\n",
|
66 |
+
"import torch\n",
|
67 |
+
"import torchaudio\n",
|
68 |
+
"import torch.nn as nn\n",
|
69 |
+
"import torch.nn.functional as F\n",
|
70 |
+
"\n",
|
71 |
+
"import IPython\n",
|
72 |
+
"\n",
|
73 |
+
"from tortoise.api import TextToSpeech\n",
|
74 |
+
"from tortoise.utils.audio import load_audio, load_voice, load_voices\n",
|
75 |
+
"\n",
|
76 |
+
"# This will download all the models used by Tortoise from the HuggingFace hub.\n",
|
77 |
+
"tts = TextToSpeech()"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "code",
|
82 |
+
"execution_count": null,
|
83 |
+
"metadata": {
|
84 |
+
"id": "bt_aoxONjfL2"
|
85 |
+
},
|
86 |
+
"outputs": [],
|
87 |
+
"source": [
|
88 |
+
"# This is the text that will be spoken.\n",
|
89 |
+
"text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\" #@param {type:\"string\"}\n",
|
90 |
+
"#@markdown Show code for multiline text input\n",
|
91 |
+
"# Here's something for the poetically inclined.. (set text=)\n",
|
92 |
+
"\"\"\"\n",
|
93 |
+
"Then took the other, as just as fair,\n",
|
94 |
+
"And having perhaps the better claim,\n",
|
95 |
+
"Because it was grassy and wanted wear;\n",
|
96 |
+
"Though as for that the passing there\n",
|
97 |
+
"Had worn them really about the same,\"\"\"\n",
|
98 |
+
"\n",
|
99 |
+
"# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
|
100 |
+
"# added very_fast preset param option, since it involves resulution with dpm++2m, expected to give best,fastest results\n",
|
101 |
+
"preset = \"ultra_fast\" #@param [\"ultra_fast\", \"fast\", \"standard\", \"high_quality\", \"very_fast\"]"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": null,
|
107 |
+
"metadata": {
|
108 |
+
"colab": {
|
109 |
+
"base_uri": "https://localhost:8080/",
|
110 |
+
"height": 211
|
111 |
+
},
|
112 |
+
"id": "SSleVnRAiEE2",
|
113 |
+
"outputId": "45b950c7-5c39-4075-bb34-0a76bf19e1bc"
|
114 |
+
},
|
115 |
+
"outputs": [],
|
116 |
+
"source": [
|
117 |
+
"#@markdown Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n",
|
118 |
+
"#@markdown with some voices you might recognize.\n",
|
119 |
+
"\n",
|
120 |
+
"#@markdown Let's list all the voices available. These are just some random clips I've gathered\n",
|
121 |
+
"#@markdown from the internet as well as a few voices from the training dataset.\n",
|
122 |
+
"#@markdown Feel free to add your own clips to the voices/ folder.\n",
|
123 |
+
"#@markdown Currently stored my voice clips under voices/krish/ and displaying the random rumblings of my voice.\n",
|
124 |
+
"#@markdown each cell is the samples used, skip unless you wanna listen to them\n",
|
125 |
+
"%cd tortoise-tts-fast-cloning\n",
|
126 |
+
"%ls tortoise/voices/krish\n",
|
127 |
+
"import IPython\n",
|
128 |
+
"IPython.display.Audio('tortoise/voices/krish/1.wav')"
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"execution_count": null,
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [],
|
136 |
+
"source": [
|
137 |
+
"%cd tortoise-tts-fast-cloning\n",
|
138 |
+
"%ls tortoise/voices/krish\n",
|
139 |
+
"import IPython\n",
|
140 |
+
"IPython.display.Audio('tortoise/voices/krish/2.wav')"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "code",
|
145 |
+
"execution_count": null,
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [],
|
148 |
+
"source": [
|
149 |
+
"%cd tortoise-tts-fast-cloning\n",
|
150 |
+
"%ls tortoise/voices/krish\n",
|
151 |
+
"import IPython\n",
|
152 |
+
"IPython.display.Audio('tortoise/voices/krish/3.wav')"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "code",
|
157 |
+
"execution_count": null,
|
158 |
+
"metadata": {},
|
159 |
+
"outputs": [],
|
160 |
+
"source": [
|
161 |
+
"%cd tortoise-tts-fast-cloning\n",
|
162 |
+
"%ls tortoise/voices/krish\n",
|
163 |
+
"import IPython\n",
|
164 |
+
"IPython.display.Audio('tortoise/voices/krish/4.wav')"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": null,
|
170 |
+
"metadata": {
|
171 |
+
"cellView": "form",
|
172 |
+
"colab": {
|
173 |
+
"base_uri": "https://localhost:8080/",
|
174 |
+
"height": 192
|
175 |
+
},
|
176 |
+
"id": "KEXOKjIvn6NW",
|
177 |
+
"outputId": "90c803f3-0b9b-4f24-ccbc-d3f3dcbde48c"
|
178 |
+
},
|
179 |
+
"outputs": [],
|
180 |
+
"source": [
|
181 |
+
"#@markdown Pick one of the voices from the output above\n",
|
182 |
+
"voice = 'krish' #@param {type:\"string\"}\n",
|
183 |
+
"\n",
|
184 |
+
"#@markdown Load it and send it through Tortoise.\n",
|
185 |
+
"voice_samples, conditioning_latents = load_voice(voice)\n",
|
186 |
+
"print(voice_samples)\n",
|
187 |
+
"# conditioning_latents = tts.get_conditioning_latents(\n",
|
188 |
+
"# voice_samples,\n",
|
189 |
+
"# return_mels=False, # Set to True if you want mel spectrograms to be returned\n",
|
190 |
+
"# latent_averaging_mode=1, # Choose the mode (0, 1, or 2) as needed\n",
|
191 |
+
"# original_tortoise=False, # Set to True or False as needed\n",
|
192 |
+
"# )\n",
|
193 |
+
"gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
|
194 |
+
" preset=preset)\n",
|
195 |
+
"torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
|
196 |
+
"IPython.display.Audio('generated.wav')"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"cell_type": "code",
|
201 |
+
"execution_count": null,
|
202 |
+
"metadata": {
|
203 |
+
"colab": {
|
204 |
+
"base_uri": "https://localhost:8080/",
|
205 |
+
"height": 41
|
206 |
+
},
|
207 |
+
"id": "VQgw3KeV8Yqb",
|
208 |
+
"outputId": "13db770e-3fcc-4b27-ab78-07a603a299d9"
|
209 |
+
},
|
210 |
+
"outputs": [],
|
211 |
+
"source": [
|
212 |
+
"#@markdown Optionally, upload use your own voice by running the next two cells. Change the name of the voice to a voice you want before running\n",
|
213 |
+
"#@markdown you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.\n",
|
214 |
+
"CUSTOM_VOICE_NAME = \"custom\"\n",
|
215 |
+
"\n",
|
216 |
+
"import os\n",
|
217 |
+
"from google.colab import files\n",
|
218 |
+
"\n",
|
219 |
+
"custom_voice_folder = f\"tortoise/voices/{CUSTOM_VOICE_NAME}\"\n",
|
220 |
+
"os.makedirs(custom_voice_folder)\n",
|
221 |
+
"for i, file_data in enumerate(files.upload().values()):\n",
|
222 |
+
" with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:\n",
|
223 |
+
" f.write(file_data)"
|
224 |
+
]
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"cell_type": "code",
|
228 |
+
"execution_count": null,
|
229 |
+
"metadata": {
|
230 |
+
"id": "jJnJwv3R9uWT"
|
231 |
+
},
|
232 |
+
"outputs": [],
|
233 |
+
"source": [
|
234 |
+
"# Generate speech with the custotm voice.\n",
|
235 |
+
"voice_samples, conditioning_latents = load_voices(CUSTOM_VOICE_NAME)\n",
|
236 |
+
"gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
|
237 |
+
" preset=preset)\n",
|
238 |
+
"torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)\n",
|
239 |
+
"IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')"
|
240 |
+
]
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"metadata": {
|
244 |
+
"accelerator": "GPU",
|
245 |
+
"colab": {
|
246 |
+
"provenance": []
|
247 |
+
},
|
248 |
+
"kernelspec": {
|
249 |
+
"display_name": "Python 3 (ipykernel)",
|
250 |
+
"language": "python",
|
251 |
+
"name": "python3"
|
252 |
+
},
|
253 |
+
"language_info": {
|
254 |
+
"codemirror_mode": {
|
255 |
+
"name": "ipython",
|
256 |
+
"version": 3
|
257 |
+
},
|
258 |
+
"file_extension": ".py",
|
259 |
+
"mimetype": "text/x-python",
|
260 |
+
"name": "python",
|
261 |
+
"nbconvert_exporter": "python",
|
262 |
+
"pygments_lexer": "ipython3",
|
263 |
+
"version": "3.9.16"
|
264 |
+
}
|
265 |
+
},
|
266 |
+
"nbformat": 4,
|
267 |
+
"nbformat_minor": 4
|
268 |
+
}
|