djkesu commited on
Commit
db268fd
·
1 Parent(s): da16777

updated dockerfile and added scripts

Browse files
Files changed (4) hide show
  1. Dockerfile +36 -47
  2. scripts/tortoise_tts.py +390 -0
  3. setup.py +40 -0
  4. tortoise_tts.ipynb +268 -0
Dockerfile CHANGED
@@ -1,47 +1,36 @@
1
- # Use an official Python runtime as a parent image
2
- FROM nvidia/cuda:11.7.0-base-ubuntu20.04
3
-
4
- # Set the working directory
5
- WORKDIR /app
6
-
7
- # Install git, wget, build-essential
8
- RUN apt-get update && apt-get install -y git wget build-essential
9
-
10
- # Install Miniconda
11
- RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
12
- bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda && \
13
- rm Miniconda3-latest-Linux-x86_64.sh
14
- ENV PATH="/miniconda/bin:${PATH}"
15
-
16
- # Clone the repository
17
- RUN git clone https://github.com/DjKesu/tortoise-tts-fast-cloning /app/tortoise5c
18
-
19
- # Change the working directory to the tortoise-tts-fast directory
20
- WORKDIR /app/tortoise5c
21
-
22
- # Create the Conda environment
23
- RUN conda create -n tts5x python=3.8 && \
24
- echo "source activate tts5x" > ~/.bashrc
25
- ENV PATH /miniconda/envs/ttts-fast/bin:$PATH
26
-
27
- # Set the shell for the following commands to use the Conda environment "ttts-fast"
28
- SHELL ["conda", "run", "-n", "ttts-fast", "/bin/bash", "-c"]
29
-
30
- # Install the necessary packages
31
- RUN conda install -y pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -c nvidia && \
32
- conda install -c anaconda gdbm && \
33
- pip install -e . && \
34
- pip install git+https://github.com/152334H/BigVGAN.git && \
35
- pip install streamlit
36
-
37
- # Make port 8501 available to the world outside this container
38
- EXPOSE 8501
39
-
40
- # Define environment variable
41
- ENV NAME tortoise-tts
42
-
43
- # List the contents of the /app directory
44
- RUN ls -al /app
45
-
46
- # Run the application
47
- CMD ["streamlit", "run", "app.py"]
 
1
+ FROM nvidia/cuda:12.2.0-base-ubuntu22.04
2
+
3
+ COPY . /app
4
+
5
+ RUN apt-get update && \
6
+ apt-get install -y --allow-unauthenticated --no-install-recommends \
7
+ wget \
8
+ git \
9
+ && apt-get autoremove -y \
10
+ && apt-get clean -y \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ ENV HOME "/root"
14
+ ENV CONDA_DIR "${HOME}/miniconda"
15
+ ENV PATH="$CONDA_DIR/bin":$PATH
16
+ ENV CONDA_AUTO_UPDATE_CONDA=false
17
+ ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
18
+ ENV TORTOISE_MODELS_DIR
19
+
20
+ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
21
+ && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
22
+ && "${CONDA_DIR}/bin/conda" init bash \
23
+ && rm -f /tmp/miniconda3.sh \
24
+ && echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
25
+
26
+ # --login option used to source bashrc (thus activating conda env) at every RUN statement
27
+ SHELL ["/bin/bash", "--login", "-c"]
28
+
29
+ RUN conda create --name tortoise python=3.9 numba inflect \
30
+ && conda activate tortoise \
31
+ && conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
32
+ && conda install transformers=4.29.2 \
33
+ && conda install streamlit \
34
+ && cd /app \
35
+ && python setup.py install \
36
+ && streamlit run app.py
 
 
 
 
 
 
 
 
 
 
 
scripts/tortoise_tts.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # AGPL: a notification must be added stating that changes have been made to that file.
3
+
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Literal, Optional
10
+
11
+ import torch
12
+ import torchaudio
13
+ from simple_parsing import ArgumentParser, field
14
+
15
+ from tortoise.api import MODELS_DIR, TextToSpeech
16
+ from tortoise.utils.audio import load_audio
17
+ from tortoise.utils.diffusion import SAMPLERS
18
+ from tortoise.models.vocoder import VocConf
19
+
20
+
21
+ @dataclass
22
+ class General:
23
+ """General options"""
24
+
25
+ text: str = field(positional=True, nargs="*", metavar="text")
26
+ """Text to speak. If omitted, text is read from stdin."""
27
+
28
+ voice: str = field(default="random", alias=["-v"])
29
+ """Selects the voice to use for generation. Use the & character to join two voices together.
30
+ Use a comma to perform inference on multiple voices. Set to "all" to use all available voices.
31
+ Note that multiple voices require the --output-dir option to be set."""
32
+
33
+ voices_dir: Optional[str] = field(default=None, alias=["-V"])
34
+ """Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories."""
35
+
36
+ preset: Literal["ultra_fast", "fast", "standard", "high_quality"] = field(
37
+ default="fast", alias=["-p"]
38
+ )
39
+ """Which voice quality preset to use."""
40
+
41
+ quiet: bool = field(default=False, alias=["-q"])
42
+ """Suppress all output."""
43
+
44
+ voicefixer: bool = field(default=True)
45
+ """Enable/Disable voicefixer"""
46
+
47
+
48
+ @dataclass
49
+ class Output:
50
+ """Output options"""
51
+
52
+ list_voices: bool = field(default=False, alias=["-l"])
53
+ """List available voices and exit."""
54
+
55
+ play: bool = field(default=False, alias=["-P"])
56
+ """Play the audio (requires pydub)."""
57
+
58
+ output: Optional[Path] = field(default=None, alias=["-o"])
59
+ """Save the audio to a file."""
60
+
61
+ output_dir: Path = field(default=Path("results/"), alias=["-O"])
62
+ """Save the audio to a directory as individual segments."""
63
+
64
+
65
+ @dataclass
66
+ class MultiOutput:
67
+ """Multi-output options"""
68
+
69
+ candidates: int = 1
70
+ """How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output."""
71
+
72
+ regenerate: Optional[str] = None
73
+ """Comma-separated list of clip numbers to re-generate."""
74
+
75
+ skip_existing: bool = False
76
+ """Set to skip re-generating existing clips."""
77
+
78
+
79
+ @dataclass
80
+ class Advanced:
81
+ """Advanced options"""
82
+
83
+ produce_debug_state: bool = False
84
+ """Whether or not to produce debug_states in current directory, which can aid in reproducing problems."""
85
+
86
+ seed: Optional[int] = None
87
+ """Random seed which can be used to reproduce results."""
88
+
89
+ models_dir: str = MODELS_DIR
90
+ """Where to find pretrained model checkpoints. Tortoise automatically downloads these to
91
+ ~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints."""
92
+
93
+ text_split: Optional[str] = None
94
+ """How big chunks to split the text into, in the format <desired_length>,<max_length>."""
95
+
96
+ disable_redaction: bool = False
97
+ """Normally text enclosed in brackets are automatically redacted from the spoken output
98
+ (but are still rendered by the model), this can be used for prompt engineering.
99
+ Set this to disable this behavior."""
100
+
101
+ device: Optional[str] = None
102
+ """Device to use for inference."""
103
+
104
+ batch_size: Optional[int] = None
105
+ """Batch size to use for inference. If omitted, the batch size is set based on available GPU memory."""
106
+
107
+ vocoder: Literal["Univnet", "BigVGAN", "BigVGAN_Base"] = "BigVGAN_Base"
108
+ """Pretrained vocoder to be used.
109
+ Univnet - tortoise original
110
+ BigVGAN - 112M model
111
+ BigVGAN_Base - 14M model
112
+ """
113
+
114
+ ar_checkpoint: Optional[str] = None
115
+ """Path to a checkpoint to use for the autoregressive model. If omitted, the default checkpoint is used."""
116
+
117
+ clvp_checkpoint: Optional[str] = None
118
+ """Path to a checkpoint to use for the CLVP model. If omitted, the default checkpoint is used."""
119
+
120
+ diff_checkpoint: Optional[str] = None
121
+ """Path to a checkpoint to use for the diffusion model. If omitted, the default checkpoint is used."""
122
+
123
+
124
+ @dataclass
125
+ class Tuning:
126
+ """Tuning options (overrides preset settings)"""
127
+
128
+ num_autoregressive_samples: Optional[int] = None
129
+ """Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
130
+ As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great"."""
131
+
132
+ temperature: Optional[float] = None
133
+ """The softmax temperature of the autoregressive model."""
134
+
135
+ length_penalty: Optional[float] = None
136
+ """A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs."""
137
+
138
+ repetition_penalty: Optional[float] = None
139
+ """A penalty that prevents the autoregressive decoder from repeating itself during decoding.
140
+ Can be used to reduce the incidence of long silences or "uhhhhhhs", etc."""
141
+
142
+ top_p: Optional[float] = None
143
+ """P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs."""
144
+
145
+ max_mel_tokens: Optional[int] = None
146
+ """Restricts the output length. 1 to 600. Each unit is 1/20 of a second."""
147
+
148
+ cvvp_amount: Optional[float] = None
149
+ """How much the CVVP model should influence the output.
150
+ Increasing this can in some cases reduce the likelihood of multiple speakers."""
151
+
152
+ diffusion_iterations: Optional[int] = None
153
+ """Number of diffusion steps to perform. More steps means the network has more chances to iteratively
154
+ refine the output, which should theoretically mean a higher quality output.
155
+ Generally a value above 250 is not noticeably better, however."""
156
+
157
+ cond_free: Optional[bool] = None
158
+ """Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
159
+ each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
160
+ of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
161
+ dramatically improves realism."""
162
+
163
+ cond_free_k: Optional[float] = None
164
+ """Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
165
+ As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
166
+ Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k"""
167
+
168
+ diffusion_temperature: Optional[float] = None
169
+ """Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
170
+ are the "mean" prediction of the diffusion network and will sound bland and smeared."""
171
+
172
+
173
+ @dataclass
174
+ class Speed:
175
+ """New/speed options"""
176
+
177
+ low_vram: bool = False
178
+ """re-enable default offloading behaviour of tortoise"""
179
+
180
+ half: bool = False
181
+ """enable autocast to half precision for autoregressive model"""
182
+
183
+ no_cache: bool = False
184
+ """disable kv_cache usage. This should really only be used if you are very low on vram."""
185
+
186
+ sampler: Optional[str] = field(default=None, choices=SAMPLERS)
187
+ """override the sampler used for diffusion (default depends on --preset)"""
188
+
189
+ original_tortoise: bool = False
190
+ """ensure results are identical to original tortoise-tts repo"""
191
+
192
+
193
+ if __name__ == "__main__":
194
+ parser = ArgumentParser(
195
+ description="TorToiSe is a text-to-speech program that is capable of synthesizing speech "
196
+ "in multiple voices with realistic prosody and intonation."
197
+ )
198
+ # bugs out for some reason
199
+ # parser.add_argument(
200
+ # "--web",
201
+ # action="store_true",
202
+ # help="launch the webui (doesn't pass it the other arguments)",
203
+ # )
204
+ parser.add_arguments(General, "general")
205
+ parser.add_arguments(Output, "output")
206
+ parser.add_arguments(MultiOutput, "multi_output")
207
+ parser.add_arguments(Advanced, "advanced")
208
+ parser.add_arguments(Tuning, "tuning")
209
+ parser.add_arguments(Speed, "speed")
210
+
211
+ usage_examples = f"""
212
+ Examples:
213
+
214
+ Read text using random voice and place it in a file:
215
+
216
+ {parser.prog} -o hello.wav "Hello, how are you?"
217
+
218
+ Read text from stdin and play it using the tom voice:
219
+
220
+ echo "Say it like you mean it!" | {parser.prog} -P -v tom
221
+
222
+ Read a text file using multiple voices and save the audio clips to a directory:
223
+
224
+ {parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt
225
+ """
226
+
227
+ # show usage even when Ctrl+C is pressed early
228
+ try:
229
+ args = parser.parse_args()
230
+ except SystemExit as e:
231
+ if e.code == 0:
232
+ print(usage_examples)
233
+ sys.exit(e.code)
234
+ # bugs out for some reason
235
+ # if args.web:
236
+ # from importlib import import_module
237
+ # app = import_module("app")
238
+ # sys.exit(app.main())
239
+
240
+ from tortoise.inference import (
241
+ check_pydub,
242
+ get_all_voices,
243
+ get_seed,
244
+ parse_multiarg_text,
245
+ parse_voice_str,
246
+ split_text,
247
+ validate_output_dir,
248
+ voice_loader,
249
+ save_gen_with_voicefix
250
+ )
251
+
252
+ # get voices
253
+ all_voices, extra_voice_dirs = get_all_voices(args.general.voices_dir)
254
+ if args.output.list_voices:
255
+ for v in all_voices:
256
+ print(v)
257
+ sys.exit(0)
258
+ selected_voices = parse_voice_str(args.general.voice, all_voices)
259
+ voice_generator = voice_loader(selected_voices, extra_voice_dirs)
260
+
261
+ # parse text
262
+ if not args.general.text:
263
+ print("reading text from stdin!")
264
+ text = parse_multiarg_text(args.general.text)
265
+ texts = split_text(text, args.advanced.text_split)
266
+
267
+ output_dir = validate_output_dir(
268
+ args.output.output_dir, selected_voices, args.multi_output.candidates
269
+ )
270
+
271
+ # error out early if pydub isn't installed
272
+ pydub = check_pydub(args.output.play)
273
+
274
+ seed = get_seed(args.advanced.seed)
275
+ verbose = not args.general.quiet
276
+
277
+ vocoder = getattr(VocConf, args.advanced.vocoder)
278
+ if verbose:
279
+ print("Loading tts...")
280
+ tts = TextToSpeech(
281
+ models_dir=args.advanced.models_dir,
282
+ enable_redaction=not args.advanced.disable_redaction,
283
+ device=args.advanced.device,
284
+ autoregressive_batch_size=args.advanced.batch_size,
285
+ high_vram=not args.speed.low_vram,
286
+ kv_cache=not args.speed.no_cache,
287
+ ar_checkpoint=args.advanced.ar_checkpoint,
288
+ clvp_checkpoint=args.advanced.clvp_checkpoint,
289
+ diff_checkpoint=args.advanced.diff_checkpoint,
290
+ vocoder=vocoder,
291
+ )
292
+
293
+ gen_settings = {
294
+ "use_deterministic_seed": seed,
295
+ "verbose": verbose,
296
+ "k": args.multi_output.candidates,
297
+ "preset": args.general.preset,
298
+ }
299
+ tuning_options = [
300
+ "num_autoregressive_samples",
301
+ "temperature",
302
+ "length_penalty",
303
+ "repetition_penalty",
304
+ "top_p",
305
+ "max_mel_tokens",
306
+ "cvvp_amount",
307
+ "diffusion_iterations",
308
+ "cond_free",
309
+ "cond_free_k",
310
+ "diffusion_temperature",
311
+ ]
312
+ for option in tuning_options:
313
+ if getattr(args.tuning, option) is not None:
314
+ gen_settings[option] = getattr(args.tuning, option)
315
+
316
+ speed_options = [
317
+ "sampler",
318
+ "original_tortoise",
319
+ "half",
320
+ ]
321
+ for option in speed_options:
322
+ if getattr(args.speed, option) is not None:
323
+ gen_settings[option] = getattr(args.speed, option)
324
+
325
+ total_clips = len(texts) * len(selected_voices)
326
+ regenerate_clips = (
327
+ [int(x) for x in args.multi_output.regenerate.split(",")]
328
+ if args.multi_output.regenerate
329
+ else None
330
+ )
331
+ for voice_idx, (voice, voice_samples, conditioning_latents) in enumerate(
332
+ voice_generator
333
+ ):
334
+ audio_parts = []
335
+ for text_idx, text in enumerate(texts):
336
+ clip_name = f'{"-".join(voice)}_{text_idx:02d}'
337
+ if args.output.output_dir:
338
+ first_clip = os.path.join(args.output.output_dir, f"{clip_name}_00.wav")
339
+ if (
340
+ args.multi_output.skip_existing
341
+ or (regenerate_clips and text_idx not in regenerate_clips)
342
+ ) and os.path.exists(first_clip):
343
+ audio_parts.append(load_audio(first_clip, 24000))
344
+ if verbose:
345
+ print(f"Skipping {clip_name}")
346
+ continue
347
+ if verbose:
348
+ print(
349
+ f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..."
350
+ )
351
+ print(" " + text)
352
+ gen = tts.tts_with_preset(
353
+ text,
354
+ voice_samples=voice_samples,
355
+ conditioning_latents=conditioning_latents,
356
+ **gen_settings,
357
+ )
358
+ gen = gen if args.multi_output.candidates > 1 else [gen]
359
+ for candidate_idx, audio in enumerate(gen):
360
+ audio = audio.squeeze(0).cpu()
361
+ if candidate_idx == 0:
362
+ audio_parts.append(audio)
363
+ if args.output.output_dir:
364
+ filename = f"{clip_name}_{candidate_idx:02d}.wav"
365
+ save_gen_with_voicefix(audio, os.path.join(args.output.output_dir, filename), squeeze=False, voicefixer=args.general.voicefixer)
366
+
367
+ audio = torch.cat(audio_parts, dim=-1)
368
+ if args.output.output_dir:
369
+ filename = f'{"-".join(voice)}_combined.wav'
370
+ save_gen_with_voicefix(
371
+ audio,
372
+ os.path.join(args.output.output_dir, filename),
373
+ squeeze=False,
374
+ voicefixer=args.general.voicefixer,
375
+ )
376
+ elif args.output.output:
377
+ filename = args.output.output or os.tmp
378
+ save_gen_with_voicefix(audio, filename, squeeze=False, voicefixer=args.general.voicefixer)
379
+ elif args.output.play:
380
+ print("WARNING: cannot use voicefixer with --play")
381
+ f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True)
382
+ torchaudio.save(f.name, audio, 24000)
383
+ pydub.playback.play(pydub.AudioSegment.from_wav(f.name))
384
+
385
+ if args.advanced.produce_debug_state:
386
+ os.makedirs("debug_states", exist_ok=True)
387
+ dbg_state = (seed, texts, voice_samples, conditioning_latents, args)
388
+ torch.save(
389
+ dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth')
390
+ )
setup.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ setuptools.setup(
7
+ name="TorToiSe",
8
+ packages=setuptools.find_packages(),
9
+ version="2.8.0",
10
+ author="James Betker",
11
+ author_email="[email protected]",
12
+ description="A high quality multi-voice text-to-speech library",
13
+ long_description=long_description,
14
+ long_description_content_type="text/markdown",
15
+ url="https://github.com/neonbjb/tortoise-tts",
16
+ project_urls={},
17
+ scripts=[
18
+ 'scripts/tortoise_tts.py',
19
+ ],
20
+ include_package_data=True,
21
+ install_requires=[
22
+ 'tqdm',
23
+ 'rotary_embedding_torch',
24
+ 'inflect',
25
+ 'progressbar',
26
+ 'einops',
27
+ 'unidecode',
28
+ 'scipy',
29
+ 'librosa',
30
+ 'transformers==4.29.2',
31
+ 'tokenizers',
32
+ 'deepspeed==0.8.3',
33
+ ],
34
+ classifiers=[
35
+ "Programming Language :: Python :: 3",
36
+ "License :: OSI Approved :: Apache Software License",
37
+ "Operating System :: OS Independent",
38
+ ],
39
+ python_requires=">=3.6",
40
+ )
tortoise_tts.ipynb ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "_pIZ3ZXNp7cf"
7
+ },
8
+ "source": [
9
+ "Welcome to Tortoise! 🐢🐢🐢🐢\n",
10
+ "\n",
11
+ "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
12
+ "\n",
13
+ "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {
20
+ "colab": {
21
+ "base_uri": "https://localhost:8080/",
22
+ "height": 1000
23
+ },
24
+ "id": "JrK20I32grP6",
25
+ "outputId": "9711e23e-3bfc-4cb0-c030-25a1cf460972"
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "!git clone https://github.com/DjKesu/tortoise-tts-fast-cloning.git\n",
30
+ "%cd tortoise-tts-fast-cloning\n",
31
+ "!pip3 install -r requirements.txt --no-deps\n",
32
+ "!pip3 install -e .\n",
33
+ "!pip3 install git+https://github.com/152334H/BigVGAN.git"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "!pip uninstall transformers\n",
43
+ "!pip install transformers==4.29.2"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {
49
+ "id": "zRW4p3ftjZ3Y"
50
+ },
51
+ "source": [
52
+ "## **Restart the runtime!**\n",
53
+ "## Ctrl+M for Colab"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "metadata": {
60
+ "id": "Gen09NM4hONQ"
61
+ },
62
+ "outputs": [],
63
+ "source": [
64
+ "#@title # Setup\n",
65
+ "# Imports used through the rest of the notebook.\n",
66
+ "import torch\n",
67
+ "import torchaudio\n",
68
+ "import torch.nn as nn\n",
69
+ "import torch.nn.functional as F\n",
70
+ "\n",
71
+ "import IPython\n",
72
+ "\n",
73
+ "from tortoise.api import TextToSpeech\n",
74
+ "from tortoise.utils.audio import load_audio, load_voice, load_voices\n",
75
+ "\n",
76
+ "# This will download all the models used by Tortoise from the HuggingFace hub.\n",
77
+ "tts = TextToSpeech()"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "metadata": {
84
+ "id": "bt_aoxONjfL2"
85
+ },
86
+ "outputs": [],
87
+ "source": [
88
+ "# This is the text that will be spoken.\n",
89
+ "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\" #@param {type:\"string\"}\n",
90
+ "#@markdown Show code for multiline text input\n",
91
+ "# Here's something for the poetically inclined.. (set text=)\n",
92
+ "\"\"\"\n",
93
+ "Then took the other, as just as fair,\n",
94
+ "And having perhaps the better claim,\n",
95
+ "Because it was grassy and wanted wear;\n",
96
+ "Though as for that the passing there\n",
97
+ "Had worn them really about the same,\"\"\"\n",
98
+ "\n",
99
+ "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
100
+ "# added very_fast preset param option, since it involves resulution with dpm++2m, expected to give best,fastest results\n",
101
+ "preset = \"ultra_fast\" #@param [\"ultra_fast\", \"fast\", \"standard\", \"high_quality\", \"very_fast\"]"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {
108
+ "colab": {
109
+ "base_uri": "https://localhost:8080/",
110
+ "height": 211
111
+ },
112
+ "id": "SSleVnRAiEE2",
113
+ "outputId": "45b950c7-5c39-4075-bb34-0a76bf19e1bc"
114
+ },
115
+ "outputs": [],
116
+ "source": [
117
+ "#@markdown Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n",
118
+ "#@markdown with some voices you might recognize.\n",
119
+ "\n",
120
+ "#@markdown Let's list all the voices available. These are just some random clips I've gathered\n",
121
+ "#@markdown from the internet as well as a few voices from the training dataset.\n",
122
+ "#@markdown Feel free to add your own clips to the voices/ folder.\n",
123
+ "#@markdown Currently stored my voice clips under voices/krish/ and displaying the random rumblings of my voice.\n",
124
+ "#@markdown each cell is the samples used, skip unless you wanna listen to them\n",
125
+ "%cd tortoise-tts-fast-cloning\n",
126
+ "%ls tortoise/voices/krish\n",
127
+ "import IPython\n",
128
+ "IPython.display.Audio('tortoise/voices/krish/1.wav')"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": null,
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": [
137
+ "%cd tortoise-tts-fast-cloning\n",
138
+ "%ls tortoise/voices/krish\n",
139
+ "import IPython\n",
140
+ "IPython.display.Audio('tortoise/voices/krish/2.wav')"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "%cd tortoise-tts-fast-cloning\n",
150
+ "%ls tortoise/voices/krish\n",
151
+ "import IPython\n",
152
+ "IPython.display.Audio('tortoise/voices/krish/3.wav')"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": null,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "%cd tortoise-tts-fast-cloning\n",
162
+ "%ls tortoise/voices/krish\n",
163
+ "import IPython\n",
164
+ "IPython.display.Audio('tortoise/voices/krish/4.wav')"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": null,
170
+ "metadata": {
171
+ "cellView": "form",
172
+ "colab": {
173
+ "base_uri": "https://localhost:8080/",
174
+ "height": 192
175
+ },
176
+ "id": "KEXOKjIvn6NW",
177
+ "outputId": "90c803f3-0b9b-4f24-ccbc-d3f3dcbde48c"
178
+ },
179
+ "outputs": [],
180
+ "source": [
181
+ "#@markdown Pick one of the voices from the output above\n",
182
+ "voice = 'krish' #@param {type:\"string\"}\n",
183
+ "\n",
184
+ "#@markdown Load it and send it through Tortoise.\n",
185
+ "voice_samples, conditioning_latents = load_voice(voice)\n",
186
+ "print(voice_samples)\n",
187
+ "# conditioning_latents = tts.get_conditioning_latents(\n",
188
+ "# voice_samples,\n",
189
+ "# return_mels=False, # Set to True if you want mel spectrograms to be returned\n",
190
+ "# latent_averaging_mode=1, # Choose the mode (0, 1, or 2) as needed\n",
191
+ "# original_tortoise=False, # Set to True or False as needed\n",
192
+ "# )\n",
193
+ "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
194
+ " preset=preset)\n",
195
+ "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
196
+ "IPython.display.Audio('generated.wav')"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": null,
202
+ "metadata": {
203
+ "colab": {
204
+ "base_uri": "https://localhost:8080/",
205
+ "height": 41
206
+ },
207
+ "id": "VQgw3KeV8Yqb",
208
+ "outputId": "13db770e-3fcc-4b27-ab78-07a603a299d9"
209
+ },
210
+ "outputs": [],
211
+ "source": [
212
+ "#@markdown Optionally, upload use your own voice by running the next two cells. Change the name of the voice to a voice you want before running\n",
213
+ "#@markdown you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.\n",
214
+ "CUSTOM_VOICE_NAME = \"custom\"\n",
215
+ "\n",
216
+ "import os\n",
217
+ "from google.colab import files\n",
218
+ "\n",
219
+ "custom_voice_folder = f\"tortoise/voices/{CUSTOM_VOICE_NAME}\"\n",
220
+ "os.makedirs(custom_voice_folder)\n",
221
+ "for i, file_data in enumerate(files.upload().values()):\n",
222
+ " with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:\n",
223
+ " f.write(file_data)"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": null,
229
+ "metadata": {
230
+ "id": "jJnJwv3R9uWT"
231
+ },
232
+ "outputs": [],
233
+ "source": [
234
+ "# Generate speech with the custotm voice.\n",
235
+ "voice_samples, conditioning_latents = load_voices(CUSTOM_VOICE_NAME)\n",
236
+ "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
237
+ " preset=preset)\n",
238
+ "torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)\n",
239
+ "IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')"
240
+ ]
241
+ }
242
+ ],
243
+ "metadata": {
244
+ "accelerator": "GPU",
245
+ "colab": {
246
+ "provenance": []
247
+ },
248
+ "kernelspec": {
249
+ "display_name": "Python 3 (ipykernel)",
250
+ "language": "python",
251
+ "name": "python3"
252
+ },
253
+ "language_info": {
254
+ "codemirror_mode": {
255
+ "name": "ipython",
256
+ "version": 3
257
+ },
258
+ "file_extension": ".py",
259
+ "mimetype": "text/x-python",
260
+ "name": "python",
261
+ "nbconvert_exporter": "python",
262
+ "pygments_lexer": "ipython3",
263
+ "version": "3.9.16"
264
+ }
265
+ },
266
+ "nbformat": 4,
267
+ "nbformat_minor": 4
268
+ }