Spaces:

djkesu
/

tortoise5c

Running

App Files Files Community

djkesu commited on Sep 25, 2023

Commit

db268fd

1 Parent(s): da16777

updated dockerfile and added scripts

Browse files

Files changed (4) hide show

Dockerfile +36 -47
scripts/tortoise_tts.py +390 -0
setup.py +40 -0
tortoise_tts.ipynb +268 -0

Dockerfile CHANGED Viewed

@@ -1,47 +1,36 @@
-# Use an official Python runtime as a parent image
-FROM nvidia/cuda:11.7.0-base-ubuntu20.04
-# Set the working directory
-WORKDIR /app
-# Install git, wget, build-essential
-RUN apt-get update && apt-get install -y git wget build-essential
-# Install Miniconda
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda && \
-    rm Miniconda3-latest-Linux-x86_64.sh
-ENV PATH="/miniconda/bin:${PATH}"
-# Clone the repository
-RUN git clone https://github.com/DjKesu/tortoise-tts-fast-cloning /app/tortoise5c
-# Change the working directory to the tortoise-tts-fast directory
-WORKDIR /app/tortoise5c
-# Create the Conda environment
-RUN conda create -n tts5x python=3.8 && \
-    echo "source activate tts5x" > ~/.bashrc
-ENV PATH /miniconda/envs/ttts-fast/bin:$PATH
-# Set the shell for the following commands to use the Conda environment "ttts-fast"
-SHELL ["conda", "run", "-n", "ttts-fast", "/bin/bash", "-c"]
-# Install the necessary packages
-RUN conda install -y pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -c nvidia && \
-    conda install -c anaconda gdbm && \
-    pip install -e . && \
-    pip install git+https://github.com/152334H/BigVGAN.git && \
-    pip install streamlit
-# Make port 8501 available to the world outside this container
-EXPOSE 8501
-# Define environment variable
-ENV NAME tortoise-tts
-# List the contents of the /app directory
-RUN ls -al /app
-# Run the application
-CMD ["streamlit", "run", "app.py"]

+FROM nvidia/cuda:12.2.0-base-ubuntu22.04
+COPY . /app
+RUN apt-get update && \
+    apt-get install -y --allow-unauthenticated --no-install-recommends \
+    wget \
+    git \
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/*
+ENV HOME "/root"
+ENV CONDA_DIR "${HOME}/miniconda"
+ENV PATH="$CONDA_DIR/bin":$PATH
+ENV CONDA_AUTO_UPDATE_CONDA=false
+ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
+ENV TORTOISE_MODELS_DIR
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
+    && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
+    && "${CONDA_DIR}/bin/conda" init bash \
+    && rm -f /tmp/miniconda3.sh \
+    && echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
+# --login option used to source bashrc (thus activating conda env) at every RUN statement
+SHELL ["/bin/bash", "--login", "-c"]
+RUN conda create --name tortoise python=3.9 numba inflect \
+    && conda activate tortoise \
+    && conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
+    && conda install transformers=4.29.2 \
+    && conda install streamlit \
+    && cd /app \
+    && python setup.py install \
+    && streamlit run app.py

scripts/tortoise_tts.py ADDED Viewed

	@@ -0,0 +1,390 @@

+#!/usr/bin/env python3
+# AGPL: a notification must be added stating that changes have been made to that file.
+import os
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional
+import torch
+import torchaudio
+from simple_parsing import ArgumentParser, field
+from tortoise.api import MODELS_DIR, TextToSpeech
+from tortoise.utils.audio import load_audio
+from tortoise.utils.diffusion import SAMPLERS
+from tortoise.models.vocoder import VocConf
+@dataclass
+class General:
+    """General options"""
+    text: str = field(positional=True, nargs="*", metavar="text")
+    """Text to speak. If omitted, text is read from stdin."""
+    voice: str = field(default="random", alias=["-v"])
+    """Selects the voice to use for generation. Use the & character to join two voices together.
+    Use a comma to perform inference on multiple voices. Set to "all" to use all available voices.
+    Note that multiple voices require the --output-dir option to be set."""
+    voices_dir: Optional[str] = field(default=None, alias=["-V"])
+    """Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories."""
+    preset: Literal["ultra_fast", "fast", "standard", "high_quality"] = field(
+        default="fast", alias=["-p"]
+    )
+    """Which voice quality preset to use."""
+    quiet: bool = field(default=False, alias=["-q"])
+    """Suppress all output."""
+    voicefixer: bool = field(default=True)
+    """Enable/Disable voicefixer"""
+@dataclass
+class Output:
+    """Output options"""
+    list_voices: bool = field(default=False, alias=["-l"])
+    """List available voices and exit."""
+    play: bool = field(default=False, alias=["-P"])
+    """Play the audio (requires pydub)."""
+    output: Optional[Path] = field(default=None, alias=["-o"])
+    """Save the audio to a file."""
+    output_dir: Path = field(default=Path("results/"), alias=["-O"])
+    """Save the audio to a directory as individual segments."""
+@dataclass
+class MultiOutput:
+    """Multi-output options"""
+    candidates: int = 1
+    """How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output."""
+    regenerate: Optional[str] = None
+    """Comma-separated list of clip numbers to re-generate."""
+    skip_existing: bool = False
+    """Set to skip re-generating existing clips."""
+@dataclass
+class Advanced:
+    """Advanced options"""
+    produce_debug_state: bool = False
+    """Whether or not to produce debug_states in current directory, which can aid in reproducing problems."""
+    seed: Optional[int] = None
+    """Random seed which can be used to reproduce results."""
+    models_dir: str = MODELS_DIR
+    """Where to find pretrained model checkpoints. Tortoise automatically downloads these to
+    ~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints."""
+    text_split: Optional[str] = None
+    """How big chunks to split the text into, in the format <desired_length>,<max_length>."""
+    disable_redaction: bool = False
+    """Normally text enclosed in brackets are automatically redacted from the spoken output
+    (but are still rendered by the model), this can be used for prompt engineering.
+    Set this to disable this behavior."""
+    device: Optional[str] = None
+    """Device to use for inference."""
+    batch_size: Optional[int] = None
+    """Batch size to use for inference. If omitted, the batch size is set based on available GPU memory."""
+    vocoder: Literal["Univnet", "BigVGAN", "BigVGAN_Base"] = "BigVGAN_Base"
+    """Pretrained vocoder to be used.
+    Univnet - tortoise original
+    BigVGAN - 112M model
+    BigVGAN_Base - 14M model
+    """
+    ar_checkpoint: Optional[str] = None
+    """Path to a checkpoint to use for the autoregressive model. If omitted, the default checkpoint is used."""
+    clvp_checkpoint: Optional[str] = None
+    """Path to a checkpoint to use for the CLVP model. If omitted, the default checkpoint is used."""
+    diff_checkpoint: Optional[str] = None
+    """Path to a checkpoint to use for the diffusion model. If omitted, the default checkpoint is used."""
+@dataclass
+class Tuning:
+    """Tuning options (overrides preset settings)"""
+    num_autoregressive_samples: Optional[int] = None
+    """Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+    As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great"."""
+    temperature: Optional[float] = None
+    """The softmax temperature of the autoregressive model."""
+    length_penalty: Optional[float] = None
+    """A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs."""
+    repetition_penalty: Optional[float] = None
+    """A penalty that prevents the autoregressive decoder from repeating itself during decoding.
+    Can be used to reduce the incidence of long silences or "uhhhhhhs", etc."""
+    top_p: Optional[float] = None
+    """P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs."""
+    max_mel_tokens: Optional[int] = None
+    """Restricts the output length. 1 to 600. Each unit is 1/20 of a second."""
+    cvvp_amount: Optional[float] = None
+    """How much the CVVP model should influence the output.
+    Increasing this can in some cases reduce the likelihood of multiple speakers."""
+    diffusion_iterations: Optional[int] = None
+    """Number of diffusion steps to perform.  More steps means the network has more chances to iteratively
+    refine the output, which should theoretically mean a higher quality output.
+    Generally a value above 250 is not noticeably better, however."""
+    cond_free: Optional[bool] = None
+    """Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
+    each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
+    of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
+    dramatically improves realism."""
+    cond_free_k: Optional[float] = None
+    """Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+    As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+    Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k"""
+    diffusion_temperature: Optional[float] = None
+    """Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+    are the "mean" prediction of the diffusion network and will sound bland and smeared."""
+@dataclass
+class Speed:
+    """New/speed options"""
+    low_vram: bool = False
+    """re-enable default offloading behaviour of tortoise"""
+    half: bool = False
+    """enable autocast to half precision for autoregressive model"""
+    no_cache: bool = False
+    """disable kv_cache usage. This should really only be used if you are very low on vram."""
+    sampler: Optional[str] = field(default=None, choices=SAMPLERS)
+    """override the sampler used for diffusion (default depends on --preset)"""
+    original_tortoise: bool = False
+    """ensure results are identical to original tortoise-tts repo"""
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="TorToiSe is a text-to-speech program that is capable of synthesizing speech "
+        "in multiple voices with realistic prosody and intonation."
+    )
+    # bugs out for some reason
+    # parser.add_argument(
+    #     "--web",
+    #     action="store_true",
+    #     help="launch the webui (doesn't pass it the other arguments)",
+    # )
+    parser.add_arguments(General, "general")
+    parser.add_arguments(Output, "output")
+    parser.add_arguments(MultiOutput, "multi_output")
+    parser.add_arguments(Advanced, "advanced")
+    parser.add_arguments(Tuning, "tuning")
+    parser.add_arguments(Speed, "speed")
+    usage_examples = f"""
+    Examples:
+    Read text using random voice and place it in a file:
+        {parser.prog} -o hello.wav "Hello, how are you?"
+    Read text from stdin and play it using the tom voice:
+        echo "Say it like you mean it!" | {parser.prog} -P -v tom
+    Read a text file using multiple voices and save the audio clips to a directory:
+        {parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt
+    """
+    # show usage even when Ctrl+C is pressed early
+    try:
+        args = parser.parse_args()
+    except SystemExit as e:
+        if e.code == 0:
+            print(usage_examples)
+        sys.exit(e.code)
+    # bugs out for some reason
+    # if args.web:
+    #     from importlib import import_module
+    #     app = import_module("app")
+    #     sys.exit(app.main())
+    from tortoise.inference import (
+        check_pydub,
+        get_all_voices,
+        get_seed,
+        parse_multiarg_text,
+        parse_voice_str,
+        split_text,
+        validate_output_dir,
+        voice_loader,
+        save_gen_with_voicefix
+    )
+    # get voices
+    all_voices, extra_voice_dirs = get_all_voices(args.general.voices_dir)
+    if args.output.list_voices:
+        for v in all_voices:
+            print(v)
+        sys.exit(0)
+    selected_voices = parse_voice_str(args.general.voice, all_voices)
+    voice_generator = voice_loader(selected_voices, extra_voice_dirs)
+    # parse text
+    if not args.general.text:
+        print("reading text from stdin!")
+    text = parse_multiarg_text(args.general.text)
+    texts = split_text(text, args.advanced.text_split)
+    output_dir = validate_output_dir(
+        args.output.output_dir, selected_voices, args.multi_output.candidates
+    )
+    # error out early if pydub isn't installed
+    pydub = check_pydub(args.output.play)
+    seed = get_seed(args.advanced.seed)
+    verbose = not args.general.quiet
+    vocoder = getattr(VocConf, args.advanced.vocoder)
+    if verbose:
+        print("Loading tts...")
+    tts = TextToSpeech(
+        models_dir=args.advanced.models_dir,
+        enable_redaction=not args.advanced.disable_redaction,
+        device=args.advanced.device,
+        autoregressive_batch_size=args.advanced.batch_size,
+        high_vram=not args.speed.low_vram,
+        kv_cache=not args.speed.no_cache,
+        ar_checkpoint=args.advanced.ar_checkpoint,
+        clvp_checkpoint=args.advanced.clvp_checkpoint,
+        diff_checkpoint=args.advanced.diff_checkpoint,
+        vocoder=vocoder,
+    )
+    gen_settings = {
+        "use_deterministic_seed": seed,
+        "verbose": verbose,
+        "k": args.multi_output.candidates,
+        "preset": args.general.preset,
+    }
+    tuning_options = [
+        "num_autoregressive_samples",
+        "temperature",
+        "length_penalty",
+        "repetition_penalty",
+        "top_p",
+        "max_mel_tokens",
+        "cvvp_amount",
+        "diffusion_iterations",
+        "cond_free",
+        "cond_free_k",
+        "diffusion_temperature",
+    ]
+    for option in tuning_options:
+        if getattr(args.tuning, option) is not None:
+            gen_settings[option] = getattr(args.tuning, option)
+    speed_options = [
+        "sampler",
+        "original_tortoise",
+        "half",
+    ]
+    for option in speed_options:
+        if getattr(args.speed, option) is not None:
+            gen_settings[option] = getattr(args.speed, option)
+    total_clips = len(texts) * len(selected_voices)
+    regenerate_clips = (
+        [int(x) for x in args.multi_output.regenerate.split(",")]
+        if args.multi_output.regenerate
+        else None
+    )
+    for voice_idx, (voice, voice_samples, conditioning_latents) in enumerate(
+        voice_generator
+    ):
+        audio_parts = []
+        for text_idx, text in enumerate(texts):
+            clip_name = f'{"-".join(voice)}_{text_idx:02d}'
+            if args.output.output_dir:
+                first_clip = os.path.join(args.output.output_dir, f"{clip_name}_00.wav")
+                if (
+                    args.multi_output.skip_existing
+                    or (regenerate_clips and text_idx not in regenerate_clips)
+                ) and os.path.exists(first_clip):
+                    audio_parts.append(load_audio(first_clip, 24000))
+                    if verbose:
+                        print(f"Skipping {clip_name}")
+                    continue
+            if verbose:
+                print(
+                    f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..."
+                )
+                print("  " + text)
+            gen = tts.tts_with_preset(
+                text,
+                voice_samples=voice_samples,
+                conditioning_latents=conditioning_latents,
+                **gen_settings,
+            )
+            gen = gen if args.multi_output.candidates > 1 else [gen]
+            for candidate_idx, audio in enumerate(gen):
+                audio = audio.squeeze(0).cpu()
+                if candidate_idx == 0:
+                    audio_parts.append(audio)
+                if args.output.output_dir:
+                    filename = f"{clip_name}_{candidate_idx:02d}.wav"
+                    save_gen_with_voicefix(audio, os.path.join(args.output.output_dir, filename), squeeze=False, voicefixer=args.general.voicefixer)
+        audio = torch.cat(audio_parts, dim=-1)
+        if args.output.output_dir:
+            filename = f'{"-".join(voice)}_combined.wav'
+            save_gen_with_voicefix(
+                audio,
+                os.path.join(args.output.output_dir, filename),
+                squeeze=False,
+                voicefixer=args.general.voicefixer,
+            )
+        elif args.output.output:
+            filename = args.output.output or os.tmp
+            save_gen_with_voicefix(audio, filename, squeeze=False, voicefixer=args.general.voicefixer)
+        elif args.output.play:
+            print("WARNING: cannot use voicefixer with --play")
+            f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True)
+            torchaudio.save(f.name, audio, 24000)
+            pydub.playback.play(pydub.AudioSegment.from_wav(f.name))
+        if args.advanced.produce_debug_state:
+            os.makedirs("debug_states", exist_ok=True)
+            dbg_state = (seed, texts, voice_samples, conditioning_latents, args)
+            torch.save(
+                dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth')
+            )

setup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import setuptools
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="TorToiSe",
+    packages=setuptools.find_packages(),
+    version="2.8.0",
+    author="James Betker",
+    author_email="[email protected]",
+    description="A high quality multi-voice text-to-speech library",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/neonbjb/tortoise-tts",
+    project_urls={},
+    scripts=[
+        'scripts/tortoise_tts.py',
+    ],
+    include_package_data=True,
+    install_requires=[
+        'tqdm',
+        'rotary_embedding_torch',
+        'inflect',
+        'progressbar',
+        'einops',
+        'unidecode',
+        'scipy',
+        'librosa',
+        'transformers==4.29.2',
+        'tokenizers',
+        'deepspeed==0.8.3',
+    ],
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.6",
+)

tortoise_tts.ipynb ADDED Viewed

	@@ -0,0 +1,268 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_pIZ3ZXNp7cf"
+   },
+   "source": [
+    "Welcome to Tortoise! 🐢🐢🐢🐢\n",
+    "\n",
+    "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
+    "\n",
+    "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "id": "JrK20I32grP6",
+    "outputId": "9711e23e-3bfc-4cb0-c030-25a1cf460972"
+   },
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/DjKesu/tortoise-tts-fast-cloning.git\n",
+    "%cd tortoise-tts-fast-cloning\n",
+    "!pip3 install -r requirements.txt --no-deps\n",
+    "!pip3 install -e .\n",
+    "!pip3 install git+https://github.com/152334H/BigVGAN.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip uninstall transformers\n",
+    "!pip install transformers==4.29.2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zRW4p3ftjZ3Y"
+   },
+   "source": [
+    "## **Restart the runtime!**\n",
+    "## Ctrl+M for Colab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Gen09NM4hONQ"
+   },
+   "outputs": [],
+   "source": [
+    "#@title # Setup\n",
+    "# Imports used through the rest of the notebook.\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import IPython\n",
+    "\n",
+    "from tortoise.api import TextToSpeech\n",
+    "from tortoise.utils.audio import load_audio, load_voice, load_voices\n",
+    "\n",
+    "# This will download all the models used by Tortoise from the HuggingFace hub.\n",
+    "tts = TextToSpeech()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bt_aoxONjfL2"
+   },
+   "outputs": [],
+   "source": [
+    "# This is the text that will be spoken.\n",
+    "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\" #@param {type:\"string\"}\n",
+    "#@markdown Show code for multiline text input\n",
+    "# Here's something for the poetically inclined.. (set text=)\n",
+    "\"\"\"\n",
+    "Then took the other, as just as fair,\n",
+    "And having perhaps the better claim,\n",
+    "Because it was grassy and wanted wear;\n",
+    "Though as for that the passing there\n",
+    "Had worn them really about the same,\"\"\"\n",
+    "\n",
+    "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
+    "# added very_fast preset param option, since it involves resulution with dpm++2m, expected to give best,fastest results\n",
+    "preset = \"ultra_fast\" #@param [\"ultra_fast\", \"fast\", \"standard\", \"high_quality\", \"very_fast\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 211
+    },
+    "id": "SSleVnRAiEE2",
+    "outputId": "45b950c7-5c39-4075-bb34-0a76bf19e1bc"
+   },
+   "outputs": [],
+   "source": [
+    "#@markdown Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n",
+    "#@markdown with some voices you might recognize.\n",
+    "\n",
+    "#@markdown Let's list all the voices available. These are just some random clips I've gathered\n",
+    "#@markdown from the internet as well as a few voices from the training dataset.\n",
+    "#@markdown Feel free to add your own clips to the voices/ folder.\n",
+    "#@markdown Currently stored my voice clips under voices/krish/ and displaying the random rumblings of my voice.\n",
+    "#@markdown each cell is the samples used, skip unless you wanna listen to them\n",
+    "%cd tortoise-tts-fast-cloning\n",
+    "%ls tortoise/voices/krish\n",
+    "import IPython\n",
+    "IPython.display.Audio('tortoise/voices/krish/1.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd tortoise-tts-fast-cloning\n",
+    "%ls tortoise/voices/krish\n",
+    "import IPython\n",
+    "IPython.display.Audio('tortoise/voices/krish/2.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd tortoise-tts-fast-cloning\n",
+    "%ls tortoise/voices/krish\n",
+    "import IPython\n",
+    "IPython.display.Audio('tortoise/voices/krish/3.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd tortoise-tts-fast-cloning\n",
+    "%ls tortoise/voices/krish\n",
+    "import IPython\n",
+    "IPython.display.Audio('tortoise/voices/krish/4.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 192
+    },
+    "id": "KEXOKjIvn6NW",
+    "outputId": "90c803f3-0b9b-4f24-ccbc-d3f3dcbde48c"
+   },
+   "outputs": [],
+   "source": [
+    "#@markdown Pick one of the voices from the output above\n",
+    "voice = 'krish' #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Load it and send it through Tortoise.\n",
+    "voice_samples, conditioning_latents = load_voice(voice)\n",
+    "print(voice_samples)\n",
+    "# conditioning_latents = tts.get_conditioning_latents(\n",
+    "#     voice_samples,\n",
+    "#     return_mels=False,  # Set to True if you want mel spectrograms to be returned\n",
+    "#     latent_averaging_mode=1,  # Choose the mode (0, 1, or 2) as needed\n",
+    "#     original_tortoise=False,  # Set to True or False as needed\n",
+    "# )\n",
+    "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
+    "                          preset=preset)\n",
+    "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
+    "IPython.display.Audio('generated.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 41
+    },
+    "id": "VQgw3KeV8Yqb",
+    "outputId": "13db770e-3fcc-4b27-ab78-07a603a299d9"
+   },
+   "outputs": [],
+   "source": [
+    "#@markdown Optionally, upload use your own voice by running the next two cells. Change the name of the voice to a voice you want before running\n",
+    "#@markdown you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.\n",
+    "CUSTOM_VOICE_NAME = \"custom\"\n",
+    "\n",
+    "import os\n",
+    "from google.colab import files\n",
+    "\n",
+    "custom_voice_folder = f\"tortoise/voices/{CUSTOM_VOICE_NAME}\"\n",
+    "os.makedirs(custom_voice_folder)\n",
+    "for i, file_data in enumerate(files.upload().values()):\n",
+    "  with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:\n",
+    "    f.write(file_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "jJnJwv3R9uWT"
+   },
+   "outputs": [],
+   "source": [
+    "# Generate speech with the custotm voice.\n",
+    "voice_samples, conditioning_latents = load_voices(CUSTOM_VOICE_NAME)\n",
+    "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
+    "                          preset=preset)\n",
+    "torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)\n",
+    "IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}