MasonCrinr
commited on
Commit
·
762a084
1
Parent(s):
30a0b0e
Upload 580 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- .github/workflows +39 -0
- .gitignore +135 -0
- Advanced_Usage.md +103 -0
- CHANGELOG.md +36 -0
- CITATION.cff +10 -0
- Dockerfile +34 -0
- LICENSE +201 -0
- MANIFEST.in +2 -0
- README.md +227 -0
- examples/favorite_riding_hood.mp3 +0 -0
- examples/favorites/atkins_mha.mp3 +0 -0
- examples/favorites/atkins_omicron.mp3 +0 -0
- examples/favorites/atkins_value.mp3 +0 -0
- examples/favorites/daniel_craig_dumbledore.mp3 +0 -0
- examples/favorites/daniel_craig_training_ethics.mp3 +0 -0
- examples/favorites/dotrice_stop_for_death.mp3 +0 -0
- examples/favorites/emma_stone_courage.mp3 +0 -0
- examples/favorites/emma_stone_training_ethics.mp3 +0 -0
- examples/favorites/halle_barry_dumbledore.mp3 +0 -0
- examples/favorites/halle_barry_oar_to_oar.mp3 +0 -0
- examples/favorites/henry_cavill_metallic_hydrogen.mp3 +0 -0
- examples/favorites/kennard_road_not_taken.mp3 +0 -0
- examples/favorites/morgan_freeman_metallic_hydrogen.mp3 +0 -0
- examples/favorites/myself_gatsby.mp3 +0 -0
- examples/favorites/patrick_stewart_omicron.mp3 +0 -0
- examples/favorites/patrick_stewart_secret_of_life.mp3 +0 -0
- examples/favorites/robert_deniro_review.mp3 +0 -0
- examples/favorites/william_shatner_spacecraft_interview.mp3 +0 -0
- examples/finetuned/lj/1.mp3 +0 -0
- examples/finetuned/lj/2.mp3 +0 -0
- examples/finetuned/lj/3.mp3 +0 -0
- examples/finetuned/lj/4.mp3 +0 -0
- examples/naturalspeech_comparison/fibers/naturalspeech.mp3 +0 -0
- examples/naturalspeech_comparison/fibers/tortoise.mp3 +0 -0
- examples/naturalspeech_comparison/lax/naturalspeech.mp3 +0 -0
- examples/naturalspeech_comparison/lax/tortoise.mp3 +0 -0
- examples/naturalspeech_comparison/maltby/naturalspeech.mp3 +0 -0
- examples/naturalspeech_comparison/maltby/tortoise.mp3 +0 -0
- examples/prompting/angry.mp3 +0 -0
- examples/prompting/happy.mp3 +0 -0
- examples/prompting/sad.mp3 +0 -0
- examples/prompting/scared.mp3 +0 -0
- examples/riding_hood/angelina.mp3 +0 -0
- examples/riding_hood/craig.mp3 +0 -0
- examples/riding_hood/deniro.mp3 +0 -0
- examples/riding_hood/emma.mp3 +0 -0
- examples/riding_hood/freeman.mp3 +0 -0
- examples/riding_hood/geralt.mp3 +0 -0
- examples/riding_hood/halle.mp3 +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tortoise/voices/angie/2.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
tortoise/voices/deniro/2.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
tortoise/voices/train_lescault/lescault_new4.wav filter=lfs diff=lfs merge=lfs -text
|
.github/workflows
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will upload a Python Package using Twine when a release is created
|
2 |
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
3 |
+
|
4 |
+
# This workflow uses actions that are not certified by GitHub.
|
5 |
+
# They are provided by a third-party and are governed by
|
6 |
+
# separate terms of service, privacy policy, and support
|
7 |
+
# documentation.
|
8 |
+
|
9 |
+
name: Upload Python Package
|
10 |
+
|
11 |
+
on:
|
12 |
+
release:
|
13 |
+
types: [published]
|
14 |
+
|
15 |
+
permissions:
|
16 |
+
contents: read
|
17 |
+
|
18 |
+
jobs:
|
19 |
+
deploy:
|
20 |
+
|
21 |
+
runs-on: ubuntu-latest
|
22 |
+
|
23 |
+
steps:
|
24 |
+
- uses: actions/checkout@v3
|
25 |
+
- name: Set up Python
|
26 |
+
uses: actions/setup-python@v3
|
27 |
+
with:
|
28 |
+
python-version: '3.x'
|
29 |
+
- name: Install dependencies
|
30 |
+
run: |
|
31 |
+
python -m pip install --upgrade pip
|
32 |
+
pip install build
|
33 |
+
- name: Build package
|
34 |
+
run: python -m build
|
35 |
+
- name: Publish package
|
36 |
+
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
37 |
+
with:
|
38 |
+
user: __token__
|
39 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
.gitignore
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
.idea/*
|
132 |
+
.models/*
|
133 |
+
.custom/*
|
134 |
+
results/*
|
135 |
+
debug_states/*
|
Advanced_Usage.md
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Advanced Usage
|
2 |
+
|
3 |
+
### Generation settings
|
4 |
+
|
5 |
+
Tortoise is primarily an autoregressive decoder model combined with a diffusion model. Both of these have a lot of knobs
|
6 |
+
that can be turned that I've abstracted away for the sake of ease of use. I did this by generating thousands of clips using
|
7 |
+
various permutations of the settings and using a metric for voice realism and intelligibility to measure their effects. I've
|
8 |
+
set the defaults to the best overall settings I was able to find. For specific use-cases, it might be effective to play with
|
9 |
+
these settings (and it's very likely that I missed something!)
|
10 |
+
|
11 |
+
These settings are not available in the normal scripts packaged with Tortoise. They are available, however, in the API. See
|
12 |
+
```api.tts``` for a full list.
|
13 |
+
|
14 |
+
### Prompt engineering
|
15 |
+
|
16 |
+
Some people have discovered that it is possible to do prompt engineering with Tortoise! For example, you can evoke emotion
|
17 |
+
by including things like "I am really sad," before your text. I've built an automated redaction system that you can use to
|
18 |
+
take advantage of this. It works by attempting to redact any text in the prompt surrounded by brackets. For example, the
|
19 |
+
prompt "\[I am really sad,\] Please feed me." will only speak the words "Please feed me" (with a sad tonality).
|
20 |
+
|
21 |
+
### Playing with the voice latent
|
22 |
+
|
23 |
+
Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent,
|
24 |
+
then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents
|
25 |
+
are quite expressive, affecting everything from tone to speaking rate to speech abnormalities.
|
26 |
+
|
27 |
+
This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output
|
28 |
+
what it thinks the "average" of those two voices sounds like.
|
29 |
+
|
30 |
+
#### Generating conditioning latents from voices
|
31 |
+
|
32 |
+
Use the script `get_conditioning_latents.py` to extract conditioning latents for a voice you have installed. This script
|
33 |
+
will dump the latents to a .pth pickle file. The file will contain a single tuple, (autoregressive_latent, diffusion_latent).
|
34 |
+
|
35 |
+
Alternatively, use the api.TextToSpeech.get_conditioning_latents() to fetch the latents.
|
36 |
+
|
37 |
+
#### Using raw conditioning latents to generate speech
|
38 |
+
|
39 |
+
After you've played with them, you can use them to generate speech by creating a subdirectory in voices/ with a single
|
40 |
+
".pth" file containing the pickled conditioning latents as a tuple (autoregressive_latent, diffusion_latent).
|
41 |
+
|
42 |
+
## Tortoise-detect
|
43 |
+
|
44 |
+
Out of concerns that this model might be misused, I've built a classifier that tells the likelihood that an audio clip
|
45 |
+
came from Tortoise.
|
46 |
+
|
47 |
+
This classifier can be run on any computer, usage is as follows:
|
48 |
+
|
49 |
+
```commandline
|
50 |
+
python tortoise/is_this_from_tortoise.py --clip=<path_to_suspicious_audio_file>
|
51 |
+
```
|
52 |
+
|
53 |
+
This model has 100% accuracy on the contents of the results/ and voices/ folders in this repo. Still, treat this classifier
|
54 |
+
as a "strong signal". Classifiers can be fooled and it is likewise not impossible for this classifier to exhibit false
|
55 |
+
positives.
|
56 |
+
|
57 |
+
## Model architecture
|
58 |
+
|
59 |
+
Tortoise TTS is inspired by OpenAI's DALLE, applied to speech data and using a better decoder. It is made up of 5 separate
|
60 |
+
models that work together. I've assembled a write-up of the system architecture here:
|
61 |
+
[https://nonint.com/2022/04/25/tortoise-architectural-design-doc/](https://nonint.com/2022/04/25/tortoise-architectural-design-doc/)
|
62 |
+
|
63 |
+
## Training
|
64 |
+
|
65 |
+
These models were trained on my "homelab" server with 8 RTX 3090s over the course of several months. They were trained on a dataset consisting of
|
66 |
+
~50k hours of speech data, most of which was transcribed by [ocotillo](http://www.github.com/neonbjb/ocotillo). Training was done on my own
|
67 |
+
[DLAS](https://github.com/neonbjb/DL-Art-School) trainer.
|
68 |
+
|
69 |
+
I currently do not have plans to release the training configurations or methodology. See the next section..
|
70 |
+
|
71 |
+
## Ethical Considerations
|
72 |
+
|
73 |
+
Tortoise v2 works considerably better than I had planned. When I began hearing some of the outputs of the last few versions, I began
|
74 |
+
wondering whether or not I had an ethically unsound project on my hands. The ways in which a voice-cloning text-to-speech system
|
75 |
+
could be misused are many. It doesn't take much creativity to think up how.
|
76 |
+
|
77 |
+
After some thought, I have decided to go forward with releasing this. Following are the reasons for this choice:
|
78 |
+
|
79 |
+
1. It is primarily good at reading books and speaking poetry. Other forms of speech do not work well.
|
80 |
+
2. It was trained on a dataset which does not have the voices of public figures. While it will attempt to mimic these voices if they are provided as references, it does not do so in such a way that most humans would be fooled.
|
81 |
+
3. The above points could likely be resolved by scaling up the model and the dataset. For this reason, I am currently withholding details on how I trained the model, pending community feedback.
|
82 |
+
4. I am releasing a separate classifier model which will tell you whether a given audio clip was generated by Tortoise or not. See `tortoise-detect` above.
|
83 |
+
5. If I, a tinkerer with a BS in computer science with a ~$15k computer can build this, then any motivated corporation or state can as well. I would prefer that it be in the open and everyone know the kinds of things ML can do.
|
84 |
+
|
85 |
+
### Diversity
|
86 |
+
|
87 |
+
The diversity expressed by ML models is strongly tied to the datasets they were trained on.
|
88 |
+
|
89 |
+
Tortoise was trained primarily on a dataset consisting of audiobooks. I made no effort to
|
90 |
+
balance diversity in this dataset. For this reason, Tortoise will be particularly poor at generating the voices of minorities
|
91 |
+
or of people who speak with strong accents.
|
92 |
+
|
93 |
+
## Looking forward
|
94 |
+
|
95 |
+
Tortoise v2 is about as good as I think I can do in the TTS world with the resources I have access to. A phenomenon that happens when
|
96 |
+
training very large models is that as parameter count increases, the communication bandwidth needed to support distributed training
|
97 |
+
of the model increases multiplicatively. On enterprise-grade hardware, this is not an issue: GPUs are attached together with
|
98 |
+
exceptionally wide buses that can accommodate this bandwidth. I cannot afford enterprise hardware, though, so I am stuck.
|
99 |
+
|
100 |
+
I want to mention here
|
101 |
+
that I think Tortoise could be a **lot** better. The three major components of Tortoise are either vanilla Transformer Encoder stacks
|
102 |
+
or Decoder stacks. Both of these types of models have a rich experimental history with scaling in the NLP realm. I see no reason
|
103 |
+
to believe that the same is not true of TTS.
|
CHANGELOG.md
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Changelog
|
2 |
+
#### v3.0.0; 2023/10/18
|
3 |
+
- Added fast inference for tortoise with HiFi Decoder (inspired by xtts by [coquiTTS](https://github.com/coqui-ai/TTS) 🐸, check out their multilingual model for noncommercial uses)
|
4 |
+
#### v2.8.0; 2023/9/13
|
5 |
+
- Added custom tokenizer for non-english models
|
6 |
+
#### v2.7.0; 2023/7/26
|
7 |
+
- Bug fixes
|
8 |
+
- Added Apple Silicon Support
|
9 |
+
- Updated Transformer version
|
10 |
+
#### v2.6.0; 2023/7/26
|
11 |
+
- Bug fixes
|
12 |
+
|
13 |
+
#### v2.5.0; 2023/7/09
|
14 |
+
- Added kv_cache support 5x faster
|
15 |
+
- Added deepspeed support 10x faster
|
16 |
+
- Added half precision support
|
17 |
+
|
18 |
+
#### v2.4.0; 2022/5/17
|
19 |
+
- Removed CVVP model. Found that it does not, in fact, make an appreciable difference in the output.
|
20 |
+
- Add better debugging support; existing tools now spit out debug files which can be used to reproduce bad runs.
|
21 |
+
|
22 |
+
#### v2.3.0; 2022/5/12
|
23 |
+
- New CLVP-large model for further improved decoding guidance.
|
24 |
+
- Improvements to read.py and do_tts.py (new options)
|
25 |
+
|
26 |
+
#### v2.2.0; 2022/5/5
|
27 |
+
- Added several new voices from the training set.
|
28 |
+
- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
|
29 |
+
- Bug fixes
|
30 |
+
|
31 |
+
#### v2.1.0; 2022/5/2
|
32 |
+
- Added ability to produce totally random voices.
|
33 |
+
- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
|
34 |
+
- Added ability to use your own pretrained models.
|
35 |
+
- Refactored directory structures.
|
36 |
+
- Performance improvements & bug fixes.
|
CITATION.cff
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cff-version: 1.3.0
|
2 |
+
message: "If you use this software, please cite it as below."
|
3 |
+
authors:
|
4 |
+
- family-names: "Betker"
|
5 |
+
given-names: "James"
|
6 |
+
orcid: "https://orcid.org/my-orcid?orcid=0000-0003-3259-4862"
|
7 |
+
title: "TorToiSe text-to-speech"
|
8 |
+
version: 2.0
|
9 |
+
date-released: 2022-04-28
|
10 |
+
url: "https://github.com/neonbjb/tortoise-tts"
|
Dockerfile
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
|
2 |
+
|
3 |
+
COPY . /app
|
4 |
+
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get install -y --allow-unauthenticated --no-install-recommends \
|
7 |
+
wget \
|
8 |
+
git \
|
9 |
+
&& apt-get autoremove -y \
|
10 |
+
&& apt-get clean -y \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
ENV HOME "/root"
|
14 |
+
ENV CONDA_DIR "${HOME}/miniconda"
|
15 |
+
ENV PATH="$CONDA_DIR/bin":$PATH
|
16 |
+
ENV CONDA_AUTO_UPDATE_CONDA=false
|
17 |
+
ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
|
18 |
+
ENV TORTOISE_MODELS_DIR
|
19 |
+
|
20 |
+
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
|
21 |
+
&& bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
|
22 |
+
&& "${CONDA_DIR}/bin/conda" init bash \
|
23 |
+
&& rm -f /tmp/miniconda3.sh \
|
24 |
+
&& echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
|
25 |
+
|
26 |
+
# --login option used to source bashrc (thus activating conda env) at every RUN statement
|
27 |
+
SHELL ["/bin/bash", "--login", "-c"]
|
28 |
+
|
29 |
+
RUN conda create --name tortoise python=3.9 numba inflect \
|
30 |
+
&& conda activate tortoise \
|
31 |
+
&& conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
|
32 |
+
&& conda install transformers=4.29.2 \
|
33 |
+
&& cd /app \
|
34 |
+
&& python setup.py install
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
MANIFEST.in
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
recursive-include tortoise/data *
|
2 |
+
recursive-include tortoise/voices *
|
README.md
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# TorToiSe
|
2 |
+
|
3 |
+
Tortoise is a text-to-speech program built with the following priorities:
|
4 |
+
|
5 |
+
1. Strong multi-voice capabilities.
|
6 |
+
2. Highly realistic prosody and intonation.
|
7 |
+
|
8 |
+
This repo contains all the code needed to run Tortoise TTS in inference mode.
|
9 |
+
|
10 |
+
Manuscript: https://arxiv.org/abs/2305.07243
|
11 |
+
## Hugging Face space
|
12 |
+
|
13 |
+
A live demo is hosted on Hugging Face Spaces. If you'd like to avoid a queue, please duplicate the Space and add a GPU. Please note that CPU-only spaces do not work for this demo.
|
14 |
+
|
15 |
+
https://huggingface.co/spaces/Manmay/tortoise-tts
|
16 |
+
|
17 |
+
## Install via pip
|
18 |
+
```bash
|
19 |
+
pip install tortoise-tts
|
20 |
+
```
|
21 |
+
|
22 |
+
If you would like to install the latest development version, you can also install it directly from the git repository:
|
23 |
+
|
24 |
+
```bash
|
25 |
+
pip install git+https://github.com/neonbjb/tortoise-tts
|
26 |
+
```
|
27 |
+
|
28 |
+
## What's in a name?
|
29 |
+
|
30 |
+
I'm naming my speech-related repos after Mojave desert flora and fauna. Tortoise is a bit tongue in cheek: this model
|
31 |
+
is insanely slow. It leverages both an autoregressive decoder **and** a diffusion decoder; both known for their low
|
32 |
+
sampling rates. On a K80, expect to generate a medium sized sentence every 2 minutes.
|
33 |
+
|
34 |
+
well..... not so slow anymore now we can get a **0.25-0.3 RTF** on 4GB vram and with streaming we can get < **500 ms** latency !!!
|
35 |
+
|
36 |
+
## Demos
|
37 |
+
|
38 |
+
See [this page](http://nonint.com/static/tortoise_v2_examples.html) for a large list of example outputs.
|
39 |
+
|
40 |
+
A cool application of Tortoise + GPT-3 (not affiliated with this repository): https://twitter.com/lexman_ai. Unfortunately, this proejct seems no longer to be active.
|
41 |
+
|
42 |
+
## Usage guide
|
43 |
+
|
44 |
+
### Local installation
|
45 |
+
|
46 |
+
If you want to use this on your own computer, you must have an NVIDIA GPU.
|
47 |
+
|
48 |
+
On Windows, I **highly** recommend using the Conda installation path. I have been told that if you do not do this, you
|
49 |
+
will spend a lot of time chasing dependency problems.
|
50 |
+
|
51 |
+
First, install miniconda: https://docs.conda.io/en/latest/miniconda.html
|
52 |
+
|
53 |
+
Then run the following commands, using anaconda prompt as the terminal (or any other terminal configured to work with conda)
|
54 |
+
|
55 |
+
This will:
|
56 |
+
1. create conda environment with minimal dependencies specified
|
57 |
+
1. activate the environment
|
58 |
+
1. install pytorch with the command provided here: https://pytorch.org/get-started/locally/
|
59 |
+
1. clone tortoise-tts
|
60 |
+
1. change the current directory to tortoise-tts
|
61 |
+
1. run tortoise python setup install script
|
62 |
+
|
63 |
+
```shell
|
64 |
+
conda create --name tortoise python=3.9 numba inflect
|
65 |
+
conda activate tortoise
|
66 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
|
67 |
+
conda install transformers=4.29.2
|
68 |
+
git clone https://github.com/neonbjb/tortoise-tts.git
|
69 |
+
cd tortoise-tts
|
70 |
+
python setup.py install
|
71 |
+
```
|
72 |
+
|
73 |
+
Optionally, pytorch can be installed in the base environment, so that other conda environments can use it too. To do this, simply send the `conda install pytorch...` line before activating the tortoise environment.
|
74 |
+
|
75 |
+
> **Note:** When you want to use tortoise-tts, you will always have to ensure the `tortoise` conda environment is activated.
|
76 |
+
|
77 |
+
If you are on windows, you may also need to install pysoundfile: `conda install -c conda-forge pysoundfile`
|
78 |
+
|
79 |
+
### Docker
|
80 |
+
|
81 |
+
An easy way to hit the ground running and a good jumping off point depending on your use case.
|
82 |
+
|
83 |
+
```sh
|
84 |
+
git clone https://github.com/neonbjb/tortoise-tts.git
|
85 |
+
cd tortoise-tts
|
86 |
+
|
87 |
+
docker build . -t tts
|
88 |
+
|
89 |
+
docker run --gpus all \
|
90 |
+
-e TORTOISE_MODELS_DIR=/models \
|
91 |
+
-v /mnt/user/data/tortoise_tts/models:/models \
|
92 |
+
-v /mnt/user/data/tortoise_tts/results:/results \
|
93 |
+
-v /mnt/user/data/.cache/huggingface:/root/.cache/huggingface \
|
94 |
+
-v /root:/work \
|
95 |
+
-it tts
|
96 |
+
```
|
97 |
+
This gives you an interactive terminal in an environment that's ready to do some tts. Now you can explore the different interfaces that tortoise exposes for tts.
|
98 |
+
|
99 |
+
For example:
|
100 |
+
|
101 |
+
```sh
|
102 |
+
cd app
|
103 |
+
conda activate tortoise
|
104 |
+
time python tortoise/do_tts.py \
|
105 |
+
--output_path /results \
|
106 |
+
--preset ultra_fast \
|
107 |
+
--voice geralt \
|
108 |
+
--text "Time flies like an arrow; fruit flies like a bananna."
|
109 |
+
```
|
110 |
+
|
111 |
+
## Apple Silicon
|
112 |
+
|
113 |
+
On macOS 13+ with M1/M2 chips you need to install the nighly version of PyTorch, as stated in the official page you can do:
|
114 |
+
|
115 |
+
```shell
|
116 |
+
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
117 |
+
```
|
118 |
+
|
119 |
+
Be sure to do that after you activate the environment. If you don't use conda the commands would look like this:
|
120 |
+
|
121 |
+
```shell
|
122 |
+
python3.10 -m venv .venv
|
123 |
+
source .venv/bin/activate
|
124 |
+
pip install numba inflect psutil
|
125 |
+
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
126 |
+
pip install transformers
|
127 |
+
git clone https://github.com/neonbjb/tortoise-tts.git
|
128 |
+
cd tortoise-tts
|
129 |
+
pip install .
|
130 |
+
```
|
131 |
+
|
132 |
+
Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored.
|
133 |
+
You may need to prepend `PYTORCH_ENABLE_MPS_FALLBACK=1` to the commands below to make them work since MPS does not support all the operations in Pytorch.
|
134 |
+
|
135 |
+
|
136 |
+
### do_tts.py
|
137 |
+
|
138 |
+
This script allows you to speak a single phrase with one or more voices.
|
139 |
+
```shell
|
140 |
+
python tortoise/do_tts.py --text "I'm going to speak this" --voice random --preset fast
|
141 |
+
```
|
142 |
+
### faster inference read.py
|
143 |
+
|
144 |
+
This script provides tools for reading large amounts of text.
|
145 |
+
|
146 |
+
```shell
|
147 |
+
python tortoise/read_fast.py --textfile <your text to be read> --voice random
|
148 |
+
```
|
149 |
+
|
150 |
+
### read.py
|
151 |
+
|
152 |
+
This script provides tools for reading large amounts of text.
|
153 |
+
|
154 |
+
```shell
|
155 |
+
python tortoise/read.py --textfile <your text to be read> --voice random
|
156 |
+
```
|
157 |
+
|
158 |
+
This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series
|
159 |
+
of spoken clips as they are generated. Once all the clips are generated, it will combine them into a single file and
|
160 |
+
output that as well.
|
161 |
+
|
162 |
+
Sometimes Tortoise screws up an output. You can re-generate any bad clips by re-running `read.py` with the --regenerate
|
163 |
+
argument.
|
164 |
+
|
165 |
+
### API
|
166 |
+
|
167 |
+
Tortoise can be used programmatically, like so:
|
168 |
+
|
169 |
+
```python
|
170 |
+
reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
|
171 |
+
tts = api.TextToSpeech()
|
172 |
+
pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
|
173 |
+
```
|
174 |
+
|
175 |
+
To use deepspeed:
|
176 |
+
|
177 |
+
```python
|
178 |
+
reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
|
179 |
+
tts = api.TextToSpeech(use_deepspeed=True)
|
180 |
+
pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
|
181 |
+
```
|
182 |
+
|
183 |
+
To use kv cache:
|
184 |
+
|
185 |
+
```python
|
186 |
+
reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
|
187 |
+
tts = api.TextToSpeech(kv_cache=True)
|
188 |
+
pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
|
189 |
+
```
|
190 |
+
|
191 |
+
To run model in float16:
|
192 |
+
|
193 |
+
```python
|
194 |
+
reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
|
195 |
+
tts = api.TextToSpeech(half=True)
|
196 |
+
pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
|
197 |
+
```
|
198 |
+
for Faster runs use all three:
|
199 |
+
|
200 |
+
```python
|
201 |
+
reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths]
|
202 |
+
tts = api.TextToSpeech(use_deepspeed=True, kv_cache=True, half=True)
|
203 |
+
pcm_audio = tts.tts_with_preset("your text here", voice_samples=reference_clips, preset='fast')
|
204 |
+
```
|
205 |
+
|
206 |
+
## Acknowledgements
|
207 |
+
|
208 |
+
This project has garnered more praise than I expected. I am standing on the shoulders of giants, though, and I want to
|
209 |
+
credit a few of the amazing folks in the community that have helped make this happen:
|
210 |
+
|
211 |
+
- Hugging Face, who wrote the GPT model and the generate API used by Tortoise, and who hosts the model weights.
|
212 |
+
- [Ramesh et al](https://arxiv.org/pdf/2102.12092.pdf) who authored the DALLE paper, which is the inspiration behind Tortoise.
|
213 |
+
- [Nichol and Dhariwal](https://arxiv.org/pdf/2102.09672.pdf) who authored the (revision of) the code that drives the diffusion model.
|
214 |
+
- [Jang et al](https://arxiv.org/pdf/2106.07889.pdf) who developed and open-sourced univnet, the vocoder this repo uses.
|
215 |
+
- [Kim and Jung](https://github.com/mindslab-ai/univnet) who implemented univnet pytorch model.
|
216 |
+
- [lucidrains](https://github.com/lucidrains) who writes awesome open source pytorch models, many of which are used here.
|
217 |
+
- [Patrick von Platen](https://huggingface.co/patrickvonplaten) whose guides on setting up wav2vec were invaluable to building my dataset.
|
218 |
+
|
219 |
+
## Notice
|
220 |
+
|
221 |
+
Tortoise was built entirely by the author (James Betker) using their own hardware. Their employer was not involved in any facet of Tortoise's development.
|
222 |
+
|
223 |
+
## License
|
224 |
+
|
225 |
+
Tortoise TTS is licensed under the Apache 2.0 license.
|
226 |
+
|
227 |
+
If you use this repo or the ideas therein for your research, please cite it! A bibtex entree can be found in the right pane on GitHub.
|
examples/favorite_riding_hood.mp3
ADDED
Binary file (970 kB). View file
|
|
examples/favorites/atkins_mha.mp3
ADDED
Binary file (31.6 kB). View file
|
|
examples/favorites/atkins_omicron.mp3
ADDED
Binary file (41.3 kB). View file
|
|
examples/favorites/atkins_value.mp3
ADDED
Binary file (18.5 kB). View file
|
|
examples/favorites/daniel_craig_dumbledore.mp3
ADDED
Binary file (24 kB). View file
|
|
examples/favorites/daniel_craig_training_ethics.mp3
ADDED
Binary file (48.9 kB). View file
|
|
examples/favorites/dotrice_stop_for_death.mp3
ADDED
Binary file (28.8 kB). View file
|
|
examples/favorites/emma_stone_courage.mp3
ADDED
Binary file (34.1 kB). View file
|
|
examples/favorites/emma_stone_training_ethics.mp3
ADDED
Binary file (48 kB). View file
|
|
examples/favorites/halle_barry_dumbledore.mp3
ADDED
Binary file (21.5 kB). View file
|
|
examples/favorites/halle_barry_oar_to_oar.mp3
ADDED
Binary file (40.9 kB). View file
|
|
examples/favorites/henry_cavill_metallic_hydrogen.mp3
ADDED
Binary file (32 kB). View file
|
|
examples/favorites/kennard_road_not_taken.mp3
ADDED
Binary file (28.5 kB). View file
|
|
examples/favorites/morgan_freeman_metallic_hydrogen.mp3
ADDED
Binary file (35.4 kB). View file
|
|
examples/favorites/myself_gatsby.mp3
ADDED
Binary file (28.1 kB). View file
|
|
examples/favorites/patrick_stewart_omicron.mp3
ADDED
Binary file (37.6 kB). View file
|
|
examples/favorites/patrick_stewart_secret_of_life.mp3
ADDED
Binary file (36.5 kB). View file
|
|
examples/favorites/robert_deniro_review.mp3
ADDED
Binary file (36.1 kB). View file
|
|
examples/favorites/william_shatner_spacecraft_interview.mp3
ADDED
Binary file (47.3 kB). View file
|
|
examples/finetuned/lj/1.mp3
ADDED
Binary file (38.2 kB). View file
|
|
examples/finetuned/lj/2.mp3
ADDED
Binary file (26.1 kB). View file
|
|
examples/finetuned/lj/3.mp3
ADDED
Binary file (18.5 kB). View file
|
|
examples/finetuned/lj/4.mp3
ADDED
Binary file (22.9 kB). View file
|
|
examples/naturalspeech_comparison/fibers/naturalspeech.mp3
ADDED
Binary file (33.1 kB). View file
|
|
examples/naturalspeech_comparison/fibers/tortoise.mp3
ADDED
Binary file (33.5 kB). View file
|
|
examples/naturalspeech_comparison/lax/naturalspeech.mp3
ADDED
Binary file (41 kB). View file
|
|
examples/naturalspeech_comparison/lax/tortoise.mp3
ADDED
Binary file (42.3 kB). View file
|
|
examples/naturalspeech_comparison/maltby/naturalspeech.mp3
ADDED
Binary file (35.1 kB). View file
|
|
examples/naturalspeech_comparison/maltby/tortoise.mp3
ADDED
Binary file (36.9 kB). View file
|
|
examples/prompting/angry.mp3
ADDED
Binary file (7.63 kB). View file
|
|
examples/prompting/happy.mp3
ADDED
Binary file (8.3 kB). View file
|
|
examples/prompting/sad.mp3
ADDED
Binary file (6.29 kB). View file
|
|
examples/prompting/scared.mp3
ADDED
Binary file (6.86 kB). View file
|
|
examples/riding_hood/angelina.mp3
ADDED
Binary file (866 kB). View file
|
|
examples/riding_hood/craig.mp3
ADDED
Binary file (826 kB). View file
|
|
examples/riding_hood/deniro.mp3
ADDED
Binary file (851 kB). View file
|
|
examples/riding_hood/emma.mp3
ADDED
Binary file (807 kB). View file
|
|
examples/riding_hood/freeman.mp3
ADDED
Binary file (943 kB). View file
|
|
examples/riding_hood/geralt.mp3
ADDED
Binary file (788 kB). View file
|
|
examples/riding_hood/halle.mp3
ADDED
Binary file (785 kB). View file
|
|